#!/usr/bin/perl # # (c) Copyright 2002 Software Garden, Inc. # All Rights Reserved. # Subject to Software License at the end of this file # # # NOTE: You must have these packages: # use strict; use CGI qw(:standard); use LWP::UserAgent; use HTML::Parser; use URI; # Remember when we started my $start_program_time = (times)[0]; # Get config file if it exists. It is the same filename as this, in same # directory, with ".cfg" at the end. Names are case sensitive. # Format: # name=value # optional comment my %config_values; my $configfile = $ENV{SCRIPT_NAME} . ".cfg"; $configfile =~ s/^.*\//.\//; open (CONFIGFILE, $configfile); while () { chomp; s/#.*//; s/^\s+//; s/\s+$//; next unless length; my ($var_name, $var_value) = split(/\s*=\s*/, $_, 2); $config_values{$var_name} = $var_value; } close CONFIGFILE; # Initialize a variety of values my $agent = $config_values{'agent'} || "Mozilla/4.0 (compatible)"; my $to_print = ""; sub add_to_print; sub add_to_print_count; sub url_encode; my $line_pos = 0; my $first_char = -1; # Get CGI object to get parameters: # &pt=num sets character size, default is 9 # &short_tags=1 to turn on short tags format my $q = new CGI; my $line_max = ($q->param('linemax') || $config_values{'linemax'} || 100); my $indent_max = ($q->param('indentmax') || $config_values{'indentmax'} || 25); my $fontsize = ($q->param('fontsize') || $config_values{'fontsize'} || 9); my $titlesize = ($q->param('titlesize') || $config_values{'titlesize'} || 10); my $short_tags = (($q->param('shorttags') || $config_values{'shorttags'} || 'no') ne 'no'); # Output start of HTML, including style information print $q->header(); print $q->start_html(-title => "Show Source", -style => <<"EOF"); PRE {font-size:${fontsize}pt; margin-left:10pt;} PRE.header {font-size:${fontsize}pt; margin-left:10pt; color:gray} TD.white {font-family:verdana,helvetica,sans-serif; font-size:${titlesize}pt; color:white; padding:10pt;} DIV.footer {font-family:verdana,helevtica,sans-serif; font-size:7pt; text-align:center} EOF # &url parameter has URL to list my $url = $q->param('url'); my $res; if ($url) { # Display URL broken into no more than titlemax characters at a time my $wrapped_url = $url; my $titlemax = ($q->param('titlemax') || $config_values{'titlemax'} || 75); $wrapped_url =~ s/(.{$titlemax})/$1
/g; print <<"EOF";
  $wrapped_url  
  EOF # 100 characters for debugging line-breaking code, saved in a comment (move it above EOF to show) #1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890 # Use LWP to get the page. We accept any type. Failure often still has some content worth displaying. # The status (e.g., 200 OK, or 404 Not Found) is displayed at the end of the page. my $ua = LWP::UserAgent->new; $ua->agent($agent); my $req = HTTP::Request->new(GET => $url); $req->header('Accept' => '*/*'); $res = $ua->request($req); my $file; $file = $res->content; # Remember time after file retrieved, then start listing in fixed width font my $got_file_time = (times)[0]; add_to_print '
';

   if ($res->content_type eq 'text/html')
   {
      # HTML code is parsed by HTML::Parser. Handlers are called after each
      # start tag (), after each end tag (), etc.,
      # to output what is encountered.

      my $p = HTML::Parser->new(api_version => 3);
      $p->handler(start => \&start_handler, 'self,text,tagname,attr');
      $p->handler(end => \&end_handler, 'self,text');
      $p->handler(text => \&text_handler, 'self,text');
      $p->handler(declaration => \&declaration_handler, 'self,text');
      $p->handler(comment => \&comment_handler, 'self,text');
      $p->handler(process => \&process_handler, 'self,text');

      $p->parse($file);
   }
   else
   {
      # Other file types are just output, with line-breaking and escaping

      add_to_print_count $file;
   }

   add_to_print "\r\n
\r\n"; # Output header information add_to_print "
RESPONSE HEADER VALUES\r\n";

   $res->scan(
      sub{
         add_to_print_count $_[0] . ': ' . $_[1] . "\r\n";
         }
      );

   add_to_print "\r\nRESPONSE PROTOCOL, CODE, AND BASE\r\n";
   add_to_print_count $res->protocol . " " . $res->status_line . "\r\n" . $res->base . "\r\n";

   add_to_print '
'; print "$to_print\r\n"; # Get time after all this and then output subtotals along with links to display in other ways my $end_time = (times)[0]; my $time_string = sprintf ("Page fetch: %.2f seconds, Parsing: %.2f seconds", $got_file_time - $start_program_time, $end_time - $got_file_time); my $show_again = $ENV{SCRIPT_NAME} . '?url=' . url_escape($url, '^A-Za-z0-9\-\.\/'); print <<"EOF";
 
 
EOF } else { # No URL -- output instructions print <<"EOF";
To show the source of a URL, invoke this program with "?url=url-to-show".

An easy way is to create a "bookmarklet" by using "Add to Favorites" or
"Bookmark this link" on the following link:


EOF } # Finish -- output closing stuff print $q->end_html(); # # # # # # # # # # # Subroutines # # # # # # # # # # # # # # # # # # # # # add_to_print appends the value of its argument to $to_print sub add_to_print { $to_print .= $_[0]; } # # # # # # # # # # # add_to_print_count processes each character given to it and then adds them # to $to_print. The processing is as follows: # # Tabs are treated as 5 spaces. # Leading white space (up to $line_max - 25) is counted for each line. # Lines longer than $line_max are broken. Each fragment is indented by # the amount of the leading white space and then ". . ." is added. # The strings $line_end and $line_start are output around the # break to let you continue links without having the indenting # shown as a link. (They are normally set to "" and "".) # & and < characters are escaped. # # This is all done in a simple character by character loop to aid in adding # functionality. sub add_to_print_count { my ($st, $line_end, $line_start) = @_; $st =~ s/\t/ /g; foreach my $ch ($st =~ /(.)/gs) { if (($ch eq "\n") || ($ch eq "\r")) { $line_pos = -1; $first_char = -1; } $line_pos ++; if ($line_pos > $line_max) { add_to_print ("$line_end\r\n" . (" " x ($first_char-1)) . ". . .$line_start"); $line_pos = 5 + $first_char; } $first_char = $line_pos if ($first_char == -1 && (($line_pos > $line_max - $indent_max) || $ch ne " " && $ch ne "\r" && $ch ne "\n")); if ($ch eq '&') { add_to_print '&'; } elsif ($ch eq '<') { add_to_print '<'; } else { add_to_print $ch; } } } # # # # # # # # # # # url_escape converts all characters that aren't listed into %nnX form to URL-encode. # This is similar to URI::Escape, which isn't always available. sub url_escape { my ($text, $codes) = @_; $text =~ s/([$codes])/sprintf "%%%02X", ord($1)/egi; $text =~ tr/ /+/; return $text; } # # # # # # # # # # # Handler for "start tag" events sub start_handler { my($self, $thetext, $tagname, $attr) = @_; # Process tags with "href=" attributes that can be looked at with this program if ($tagname eq 'a' || $tagname eq 'area' || $tagname eq 'link') { # Handle "" as special case if ($tagname eq 'a' && $attr->{name}) { if ($short_tags) { add_to_print_count ''; } else { add_to_print_count $thetext; } } else { # Make the text of the href value a link to this script with the # &url argument an absolute reference to the href value my $a_start = ''; my $a_end = ''; if ($short_tags) { add_to_print_count "<$tagname href=\""; add_to_print $a_start; add_to_print_count ($attr->{href}, $a_end, $a_start); add_to_print $a_end; add_to_print_count '">'; } else { # Find the text around the href value and output normally along with # href value as a link. # NOTE: Does not make links of values that aren't in quotes if ($thetext =~ /^(.*)(\shref=(["']))(.*?)(\3)(.*)$/is) { add_to_print_count $1 . $2; add_to_print $a_start; add_to_print_count ($4, $a_end, $a_start); add_to_print $a_end; add_to_print_count $5 . $6; } else { add_to_print_count $thetext; } } } } # Process tags with "src=" attributes that should be looked at # in a separate window processed by the browser (e.g., images) elsif ($tagname eq 'img' || $tagname eq 'embed' || $tagname eq 'input') { my $a_start = ''; my $a_end = ''; if ($short_tags) { if ($attr->{src}) { add_to_print_count "<$tagname src=\""; add_to_print $a_start; add_to_print_count ($attr->{src}, $a_end, $a_start); add_to_print $a_end; add_to_print_count '">'; } else { add_to_print_count "<$tagname>"; } } else { if ($thetext =~ /^(.*)(\ssrc=(["']))(.*?)(\3)(.*)$/is) { add_to_print_count $1 . $2; add_to_print $a_start; add_to_print_count ($4, $a_end, $a_start); add_to_print $a_end; add_to_print_count $5 . $6; } else { add_to_print_count $thetext; } } } # Process tags with "src=" attributes that can be looked at with this program elsif ($tagname eq 'script' || $tagname eq 'frame' || $tagname eq 'layer' || $tagname eq 'ilayer' || $tagname eq 'link') { my $a_start = ''; my $a_end = ''; if ($short_tags) { if ($attr->{src}) { add_to_print_count "<$tagname src=\""; add_to_print $a_start; add_to_print_count ($attr->{src}, $a_end, $a_start); add_to_print $a_end; add_to_print_count '">'; } else { add_to_print_count "<$tagname>"; } } else { if ($thetext =~ /^(.*)(\ssrc=(["']))(.*?)(\3)(.*)$/is) { add_to_print_count $1 . $2; add_to_print $a_start; add_to_print_count ($4, $a_end, $a_start); add_to_print $a_end; add_to_print_count $5 . $6; } else { add_to_print_count $thetext; } } } else { # All other tags if ($short_tags) { add_to_print_count "<" . $tagname . ">"; } else { add_to_print_count $thetext; } } } # # # # # # # # # # # Process end of tag event -- just output as is sub end_handler { my($self, $thetext) = @_; add_to_print_count "$thetext"; } # # # # # # # # # # # Process text event. Output text in bold. sub text_handler { my($self, $thetext) = @_; add_to_print ""; add_to_print_count ("$thetext", '', ''); add_to_print ""; } # # # # # # # # # # # Process declaration event -- just output as is sub declaration_handler { my($self, $thetext) = @_; add_to_print_count "$thetext"; } # # # # # # # # # # # Process comment event -- just output as is sub comment_handler { my($self, $thetext) = @_; add_to_print_count "$thetext"; } # # # # # # # # # # # Process process event -- just output as is sub process_handler { my($self, $thetext) = @_; add_to_print_count "$thetext"; } __END__ =head1 NAME srcsvc - Server-based service to view the source HTML code of a web page using a browser. =head1 SYNOPSIS B[Burl-to-use] [B<&shorttags=>yes|no] [B<&fontsize=>num] [B<&titlesize=>num] [B<&linemax=>num] [B<&titlemax=>num] [B<&indentmax=>num] =head1 DESCRIPTION I is a Perl program that runs as a CGI script on a web server. It is usually invoked using a bookmarklet while viewing a web page. The returned HTML will be a page that shows the source HTML that makes up the original page. The effect is similar to using "View / Source" or "View / Page Source" in a browser, with some nice additions: =over 5 =item * Long lines are broken at a maximum length and wrapped, with the fragments indented appropriately. =item * href= and src= attribute values are displayed as links that either popup a window (for images), or display the contents listed by this program (html pages, css and Javascript files, etc.). This is useful for looking at the files in a frameset, or reading boilerplate Javascript. =item * The text is displayed in bold, with markup (tags, etc.) shown normally. This makes it easier to look through code when using the text as a guide. =item * You can optionally display in "short tags" mode with most HTML tags reduced to just their names, without the attributes or their values shown (except for href and src). This form is useful when you just want to understand the basic structure of a page. Indenting is preserved. =item * Values in the HTTP header are displayed below the listing. Most of these are not available in normal source listings or page info, since they come from the request itself. These include the server type, last modified date/time, cookies, etc. In addition, the protocol and status values in the response are displayed, along with the URI base. This base may be set by various HTTP/HTML means, and may be the result of redirection (and therefor not correspond to the requested URL). =item * A configuration file may be used to override default values for a variety of things. =back =head1 OPTIONS Here are the values that may be changed from either the configuration file or in the request. The configuration file must be in the same directory on the server as this program, with the same filename with the addition of ".cfg" at the end. It is a text file in the following format: # comment name1=value1 name2=value2 # optional comment The names of the variables that may be set are: =over 5 =item B The string to be sent with the request for the file to be listed indicating which "Agent" is making the URL request. This is usually the name of a browser or spider. Some servers return different HTML for different agents. Most server logs log this value. The default is "Mozilla/4.0 (compatible)" (without the quotes) and can only be set in the configuration file. =item B The size in points for the listing. This is used in a CSS style as font-size:Bpt. The default is 9 (font-size:9pt). =item B The maximum line length in characters. Lines of HTML or text longer than this will be broken at this length, with the remaining fragments indented. The default value is 100. =item B Fragments of lines that are broken because they are too long are indented to line up appropriately under the start of the line. In some cases this results in very short fragments. This value lets you set the maximum indent. Lines will not be indented more than B characters. In all cases, the string ". . ." is added to the beginning of the fragment to indicate a continued line. The default is 25. =item B The size in points for the URL displayed as a title. This is used in a CSS style as font-size:Bpt. The default is 10 (font-size:10pt). =item B The URL displayed as a title at the beginning of the page is broken into lines no longer than B characters. The default is 75. =item B You can optionally display in "short tags" mode with most HTML tags reduced to just their names, without the attributes or their values shown (except for href and src, and "