Tutorial02: Parsing HTML with Perl

Download the source code

Comment on this tutorial

#####################################################################
# Author:     Steve DeGraeve
# URL:        www.degraeve.com
#
# What?
#   This is a stripped down version of how to retrieve and then 
#   parse through an HTML document.
#
#   The script breaks the returned HTML into the HTML code and 
#   HTML content.  This is to facilitate translating only the 
#   words in the content of a web page.  You wouldn't want to 
#   replace every instance of 'img' in a page  because that 
#   would destroy all of your img tags.
#
#   It uses the LWP library that can be found at:
#   http://www.perl.com/CPAN-local/modules/by-module/LWP/
#   It makes retrieving the HTML content MUCH easier.
#
#   All this script does is take the file contents in and spit it
#   back out.  It doesn't fix urls, or paths to images.  
#   i.e.: relative images and links won't work.  Fixing these is 
#   the hard part  --  and the FUN part.
#
#   For a complete working copy, see:
#   http://www.degraeve.com/translator.shtml
#
# Associated Files?
#   stripped.cgi  (this file)
#   test.htm
#
#####################################################################

use LWP::Simple;

&ReadParse(*input);
$url = $input{'url'};
$content = get($url);  # RETRIEVE THE HTML DOCUMENT

print "Content-type: text/html\n\n";   

$len = length($content);
$html = "0";          # THE FLAG
$htmlcode = "";       # THE HTML CODE
$htmlcontent = "";    # THE CONTENT OF THE PAGE

$i = 0;
for ($i = 0; $i < $len; $i++){
  
  $char = substr($content, $i, 1);

  if ($char eq "<"){
    # YOU CAN TRANSLATE THIS BUFFER :)
    print $htmlcontent;   
    $htmlcontent = "";
    $html = "1";
  }

  if ($html eq "1"){
    $htmlcode = $htmlcode . $char;
  }

  if ($html eq "0"){
    $htmlcontent = $htmlcontent . $char ;
  }

  if ($char eq ">"){
    $html = "0";
    # YOU CAN TRANSLATE THIS BUFFER :)
    print $htmlcode;   
    $htmlcode = "";
  }
  
}


##########################################################################
#    THE FOLLOWING IS TO GET THE PARAMS PASSED 
#    TO THE SCRIPT THRU THE GET/POST METHODS
#    I DIDN'T WRITE THIS PART...
#

sub ReadParse {
  local (*in) = @_ if @_;
  local ($i, $key, $val);

  # Read in text
  if (&MethGet) {
    $in = $ENV{'QUERY_STRING'};
  } elsif (&MethPost) {
    read(STDIN,$in,$ENV{'CONTENT_LENGTH'});
  }

  @in = split(/[&;]/,$in);

  foreach $i (0 .. $#in) {
    # Convert plus's to spaces
    $in[$i] =~ s/\+/ /g;
    # Split into key and value.
    ($key, $val) = split(/=/,$in[$i],2); # splits on the first =.

    # Convert %XX from hex numbers to alphanumeric
    $key =~ s/%(..)/pack("c",hex($1))/ge;
    $val =~ s/%(..)/pack("c",hex($1))/ge;

    # Associate key and value
    $in{$key} .= "\0" if (defined($in{$key})); # \0 is the multiple separator
    $in{$key} .= $val;
# Get rid of any weird characters.
    $in{$key} =~ s/\*|\$|\<|\>|\#|\%//gi;
        }

  return scalar(@in);
}

sub MethGet {
  return ($ENV{'REQUEST_METHOD'} eq "GET");
}

sub MethPost {
  return ($ENV{'REQUEST_METHOD'} eq "POST");
}