Tutorial02: Parsing HTML with Perl
» Download the source code
» Comment on this tutorial
#####################################################################
# Author: Steve DeGraeve
# URL: www.degraeve.com
#
# What?
# This is a stripped down version of how to retrieve and then
# parse through an HTML document.
#
# The script breaks the returned HTML into the HTML code and
# HTML content. This is to facilitate translating only the
# words in the content of a web page. You wouldn't want to
# replace every instance of 'img' in a page because that
# would destroy all of your img tags.
#
# It uses the LWP library that can be found at:
# http://www.perl.com/CPAN-local/modules/by-module/LWP/
# It makes retrieving the HTML content MUCH easier.
#
# All this script does is take the file contents in and spit it
# back out. It doesn't fix urls, or paths to images.
# i.e.: relative images and links won't work. Fixing these is
# the hard part -- and the FUN part.
#
# For a complete working copy, see:
# http://www.degraeve.com/translator.shtml
#
# Associated Files?
# stripped.cgi (this file)
# test.htm
#
#####################################################################
use LWP::Simple;
&ReadParse(*input);
$url = $input{'url'};
$content = get($url); # RETRIEVE THE HTML DOCUMENT
print "Content-type: text/html\n\n";
$len = length($content);
$html = "0"; # THE FLAG
$htmlcode = ""; # THE HTML CODE
$htmlcontent = ""; # THE CONTENT OF THE PAGE
$i = 0;
for ($i = 0; $i < $len; $i++){
$char = substr($content, $i, 1);
if ($char eq "<"){
# YOU CAN TRANSLATE THIS BUFFER :)
print $htmlcontent;
$htmlcontent = "";
$html = "1";
}
if ($html eq "1"){
$htmlcode = $htmlcode . $char;
}
if ($html eq "0"){
$htmlcontent = $htmlcontent . $char ;
}
if ($char eq ">"){
$html = "0";
# YOU CAN TRANSLATE THIS BUFFER :)
print $htmlcode;
$htmlcode = "";
}
}
##########################################################################
# THE FOLLOWING IS TO GET THE PARAMS PASSED
# TO THE SCRIPT THRU THE GET/POST METHODS
# I DIDN'T WRITE THIS PART...
#
sub ReadParse {
local (*in) = @_ if @_;
local ($i, $key, $val);
# Read in text
if (&MethGet) {
$in = $ENV{'QUERY_STRING'};
} elsif (&MethPost) {
read(STDIN,$in,$ENV{'CONTENT_LENGTH'});
}
@in = split(/[&;]/,$in);
foreach $i (0 .. $#in) {
# Convert plus's to spaces
$in[$i] =~ s/\+/ /g;
# Split into key and value.
($key, $val) = split(/=/,$in[$i],2); # splits on the first =.
# Convert %XX from hex numbers to alphanumeric
$key =~ s/%(..)/pack("c",hex($1))/ge;
$val =~ s/%(..)/pack("c",hex($1))/ge;
# Associate key and value
$in{$key} .= "\0" if (defined($in{$key})); # \0 is the multiple separator
$in{$key} .= $val;
# Get rid of any weird characters.
$in{$key} =~ s/\*|\$|\<|\>|\#|\%//gi;
}
return scalar(@in);
}
sub MethGet {
return ($ENV{'REQUEST_METHOD'} eq "GET");
}
sub MethPost {
return ($ENV{'REQUEST_METHOD'} eq "POST");
}