package XML::Parser::LiteHeavy; ############################################################################## # XML::Parser::LiteHeavy # # Copyright (C) 2004 Makamaka Hannyaharamitu (makamaka[at]donzoko.net) # # 2004-08-26 modified for Perl 5.8.4 and more. (original is LiteHeavy 1.03) # ############################################################################## # ORIGINAL SOURCE ... XML::Parser::Lite # ====================================================================== # # Copyright (C) 2000-2001 Paul Kulchenko (paulclinger@yahoo.com) # SOAP::Lite is free software; you can redistribute it # and/or modify it under the same terms as Perl itself. # # $Id: Lite.pm,v 1.1.1.1 2002/11/01 14:53:57 paulclinger Exp $ # # ====================================================================== ############################################################################## use strict; use vars qw($VERSION $self %STACK %LEVEL %STREAM); $VERSION = 1.03; my @Handlers = qw(Start End Char Proc Comment CdataStart CdataEnd); compile(); sub new { my ($self, %args) = @_; my $class = ref($self) || $self; return $self if ref $self; $args{Pkg} ||= caller; $self = bless \%args => $class; $self->setHandlers(); # clear first if( my $style = $args{Style} ){ no strict 'refs'; local $^W; my $pkg = __PACKAGE__ . '::Style::' . $style; eval qq| require $pkg |; if($@){ die $@ } for my $name ( qw(Init Final), @Handlers ){ my $sub = $pkg . '::' . $name; $self->{subs}->{$name} = (defined *{$sub}) ? *{$sub} : sub {}; if($] < 5.006){ eval qq| &$sub |; if($@){ $self->{subs}->{$name} = sub {} } } } } else{ $self->setHandlers(%{$args{Handlers} || {}}); } return $self; } #sub DESTROY{ # print $_[0], " is destroyed.\n"; #} sub setHandlers { my $self = shift; unless(@_){ for(@Handlers){ $self->{subs}->{$_} = sub {}; } } while(@_){ my($name => $func) = splice(@_, 0, 2); $self->{subs}->{$name} = $func ? $func : sub {}; } return $self; } sub regexp { my $patch = shift || ''; my $package = __PACKAGE__; # This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions", # Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998. # Copyright (c) 1998, Robert D. Cameron. # The following code may be freely used and distributed provided that # this copyright and citation notice remains intact and that modifications # or additions are clearly identified. my $TextSE = "[^<]+"; my $UntilHyphen = "[^-]*-"; my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-"; my $CommentCE = "$Until2Hyphens>?"; my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+"; my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>"; my $S = "[ \\n\\t\\r]+"; my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; my $Name = "(?:$NameStrt)(?:$NameChar)*"; my $QuoteSE = "\"[^\"]*\"|'[^']*'"; my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*"; my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; my $S1 = "[\\n\\r\\t ]"; my $UntilQMs = "[^?]*\\?+"; my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>"; my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S"; my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?"; my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; my $PI_CE = "$Name(?:$PI_Tail)?"; # these expressions were modified for backtracking and events my $EndTagCE = "($Name)(?{${package}::end(\$self,\$2)})(?:$S)?>"; my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'"; my $ElemTagCE = "($Name)(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)(?{[\@{\$^R||[]},\$4=>defined\$5?\$5:\$6]}))*(?:$S)?(/)?>(?{${package}::start(\$self,\$3,\@{\$^R||[]})})(?{\${7} and ${package}::end(\$self,\$3)})"; my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; # Next expression is under "black magic". # Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE', # but it doesn't work under Perl 5.005 and only magic with # (?:....)?? solved the problem. # I would appreciate if someone let me know what is the right thing to do # and what's the reason for all this magic. # Seems like a problem related to (?:....)? rather than to ?{} feature. # Tests are in t/31-xmlparserlite.t if you decide to play with it. "(?:($TextSE)(?{${package}::char(\$self,\$1)}))$patch|$MarkupSPE"; # after Perl 5.8.4, this regex doesn't work... =pod my $TextSE = "[^<]+"; my $UntilHyphen = "[^-]*-"; my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-"; my $CommentCE = "($Until2Hyphens>)?(?{${package}::comment(\$self,\$2)})"; my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+"; my $CDATA_CE = "($UntilRSBs(?:[^\\]>]$UntilRSBs)*)>(?{${package}::cdata(\$self,\$3)})"; my $S = "[ \\n\\t\\r]+"; my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; my $Name = "(?:$NameStrt)(?:$NameChar)*"; my $QuoteSE = "\"[^\"]*\"|'[^']*'"; my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*"; my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; my $S1 = "[\\n\\r\\t ]"; my $UntilQMs = "[^?]*\\?+"; my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>"; my $PI_CE = "($Name(?:$PI_Tail))?(?{local \$${package}::str = defined \$5 ? \$5 : \$4; ${package}::pi(\$self,\$${package}::str)})"; my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$PI_CE)|%$Name;|$S"; my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?"; my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; # these expressions were modified for backtracking and events my $EndTagCE = "($Name)(?{${package}::end(\$self,\$6)})(?:$S)?>"; my $AttValSE = "\"(?:[^<\"]*)\"|'(?:[^<']*)'"; my $ElemTagCE = "($Name)(?:$S($Name)(?:$S)?=(?:$S)?($AttValSE)(?{[\@{\$^R||[]},\$8=>\$\+]}))*(?:$S)?(/)?>(?{${package}::start(\$self,\$7,\@{\$^R||[]})})(?{\${10} and ${package}::end(\$self,\$7)})"; my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; # Next expression is under "black magic". # Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE', # but it doesn't work under Perl 5.005 and only magic with # (?:....)?? solved the problem. # I would appreciate if someone let me know what is the right thing to do # and what's the reason for all this magic. # Seems like a problem related to (?:....)? rather than to ?{} feature. # Tests are in t/31-xmlparserlite.t if you decide to play with it. "(?:($TextSE)(?{${package}::char(\$self,\$1)}))$patch|$MarkupSPE"; =cut } sub compile { local $^W; #$_ = regexp(); #print qq{sub parse_re { local \$self = shift; use re "eval"; 1 while \$_[0] =~ m{$_}go }; 1}; # try regexp as it should be, apply patch if doesn't work foreach (regexp(), regexp('??')) { eval qq{sub parse_re { local \$self = shift; use re "eval"; 1 while \$_[0] =~ m{$_}go }; 1} or die; last if eval { parse_re(__PACKAGE__,'bar'); 1 } }; *compile = sub {}; } sub parse { my ($self,$source) = @_; if(ref($source) =~ /^IO::/ or ref(\$source) eq 'GLOB'){ local $/ = undef; $source = <$source>; } if(!$self->{subs}->{Init}){ $self->{subs}->{Init} = sub {1}; } $self->init(); eval{ $self->parse_re($source) }; if($@){ $self->release(); die $@; } $self->process(); if(!$self->{subs}->{Final}){ $self->{subs}->{Final} = sub {1}; } my (@result,$result) = ((),undef); if(wantarray){ @result = $self->final(); } else{ $result = $self->final(); } $self->release(); return wantarray ? @result : $result; } sub parsefile{ my $self = shift; my $file = shift; local(*FILE); open(FILE, $file) or die "Couldn't open $file:\n$!"; binmode(FILE); my (@result,$result) = ((),undef); if(wantarray){ @result = $self->parse(*FILE); } else{ $result = $self->parse(*FILE); } close(FILE); return wantarray ? @result : $result; } sub process{ my $self = shift; my $cdata = 0; my $root = 0; for my $item ( @{ $STREAM{$self} } ){ if(!$root){ next if($item->[0] eq 'Char'); $root = 1 if($item->[0] eq 'Start' and $item->[2] == 1); } $cdata = 1 if($item->[0] eq 'CdataStart'); $cdata = 0 if($item->[0] eq 'CdataEnd'); _comment($item->[1]) if($item->[0] eq 'Comment'); _cdata($item->[1]) if($cdata); _pi($item->[1]) if($item->[0] eq 'Proc'); _expand_entity($item->[1]) if(!$cdata and $item->[0] eq 'Char'); _attr($item->[1]) if($item->[0] eq 'Start'); $self->{subs}->{$item->[0]}->(@{$item->[1]}); } } sub release{ my $self = shift; delete($STACK{$self}); delete($LEVEL{$self}); delete($STREAM{$self}); $self; } sub init{ # In start, end, char and so on, $self can't be used as hash! my $self = shift; $STACK{$self} = []; $LEVEL{$self} = 0; $STREAM{$self} = []; $self->{subs}->{Init}->($self, @_); } sub final { my $self = shift; my $stack = $STACK{$self}->[-1]; if( @{$STACK{$self}} ){ $self->release() and die "not properly closed tag '$stack'\n"; } if( !$LEVEL{$self} ){ $self->release() and die "no element found\n"; } $self->{subs}->{Final}->($self, @_); } # # In start, end, char and so on, $self can't be used as hash! ...Why? # sub start { my $self = shift; die "multiple roots, wrong element '$_[0]'\n" if $LEVEL{$self}++ && ! @{ $STACK{$self} }; push @{ $STACK{$self} },$_[0]; push @{ $STREAM{$self} }, ['Start',[$self,@_],$LEVEL{$self}]; } sub char { my $self = shift; push @{ $STREAM{$self} }, ['Char',[$self,@_]]; return if @{ $STACK{$self} }; # check for junk before or after element # can't use split or regexp due to limitations in ?{} implementation, # will iterate with loop, but we'll do it no more than two times, so # it shouldn't affect performance for (my $i=0; $i < length $_[0]; $i++) { die "junk '$_[0]' @{[$LEVEL{$self} ? 'after' : 'before']} XML element\n" if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there } } sub end { my $self = shift; pop(@{$STACK{$self}}) eq $_[0] or die "mismatched tag '$_[0]'\n"; push @{ $STREAM{$self} }, ['End',[$self,@_]]; } sub comment { my $self = shift; push @{ $STREAM{$self} }, ['Comment',[$self,@_]]; } sub _comment{ my $item = shift; $item->[1] =~ s/-->$//; } sub cdata { my $self = shift; push @{ $STREAM{$self} }, ['CdataStart',[$self]]; push @{ $STREAM{$self} }, ['Char',[$self,@_]]; push @{ $STREAM{$self} }, ['CdataEnd',[$self]]; } sub _cdata{ my $item = shift; $item->[1] =~ s/]]$// if(defined $item->[1]); } sub pi{ my $self = shift; defined $_[0] or die "invalid pi\n"; my ($target,$data) = split(/\s+/,$_[0],2); return if($target eq 'xml'); push @{ $STREAM{$self} }, ['Proc',[$self,$target,$data]]; } sub _pi{ my $item = shift; $item->[2] =~ s/\?>$//; } sub _attr{ my $item = shift; for(@{$item}){ s/^["']//; s/["']$//; } _expand_entity($item); } sub _expand_entity{ my $item = shift; for(@{$item}){ s/<//g; s/'/'/g; s/"/"/g; s/&#[xX](\d+);/ chr($1) /eg; s/&/&/g; } } #====================================================================== 1; __END__ =head1 NAME XML::Parser::LiteHeavy - Lightweight regexp-based XML parser =head1 SYNOPSIS use XML::Parser::LiteHeavy; $p1 = new XML::Parser::LiteHeavy; $p1->setHandlers( Start => sub { shift; print "start: @_\n" }, Char => sub { shift; print "char: @_\n" }, End => sub { shift; print "end: @_\n" }, ); $p1->parse('Hello World!'); $p2 = new XML::Parser::LiteHeavy( Handlers => { Start => sub { shift; print "start: @_\n" }, Char => sub { shift; print "char: @_\n" }, End => sub { shift; print "end: @_\n" }, } ); $p2->parse('Hello cruel World!'); $p3 = new XML::Parser::LiteHeavy(Style => 'Tree'); $tree = $p3->parsefile('test.xml'); =head1 DESCRIPTION This module was made from XML::Parser::Lite which gives you access to XML parser with interface similar to XML::Parser interface. Some calls are supported (init, final, start, char, end, comment, cdatastart, cdataend and proc). =head1 METHODS =over 4 =item new See XML::Parser. Available options are 'Handlers', 'Pkg' and 'Style'. =item setHandlers See XML::Parser. Available handlers are 'Init', 'Final', 'Start', 'End' 'Char', 'Comment', 'CdataStart', 'CdataEnd' and 'Proc'. =item parse See XML::Parser. Almost options are not available. =item parsefile See XML::Parser. Almost options are not available. =back =head2 STYLE You can use 'Tree' and 'Objects' style. =head1 SEE ALSO XML::Parser XML::Parser::Lite =head1 COPYRIGHT Copyright (C) 2004 Makamaka Hannyaharamitu This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. This module is based on XML::Parser::Lite. Copyright (C) 2000-2001 Paul Kulchenko (paulclinger@yahoo.com) Its parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html Copyright (c) 1998, Robert D. Cameron. =head1 AUTHOR Makamaka Hannyaharamitu (makamaka[at]donzoko.net) =cut