package XML::Parser::LiteHeavy;
##############################################################################
# XML::Parser::LiteHeavy
#
# Copyright (C) 2004 Makamaka Hannyaharamitu (makamaka[at]donzoko.net)
#
# 2004-08-26 modified for Perl 5.8.4 and more. (original is LiteHeavy 1.03)
#
##############################################################################
# ORIGINAL SOURCE ... XML::Parser::Lite
# ======================================================================
#
# Copyright (C) 2000-2001 Paul Kulchenko (paulclinger@yahoo.com)
# SOAP::Lite is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
#
# $Id: Lite.pm,v 1.1.1.1 2002/11/01 14:53:57 paulclinger Exp $
#
# ======================================================================
##############################################################################
use strict;
use vars qw($VERSION $self %STACK %LEVEL %STREAM);
$VERSION = 1.03;
my @Handlers = qw(Start End Char Proc Comment CdataStart CdataEnd);
compile();
sub new {
my ($self, %args) = @_;
my $class = ref($self) || $self;
return $self if ref $self;
$args{Pkg} ||= caller;
$self = bless \%args => $class;
$self->setHandlers(); # clear first
if( my $style = $args{Style} ){
no strict 'refs'; local $^W;
my $pkg = __PACKAGE__ . '::Style::' . $style;
eval qq| require $pkg |;
if($@){ die $@ }
for my $name ( qw(Init Final), @Handlers ){
my $sub = $pkg . '::' . $name;
$self->{subs}->{$name} = (defined *{$sub})
? *{$sub} : sub {};
if($] < 5.006){
eval qq| &$sub |;
if($@){ $self->{subs}->{$name} = sub {} }
}
}
}
else{
$self->setHandlers(%{$args{Handlers} || {}});
}
return $self;
}
#sub DESTROY{
# print $_[0], " is destroyed.\n";
#}
sub setHandlers {
my $self = shift;
unless(@_){
for(@Handlers){
$self->{subs}->{$_} = sub {};
}
}
while(@_){
my($name => $func) = splice(@_, 0, 2);
$self->{subs}->{$name} = $func ? $func : sub {};
}
return $self;
}
sub regexp {
my $patch = shift || '';
my $package = __PACKAGE__;
# This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html
# Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
# Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998.
# Copyright (c) 1998, Robert D. Cameron.
# The following code may be freely used and distributed provided that
# this copyright and citation notice remains intact and that modifications
# or additions are clearly identified.
my $TextSE = "[^<]+";
my $UntilHyphen = "[^-]*-";
my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
my $CommentCE = "$Until2Hyphens>?";
my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
my $S = "[ \\n\\t\\r]+";
my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
my $Name = "(?:$NameStrt)(?:$NameChar)*";
my $QuoteSE = "\"[^\"]*\"|'[^']*'";
my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
my $S1 = "[\\n\\r\\t ]";
my $UntilQMs = "[^?]*\\?+";
my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
my $PI_CE = "$Name(?:$PI_Tail)?";
# these expressions were modified for backtracking and events
my $EndTagCE = "($Name)(?{${package}::end(\$self,\$2)})(?:$S)?>";
my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'";
my $ElemTagCE = "($Name)(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)(?{[\@{\$^R||[]},\$4=>defined\$5?\$5:\$6]}))*(?:$S)?(/)?>(?{${package}::start(\$self,\$3,\@{\$^R||[]})})(?{\${7} and ${package}::end(\$self,\$3)})";
my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
# Next expression is under "black magic".
# Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE',
# but it doesn't work under Perl 5.005 and only magic with
# (?:....)?? solved the problem.
# I would appreciate if someone let me know what is the right thing to do
# and what's the reason for all this magic.
# Seems like a problem related to (?:....)? rather than to ?{} feature.
# Tests are in t/31-xmlparserlite.t if you decide to play with it.
"(?:($TextSE)(?{${package}::char(\$self,\$1)}))$patch|$MarkupSPE";
# after Perl 5.8.4, this regex doesn't work...
=pod
my $TextSE = "[^<]+";
my $UntilHyphen = "[^-]*-";
my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
my $CommentCE = "($Until2Hyphens>)?(?{${package}::comment(\$self,\$2)})";
my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
my $CDATA_CE = "($UntilRSBs(?:[^\\]>]$UntilRSBs)*)>(?{${package}::cdata(\$self,\$3)})";
my $S = "[ \\n\\t\\r]+";
my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
my $Name = "(?:$NameStrt)(?:$NameChar)*";
my $QuoteSE = "\"[^\"]*\"|'[^']*'";
my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
my $S1 = "[\\n\\r\\t ]";
my $UntilQMs = "[^?]*\\?+";
my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
my $PI_CE = "($Name(?:$PI_Tail))?(?{local \$${package}::str = defined \$5 ? \$5 : \$4; ${package}::pi(\$self,\$${package}::str)})";
my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$PI_CE)|%$Name;|$S";
my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
# these expressions were modified for backtracking and events
my $EndTagCE = "($Name)(?{${package}::end(\$self,\$6)})(?:$S)?>";
my $AttValSE = "\"(?:[^<\"]*)\"|'(?:[^<']*)'";
my $ElemTagCE = "($Name)(?:$S($Name)(?:$S)?=(?:$S)?($AttValSE)(?{[\@{\$^R||[]},\$8=>\$\+]}))*(?:$S)?(/)?>(?{${package}::start(\$self,\$7,\@{\$^R||[]})})(?{\${10} and ${package}::end(\$self,\$7)})";
my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
# Next expression is under "black magic".
# Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE',
# but it doesn't work under Perl 5.005 and only magic with
# (?:....)?? solved the problem.
# I would appreciate if someone let me know what is the right thing to do
# and what's the reason for all this magic.
# Seems like a problem related to (?:....)? rather than to ?{} feature.
# Tests are in t/31-xmlparserlite.t if you decide to play with it.
"(?:($TextSE)(?{${package}::char(\$self,\$1)}))$patch|$MarkupSPE";
=cut
}
sub compile { local $^W;
#$_ = regexp();
#print qq{sub parse_re { local \$self = shift; use re "eval"; 1 while \$_[0] =~ m{$_}go }; 1};
# try regexp as it should be, apply patch if doesn't work
foreach (regexp(), regexp('??')) {
eval qq{sub parse_re { local \$self = shift; use re "eval"; 1 while \$_[0] =~ m{$_}go }; 1} or die;
last if eval { parse_re(__PACKAGE__,'bar'); 1 }
};
*compile = sub {};
}
sub parse {
my ($self,$source) = @_;
if(ref($source) =~ /^IO::/ or ref(\$source) eq 'GLOB'){
local $/ = undef;
$source = <$source>;
}
if(!$self->{subs}->{Init}){ $self->{subs}->{Init} = sub {1}; }
$self->init();
eval{ $self->parse_re($source) };
if($@){
$self->release();
die $@;
}
$self->process();
if(!$self->{subs}->{Final}){ $self->{subs}->{Final} = sub {1}; }
my (@result,$result) = ((),undef);
if(wantarray){
@result = $self->final();
}
else{
$result = $self->final();
}
$self->release();
return wantarray ? @result : $result;
}
sub parsefile{
my $self = shift;
my $file = shift;
local(*FILE);
open(FILE, $file) or die "Couldn't open $file:\n$!";
binmode(FILE);
my (@result,$result) = ((),undef);
if(wantarray){
@result = $self->parse(*FILE);
}
else{
$result = $self->parse(*FILE);
}
close(FILE);
return wantarray ? @result : $result;
}
sub process{
my $self = shift;
my $cdata = 0;
my $root = 0;
for my $item ( @{ $STREAM{$self} } ){
if(!$root){
next if($item->[0] eq 'Char');
$root = 1 if($item->[0] eq 'Start' and $item->[2] == 1);
}
$cdata = 1 if($item->[0] eq 'CdataStart');
$cdata = 0 if($item->[0] eq 'CdataEnd');
_comment($item->[1]) if($item->[0] eq 'Comment');
_cdata($item->[1]) if($cdata);
_pi($item->[1]) if($item->[0] eq 'Proc');
_expand_entity($item->[1])
if(!$cdata and $item->[0] eq 'Char');
_attr($item->[1]) if($item->[0] eq 'Start');
$self->{subs}->{$item->[0]}->(@{$item->[1]});
}
}
sub release{
my $self = shift;
delete($STACK{$self});
delete($LEVEL{$self});
delete($STREAM{$self});
$self;
}
sub init{ # In start, end, char and so on, $self can't be used as hash!
my $self = shift;
$STACK{$self} = [];
$LEVEL{$self} = 0;
$STREAM{$self} = [];
$self->{subs}->{Init}->($self, @_);
}
sub final {
my $self = shift;
my $stack = $STACK{$self}->[-1];
if( @{$STACK{$self}} ){
$self->release() and die "not properly closed tag '$stack'\n";
}
if( !$LEVEL{$self} ){
$self->release() and die "no element found\n";
}
$self->{subs}->{Final}->($self, @_);
}
#
# In start, end, char and so on, $self can't be used as hash! ...Why?
#
sub start {
my $self = shift;
die "multiple roots, wrong element '$_[0]'\n" if $LEVEL{$self}++ && ! @{ $STACK{$self} };
push @{ $STACK{$self} },$_[0];
push @{ $STREAM{$self} }, ['Start',[$self,@_],$LEVEL{$self}];
}
sub char {
my $self = shift;
push @{ $STREAM{$self} }, ['Char',[$self,@_]];
return if @{ $STACK{$self} };
# check for junk before or after element
# can't use split or regexp due to limitations in ?{} implementation,
# will iterate with loop, but we'll do it no more than two times, so
# it shouldn't affect performance
for (my $i=0; $i < length $_[0]; $i++) {
die "junk '$_[0]' @{[$LEVEL{$self} ? 'after' : 'before']} XML element\n"
if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there
}
}
sub end {
my $self = shift;
pop(@{$STACK{$self}}) eq $_[0] or die "mismatched tag '$_[0]'\n";
push @{ $STREAM{$self} }, ['End',[$self,@_]];
}
sub comment {
my $self = shift;
push @{ $STREAM{$self} }, ['Comment',[$self,@_]];
}
sub _comment{
my $item = shift;
$item->[1] =~ s/-->$//;
}
sub cdata {
my $self = shift;
push @{ $STREAM{$self} }, ['CdataStart',[$self]];
push @{ $STREAM{$self} }, ['Char',[$self,@_]];
push @{ $STREAM{$self} }, ['CdataEnd',[$self]];
}
sub _cdata{
my $item = shift;
$item->[1] =~ s/]]$// if(defined $item->[1]);
}
sub pi{
my $self = shift;
defined $_[0] or die "invalid pi\n";
my ($target,$data) = split(/\s+/,$_[0],2);
return if($target eq 'xml');
push @{ $STREAM{$self} }, ['Proc',[$self,$target,$data]];
}
sub _pi{
my $item = shift;
$item->[2] =~ s/\?>$//;
}
sub _attr{
my $item = shift;
for(@{$item}){
s/^["']//;
s/["']$//;
}
_expand_entity($item);
}
sub _expand_entity{
my $item = shift;
for(@{$item}){
s/<//g;
s/'/'/g;
s/"/"/g;
s/[xX](\d+);/ chr($1) /eg;
s/&/&/g;
}
}
#======================================================================
1;
__END__
=head1 NAME
XML::Parser::LiteHeavy - Lightweight regexp-based XML parser
=head1 SYNOPSIS
use XML::Parser::LiteHeavy;
$p1 = new XML::Parser::LiteHeavy;
$p1->setHandlers(
Start => sub { shift; print "start: @_\n" },
Char => sub { shift; print "char: @_\n" },
End => sub { shift; print "end: @_\n" },
);
$p1->parse('Hello World!');
$p2 = new XML::Parser::LiteHeavy(
Handlers => {
Start => sub { shift; print "start: @_\n" },
Char => sub { shift; print "char: @_\n" },
End => sub { shift; print "end: @_\n" },
}
);
$p2->parse('Hello cruel World!');
$p3 = new XML::Parser::LiteHeavy(Style => 'Tree');
$tree = $p3->parsefile('test.xml');
=head1 DESCRIPTION
This module was made from XML::Parser::Lite which gives you access to
XML parser with interface similar to XML::Parser interface.
Some calls are supported (init, final, start, char, end, comment, cdatastart,
cdataend and proc).
=head1 METHODS
=over 4
=item new
See XML::Parser. Available options are 'Handlers', 'Pkg' and 'Style'.
=item setHandlers
See XML::Parser. Available handlers are 'Init', 'Final', 'Start', 'End'
'Char', 'Comment', 'CdataStart', 'CdataEnd' and 'Proc'.
=item parse
See XML::Parser. Almost options are not available.
=item parsefile
See XML::Parser. Almost options are not available.
=back
=head2 STYLE
You can use 'Tree' and 'Objects' style.
=head1 SEE ALSO
XML::Parser XML::Parser::Lite
=head1 COPYRIGHT
Copyright (C) 2004 Makamaka Hannyaharamitu
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
This module is based on XML::Parser::Lite.
Copyright (C) 2000-2001 Paul Kulchenko (paulclinger@yahoo.com)
Its parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html
Copyright (c) 1998, Robert D. Cameron.
=head1 AUTHOR
Makamaka Hannyaharamitu (makamaka[at]donzoko.net)
=cut