package ExtractUniqueBlock; use strict; use warnings; use Digest::MD5; use Encode; use Encode::Guess; use HTML::TokeParser; use IO::File; our $VERSION = '0.01'; # Tagset (HTML4.01 Transitional block-like elements) our %TS = map { $_ => 1 } qw(p h1 h2 h3 h4 h5 h6 ul ol dir menu pre dl div center noscript noframes blockquote form isindex hr table fieldset address); our %NS = map { $_ => 1 } qw(script style); # TokeParser Text Map our %TP = ('S' => 4, 'E' => 2, 'T' => 1, 'C' => 1, 'D' => 1, 'PI' => 2); sub new { my ($class, %opt) = @_; my $self; $self->{'opt'} = { 'sim_threshold' => (defined $opt{'sim_threshold'}) ? $opt{'sim_threshold'} : 0.85, 'bdf_threshold' => (defined $opt{'bdf_threshold'}) ? $opt{'bdf_threshold'} : 0, 'utf8_decode' => (defined $opt{'utf8_decode'}) ? $opt{'utf8_decode'} : 1, 'block_id_attribute' => $opt{'block_id_attribute'}, }; $self->{'md5'} = []; $self->{'encoding'} = []; $self->{'blockset'} = []; $self->{'vectorset'} = []; $self->{'extract'} = { 'text' => [], 'html' => [], 'block_id' => [], }; return bless $self, $class; } sub add_html { my ($self, $html, $encoding) = @_; if (utf8::is_utf8($html)) { utf8::encode($html); $encoding = 'utf8'; } elsif ($self->{'opt'}->{'utf8_decode'}) { $encoding = html_to_encoding($html); if (! $encoding) { $encoding = 'utf8'; } } else { undef $encoding; } my ($blockset, $vectorset) = _get_blockset($html); push @{$self->{'md5'}}, Digest::MD5::md5_hex($html); push @{$self->{'encoding'}}, $encoding; push @{$self->{'blockset'}}, $blockset; push @{$self->{'vectorset'}}, $vectorset; return $self; } sub add_file { my ($self, $file, $encoding) = @_; my $io1 = IO::File->new($file) or die $!; my $html = join("\n", $io1->getlines); $io1->close; $self->add_html($html, $encoding); return $self; } sub extract { my ($self, @lists) = @_; foreach my $html (@lists) { $self->add_html($html); } $self->_compute_bdf(); my $max_set = scalar(@{$self->{'bdf'}}); for (my $i = 0; $i < $max_set; $i++) { my (@html, @text, %block_id); my $max_block = scalar(@{$self->{'bdf'}->[$i]}); for (my $j = 0; $j < $max_block; $j++) { if ($self->{'bdf'}->[$i]->[$j] <= $self->{'opt'}->{'bdf_threshold'}) { my (@block_text, @block_html); # ブロックをHTML/TEXTに展開 foreach my $token (@{$self->{'blockset'}->[$i]->[$j]}) { my $html = ($self->{'encoding'}->[$i]) ? Encode::decode($self->{'encoding'}->[$i], $token->[$TP{$token->[0]}]) : $token->[$TP{$token->[0]}]; if ($token->[0] eq 'T') { $html =~ s/^\n+//o; $html =~ s/\n+$//o; push @block_text, $html; } push @block_html, $html; } push @text, \@block_text; push @html, \@block_html; # BlockID を記録(実験用, ブロック先頭タグ内の block 属性リスト) if ($self->{'opt'}->{'block_id_attribute'}) { if ($self->{'blockset'}->[$i]->[$j]->[0]->[2]) {; my $block_id = $self->{'blockset'}->[$i]->[$j]->[0]->[2]->{$self->{'opt'}->{'block_id_attribute'}}; $block_id{$block_id} = $self->{'bdf'}->[$i]->[$j] if $block_id; } } else { $block_id{$j} = $self->{'bdf'}->[$i]->[$j]; } } } push @{$self->{'extract'}->{'text'}}, \@text; push @{$self->{'extract'}->{'html'}}, \@html; push @{$self->{'extract'}->{'block_id'}}, \%block_id; } return $self; } sub list_text { my $self = shift; return @{$self->{'extract'}->{'text'}}; } sub list_html { my $self = shift; return @{$self->{'extract'}->{'html'}}; } sub list_block_id { my $self = shift; return map { [ sort { $a <=> $b } keys %{$_} ] } @{$self->{'extract'}->{'block_id'}}; } sub text { my $self = shift; return _to_text($self->{'extract'}->{'text'}); } sub html { my $self = shift; return _to_text($self->{'extract'}->{'html'}); } sub block_id { my $self = shift; return @{$self->{'extract'}->{'block_id'}}; } sub html_to_encoding { my ($html) = @_; my $enc; my @encoding = (($html =~ /charset=([\w_-]+)/g), ($html =~ /encoding="([\w_-]+)"/g)); foreach my $val (@encoding) { if (Encode::find_encoding($val)) { $enc = $val; last; } } if (! $enc) { my $guess = Encode::Guess::guess_encoding($html); if (ref $guess) { $enc = $guess->name; } } return $enc; } sub _compute_bdf { my $self = shift; my @bdf; my @sim; my $max = scalar(@{$self->{'blockset'}}); # 比較元HTML for (my $i = 0; $i < $max; $i++) { my $max_block_i = scalar(@{$self->{'blockset'}->[$i]}); # 比較元ブロック for (my $j = 0; $j < $max_block_i; $j++) { $bdf[$i][$j] = 0; # キャッシュ使用を監視 my @used; # キャッシュ利用 foreach my $cache (@{$sim[$i][$j]}) { $used[$cache->[0]] = 1; $bdf[$i][$j]++; } # 比較先HTML foreach my $k ($i + 1 ... $max - 1, 0 ... $i) { if ($i == $k || defined $used[$k]) { next; } # HTML(MD5)比較(同じページはスキップ) if ($self->{'md5'}->[$i] eq $self->{'md5'}->[$k]) { next; } # 探索打ち切り if ($bdf[$i][$j] > $self->{'opt'}->{'bdf_threshold'}) { last; } my $max_block_k = scalar(@{$self->{'blockset'}->[$k]}); # 比較先ブロック for (my $l = 0; $l < $max_block_k; $l++) { my $sim = _cosine_similarity($self->{'vectorset'}->[$i]->[$j], $self->{'vectorset'}->[$k]->[$l]); # ブロックの一致判断 if ($sim >= $self->{'opt'}->{'sim_threshold'}) { $bdf[$i][$j]++; # 結果をキャッシュ push @{$sim[$k][$l]}, [ $i, $j, $sim ]; last; } } } } } $self->{'bdf'} = \@bdf; } sub _get_blockset { my ($html) = @_; my (@blockset, @vectorset); my $p = HTML::TokeParser->new(\$html); my @stack; while (my $token = $p->get_token()) { push @stack, $token; if ($token->[0] eq 'E' && $TS{$token->[1]}) { my (@block, $vector); my $is_skip = 0; while (scalar(@stack)) { my $token = pop @stack; unshift @block, $token; # Create Vector if ($token->[0] eq 'S') { if ($is_skip) { $is_skip = 0; next; } # Tag Name Vector (Number of Tags) $vector->{'|T|' . $token->[1]}++; foreach my $key ('alt', 'title') { if ($token->[2]{$key}) { map { $vector->{$_}++ } _string_to_token($token->[2]{$key}); } } if ($TS{$token->[1]}) { last; } } elsif ($token->[0] eq 'T' && ! $is_skip) { map { $vector->{$_}++ } _string_to_token($token->[1]); } elsif ($token->[0] eq 'E' && $NS{$token->[1]}) { $is_skip = 1; } } push @blockset, \@block; push @vectorset, $vector; } } return \@blockset, \@vectorset; } sub _string_to_token { my @tmp = split(/\r|\n/o, lc $_[0]); # my @tmp = split(/\s+/o, lc $_[0]); my @token; foreach my $token (@tmp) { $token =~ s/\s+/ /go; if ($token) { push @token, $token; } } return @token; } sub _cosine_similarity { my ($vector_1, $vector_2) = @_; # Inner Product my $inner_product = 0.0; map { if ($vector_2->{$_}) { $inner_product += $vector_1->{$_} * $vector_2->{$_}; } } keys %{$vector_1}; # Euclidean Norm my $norm_1 = 0.0; map { $norm_1 += $_ ** 2 } values %{$vector_1}; $norm_1 = sqrt($norm_1); my $norm_2 = 0.0; map { $norm_2 += $_ ** 2 } values %{$vector_2}; $norm_2 = sqrt($norm_2); return ($norm_1 && $norm_2) ? $inner_product / ($norm_1 * $norm_2) : 0.0; } sub _to_text { my $list = shift; return map { join("\n", map { join("\n", @{$_}) } @{$_}) } @{$list}; } 1;