#! /usr/local/bin/perl use 5.008; use strict; use warnings; use Getopt::Long; use Encode qw(encode_utf8); sub load_trie { my($fh) = @_; my($root); $root = {}; while (defined(my $line = readline($fh))) { my($node); chomp $line; $node = $root; foreach my $char (split(//, $line)) { for my $next ($node->{children}->{$char}) { $next ||= {}; $node = $next; } } $node->{match} = 1; } $root; } sub fold_trie { my($node) = @_; my($newnode); $newnode = {}; if ($node->{match}) { $newnode->{match} = 1; } if (my $children = $node->{children}) { for my $newchildren ($newnode->{children}) { foreach my $edge (keys(%{$children})) { my($child); $child = fold_trie($children->{$edge}); if (!$child->{match} && (my $grandchildren = $child->{children})) { if (keys(%{$grandchildren}) == 1) { my($childedge); ($childedge, $child) = %{$grandchildren}; $edge .= $childedge; } } $newchildren->{$edge} = $child; } } } $newnode; } sub optimise { my($trie) = @_; my(%dawgix, %map, @trienodes, @dawgnodes, $dawg); push(@trienodes, $trie); for (my $trieix = 0; $trieix<@trienodes; $trieix++) { my($children); $children = $trienodes[$trieix]->{children}; if ($children) { push(@trienodes, values(%{$children})); } } while (@trienodes) { my($trienode, $children, $key); $trienode = pop(@trienodes); $children = $trienode->{children}; $key = $trienode->{match} ? "\x01" : "\x00"; if ($children) { $key .= pack( "(wa*w)*", map((length($_), encode_utf8($_), $map{0+$children->{$_}}), sort(keys(%{$children})))); } for my $dawgix ($dawgix{$key}) { if (! defined($dawgix)) { my(%dawgnode); $dawgix = @dawgnodes; if ($trienode->{match}) { $dawgnode{match} = 1; } if ($children) { my(%children); while (my($edge, $child) = each(%{$children})) { ($children{$edge} = $dawgnodes[$map{0+$child}]) ->{refcnt}++; } $dawgnode{children} = \%children; } push(@dawgnodes, \%dawgnode); } $map{0+$trienode} = $dawgix; } } $dawg = $dawgnodes[-1]; $dawg->{refcnt}++; $dawg; } sub fold_dawg { my($node, $seen) = @_; $seen ||= {}; for my $newnode ($seen->{0+$node}) { if (!$newnode) { $newnode = {}; if ($node->{match}) { $newnode->{match} = 1; } $newnode->{refcnt} = $node->{refcnt}; if (my $children = $node->{children}) { for my $newchildren ($newnode->{children}) { foreach my $edge (keys(%{$children})) { my($child); $child = fold_dawg($children->{$edge}, $seen); if (!$child->{match} && $child->{refcnt} < 2 && (my $grandchildren = $child->{children})) { if (keys(%{$grandchildren}) == 1) { my($childedge); ($childedge, $child) = %{$grandchildren}; $edge .= $childedge; } } $newchildren->{$edge} = $child; } } } } return $newnode; } } use constant { bare_id_re => qr/^(?:[A-Za-z_][0-9A-Za-z_]*|-?(?:[0-9]+(?:\.[0-9]*)?|\.[0-9]+))$/, keyword_re => qr/^(?:digraph|edge|graph|node|strict|subgraph)$/, }; sub quote_id { my($id) = @_; if ($id =~ bare_id_re && $id !~ keyword_re) { $id; } else { $id =~ s/\"/\\\"/g; qq{"$id"}; } } sub dump_dawg { my($dawg, $fh) = @_; my($node, %ix, @todo, $nodeix, $nextix); print $fh "digraph {\n"; print $fh " rankdir=LR;\n"; print $fh " node [shape=circle];\n"; $node = $dawg; $ix{0+$node} = $nodeix = $nextix++; if ($node->{match}) { print $fh (" ", quote_id($nodeix), " [shape=doublecircle, style=filled];\n"); } else { print $fh " ", quote_id($nodeix), " [style=filled];\n"; } for (;;) { if (my $children = $node->{children}) { foreach my $edge (sort(keys(%{$children}))) { my($child); $child = $children->{$edge}; for my $childix ($ix{0+$child}) { if (!defined($childix)) { $childix = $nextix++; push(@todo, $child); } print $fh (" ", quote_id($nodeix), "->", quote_id($childix), " [label=", quote_id($edge), "];\n"); } } } $node = shift(@todo) or last; $nodeix = $ix{0+$node}; if ($node->{match}) { print $fh " ", quote_id($nodeix), " [shape=doublecircle];\n"; } } print $fh "}\n"; } my @stats = ( [nodes => "total nodes"], [nodes_none => " nodes with no edges"], [nodes_single => " nodes with only single-character edges"], [nodes_multi => " nodes with only multi-character edges"], [nodes_both => " nodes with both single- and multi-character edges"], [edges => "total edges"], [edges_single => " single-character edges"], [edges_multi => " multi-character edges"], [chars => "total edge characters"]); sub analyse_dawg { my($dawg, $name) = @_; my(%seen, @todo, %stats); %stats = map(($_->[0] => 0), @stats); push(@todo, $dawg); $seen{0+$dawg} = 1; while (@todo) { my($node, $single, $multi); $node = shift(@todo); $stats{nodes}++; if (my $children = $node->{children}) { foreach my $edge (keys(%{$children})) { my($child); if (length($edge) > 1) { $multi++; } else { $single++; } $stats{chars} += length($edge); $child = $children->{$edge}; if (! $seen{0+$child}++) { push(@todo, $child); } } if ($single) { $stats{edges} += $single; $stats{edges_single} += $single; if ($multi) { $stats{edges} += $multi; $stats{edges_multi} += $multi; $stats{nodes_both}++; } else { $stats{nodes_single}++; } } else { if ($multi) { $stats{edges} += $multi; $stats{edges_multi} += $multi; $stats{nodes_multi}++; } } } else { $stats{nodes_none}++; } } print "=== ", $name, " ===\n"; foreach my $stat (@stats) { printf("%10u %s\n", $stats{$stat->[0]}, $stat->[1]); } print "\n"; } sub usage { print STDERR "Usage: $0 [ --dot ] filename...\n"; exit(1); } my($dot); GetOptions("dot" => \$dot) && @ARGV>0 or usage(); sub process_dawg { my($dawg, $name) = @_; analyse_dawg($dawg, $name); if ($dot) { my($fh); open($fh, ">", "$name.dot") or die "$name.dot: $!"; dump_dawg($dawg, $fh); close($fh); } } foreach my $arg (@ARGV) { my($plain, $fold, $opt, $fold_opt, $opt_fold); if (open(my $fh, "<", $arg)) { $plain = load_trie($fh); close($fh); } else { warn "$arg: $!\n"; next; } process_dawg($plain, $arg); $opt = optimise($plain); process_dawg($opt, "${arg}_opt"); $fold = fold_trie($plain); undef $plain; process_dawg($fold, "${arg}_fold"); $opt_fold = fold_dawg($opt); undef $opt; process_dawg($opt_fold, "${arg}_opt_fold"); $fold_opt = optimise($fold); undef $fold; process_dawg($fold_opt, "${arg}_fold_opt"); }