#!/usr/bin/perl -w
# Copyright © 2004 Jamie Zawinski <jwz@jwz.org>
#
# Permission to use, copy, modify, distribute, and sell this software and its
# documentation for any purpose is hereby granted without fee, provided that
# the above copyright notice appear in all copies and that both that
# copyright notice and this permission notice appear in supporting
# documentation. No representations are made about the suitability of this
# software for any purpose. It is provided "as is" without express or
# implied warranty.
#
# Created: 3-Mar-2004 by Jamie Zawinski, Anonymous, and Jacob Post.
#
##############################################################################
#
# This is a program that can read the Mozilla URL history file --
# normally $HOME/.mozilla/default/*.slt/history.dat -- and prints out
# a list of URLs and their time of last access. With no arguments,
# it prints lines like
#
# 1078333826 1 http://www.jwz.org/hacks/
#
# where the first number is a ctime (number of seconds since Jan 1 1970 GMT)
# and the second number is how many times this URL was visited. The URLs are
# printed most-recent-first.
#
# With -vv, it prints all the information known about each URL,
# including time of first visit, last visit, document title, etc.
#
# With --html, it produces HTML output instead of plain text.
#
# With "--age 2H", it limits itself to URLs that were loaded within the
# last two hours. Likewise with "sec", "min", "day", "month", etc.
#
##############################################################################
#
# And Now, The Ugly Truth Laid Bare:
#
# In Netscape Navigator 1.0 through 4.0, the history.db file was just a
# Berkeley DBM file. You could trivially bind to it from Perl, and
# pull out the URLs and last-access time. In Mozilla, this has been
# replaced with a "Mork" database for which no tools exist.
#
# Let me make it clear that McCusker is a complete barking lunatic.
# This is just about the stupidest file format I've ever seen.
#
# http://www.mozilla.org/mailnews/arch/mork/primer.txt
# http://jwz.livejournal.com/312657.html
# http://www.jwz.org/doc/mailsum.html
# http://bugzilla.mozilla.org/show_bug.cgi?id=241438
#
# In brief, let's count its sins:
#
# - Two different numerical namespaces that overlap.
#
# - It can't decide what kind of character-quoting syntax to use:
# Backslash? Hex encoding with dollar-sign?
#
# - C++ line comments are allowed sometimes, but sometimes // is just
# a pair of characters in a URL.
#
# - It goes to all this serious compression effort (two different
# string-interning hash tables) and then writes out Unicode strings
# without using UTF-8: writes out the unpacked wchar_t characters!
#
# - Worse, it hex-encodes each wchar_t with a 3-byte encoding,
# meaning the file size will be 3x or 6x (depending on whether
# whchar_t is 2 bytes or 4 bytes.)
#
# - It masquerades as a "textual" file format when in fact it's just
# another binary-blob file, except that it represents all its magic
# numbers in ASCII. It's not human-readable, it's not hand-editable,
# so the only benefit there is to the fact that it uses short lines
# and doesn't use binary characters is that it makes the file bigger.
# Oh wait, my mistake, that isn't actually a benefit at all.
#
# Pure comedy.
#
##############################################################################
require 5;
use diagnostics;
use strict;
use POSIX qw(strftime);
my $progname = $0; $progname =~ s@.*/@@g;
my $version = q{ $Revision: 1.14 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/;
my $verbose = 0;
my $show_all_p = 1;
my (%key_table, %val_table, %row_hash);
my ($total, $skipped) = (0, 0);
# Returns a list of hashes, the contents of the mork file.
#
sub mork_parse_file {
my ($file, $age) = @_;
local $/ = undef;
local *IN;
my $since = ($age ? time() - $age : 0);
##########################################################################
# Define the messy regexen up here
##########################################################################
my $top_level_comment = qr@//.*\n@;
my $key_table_re = qr/ < \s* < # "< <"
\( a=c \) > # "(a=c)>"
(?> ([^>]*) ) > \s* # Grab anything that's not ">"
/sx;
my $value_table_re = qr/ < ( .*?\) )> \s* /sx;
my $table_re = qr/ \{ -? # "{" or "{-"
[\da-f]+ : # hex, ":"
(?> .*?\{ ) # Eat up to a {...
((?> .*?\} ) # and then the closing }...
(?> .*?\} )) # Finally, grab the table section
\s* /six;
my $row_re = qr/ ( (?> \[ [^]]* \] # "["..."]"
\s*)+ ) # Perhaps repeated many times
/sx;
my $section_begin_re = qr/ \@\$\$\{ # "@$${"
([\dA-F]+) # hex
\{\@ \s* # "{@"
/six;
my $section_end_re = undef;
my $section = "top level";
##########################################################################
# Read in the file.
##########################################################################
open (IN, "<$file") || error ("$file: $!");
print STDERR "$progname: reading $file...\n" if ($verbose);
my $body = <IN>;
close IN;
$body =~ s/\r\n/\n/gs; # Windows Mozilla uses \r\n
$body =~ s/\r/\n/gs; # Presumably Mac Mozilla is similarly dumb
$body =~ s/\\\\/\$5C/gs; # Sometimes backslash is quoted with a
# backslash; convert to hex.
$body =~ s/\\\)/\$29/gs; # close-paren is quoted with a backslash;
# convert to hex.
$body =~ s/\\\n//gs; # backslash at end of line is continuation.
##########################################################################
# Figure out what we're looking at, and parse it.
##########################################################################
print STDERR "$progname: $file: parsing...\n" if ($verbose);
pos($body) = 0;
my $length = length($body);
while( pos($body) < $length ) {
# Key table
if ( $body =~ m/\G$key_table_re/gc ) {
mork_parse_key_table($file, $section, $1);
# Values
} elsif ( $body =~ m/\G$value_table_re/gco ) {
mork_parse_value_table($file, $section, $1);
# Table
} elsif ( $body =~ m/\G$table_re/gco ) {
mork_parse_table($file, $section, $age, $since, $1);
# Rows (-> table)
} elsif ( $body =~ m/\G$row_re/gco ) {
mork_parse_table($file, $section, $age, $since, $1);
# Section begin
} elsif ( $body =~ m/\G$section_begin_re/gco ) {
$section = $1;
$section_end_re = qr/\@\$\$\}$section\}\@\s*/s;
# Section end
} elsif ( $section_end_re && $body =~ m/\G$section_end_re/gc ) {
$section_end_re = undef;
$section = "top level";
# Comment
} elsif ( $body =~ m/\G$top_level_comment/gco ) {
#no-op
} else {
# $body =~ m/\G (.{0,300}) /gcsx; print "<$1>\n";
error("$file: $section: Cannot parse");
}
}
if($section_end_re) {
error("$file: Unterminated section $section");
}
print STDERR "$progname: $file: sorting...\n" if ($verbose);
my @entries = sort { $b->{LastVisitDate} <=>
$a->{LastVisitDate} } values(%row_hash);
print STDERR "$progname: $file: done! ($total total, $skipped skipped)\n"
if ($verbose);
(%key_table, %val_table, %row_hash, $total, $skipped) = ();
return \@entries;
}
##########################################################################
# parse a row and column table
##########################################################################
sub mork_parse_table {
my($file, $section, $age, $since, $table_part) = (@_);
print STDERR "\n" if ($verbose > 3);
# Assumption: no relevant spaces in values in this section
$table_part =~ s/\s+//g;
# print $table_part; #exit(0);
#Grab each complete [...] block
while( $table_part =~ m/\G [^[]* \[ # find a "["
( [^]]+ ) \] # capture up to "]"
/gcx ) {
$_ = $1;
my %hash;
my ($id, @cells) = split (m/[()]+/s);
next unless scalar(@cells);
# Trim junk
$id =~ s/^-//;
$id =~ s/:.*//;
if($row_hash{$id}) {
%hash = ( %{$row_hash{$id}} );
} else {
%hash = ( 'ID' => $id,
'LastVisitDate' => 0 );
}
foreach (@cells) {
next unless $_;
my ($keyi, $which, $vali) =
m/^\^ ([-\dA-F]+)
([\^=])
(.*)
$/xi;
error ("$file: unparsable cell: $_\n") unless defined ($vali);
# If the key isn't in the key table, ignore it
#
my $key = $key_table{$keyi};
next unless defined($key);
my $val = ($which eq '='
? $vali
: $val_table{$vali});
if ($key eq 'LastVisitDate' || $key eq 'FirstVisitDate') {
$val = int ($val / 1000000); # we don't need milliseconds, dude.
}
$hash{$key} = $val;
#print "$id: $key -> $val\n";
}
if ($age && ($hash{LastVisitDate} || $since) < $since) {
print STDERR "$progname: $file: skipping old: " .
"$hash{LastVisitDate} $hash{URL}\n"
if ($verbose > 3);
$skipped++;
next;
}
$total++;
$row_hash{$id} = \%hash;
}
}
##########################################################################
# parse a values table
##########################################################################
sub mork_parse_value_table {
my($file, $section, $val_part) = (@_);
return unless $val_part;
my @pairs = split (m/\(([^\)]+)\)/, $val_part);
$val_part = undef;
print STDERR "\n" if ($verbose > 3);
foreach (@pairs) {
next unless (m/[^\s]/s);
my ($key, $val) = m/([\dA-F]*)[\t\n ]*=[\t\n ]*(.*)/i;
if (! defined ($val)) {
print STDERR "$progname: $file: $section: unparsable val: $_\n";
next;
}
# Assume that URLs and LastVisited are never hexilated; so
# don't bother unhexilating if we won't be using Name, etc.
if($show_all_p && $val =~ m/\$/) {
# Approximate wchar_t -> ASCII and remove NULs
$val =~ s/\$00//g; # faster if we remove these first
$val =~ s/\$([\dA-F]{2})/chr(hex($1))/ge;
}
$val_table{$key} = $val;
print STDERR "$progname: $file: $section: val $key = \"$val\"\n"
if ($verbose > 3);
}
}
##########################################################################
# parse a key table
##########################################################################
sub mork_parse_key_table {
my ($file, $section, $key_table) = (@_);
print STDERR "\n" if ($verbose > 3);
$key_table =~ s@\s+//.*$@@gm;
my @pairs = split (m/\(([^\)]+)\)/s, $key_table);
$key_table = undef;
foreach (@pairs) {
next unless (m/[^\s]/s);
my ($key, $val) = m/([\dA-F]+)\s*=\s*(.*)/i;
error ("$file: $section: unparsable key: $_") unless defined ($val);
# If we're only emitting URLs and dates, don't even bother
# saving the other fields that we aren't interested in.
#
next if (!$show_all_p &&
$val ne 'URL' && $val ne 'LastVisitDate' &&
$val ne 'VisitCount');
$key_table{$key} = $val;
print STDERR "$progname: $file: $section: key $key = \"$val\"\n"
if ($verbose > 3);
}
}
sub html_quote {
my ($s) = @_;
$s =~ s/&/&/g;
$s =~ s/</</g;
$s =~ s/>/>/g;
$s =~ s/\"/"/g;
return $s;
}
sub html_wrap {
my ($s) = @_;
$s = html_quote ($s);
# while there are non-wrappable chunks of 30 characters,
# insert wrap points at certain punctuation characters every 10 characters.
while ($s =~ m/[^\s]{30}/s) {
last unless ($s =~ s@([^\s]{10})([/;,])([^/\s])@$1$2 $3@gs ||
$s =~ s@([^\s]{10})([-_\$\#?.]|&|%(2F|2C|26))@$1 $2@gs);
}
# if we still have non-wrappable chunks of 40 characters,
# insert wrap points every 30 characters no matter what.
while ($s =~ m/[^\s]{40}/s) {
last unless ($s =~ s@([^\s]{30})@$1 @gs);
}
return $s;
}
sub format_urls {
my ($results, $html_p) = @_;
print "<TABLE BORDER=0 CELLPADDING=" . ($show_all_p ? "4" : "0") .
" CELLSPACING=0>\n"
if ($html_p);
foreach my $hash (@$results) {
if ($show_all_p) {
#
# Print every field in the hash.
#
if ($html_p) {
print " <TR>\n";
print " <TD NOWRAP ALIGN=RIGHT VALIGN=TOP>$hash->{ID} </TD>\n";
print " <TD NOWRAP>\n";
print " <TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>\n";
}
my %key_sort_table = (
'ID' => ' 0 ',
'URL' => ' 1 ',
'Name' => ' 2 ',
'Hostname' => ' 3 ',
'FirstVisitDate' => ' 4 ',
'LastVisitDate' => ' 5 '
);
foreach my $key (sort { ($key_sort_table{$a} || $a) cmp
($key_sort_table{$b} || $b)
} (keys(%$hash))) {
my $val = $hash->{$key};
if ($key eq 'LastVisitDate' || $key eq 'FirstVisitDate') {
$val = localtime ($val);
}
if ($html_p) {
next if ($key eq 'ID');
$key = html_quote ($key);
$val = ($key eq 'URL'
? "<A HREF=\"$val\">" . html_wrap ($val) . "</A>"
: html_wrap ($val));
print " <TR>\n";
print " <TD VALIGN=TOP NOWRAP ALIGN=RIGHT>$key: </TD>\n";
print " <TD VALIGN=TOP>$val</TD>\n";
print " </TR>\n";
} else {
print sprintf ("%14s = %s\n", $key, $val);
}
}
if ($html_p) {
print " </TABLE>\n";
print " </TD>\n";
print " </TR>\n";
}
print "\n";
} else {
#
# Print just the URLs and their last-load-times.
#
my $url = $hash->{'URL'};
my $date = $hash->{'LastVisitDate'} || 0;
my $count = $hash->{'VisitCount'} || 1;
next unless defined ($url);
if ($html_p) {
$date = strftime("%d %b %l:%M %p", localtime ($date));
my $u2 = html_wrap ($url);
print " <TR>";
print "<TD VALIGN=TOP ALIGN=RIGHT NOWRAP>";
print "($count) " if ($count > 1);
print "$date </TD>";
print "<TD VALIGN=TOP><A HREF=\"$url\">$u2</A></TD>";
print "</TR>\n";
} else {
print "$date\t$count\t$url\n";
}
}
}
print "</TABLE>\n" if ($html_p);
}
sub error {
($_) = @_;
print STDERR "$progname: $_\n";
exit 1;
}
sub usage {
print STDERR "usage: $progname [--verbose] [--html] [--age secs] " .
"mork-input-file\n" .
"\t'age' can be of the form '2h', '3d', etc.\n";
exit 1;
}
sub main {
my ($file, $age, $html_p);
while ($#ARGV >= 0) {
$_ = shift @ARGV;
if ($_ eq "--verbose") { $verbose++; }
elsif (m/^-v+$/) { $verbose += length($_)-1; }
elsif ($_ eq "--age") { $age = shift @ARGV; }
elsif ($_ eq "--html") { $html_p = 1; }
elsif (m/^-./) { usage; }
elsif (!defined($file)) { $file = $_; }
else { usage; }
}
usage() unless defined($file);
$show_all_p = ($verbose > 1);
if (!$age) {
} elsif ($age =~ m/^(\d+)\s*s(ec(onds?)?)?$/i) {
$age = $1 + 0;
} elsif ($age =~ m/^(\d+)\s*m(in(utes?)?)?$/i) {
$age = $1 * 60;
} elsif ($age =~ m/^(\d+)\s*h(ours?)?$/i) {
$age = $1 * 60 * 60;
} elsif ($age =~ m/^(\d+)\s*d(ays?)?$/i) {
$age = $1 * 60 * 60 * 24;
} elsif ($age =~ m/^(\d+)\s*w(eeks?)?$/i) {
$age = $1 * 60 * 60 * 24 * 7;
} elsif ($age =~ m/^(\d+)\s*m(on(ths?)?)?$/i) {
$age = $1 * 60 * 60 * 24 * 30;
} elsif ($age =~ m/^(\d+)\s*y(ears?)?$/i) {
$age = $1 * 60 * 60 * 24 * 365;
} else {
error ("unparsable: --age $age");
}
my $results = mork_parse_file ($file, $age);
format_urls ($results, $html_p);
}
main;
exit 0;
syntax highlighted by
Code2HTML, v. 0.9.1