Search This Blog

2008-01-10

parse automatically firefox history file


#!/usr/bin/perl -w
# Copyright © 2004 Jamie Zawinski <jwz@jwz.org>
#
# Permission to use, copy, modify, distribute, and sell this software and its
# documentation for any purpose is hereby granted without fee, provided that
# the above copyright notice appear in all copies and that both that
# copyright notice and this permission notice appear in supporting
# documentation. No representations are made about the suitability of this
# software for any purpose. It is provided "as is" without express or
# implied warranty.
#
# Created: 3-Mar-2004 by Jamie Zawinski, Anonymous, and Jacob Post.
#
##############################################################################
#
# This is a program that can read the Mozilla URL history file --
# normally $HOME/.mozilla/default/*.slt/history.dat -- and prints out
# a list of URLs and their time of last access. With no arguments,
# it prints lines like
#
# 1078333826 1 http://www.jwz.org/hacks/
#
# where the first number is a ctime (number of seconds since Jan 1 1970 GMT)
# and the second number is how many times this URL was visited. The URLs are
# printed most-recent-first.
#
# With -vv, it prints all the information known about each URL,
# including time of first visit, last visit, document title, etc.
#
# With --html, it produces HTML output instead of plain text.
#
# With "--age 2H", it limits itself to URLs that were loaded within the
# last two hours. Likewise with "sec", "min", "day", "month", etc.
#
##############################################################################
#
# And Now, The Ugly Truth Laid Bare:
#
# In Netscape Navigator 1.0 through 4.0, the history.db file was just a
# Berkeley DBM file. You could trivially bind to it from Perl, and
# pull out the URLs and last-access time. In Mozilla, this has been
# replaced with a "Mork" database for which no tools exist.
#
# Let me make it clear that McCusker is a complete barking lunatic.
# This is just about the stupidest file format I've ever seen.
#
# http://www.mozilla.org/mailnews/arch/mork/primer.txt
# http://jwz.livejournal.com/312657.html
# http://www.jwz.org/doc/mailsum.html
# http://bugzilla.mozilla.org/show_bug.cgi?id=241438
#
# In brief, let's count its sins:
#
# - Two different numerical namespaces that overlap.
#
# - It can't decide what kind of character-quoting syntax to use:
# Backslash? Hex encoding with dollar-sign?
#
# - C++ line comments are allowed sometimes, but sometimes // is just
# a pair of characters in a URL.
#
# - It goes to all this serious compression effort (two different
# string-interning hash tables) and then writes out Unicode strings
# without using UTF-8: writes out the unpacked wchar_t characters!
#
# - Worse, it hex-encodes each wchar_t with a 3-byte encoding,
# meaning the file size will be 3x or 6x (depending on whether
# whchar_t is 2 bytes or 4 bytes.)
#
# - It masquerades as a "textual" file format when in fact it's just
# another binary-blob file, except that it represents all its magic
# numbers in ASCII. It's not human-readable, it's not hand-editable,
# so the only benefit there is to the fact that it uses short lines
# and doesn't use binary characters is that it makes the file bigger.
# Oh wait, my mistake, that isn't actually a benefit at all.
#
# Pure comedy.
#
##############################################################################


require 5;
use diagnostics;
use strict;
use POSIX qw(strftime);

my $progname = $0; $progname =~ s@.*/@@g;
my $version = q{ $Revision: 1.14 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/;

my $verbose = 0;
my $show_all_p = 1;

my (%key_table, %val_table, %row_hash);
my ($total, $skipped) = (0, 0);

# Returns a list of hashes, the contents of the mork file.
#
sub mork_parse_file {
my ($file, $age) = @_;
local $/ = undef;
local *IN;

my $since = ($age ? time() - $age : 0);

##########################################################################
# Define the messy regexen up here
##########################################################################

my $top_level_comment = qr@//.*\n@;

my $key_table_re = qr/ < \s* < # "< <"
\( a=c \) > # "(a=c)>"
(?> ([^>]*) ) > \s* # Grab anything that's not ">"
/sx;

my $value_table_re = qr/ < ( .*?\) )> \s* /sx;

my $table_re = qr/ \{ -? # "{" or "{-"
[\da-f]+ : # hex, ":"
(?> .*?\{ ) # Eat up to a {...
((?> .*?\} ) # and then the closing }...
(?> .*?\} )) # Finally, grab the table section
\s* /six;

my $row_re = qr/ ( (?> \[ [^]]* \] # "["..."]"
\s*)+ ) # Perhaps repeated many times
/sx;

my $section_begin_re = qr/ \@\$\$\{ # "@$${"
([\dA-F]+) # hex
\{\@ \s* # "{@"
/six;

my $section_end_re = undef;
my $section = "top level";

##########################################################################
# Read in the file.
##########################################################################
open (IN, "<$file") || error ("$file: $!");
print STDERR "$progname: reading $file...\n" if ($verbose);

my $body = <IN>;
close IN;

$body =~ s/\r\n/\n/gs; # Windows Mozilla uses \r\n
$body =~ s/\r/\n/gs; # Presumably Mac Mozilla is similarly dumb

$body =~ s/\\\\/\$5C/gs; # Sometimes backslash is quoted with a
# backslash; convert to hex.
$body =~ s/\\\)/\$29/gs; # close-paren is quoted with a backslash;
# convert to hex.
$body =~ s/\\\n//gs; # backslash at end of line is continuation.

##########################################################################
# Figure out what we're looking at, and parse it.
##########################################################################

print STDERR "$progname: $file: parsing...\n" if ($verbose);

pos($body) = 0;
my $length = length($body);

while( pos($body) < $length ) {

# Key table

if ( $body =~ m/\G$key_table_re/gc ) {
mork_parse_key_table($file, $section, $1);

# Values
} elsif ( $body =~ m/\G$value_table_re/gco ) {
mork_parse_value_table($file, $section, $1);

# Table
} elsif ( $body =~ m/\G$table_re/gco ) {
mork_parse_table($file, $section, $age, $since, $1);

# Rows (-> table)
} elsif ( $body =~ m/\G$row_re/gco ) {
mork_parse_table($file, $section, $age, $since, $1);

# Section begin
} elsif ( $body =~ m/\G$section_begin_re/gco ) {
$section = $1;
$section_end_re = qr/\@\$\$\}$section\}\@\s*/s;

# Section end
} elsif ( $section_end_re && $body =~ m/\G$section_end_re/gc ) {
$section_end_re = undef;
$section = "top level";

# Comment
} elsif ( $body =~ m/\G$top_level_comment/gco ) {
#no-op

} else {
# $body =~ m/\G (.{0,300}) /gcsx; print "<$1>\n";
error("$file: $section: Cannot parse");
}
}

if($section_end_re) {
error("$file: Unterminated section $section");
}


print STDERR "$progname: $file: sorting...\n" if ($verbose);

my @entries = sort { $b->{LastVisitDate} <=>
$a->{LastVisitDate} } values(%row_hash);

print STDERR "$progname: $file: done! ($total total, $skipped skipped)\n"
if ($verbose);

(%key_table, %val_table, %row_hash, $total, $skipped) = ();

return \@entries;
}


##########################################################################
# parse a row and column table
##########################################################################

sub mork_parse_table {
my($file, $section, $age, $since, $table_part) = (@_);

print STDERR "\n" if ($verbose > 3);

# Assumption: no relevant spaces in values in this section
$table_part =~ s/\s+//g;

# print $table_part; #exit(0);

#Grab each complete [...] block
while( $table_part =~ m/\G [^[]* \[ # find a "["
( [^]]+ ) \] # capture up to "]"
/gcx ) {
$_ = $1;

my %hash;
my ($id, @cells) = split (m/[()]+/s);

next unless scalar(@cells);

# Trim junk
$id =~ s/^-//;
$id =~ s/:.*//;

if($row_hash{$id}) {
%hash = ( %{$row_hash{$id}} );
} else {
%hash = ( 'ID' => $id,
'LastVisitDate' => 0 );
}

foreach (@cells) {
next unless $_;

my ($keyi, $which, $vali) =
m/^\^ ([-\dA-F]+)
([\^=])
(.*)
$/xi;

error ("$file: unparsable cell: $_\n") unless defined ($vali);

# If the key isn't in the key table, ignore it
#
my $key = $key_table{$keyi};
next unless defined($key);

my $val = ($which eq '='
? $vali
: $val_table{$vali});

if ($key eq 'LastVisitDate' || $key eq 'FirstVisitDate') {
$val = int ($val / 1000000); # we don't need milliseconds, dude.
}

$hash{$key} = $val;
#print "$id: $key -> $val\n";
}


if ($age && ($hash{LastVisitDate} || $since) < $since) {
print STDERR "$progname: $file: skipping old: " .
"$hash{LastVisitDate} $hash{URL}\n"
if ($verbose > 3);
$skipped++;
next;
}

$total++;
$row_hash{$id} = \%hash;
}
}


##########################################################################
# parse a values table
##########################################################################

sub mork_parse_value_table {
my($file, $section, $val_part) = (@_);

return unless $val_part;

my @pairs = split (m/\(([^\)]+)\)/, $val_part);
$val_part = undef;

print STDERR "\n" if ($verbose > 3);

foreach (@pairs) {
next unless (m/[^\s]/s);
my ($key, $val) = m/([\dA-F]*)[\t\n ]*=[\t\n ]*(.*)/i;

if (! defined ($val)) {
print STDERR "$progname: $file: $section: unparsable val: $_\n";
next;
}

# Assume that URLs and LastVisited are never hexilated; so
# don't bother unhexilating if we won't be using Name, etc.
if($show_all_p && $val =~ m/\$/) {
# Approximate wchar_t -> ASCII and remove NULs
$val =~ s/\$00//g; # faster if we remove these first
$val =~ s/\$([\dA-F]{2})/chr(hex($1))/ge;
}

$val_table{$key} = $val;
print STDERR "$progname: $file: $section: val $key = \"$val\"\n"
if ($verbose > 3);
}
}


##########################################################################
# parse a key table
##########################################################################

sub mork_parse_key_table {
my ($file, $section, $key_table) = (@_);

print STDERR "\n" if ($verbose > 3);
$key_table =~ s@\s+//.*$@@gm;

my @pairs = split (m/\(([^\)]+)\)/s, $key_table);
$key_table = undef;

foreach (@pairs) {
next unless (m/[^\s]/s);
my ($key, $val) = m/([\dA-F]+)\s*=\s*(.*)/i;
error ("$file: $section: unparsable key: $_") unless defined ($val);

# If we're only emitting URLs and dates, don't even bother
# saving the other fields that we aren't interested in.
#
next if (!$show_all_p &&
$val ne 'URL' && $val ne 'LastVisitDate' &&
$val ne 'VisitCount');

$key_table{$key} = $val;
print STDERR "$progname: $file: $section: key $key = \"$val\"\n"
if ($verbose > 3);
}
}


sub html_quote {
my ($s) = @_;
$s =~ s/&/&amp;/g;
$s =~ s/</&lt;/g;
$s =~ s/>/&gt;/g;
$s =~ s/\"/&quot;/g;
return $s;
}

sub html_wrap {
my ($s) = @_;
$s = html_quote ($s);

# while there are non-wrappable chunks of 30 characters,
# insert wrap points at certain punctuation characters every 10 characters.
while ($s =~ m/[^\s]{30}/s) {
last unless ($s =~ s@([^\s]{10})([/;,])([^/\s])@$1$2 $3@gs ||
$s =~ s@([^\s]{10})([-_\$\#?.]|&amp;|%(2F|2C|26))@$1 $2@gs);
}

# if we still have non-wrappable chunks of 40 characters,
# insert wrap points every 30 characters no matter what.
while ($s =~ m/[^\s]{40}/s) {
last unless ($s =~ s@([^\s]{30})@$1 @gs);
}

return $s;
}

sub format_urls {
my ($results, $html_p) = @_;

print "<TABLE BORDER=0 CELLPADDING=" . ($show_all_p ? "4" : "0") .
" CELLSPACING=0>\n"
if ($html_p);

foreach my $hash (@$results) {

if ($show_all_p) {
#
# Print every field in the hash.
#

if ($html_p) {
print " <TR>\n";
print " <TD NOWRAP ALIGN=RIGHT VALIGN=TOP>$hash->{ID}&nbsp;</TD>\n";
print " <TD NOWRAP>\n";
print " <TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>\n";
}

my %key_sort_table = (
'ID' => ' 0 ',
'URL' => ' 1 ',
'Name' => ' 2 ',
'Hostname' => ' 3 ',
'FirstVisitDate' => ' 4 ',
'LastVisitDate' => ' 5 '
);

foreach my $key (sort { ($key_sort_table{$a} || $a) cmp
($key_sort_table{$b} || $b)
} (keys(%$hash))) {
my $val = $hash->{$key};
if ($key eq 'LastVisitDate' || $key eq 'FirstVisitDate') {
$val = localtime ($val);
}
if ($html_p) {
next if ($key eq 'ID');
$key = html_quote ($key);
$val = ($key eq 'URL'
? "<A HREF=\"$val\">" . html_wrap ($val) . "</A>"
: html_wrap ($val));
print " <TR>\n";
print " <TD VALIGN=TOP NOWRAP ALIGN=RIGHT>$key: &nbsp;</TD>\n";
print " <TD VALIGN=TOP>$val</TD>\n";
print " </TR>\n";
} else {
print sprintf ("%14s = %s\n", $key, $val);
}
}

if ($html_p) {
print " </TABLE>\n";
print " </TD>\n";
print " </TR>\n";
}
print "\n";

} else {
#
# Print just the URLs and their last-load-times.
#
my $url = $hash->{'URL'};
my $date = $hash->{'LastVisitDate'} || 0;
my $count = $hash->{'VisitCount'} || 1;
next unless defined ($url);

if ($html_p) {
$date = strftime("%d %b %l:%M %p", localtime ($date));
my $u2 = html_wrap ($url);
print " <TR>";
print "<TD VALIGN=TOP ALIGN=RIGHT NOWRAP>";
print "($count) " if ($count > 1);
print "$date &nbsp;</TD>";
print "<TD VALIGN=TOP><A HREF=\"$url\">$u2</A></TD>";
print "</TR>\n";
} else {
print "$date\t$count\t$url\n";
}
}
}

print "</TABLE>\n" if ($html_p);
}


sub error {
($_) = @_;
print STDERR "$progname: $_\n";
exit 1;
}

sub usage {
print STDERR "usage: $progname [--verbose] [--html] [--age secs] " .
"mork-input-file\n" .
"\t'age' can be of the form '2h', '3d', etc.\n";
exit 1;
}

sub main {
my ($file, $age, $html_p);
while ($#ARGV >= 0) {
$_ = shift @ARGV;
if ($_ eq "--verbose") { $verbose++; }
elsif (m/^-v+$/) { $verbose += length($_)-1; }
elsif ($_ eq "--age") { $age = shift @ARGV; }
elsif ($_ eq "--html") { $html_p = 1; }
elsif (m/^-./) { usage; }
elsif (!defined($file)) { $file = $_; }
else { usage; }
}

usage() unless defined($file);

$show_all_p = ($verbose > 1);

if (!$age) {
} elsif ($age =~ m/^(\d+)\s*s(ec(onds?)?)?$/i) {
$age = $1 + 0;
} elsif ($age =~ m/^(\d+)\s*m(in(utes?)?)?$/i) {
$age = $1 * 60;
} elsif ($age =~ m/^(\d+)\s*h(ours?)?$/i) {
$age = $1 * 60 * 60;
} elsif ($age =~ m/^(\d+)\s*d(ays?)?$/i) {
$age = $1 * 60 * 60 * 24;
} elsif ($age =~ m/^(\d+)\s*w(eeks?)?$/i) {
$age = $1 * 60 * 60 * 24 * 7;
} elsif ($age =~ m/^(\d+)\s*m(on(ths?)?)?$/i) {
$age = $1 * 60 * 60 * 24 * 30;
} elsif ($age =~ m/^(\d+)\s*y(ears?)?$/i) {
$age = $1 * 60 * 60 * 24 * 365;
} else {
error ("unparsable: --age $age");
}

my $results = mork_parse_file ($file, $age);
format_urls ($results, $html_p);
}

main;
exit 0;




syntax highlighted by Code2HTML, v. 0.9.1

No comments:

Post a Comment

- the first minus - Comments have to be moderated because of the spammers
- the second minus - I am very lazy at moderating comments ... hardly find time ...
- the third minus - Short links are no good for security ...
- The REAL PLUS : Any critic and positive feedback is better than none, so your comments will be published sooner or later !!!!

Labels

perl (41) Cheat Sheet (25) how-to (24) windows (14) sql server 2008 (13) linux (12) oracle (12) sql (12) Unix (11) cmd windows batch (10) mssql (10) cmd (9) script (9) textpad (9) netezza (8) sql server 2005 (8) cygwin (7) meta data mssql (7) metadata (7) bash (6) code generation (6) Informatica (5) cheatsheet (5) energy (5) tsql (5) utilities (5) excel (4) future (4) generic (4) git cheat sheet (4) html (4) perl modules (4) programs (4) settings (4) sh (4) shortcuts (4) поуки (4) принципи (4) Focus Fusion (3) Solaris (3) cool programs (3) development (3) economy (3) example (3) freeware (3) fusion (3) logging (3) morphus (3) mssql 2005 (3) nuclear (3) nz (3) parse (3) python (3) sftp (3) sofware development (3) source (3) sqlplus (3) table (3) vim (3) .Net (2) C# (2) China (2) GUI (2) Google (2) GoogleCL (2) Solaris Unix (2) architecture (2) ascii (2) awk (2) batch (2) cas (2) chrome extensions (2) code2html (2) columns (2) configuration (2) conversion (2) duplicates (2) excel shortcuts (2) export (2) file (2) free programs (2) informatica sql repository (2) linux cheat sheet (2) mssql 2008 (2) mysql (2) next big future (2) nsis (2) nz netezza cheat sheet (2) nzsql (2) ora (2) prediction (2) publish (2) release management (2) report (2) security (2) single-click (2) sqlserver 2005 (2) sqlserver 2008 (2) src (2) ssh (2) template (2) tools (2) vba (2) video (2) xlt (2) xml (2) youtube videos (2) *nix (1) .vimrc (1) .virmrc vim settings configs (1) BSD license (1) Bulgaria (1) Dallas (1) Database role (1) Dense plasma focus (1) Deployment (1) ERP (1) ExcelToHtml (1) GD (1) GDP (1) HP-UX (1) Hosting (1) IDEA (1) INC (1) IT general (1) ITIL management bullshit-management (1) IZarc (1) Java Web Start (1) JavaScript anchor html jquery (1) Khan Academy (1) LINUX UNIX BASH AND CYGWIN TIPS AND TRICKS (1) Linux Unix rpm cpio build install configure (1) Linux git source build .configure make (1) ListBox (1) MIT HYDROGEN VIRUS (1) OO (1) Obama (1) PowerShell (1) Run-time (1) SDL (1) SIWA (1) SOX (1) Scala (1) Services (1) Stacks (1) SubSonic (1) TED (1) abstractions (1) ansible hosts linux bash (1) ansible linux deployment how-to (1) ansible yum pip python (1) apache (1) apache 2.2 (1) application life cycle (1) architecture input output (1) archive (1) arguments (1) avatar (1) aws cheat sheet cli (1) aws cli (1) aws cli amazon cheat sheet (1) aws elb (1) backup (1) bash Linux open-ssh ssh ssh_server ssh_client public-private key authentication (1) bash perl search and replace (1) bash stub (1) bin (1) biofuels (1) biology (1) books (1) browser (1) bubblesort (1) bugs (1) build (1) byte (1) cas_sql_dev (1) chennai (1) chrome (1) class (1) claut (1) cmdow (1) code generation sqlserver (1) command (1) command line (1) conf (1) confluence (1) console (1) convert (1) cool programs windows free freeware (1) copy paste (1) copy-paste (1) csv (1) ctags (1) current local time (1) cygwin X11 port-forwarding mintty xclock Linux Unix X (1) cygwin bash how-to tips_n_tricks (1) cygwin conf how-to (1) data (1) data types (1) db2 cheat sheet (1) db2 starter ibm bash Linux (1) debt (1) diagram (1) dictionaries (1) digital (1) disk (1) disk space (1) documentation (1) dos (1) dubai (1) e-cars (1) electric cars (1) electricity (1) emulate (1) errors (1) exponents (1) export workflow (1) extract (1) fast export (1) fexp (1) file extension (1) file permissions (1) findtag (1) firewall (1) for loop (1) freaky (1) functions (1) fusion research (1) german (1) git gitlab issues handling system (1) google cli (1) google code (1) google command line interface (1) gpg (1) ha (1) head (1) helsinki (1) history (1) hop or flop (1) host-independant (1) how-to Windows cmd time date datetime (1) ibm db2 cognos installation example db deployment provisioning (1) ideas (1) image (1) informatica oracle sql (1) informatica repo sql workflows sessions file source dir (1) informatica source files etl (1) install (1) isg-pub issue-tracker architecture (1) it management best practices (1) java (1) jump to (1) keyboard shortcuts (1) ksh (1) level (1) linkedin (1) linux bash ansible hosts (1) linux bash commands (1) linux bash how-to shell expansion (1) linux bash shell grep xargs (1) linux bash tips and t ricks (1) linux bash unix cygwin cheatsheet (1) linux bash user accounts password (1) linux bash xargs space (1) linux cheat-sheet (1) linux cheatsheet cheat-sheet revised how-to (1) linux how-to non-root vim (1) linux ssh hosts parallel subshell bash oneliner (1) london (1) make (1) me (1) metacolumn (1) metadata functions (1) metaphonre (1) method (1) model (1) movie (1) multithreaded (1) mysql cheat sheet (1) mysql how-to table datatypes (1) n900 (1) nano (1) neteza (1) netezza bash linux nps (1) netezza nps (1) netezza nps nzsql (1) netezza nz Linux bash (1) netezza nz bash linux (1) netezza nz nzsql sql (1) netezza nzsql database db sizes (1) non-password (1) nord pol (1) nps backup nzsql schema (1) number formatting (1) nz db size (1) nz table count rows (1) nzsql date timestamp compare bigint to_date to_char now (1) on-lier (1) one-liners (1) one-to-many (1) oneliners (1) open (1) open source (1) openrowset (1) openssl (1) oracle PL/SQL (1) oracle Perl perl (1) oracle installation usability (1) oracle number formatting format-model ora-sql oracle (1) oracle templates create table (1) oracle trigger generic autoincrement (1) oracle vbox virtual box cheat sheet (1) oracle virtual box cheat sheet (1) outlook (1) parser (1) password (1) paths (1) perl @INC compile-time run-time (1) perl disk usage administration Linux Unix (1) perl modules configuration management (1) permissions (1) php (1) picasa (1) platform (1) postgreSQL how-to (1) powerShell cmd cygwin mintty.exe terminal (1) ppm (1) predictions (1) prices (1) principles (1) productivity (1) project (1) prompt (1) proxy account (1) public private key (1) publishing (1) putty (1) qt (1) read file (1) registry (1) relationship (1) repository (1) rm (1) scala ScalaFmt (1) scp (1) scripts (1) scsi (1) search and replace (1) sed (1) sendEmail (1) sh stub (1) shortcuts Windows sql developer Oracle (1) sidebar (1) silicon (1) smells (1) smtp (1) software development (1) software procurement (1) sofware (1) sort (1) sql script (1) sql_dev (1) sqlcmd (1) sqlite (1) sqlite3 (1) sshd (1) sshd cygwin (1) stackoverflow (1) stored procedure (1) stub (1) stupidity (1) subroutines (1) svn (1) sysinternals (1) system design (1) tail (1) tar (1) temp table (1) templates (1) teradata (1) terminal (1) test (1) testing (1) theory (1) thorium (1) time (1) tip (1) title (1) tmux .tmux.conf configuration (1) tmux efficiency bash (1) tool (1) ui code prototyping tips and tricks (1) umask Linux Unix bash file permissions chmod (1) url (1) urls (1) user (1) utility (1) utils (1) vb (1) vbox virtual box cheat sheet (1) vim perl regex bash search for string (1) vim recursively hacks (1) vim starter (1) vim-cheat-sheet vim cheat-sheet (1) vimeo (1) visual stuio (1) warsaw (1) wiki (1) wikipedia (1) window (1) windows 7 (1) windows 8 (1) windows programs (1) windows reinstall (1) windows utility batch perl space Windows::Clipboard (1) wisdoms (1) workflow (1) worth-reading (1) wrapper (1) xp_cmdshell (1) xslt (1) youtube (1)

Blog Archive

Translate with Google Translate

My Blog List