#!/usr/bin/perl -w # Merge mbox format files, stripping dupes use strict; use Digest::MD5 qw/md5_base64/; my %ids = (); # message ids seen my $msgno = 0; # current message number in current input file # Getmsg reads an mbox-format message from the input files, # returning it and updating $msgno. my $lastline = "*START*"; sub getmsg { my $result; my $oldargv = $ARGV; $oldargv = "" unless defined($oldargv); if ($lastline eq "*END*") { return ""; } elsif ($lastline eq "*START*") { $result = ""; } else { $result = $lastline; } while (<>) { if ($ARGV ne $oldargv) { $msgno = 0; $oldargv = $ARGV; } if (/^From / && $lastline ne "*START*") { $lastline = $_; $msgno++; return $result; } else { $lastline = $_; $result .= $_; } } $lastline = "*END*"; $msgno++; return $result; } # Get the id out of a message. # Uses Message-ID: header if available, MD5 digest if not. sub id { my ($msg) = @_; my $result; ($result) = $msg =~ /^Message-ID: (.*)$/mi; unless (defined($result)) { # warn "$ARGV:$msgno:no Message-ID, using MD5\n"; $result = md5_base64($msg); } return $result; } my $msg; while ($msg = getmsg()) { my $id = id($msg); if (exists($ids{$id})) { warn "$ARGV:$msgno:duplicate id $id\n"; } else { $ids{$id} = 1; print $msg; } }