#!/usr/bin/perl -w # findimagedupes (name to be changed later) # copyright 2001 rob kudla # licensed under the GNU Public License version 2.0 or later # # strengths: recognizes similar pictures with 98% accuracy when # the pictures actually have unique features; generates # collection files for easy managing of dupes with # gqview (hopefully pixie someday) # weaknesses: lots of false positives on contact sheets and # shots of things like ocean horizons, which all # reduce to basically this: # # 1111111111111111 # 1111111111111111 # 1111111111111111 # 1111111111111111 # 1111111111111111 # 1111111111111111 # 1111111111111111 # 1111111111111111 # 0000000000000000 # 0000000000000000 # 0000000000000000 # 0000000000000000 # 0000000000000000 # 0000000000000000 # 0000000000000000 # 0000000000000000; slows down geometrically with # larger image collections (>18 hours for 25144 # images, ~15 minutes for 2500) # # i've tried moving to an 8x8x8 bit array (twice the size but allows # greyscale comparisons) and it actually causes more false positives, # probably because 8x8 pixels just provides too little detail. use Image::Magick; #use strict; use warnings; use File::Basename; my $VERSION = "0.1.3"; my $prog = $0 . ''; $prog = substr($prog,rindex($prog,'/') + 1) if rindex($prog,'/') >= 0; my $image; sub handleSEGV { die "caught segfault in getfingerprint()\n"; }; # check args my %OPT; while (my $arg = shift) { if ($arg eq '-rescan') { $OPT{'rescan'} = 1; } elsif ($arg eq '-update') { $OPT{'update'} = 1; } elsif ($arg eq '-t') { $OPT{'threshold'} = shift; } elsif ($arg eq '-f') { $OPT{'dbfile'} = shift; } elsif ($arg eq '-?' || $arg eq '-h' || $arg eq '--help') { $OPT{'help'} = 1; } elsif ($arg eq '-d') { $OPT{'scandir'} = shift; } elsif ($arg eq '-v') { $OPT{'viewpgm'} = shift; } elsif ($arg eq '-c') { $OPT{'gqvfile'} = shift; } elsif ($arg eq '-p') { $OPT{'printfp'} = 1; } elsif ($arg eq '-verbose') { $OPT{'verbose'} = 1; } elsif ($arg eq '-include-hidden') { $OPT{'include-hidden'} = 1; } elsif ($arg eq '-g') { $OPT{'guimode'} = 1; } else { # assume it's a filename if (!defined($OPT{'file1'})) { $OPT{'file1'} = $arg; } else { $OPT{'file2'} = $arg; } } } # slap user if only one filename specified $OPT{'help'} = 1 if (defined $OPT{'file1'} && !defined $OPT{'file2'}); # print help message if needed if ($OPT{'help'}) { print "$prog - Copyright 2001 Rob Kudla - http://www.kudla.org/raindog This program is distributed under the terms of the GNU Public License; see the file COPYING for details. Usage: $prog [options] [ ] Options: -rescan = rescan fingerprints of all files in directory -update = like rescan, but do not read Files already in DB File and erase Entries not found any longer in Filesystem -f = use as image fingerprint database -d = scan instead of current directory -t = use as threshold% of similarity (default 90) -v = launch (in bg) to view each set of dupes -c = create GQView collection .gqv of duplicates = diff just those two files, using -v if present (other options ignored if files are specified) -p = only valid when files specified; prints the hex of the actual fingerprint of each file. -g = GUI mode: produce only machine-friendly output. -verbose = Verbose Output\n"; exit 0; } # set up defaults $OPT{'scandir'} = '.' unless defined $OPT{'scandir'}; $OPT{'dbfile'} = 'imagedupes-db.txt' unless defined $OPT{'dbfile'}; $OPT{'dbfile'} = "$OPT{'scandir'}/$OPT{'dbfile'}" unless $OPT{'dbfile'} =~ "/"; $OPT{'threshold'} = 90 unless defined $OPT{'threshold'} && $OPT{'threshold'} > 0; $OPT{'curdir'} = `pwd`; chop $OPT{'curdir'}; $OPT{'update'} ||=0; $OPT{'rescan'} ||=0; $OPT{'include-hidden'} ||= 0; # set up gqvfile if needed if (defined $OPT{'gqvfile'} && defined $OPT{'file1'}) { $OPT{'gqvfile'} = "$OPT{'gqvfile'}.gqv" if $OPT{'gqvfile'} !~ /\.gqv$/; open GQV, ">$OPT{'gqvfile'}"; print GQV "#GQView collection\n#Created with $prog version $VERSION\n"; close GQV; } # set up countbits array my @countbits_arr; for (my $i = 0; $i < 256; $i++) { $countbits_arr[$i] = _countbits (chr($i)); } # get columns if we can my $cols; if (!defined $OPT{'guimode'}) { $cols = `tput cols`; $cols += 0; $cols = 80 if $cols == 0; } # scan files in if the user wants or there's no database now if ( $OPT{'rescan'}>0 || $OPT{'update'}>0 || ( (!-e $OPT{'dbfile'}) || (-s $OPT{'dbfile'} == 0)) && !defined $OPT{'file1'} ) { print "Scanning fingerprints from $OPT{'scandir'} into $OPT{'dbfile'}.\n" if !(defined $OPT{'guimode'}); # get whole tree my $list = `find $OPT{'scandir'} -type f | sort`; # TODO: We should use File::Find here # get imagemagick object $image = Image::Magick->new; my $existing_summs={}; # create dbfile if ( $OPT{'update'} ) { open IMGFP, "<$OPT{'dbfile'}"; open IMGFPO, ">$OPT{'dbfile'}.sik"; while (my $line = ) { my ($file,$sum)=split(":",$line); $existing_summs->{$file}=$sum; if ( -s $file ) { print IMGFPO $line; } else { print "Ignoring $line"; } } close IMGFP; close IMGFPO; unlink "$OPT{'dbfile'}" if -e "$OPT{'dbfile'}"; rename "$OPT{'dbfile'}.sik","$OPT{'dbfile'}"; open IMGFP, ">>$OPT{'dbfile'}"; } else { open IMGFP, ">$OPT{'dbfile'}"; } # put tree into array my @list; if ( $OPT{'include-hidden'} ) { @list = split "\n", $list; } else { @list = grep { $_ !~ m,/\., } split "\n", $list; } # max value for our lame little statusbar my $numfiles = $#list + 1; # traverse the array. foreach $file (@list) { $curfile++; next if $existing_summs->{$file}; # erase current screen line system("tput el") if (!defined $OPT{'guimode'}); # build lame little status bar if (defined $OPT{'guimode'}) { $outputline = "Status::" . sprintf("%04d", $curfile) . "::" . sprintf("%04d", $numfiles) . "::" . sprintf("%03.2f", ($curfile/$numfiles) * 100); print "$outputline\n"; } else { $outputline = "[" . sprintf("%04d", $curfile) . "/" . sprintf("%04d", $numfiles) . "] " . sprintf("%2.0f%%", ($curfile/$numfiles) * 100) . statusbar($curfile, $numfiles) . "100% "; $outputline .= substr($file, 0, $cols - length($outputline) - 1); print "$outputline\n"; } # move cursor back up a line # I couldn't just use \r - it only updated like every 15 iterations # apparently the linux console only refreshes on a \n system("tput cuu1") unless defined $OPT{'guimode'} || defined $OPT{'verbose'}; # check what file thinks the file is $format = ''; $filetype = `file "$file"`; # # check for file types imagemagick is stupid about, which is # # basically anything but a bitmap image # # unless ($file =~ /\.txt$/i || $file =~ /\.html$/i || # $filetype =~ /zip/i || $filetype =~ /mp3/i || # $filetype =~ /link/i || $filetype =~ /rpm/i || # $filetype =~ /execut/i || $filetype =~ /socket/i || # $filetype =~ /pipe/i || $filetype =~ /postscript/i || # $filetype =~ /pdf/i || $filetype =~ /mpeg/i || # $file =~ /\.man$/i || # $filetype =~ /text/i || $file =~ /\.htm$/i ) { # or we could just assume the user has a good magic file. # Of course imagemagick will still crap out on some animated GIFs. # imagedups-db.txt =~ /image/ ..., so we want to avoid that. if ($filetype =~ /\:.*image/i || $filetype =~ /\:.*bitmap/i) { # Adjusted script to work with both old and new perlmagick # Old perlmagick returns a CSV string, new returns an array my @pingstring = $image->Ping($file); if(@pingstring) { if ($pingstring[0] =~ /,/) { @pingstring = split (',', $pingstring[0]); } $format = $pingstring[3]; } # else leave $format == '' } # oh yeah, and just in case a text file slips through (crash!!) if ($format ne '' && $format ne 'TXT') { $img = &getfingerprint($image, $file); # quote percents and colons in our db file. $filename = $file; $filename =~ s/\%/\%25/g; $filename =~ s/\:/\%3A/g; # only save if the image made a valid pbm. if (defined($img) && length($img) > 0) { print IMGFP "$filename:"; for (my $i = 0; $i < length($img); $i++) { # convert each byte of pbm to a hex pair. print IMGFP sprintf("%02x", ord(substr($img,$i,1))); } print IMGFP "\n"; } elsif (!defined($img)) { warn "warning: unable to get fingerprint of $file\n"; } } } close IMGFP; print "\n" if !(defined $OPT{'guimode'}); } # find dupes if (defined $OPT{'file1'}) { # do file1 and file2 $image = Image::Magick->new; my $fp1 = getfingerprint($image, $OPT{'file1'}) or die "fatal: unable to get fingerprint of $OPT{'file1'}\n"; my $fp2 = getfingerprint($image, $OPT{'file2'}) or die "fatal: unable to get fingerprint of $OPT{'file2'}\n"; # xor the two binary strings to find differences $fpdiff = $fp1 ^ $fp2; # print fingerprints if -p specified. if ($OPT{'printfp'}) { my $i; print "$OPT{'file1'}:"; for (my $i = 0; $i < length($fp1); $i++) { # convert each byte of pbm to a hex pair. print sprintf("%02x", ord(substr($fp1,$i,1))); } print "\n"; print "$OPT{'file2'}:"; for (my $i = 0; $i < length($fp2); $i++) { # convert each byte of pbm to a hex pair. print sprintf("%02x", ord(substr($fp2,$i,1))); } print "\n"; print "Difference:"; for (my $i = 0; $i < length($fpdiff); $i++) { # convert each byte of pbm to a hex pair. print sprintf("%02x", ord(substr($fpdiff,$i,1))); } print "\n"; } # how many bits are different? number and % $diffbits = countbits($fpdiff); $diffpct = sprintf("%0.2f",(1-($diffbits/256))*100); if (defined $OPT{'guimode'}) { print "Dupe::$OPT{'file1'}::$OPT{'file2'}::$diffpct\n"; } else { print "$OPT{'file1'} $OPT{'file2'}: seem to be $diffpct\% similar.\n"; } # launch the viewer if the user wanted us to if (defined($OPT{'viewpgm'})) { system("$OPT{'viewpgm'} $key &"); system("$OPT{'viewpgm'} $keys[$j] &"); print "Press enter when done viewing. " if !(defined $OPT{'guimode'}); ; } } else { # do whole tree open IMGFP, "<$OPT{'dbfile'}"; print "Finding duplicates in $OPT{'scandir'}, threshold $OPT{'threshold'}%.\n" if !(defined $OPT{'guimode'}); # load db into hash while ($line = ) { chop $line; ($key,$fp) = split(":",$line); # remember, : and % are escaped $key =~ s/\%3A/\:/g; $key =~ s/\%25/\%/g; $PFP{$key} = pack("H*", $fp); } @keys = keys %PFP; my $i = 0; my $bits_that_can_differ = 256 * (1 - $OPT{'threshold'} / 100 ); # traverse the hash foreach $key (@keys) { # generate lame little status bar if (defined $OPT{'guimode'}) { $outputline = "Status::" . sprintf("%04d", $i) . "::" . sprintf("%04d", $#keys) . "::" . sprintf("%03.2f", ($i/$#keys) * 100); print "$outputline\n"; } else { print "[" . sprintf("%04d", $i) . "/" . sprintf("%04d", $#keys) . "] 0%" . statusbar($i, $#keys) . "100%\n"; } # move the cursor up a line, see -rescan section system("tput cuu1") if (!defined $OPT{'guimode'}); # check remainder of hash for close matches for ( $j = $i + 1; $j <= $#keys; $j++) { # read pbm data for both entries and unhex my $fp1 = $PFP{$key}; my $fp2 = $PFP{$keys[$j]}; # xor the two binary strings to find differences $fpdiff = $fp1 ^ $fp2; # how many bits are different? number and % $diffbits = countbits($fpdiff); if ($diffbits <= $bits_that_can_differ) { $diffpct = sprintf("%0.2f",(1-($diffbits/256))*100); # blank line, we're going to tell the user something if (defined $OPT{'guimode'}) { print "Dupe:\:$key:\:$keys[$j]:\:$diffpct\n"; } else { system("tput el"); print "$key $keys[$j]: seem to be $diffpct\% similar.\n"; } # originally this was a log, now it writes out the gqvfile &difflog($key) if not defined($ALREADYDIFF{$key}); &difflog($keys[$j]) if not defined($ALREADYDIFF{$keys[$j]}); # and makes sure to only write out each file once $ALREADYDIFF{$key} = 1; $ALREADYDIFF{$keys[$j]} = 1; # launch the viewer if the user wanted us to if (defined($OPT{'viewpgm'})) { system("$OPT{'viewpgm'} $key &"); system("$OPT{'viewpgm'} $keys[$j] &"); print "Press enter when done viewing. "; ; } } } $i++; } # write out end of gqvfile, dunno if it's required but GQView does it if (defined $OPT{'gqvfile'}) { open GQV, ">>$OPT{'gqvfile'}"; print GQV "#end\n"; close GQV; } # close db file close IMGFP; print "\n"; } undef $image; sub base2 { # base2: converts binary string to list of 1's and 0's not unlike # pbm used to provide in text mode my $inval = shift; my $outval; for (my $i = 0; $i < length($inval); $i++) { for (my $j = 7; $j >= 0; $j--) { $outval .= (ord(substr($inval,$i,1)) and (2 ** $j) ? 1 : 0); } } $outval; } sub _countbits { # countbits: counts the 1 bits in a binary string (doesn't use base2) my $inval = shift; my $outval = 0; for (my $i = 0; $i < length($inval); $i++) { for (my $j = 7; $j >= 0; $j--) { my $bit = (ord(substr($inval,$i,1)) & (2 ** $j) ? 1 : 0); $outval += $bit; } } $outval; } sub countbits { my $inval = shift; my $outval = 0; for (my $i = 0; $i < length($inval); $i++) { $outval += $countbits_arr[ord(substr($inval,$i,1))]; } $outval; } sub statusbar { # statusbar: prints 0 to 50 dots based on $cur/$fin (arg0/arg1) my $cur = shift; my $fin = shift; my $dots = int(($cur/$fin)*50); my $blks = 50 - $dots; my $outline = ("." x ($dots)) . (" " x $blks); $outline; } sub debuglog { # debuglog: writes to debug log. my $arg = shift; open DEBUGLOG, ">>findimagedupes-debug.txt"; print DEBUGLOG "$arg\n"; close DEBUGLOG; } sub difflog { # difflog: used to be a debug thing, now handles the gqvfile output. return undef if not defined($OPT{'gqvfile'}); my $arg = shift; $arg =~ s/^\.\//$OPT{'curdir'}\//; open DIFFLOG, ">>$OPT{'gqvfile'}"; print DIFFLOG qq^"$arg"\n^; close DIFFLOG; } sub getfingerprint { # here's a good a place as any to document the algorithm. it's not # so much an algorithm as a philosophy, it's kind of too lame to be # an algorithm. suggestions for improvement are very welcome. # 1. read file. # 2. standardize size by resampling to 160x160. # 3. grayscale it. (reducing saturation seems faster than quantize.) # 4. blur it a lot. (gets rid of noise. we're going down 10x more anyway) # adding this nudges down false dupes about 10% and makes marginal # dupes (e.g. big gamma difference) show up about 10% higher. # 5. spread the intensity out as much as possible (normalize.) # 6. make it as contrasty as possible (equalize.) # this is for those real dark pictures that someone has slapped # a pure white logo on. yes, i tested this thoroughly on pr0n! # 7. resample again down to 16x16. I wanted to use a mosaic/pixelate # kind of thing but hopefully imagemagick's resample function works # roughly the same way. # 8. reduce to 1bpp (threshold using defaults) # 9. convert to pbm, er, um, raw mono # 10. save out to database as hex string containing raw image data # 11. when comparing, convert each file pair's thumbprints back to # binary and xor them. # 12. count the 1 bits in the result to approximate similarity. my $image = shift; my $file = shift; my (@blobs, $img); $SIG{SEGV} = \&handleSEGV; my $result = eval { my $error = $image->Read($file); if ( $error ) { warn "Could not read Image $file: '$error'\n"; return undef; }; $#$image = 0; $image->Sample("160x160!"); $image->Modulate(saturation=>-100); $image->Blur(radius=>5); $image->Normalize(); $image->Equalize(); $image->Sample("16x16"); $image->Threshold(); $image->Set(magick=>'mono'); @blobs = $image->ImageToBlob(); if(not defined ($blobs[0])) { warn("got undefined blobs for $file\n"); } else { # we used to discard the pbm header, but now we use raw mono # so we'll discard all but the first 32 bytes $img = substr($blobs[0],0,32); } }; $SIG{SEGV} = 'DEFAULT'; # free image but don't delete object. undef @$image; (defined $result) ? $img: undef; }