#!/usr/bin/perl # # linkdups - Find duplicate files in a directory tree and link() them. # # Copyright (C) 2000-2005 Steven Pritchard # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of # the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # $Id: linkdups,v 1.12 2005/10/23 21:15:06 steve Exp $ use strict; use warnings; use Getopt::Std; use Digest::MD5; use FileHandle; use DirHandle; use File::Basename; our (%opt,$context,$recursive,%files,$debug,$verbose,$dryrun,$total,$num); our (%sizes,$numtotal,$start,$pass1); sub info(@); sub debug(@); sub recurse_into($); sub check($); $debug=0; $verbose=0; $dryrun=0; $total=0; $numtotal=0; $num=0; $start=0; getopts('rvDnt', \%opt); $recursive=$opt{'r'} if (defined($opt{'r'})); $verbose=$opt{'v'} if (defined($opt{'v'})); $debug=$opt{'D'} if (defined($opt{'D'})); $dryrun=$opt{'n'} if (defined($opt{'n'})); our $usetimes=(defined($opt{'t'}) ? 1 : 0); if ($verbose or $debug) { eval { use Time::HiRes qw(gettimeofday tv_interval); }; $start=[gettimeofday] if (!$@); } if ($#ARGV==-1) { if ($recursive) { $ARGV[0]="."; } else { $ARGV[0]="-"; } } # Pass 1 - stat() everything. for my $file (@ARGV) { my @statbuf; if (!(@statbuf=lstat($file))) { print STDERR basename($0), ": $file: $!\n"; next; } if ($recursive and (-d(_))) { &recurse_into($file); } else { # FIXME - Shouldn't there be a check for -f here? if ($statbuf[3]>1) { print STDERR "link count on $file is $statbuf[3]!\n"; } $numtotal++; if ($usetimes) { push(@{$sizes{$statbuf[7]}->{$statbuf[9]}}, $file); } else { push(@{$sizes{$statbuf[7]}}, $file); } } } if ($verbose or $debug) { my $elapsed; if ($start) { $pass1=[gettimeofday]; $elapsed=tv_interval($start, $pass1); } else { $pass1=time; $elapsed=$pass1-$^T; } info "$numtotal files scanned in $elapsed seconds.\n"; } $context=new Digest::MD5; # Pass 2 - Check for files with the same size, md5sum them and link duplicates. if ($usetimes) { for my $size (sort { $b <=> $a } keys(%sizes)) { for my $mtime (sort { $a <=> $b } keys(%{$sizes{$size}})) { my $files=$sizes{$size}->{$mtime}; if (@$files>1) { info scalar(@$files), " files of size $size modified ", scalar(localtime($mtime)), "\n"; check($files); } } } } else { for my $size (sort { $b <=> $a } keys(%sizes)) { my $files=$sizes{$size}; if (@$files>1) { info scalar(@$files), " files of size $size\n"; check($files); } } } info "$num files checked in "; if ($start) { info tv_interval($pass1); } else { info time-$pass1; } info " seconds ($numtotal files scanned).\n"; info "$total bytes "; info "would be " if ($dryrun); info "saved.\n"; sub check($) { my $files=shift; my %md5; for my $file (@$files) { debug("$file\n"); $num++; if (my $fd=new FileHandle "<$file") { $context->reset(); $context->addfile($fd); my $hash=$context->hexdigest(); debug("$hash $file\n"); close($fd); if (defined($md5{$hash})) { info "$file is a duplicate of $md5{$hash}"; my $size=-s $file; if ($size==0) { info " but it is only $size bytes\n"; return; } else { info "\n"; $total+=$size; return if ($dryrun); debug("unlinking $file...\n"); unlink($file) or die "Can't unlink $file: $!\n"; debug("linking $md5{$hash} to $file...\n"); link($md5{$hash}, $file) or die "Can't link $md5{$hash} to $file: $!\n"; } } else { $md5{$hash}=$file; } } else { print STDERR basename($0), ": $file: $!\n"; } } } sub recurse_into($) { my $dir=shift; if (opendir(DIR, $dir)) { my $x; for $x (grep(!/^\.{1,2}$/, readdir(DIR))) { my @statbuf=lstat("$dir/$x"); if (-d(_)) { &recurse_into("$dir/$x"); } elsif (-f(_)) { if ($statbuf[3]>1) { print STDERR "link count on $dir/$x is $statbuf[3]!\n"; } $numtotal++; if ($usetimes) { push(@{$sizes{$statbuf[7]}->{$statbuf[9]}}, "$dir/$x"); } else { push(@{$sizes{$statbuf[7]}}, "$dir/$x"); } } else { debug basename($0), ": '$dir/$x' is not a plain file, skipping...\n"; } } } else { warn "Can't open $dir: $!\n"; } } sub debug(@) { print STDERR @_ if ($debug); } sub info(@) { print @_ if ($verbose or $debug); } # vi: set ai et: