/*
** cmpln v1.1.1
**
** Link identical files
**
** Copyright (C) 6.5.1997 by Andreas Ley <Andreas.Ley@rz.uni-karlsruhe.de>
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 2 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
**
** This program has been tested on a HP9000/715 with HP-UX A.09.07
** In this environment, neither lint -u nor gcc -Wall -fsyntax-only produce
** any messages. If you encounter any errors or need to make any changes to
** port it to another platform, please contact me.
**
** Version history
**
** Version 1.1.1 - 11.2.2003
**	Check modification time
**
** Version 1.1 - 4.7.1998
**	Redesign of latter part due to filesystem & backup failure :(
**
** Version 1.0 - 6.5.1997
**	Initial version
*/

static char copyright[] = "@(#)Copyright (C) 1997,1998,2003 by Andreas Ley (Andreas.Ley@rz.uni-karlsruhe.de)";
static char sccsid[] = "@(#)cmpln v1.1.1 - Link identical files";


#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <errno.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/param.h>
#include <sys/resource.h>


#ifndef FALSE
#define	FALSE	(0)		/* This is the naked Truth */
#define	TRUE	(1)		/* and this is the Light */
#endif

#define	CHUNKSIZE	1024	/* Add this many new file entries to the file
				   table in case it overflows */
#define	BUFFERSIZE	8192	/* Size of the buffer used for file contents
				   comparison. As many buffers get allocated
				   as files can be opened simultaneously. */


#define	coord(x,y)	((x)<(y)?(x)+len*(y):(x)*len+(y))


static char	*image;
static int	debug=0,verbose=FALSE,nonexec=FALSE,symbolic=FALSE;
static enum	{CROSS=0,GET_DEV,DONT_CROSS} xdev=GET_DEV;
static off_t	minsize=1024;
static int	files=0,maxfiles=0,maxopenfiles;
static dev_t	dev=(dev_t)-1;

typedef	struct direntry {
		char		*name;
		struct direntry	*parent;
	} direntry;

typedef struct {
		char		*name;
		direntry	*parent;
		ino_t		inode;
		off_t		size;
		time_t		mtime;
	} fileentry;

static fileentry	*entries=NULL;


/*
**  Qsort command to sort by file size.
*/
static int cmpsize(s1,s2)
fileentry	*s1,*s2;
{
	return(s1->size<s2->size?-1:s1->size>s2->size?1:0);
}


/*
**  Qsort command to sort by file inode.
*/
static int cmpinode(s1,s2)
fileentry	*s1,*s2;
{
	return(s1->inode<s2->inode?-1:s1->inode>s2->inode?1:0);
}


/*
**  Qsort command to sort by file mtime.
*/
static int cmpmtime(s1,s2)
fileentry	*s1,*s2;
{
	return(s1->mtime<s2->mtime?-1:s1->mtime>s2->mtime?1:0);
}



/*
**  Constructs the full path to a direntry.
*/
static char *path(parent)
direntry	*parent;
{
	static char	buffer[MAXPATHLEN+2];
	size_t		len;
	char		*ptr;

	if (debug>3)
		(void)fprintf(stderr,"path(0x%08lx)\n",(long)parent);
	ptr=buffer+sizeof(buffer)-1;
	*ptr='\0';
	while (parent) {
		len=strlen(parent->name);
		ptr-=len+1;
		if (ptr<buffer) {
			(void)fprintf(stderr,"%s: internal error: path buffer overflow\n",image);
			exit(1);
		}
		(void)strncpy(ptr+1,parent->name,len);
		*ptr='/';
		parent=parent->parent;
	}
	return(ptr+1);
}



/*
**  Constructs a relative path to link src to dest
*/
static char *buildsymbolic(src,dest)
char		*src,*dest;
{
	static char	buffer[MAXPATHLEN+1];
	char		*ptr,*sptr,*dptr;

	if (debug>2)
		(void)fprintf(stderr,"buildsymbolic(\"%s\",\"%s\")\n",src,dest);
	for(;;) {
		if (!(sptr=strchr(src,'/')))
			break;
		if (!(dptr=strchr(dest,'/')))
			break;
		if (strncmp(src,dest,(size_t)(sptr-src+1)))
			break;
		src=sptr+1;
		dest=dptr+1;
	}
	if (debug>2)
		(void)fprintf(stderr,"Divergent: \"%s\" \"%s\"\n",src,dest);
	ptr=buffer;
	while ((dptr=strchr(dest,'/'))) {
		if (ptr-buffer+4>(int)sizeof(buffer)) {
			(void)fprintf(stderr,"%s: internal error: symbolic path too long\n",image);
			exit(1);
		}
		(void)strcpy(ptr,"../");
		ptr+=3;
		dest=dptr+1;
	}
	if (ptr-buffer+strlen(src)+1>sizeof(buffer)) {
		(void)fprintf(stderr,"%s: internal error: symbolic path too long\n",image);
		exit(1);
	}
	(void)strcpy(ptr,src);
	if (debug>2)
		(void)fprintf(stderr,"Symbolic path: \"%s\"\n",buffer);
	return(buffer);
}



/*
**  Debug function: list files in table.
*/
static void listfiles(file,end)
int	file,end;
{
	if (debug>2)
		(void)fprintf(stderr,"listfiles(%d,%d)\n",file,end);
	for(;file<end;file++)
		(void)fprintf(stderr,"[%d]%10ld %s %ld %ld\n",file,(long)entries[file].size,path((direntry*)&entries[file]),entries[file].inode,entries[file].mtime);
}



/*
**  Debug function: list matrix.
*/
static void listmatrix(matrix,offset,len)
char	*matrix;
int	offset,len;
{
	int	x,y,size;
	char	buffer[16];

	if (debug>2)
		(void)fprintf(stderr,"listmatrix(0x%08lx,%d)\n",(long)matrix,len);
	(void)sprintf(buffer,"%d",len+offset-1);
	size=strlen(buffer);
	(void)fprintf(stderr,"%*s",size+2,"");
	for(x=0;x<len;x++)
		(void)fprintf(stderr," [%*d]",size,x+offset);
	(void)fprintf(stderr,"\n");
	for(y=0;y<len;y++) {
			(void)fprintf(stderr,"[%*d]",size,y+offset);
		for(x=0;x<len;x++)
			(void)fprintf(stderr,"%*d ",size+2,matrix[coord(x,y)]);
		(void)fprintf(stderr,"\n");
	}
}



/*
** Add a file's data to the table, increasing the table size if required.
*/
static void addfile(name,parent,inode,size,mtime)
char		*name;
direntry	*parent;
ino_t		inode;
off_t		size;
time_t		mtime;
{
	if (debug>2) {
		(void)fprintf(stderr,"addfile(\"%s\",0x%08lx=\"%s\",%ld,%ld,%ld)\n",name,(long)parent,path(parent),inode,size,mtime);
	}
	if (files>=maxfiles) {
		maxfiles+=CHUNKSIZE;
		if (debug)
			(void)fprintf(stderr,"addfile: expand to %d entries\n",maxfiles);
		if (!(entries=realloc((void *)entries,maxfiles*sizeof(fileentry)))) {
			perror(image);
			exit(1);
		}
	}
	entries[files].name=strdup(name);
	entries[files].parent=parent;
	entries[files].inode=inode;
	entries[files].size=size;
	entries[files].mtime=mtime;
	files++;
}



/*
**  Handles a given filename (along with a pointer to it's parent directory).
**  If filename is a directory, it gets searched and cmpln called recursivly.
*/
static void cmpln(name,parent)
char		*name;
direntry	*parent;
{
	struct stat	buf;
	DIR		*dirp;
	struct dirent	*dp;
	direntry	dptr,*subparent=NULL;
	char		*ptr;

	if (debug>2)
		(void)fprintf(stderr,"cmpln(\"%s\",0x%08lx=\"%s\")\n",name,(long)parent,path(parent));

	dptr.name=name;
	dptr.parent=parent;
	ptr=path(&dptr);

	/*
	**  Try to stat file or link, return without error if file can't be
	**  stated - perhaps we just don't have no permission.
	*/
	if (lstat(ptr,&buf)) {
		(void)fprintf(stderr,"%s: can't lstat ",image);
		perror(ptr);
		errno=0;	/* FIXME: exit(1) if != EPERM */
		return;
	}

	/*
	**  Check device, break if we crossed a mountpoint and weren't told
	**  to do so.
	*/
	if (xdev!=CROSS) {
		if (xdev==GET_DEV) {
			xdev=DONT_CROSS;
			dev=buf.st_dev;
		}
		else if (buf.st_dev!=dev)
			return;
	}

	/*
	**  Handle files: just add to file list
	*/
	if (S_ISREG(buf.st_mode)) {
		if (buf.st_size>=minsize)
			addfile(name,parent,buf.st_ino,buf.st_size,buf.st_mtime);
	}

	/*
	**  Handle directories
	*/
	else if (S_ISDIR(buf.st_mode)) {
		if ((dirp=opendir(ptr))) {
			while ((dp=readdir(dirp)))
				if (strcmp(dp->d_name,".")&&strcmp(dp->d_name,"..")) {
					/*
					**  Handle directory entries other than self and parent
					*/
					if (!subparent) {
						/*
						**  Allocate a direntry for the
						**  directory we're in.
						**  FIXME: Memory doesn't get
						**  freed even if subsequent
						**  cmpln calls didn't use it.
						**  FIXME: add 'used' return
						**  code to cmpln.
						*/
						if (!(subparent=malloc(sizeof(direntry)))) {
							perror(image);
							exit(1);
						}
						subparent->name=strdup(name);
						subparent->parent=parent;
					}
					cmpln(dp->d_name,subparent);
				}
			if (errno) {
				/*
				**  Error reading from directory: return without error -
				**  perhaps we just don't have no permission.
				*/
				(void)fprintf(stderr,"%s: can't read directory ",image);
				perror(ptr);
				errno=0;	/* FIXME: exit(1) if != EPERM */
			}
			(void)closedir(dirp);
		}
		else {
			/*
			**  Error opening directory: return without error -
			**  perhaps we just don't have no permission.
			*/
			(void)fprintf(stderr,"%s: can't open directory ",image);
			perror(ptr);
			errno=0;	/* FIXME: exit(1) if != EPERM */
		}
	}
}



/*
**  Check a block of files with identical sizes from the list of files
**  for identical contents.
**  FIXME: Fails if the block is bigger than the maximum files that can be
**  opened.
*/
static void checkblock(start,len)
int	start,len;
{
	int	file,nfile,numfd,*fd;
	char	*ptr,*nptr,*sptr,*matrix,**buffer;
	size_t	retval;
	off_t	size=entries[start].size,expect;

	/*
	**  First pass: Look for duplicate inodes, check each inode only once.
	**  We can't skip duplicate inodes if running cross-device. This could
	**  be optimized by checking st_dev also, but since this is an unusual
	**  case, we save the memory that would be needed to store all the
	**  identical st_devs in the typical case.
	*/
	if (xdev!=CROSS) {
		if (debug>2) {
			(void)fprintf(stderr,"Sorting %d files for duplicate inodes:\n",len);
			listfiles(start,start+len);
		}
		qsort((void *)&entries[start],(size_t)len,sizeof(fileentry),(int (*)())cmpinode);
		if (debug>2) {
			(void)fprintf(stderr,"Sorted %d files for duplicate inodes:\n",len);
			listfiles(start,start+len);
		}
		for(file=start;file<start+len-1;file++) {
			if (debug>2) {
				(void)fprintf(stderr,"Checking %d files for duplicate inodes:\n",len);
				listfiles(file,file+1);
			}
			for(nfile=file+1;nfile<start+len&&entries[file].inode==entries[nfile].inode;nfile++);
			if (nfile>file+1) {
				if (debug>1) {
					(void)fprintf(stderr,"Reduced due to duplicate inode:\n");
					listfiles(file,nfile);
				}
				(void)memmove((void *)&entries[file+1],(void *)&entries[nfile],(start+len-nfile)*sizeof(fileentry));
				len-=nfile-file-1;
			}
		}
		/* All the files were already linked - nothing to do */
		if (len==1)
			return;
	}
	qsort((void *)&entries[start],(size_t)len,sizeof(fileentry),(int (*)())cmpmtime);
	if (debug>1) {
		(void)fprintf(stderr,"Checking %d files for identical contents:\n",len);
		listfiles(start,start+len);
	}

	/*
	**  Second pass: Actually compare files.
	*/

	if (len>maxopenfiles) {
		(void)fprintf(stderr,"%s: can't check more than %d files of %ld bytes simultaneously, reducing table\n",image,maxopenfiles,size);
		len=maxopenfiles;
	}

	/*
	**  Open files
	*/

	if (!(fd=(int *)malloc(len*sizeof(int)))) {
		perror(image);
		exit(1);
	}
	for(file=start;file<start+len;file++) {
		ptr=path((direntry*)&entries[file]);
		if (debug>2)
			(void)fprintf(stderr,"Open: %s\n",ptr);
		if ((fd[file-start]=open(ptr,O_RDONLY))<0) {
			if (errno==EMFILE||errno==ENFILE) {
				len=file-start;
				(void)fprintf(stderr,"%s: can't open more than %d files of %ld bytes simultaneously, reducing table\n",image,len,size);
				break;
			}
			(void)fprintf(stderr,"%s: can't open ",image);
			perror(ptr);
			exit(1);
		}
	}
	numfd=len;

	/*
	**  Allocate file buffers
	*/

	if (!(buffer=(char **)malloc(len*sizeof(char *)))) {
		perror(image);
		exit(1);
	}
	for(file=start;file<start+len;file++) {
		if (!(buffer[file-start]=malloc(BUFFERSIZE))) {
			perror(image);
			exit(1);
		}
	}

	/*
	**  Allocate and initialize identity matrix
	*/

	if (!(matrix=malloc((size_t)(len*len)))) {
		perror(image);
		exit(1);
	}
	(void)memset((void *)matrix,TRUE,(size_t)(len*len));

	/*
	**  Read loop
	*/

	while(numfd>1&&size>0) {
		expect=size<BUFFERSIZE?size:BUFFERSIZE;
		if (debug>2)
			(void)fprintf(stderr,"%ld bytes to go, expecting %ld bytes on the next read\n",size,expect);
		/*
		**  Read from every file, check return value
		*/
		for(file=start;file<start+len;file++)
			if (fd[file-start]>=0) {
				retval=read(fd[file-start],(void *)buffer[file-start],BUFFERSIZE);
				if (debug>2)
					(void)fprintf(stderr,"Read: %d (%s)\n",retval,path((direntry*)&entries[file]));
				if ((int)retval!=expect) {
					(void)fprintf(stderr,"%s: can't read from %s\n",image,path((direntry*)&entries[file]));
					(void)fprintf(stderr,"%s: expected %ld bytes, read %d bytes\n",image,expect,retval);
					exit(1);
				}
			}
		size-=expect;
		/*
		**  Check all open, potentially identical files
		*/
		for(file=start;file<start+len-1;file++)
			if (fd[file-start]>=0)
				for(nfile=file+1;nfile<start+len;nfile++)
					if (fd[nfile-start]>=0&&matrix[coord(file-start,nfile-start)])
						if (memcmp((void *)buffer[file-start],(void *)buffer[nfile-start],expect))
							matrix[coord(file-start,nfile-start)]=FALSE;
		/*
		**  Close files that have no more potentially identical files.
		*/
		for(file=start;file<start+len;file++)
			if (fd[file-start]>=0) {
				for(nfile=start;nfile<start+len;nfile++)
					if (file!=nfile&&fd[nfile-start]>=0&&matrix[coord(file-start,nfile-start)])
						break;
				if (nfile==start+len) {
					if (debug)
						(void)fprintf(stderr,"Close (no more potentially identical files): %s\n",path((direntry*)&entries[file]));
					if (close(fd[file-start])<0) {
						(void)fprintf(stderr,"%s: can't close ",image);
						perror(path((direntry*)&entries[file]));
						exit(1);
					}
					fd[file-start]=-1;
					numfd--;
				}
			}
	}
	if (debug)
		listmatrix(matrix,start,len);

	/*
	**  The set of all files still open must have files with identical
	**  contents. Analyse identity matrix and output link commands.
	*/

	for(file=start;file<start+len-1;file++)
		if (fd[file-start]>=0) {
			if (!(ptr=strdup(path((direntry*)&entries[file])))) {
				perror(image);
				exit(1);
			}
			for(nfile=file+1;nfile<start+len;nfile++) {
				if (fd[nfile-start]>=0&&matrix[coord(file-start,nfile-start)]) {
					if (!(nptr=strdup(path((direntry*)&entries[nfile])))) {
						perror(image);
						exit(1);
					}
					if (symbolic) {
						sptr=buildsymbolic(ptr,nptr);
					}
					if (verbose||nonexec) {
						(void)printf("rm -f '%s'\n",nptr);
						if (symbolic)
							(void)printf("ln -s '%s' '%s'\n",sptr,nptr);
						else
							(void)printf("ln '%s' '%s'\n",ptr,nptr);
					}
					if (!nonexec) {
						(void)remove(nptr);
						if (symbolic) {
							if (symlink(sptr,nptr)) {
								(void)fprintf(stderr,"%s: can't symlink %s to ",image,sptr);
								perror(nptr);
								exit(1);
							}
						}
						else {
							if (link(ptr,nptr)) {
								(void)fprintf(stderr,"%s: can't link %s to ",image,ptr);
								perror(nptr);
								exit(1);
							}
						}
					}
					if (debug>2)
						(void)fprintf(stderr,"Close (already linked): %s\n",nptr);
					if (close(fd[nfile-start])<0) {
						(void)fprintf(stderr,"%s: can't close ",image);
						perror(nptr);
						exit(1);
					}
					fd[nfile-start]=-1;
					numfd--;
					free((void *)nptr);
				}
			}
			if (debug>2)
				(void)fprintf(stderr,"Close (all links done): %s\n",ptr);
			if (close(fd[file-start])<0) {
				(void)fprintf(stderr,"%s: can't close ",image);
				perror(ptr);
				exit(1);
			}
			numfd--;
			free((void *)ptr);
		}

	/*
	**  Free allocated buffers and arrays
	*/

	if (numfd) {
		(void)fprintf(stderr,"%s: internal error: %d files still open\n",image,numfd);
		exit(1);
	}
	free((void *)matrix);
	for(file=start;file<start+len;file++)
		free((void *)buffer[file-start]);
	free((void *)buffer);
	free((void *)fd);
}



static void usage()
{
	(void)fprintf(stderr,"Usage: %s [-v] [-n] [-s] [-x] [-m minsize] filename|directory [...]\n",image);
	(void)fprintf(stderr,"-v  Verbose\n");
	(void)fprintf(stderr,"-n  Don't execute\n");
	(void)fprintf(stderr,"-s  Use symbolic links\n");
	(void)fprintf(stderr,"-x  Don't stop at mountpoints\n");
	(void)fprintf(stderr,"-m  Minimum file size (choose the minimum file size that can't be stored within an inode)\n");
	exit(1);
}


int main(argc,argv)
int	argc;
char	*argv[];
{
	int		c;
	extern char	*optarg;
	extern int	optind;
	struct rlimit	rlp;
	int		from,to;

	if ((image=strrchr(argv[0],'/')))
		image++;
	else
		image=argv[0];

	while ((c=getopt(argc,argv,"Dvnsxm:vh?"))!=EOF)
		switch ((char)c) {
		case 'D':
			debug++;
			break;
		case 'v':
			verbose=TRUE;
			break;
		case 'n':
			nonexec=TRUE;
			break;
		case 's':
			symbolic=TRUE;
			break;
		case 'x':
			xdev=CROSS;
			break;
		case 'm':
			minsize=atol(optarg);
			break;
		case 'h':
			(void)fprintf(stderr,"%s\n",sccsid+4);
			(void)fprintf(stderr,"%s\n",copyright+4);
		/*FALLTHROUGH*/
		case '?':
			usage();
		}

	if (optind==argc)
		usage();

	/*
	**  Set soft limit for max. open files to hard limit.
	*/

	(void)getrlimit(RLIMIT_NOFILE,&rlp);
	rlp.rlim_cur=rlp.rlim_max;
	(void)setrlimit(RLIMIT_NOFILE,&rlp);
	maxopenfiles=rlp.rlim_cur-3;
	if (debug>1)
		(void)fprintf(stderr,"Max. open files: %d\n",maxopenfiles);

	/*
	**  Build table of files.
	*/

	for (;optind<argc;optind++)
		cmpln(argv[optind],(direntry *)NULL);
	if (debug>2) {
		(void)fprintf(stderr,"Unsorted files:\n");
		listfiles(0,files);
	}

	/*
	**  Sort table of files by file size. Only files with identical size
	**  have potentially identical contents.
	*/

	qsort((void *)entries,(size_t)files,sizeof(fileentry),(int (*)())cmpsize);
	if (debug>2) {
		(void)fprintf(stderr,"Files sorted by size:\n");
		listfiles(0,files);
	}

	/*
	**  Build blocks of identical size, compare all files within a block.
	*/

	for(from=0;from<files;from=to) {
		for(to=from+1;to<files&&entries[from].size==entries[to].size;to++);
		if (to>from+1) {
			if (debug>1) {
				(void)fprintf(stderr,"Files with size %ld:\n",(long)entries[from].size);
				listfiles(from,to);
			}
			checkblock(from,to-from);
		}
	}

	return(0);
}

