|
- #!/bin/sh
- # Filename: ckdup.sh
- # Author: Jockey dot Kyd at gmail dot com
- # Date: Mar.18th, 2007
- # Licence: No licence. Feel free to do whatever you want.
- # But NO WARRANTY. Use it on your own risk.
- #
- # This script aims to check duplicate files based on their
- # MD5 hash. Although collisions have been found with MD5
- # Message-Digest algorithm, I believe that it's extremely
- # difficult to find different files have a same hash, if
- # possible after all.
- #
- # The script accepts three options and they should be placed
- # immediately after the script name. For each option except
- # '-h', it's mandatory to specify at least one file to check.
- #
- # -h Print usage infomation.
- # -r Recursive mode. No effect with '-h'.
- # -v Verbose output to tell which file is being
- # checked. No effect with '-h'.
- #
- # In output, the duplicate files are put together. The delimiter
- # between different groups is a blank line.
- #
- # NOTE: The script will ignore hidden files unless you specify.
- # However, they'll be ignored anyway in recursion. It's
- # NOT recommended to include any hidden file under any
- # circumstances.
- #
- # To avoid converting delimiters into spaces automatically.
- IFS=''
- usage ()
- {
- echo "usage: ""$0"" [-hrv] file..." 1>&2
- exit 1
- }
- # This is the wrapper function to generate output needed by
- # genraw() in the format of '<MD5HASH> <FILENAME>'. One and
- # only one command as a piece of wrapped code should be placed
- # here, depending on what tool and OS you're using. Writing
- # your own command is a matter of course if none fits.
- genmd5 ()
- {
- # For md(1) on Mac OS X, FreeBSD and OpenBSD-CURRENT
- md5 -r "$@"
- # For md(1) on NetBSD
- #md5 -n "$@"
- # For md5sum(1) from FSF on GNU/Linux
- #md5sum "$@"
- }
- # This is the function to generate variable containing raw
- # output which will be further processed later.
- genraw ()
- {
- for i in "$@"; do
- if [ ! -e "$i" ]; then
- echo "$0"": missing file: ""$i" 1>&2
- continue
- elif "$recur" && [ \( -d "$i" \) -a \( -x "$i" \) ]; then
- i=$(echo $i | sed 's/\/$//')
- genraw "$i"/*
- elif [ -f "$i" ]; then
- if "$verbose"; then
- echo "Checking: ""$i"
- fi
- raw=$(genmd5 "$i")"\n""$raw"
- fi
- done
- }
- recur=false
- help=false
- verbose=false
- while getopts ':rhv' opt; do
- case "$opt" in
- r )
- recur=true ;;
- v )
- verbose=true ;;
- h | \? )
- help=true ;;
- esac
- done
- shift $(($OPTIND - 1))
- if "$help"; then
- usage
- elif [ "$#" = 0 ]; then
- echo "$0"": no files to check." 1>&2
- usage
- fi
- # Call the function genraw() to generate the variable.
- # NOTE: If the wildcard cannot be expanded, the script
- # will give a warning of missing file but never
- # mind, since you're not in bad case, just ignore
- # it.
- genraw "$@"
- # Regenerate the variable's value as a whole.
- raw=$(echo -e "$raw")
- # This variable contains same MD5 hash of duplicate files
- # line by line.
- dup=$(echo "$raw" | awk '{print $1}' | sort | uniq -c | awk '$1 != 1 { print $2 }')
- # If no files are duplicate, exit normally.
- if [ -z "$dup" ]; then
- if "$verbose";then
- echo -e '\n'
- fi
- echo "No duplicate files found."
- exit 0
- fi
- isfirst=true
- echo "$dup" | {
- while read i; do
- if "$isfirst"; then
- if "$verbose"; then
- echo -e "\n"
- fi
- echo "The following files are duplicate:"
- isfirst=false
- fi
- echo
- echo "$raw" | grep "$i" | awk '{
- i = 2
- j = $i
- while (i<NF) {
- i++
- j = j" "$i
- }
- print j
- }'
- done
- }
- exit 0
复制代码 [color="Red"]注意:genmd5()需要按照自己系统的实际情况进行修改。我是在Mac OS X上写的,所以默认工具用的是 md5(1) 。
为了排版我把源文件上传在附件中...我是菜鸟,注释写得很糟糕,代码中潜在的bug也没找不出来(如果存在的话)...交流为主便不计较这么多。各位多多包涵。:thank
使用示例: |
本帖子中包含更多资源
您需要 登录 才可以下载或查看,没有帐号?注册
x
|