LinuxSir.cn,穿越时空的Linuxsir!

 找回密码
 注册
搜索
热搜: shell linux mysql
查看: 858|回复: 2

[拙作]一个基于md5hash的文件查重器

[复制链接]
发表于 2007-6-7 22:32:09 | 显示全部楼层 |阅读模式
  1. #!/bin/sh
  2. # Filename: ckdup.sh
  3. # Author: Jockey dot Kyd at gmail dot com
  4. # Date: Mar.18th, 2007
  5. # Licence: No licence. Feel free to do whatever you want.
  6. #           But NO WARRANTY. Use it on your own risk.
  7. #
  8. # This script aims to check duplicate files based on their
  9. # MD5 hash. Although collisions have been found with MD5
  10. # Message-Digest algorithm, I believe that it's extremely
  11. # difficult to find different files have a same hash, if
  12. # possible after all.
  13. #
  14. # The script accepts three options and they should be placed
  15. # immediately after the script name. For each option except
  16. # '-h', it's mandatory to specify at least one file to check.
  17. #
  18. #        -h        Print usage infomation.
  19. #        -r        Recursive mode. No effect with '-h'.
  20. #        -v        Verbose output to tell which file is being
  21. #                checked. No effect with '-h'.
  22. #
  23. # In output, the duplicate files are put together. The delimiter
  24. # between different groups is a blank line.
  25. #
  26. # NOTE: The script will ignore hidden files unless you specify.
  27. #        However, they'll be ignored anyway in recursion. It's
  28. #        NOT recommended to include any hidden file under any
  29. #        circumstances.
  30. #
  31. # To avoid converting delimiters into spaces automatically.
  32. IFS=''
  33. usage ()
  34. {
  35.         echo "usage: ""$0"" [-hrv] file..." 1>&2
  36.         exit 1
  37. }
  38. # This is the wrapper function to generate output needed by
  39. # genraw() in the format of '<MD5HASH> <FILENAME>'. One and
  40. # only one command as a piece of wrapped code should be placed
  41. # here, depending on what tool and OS you're using. Writing
  42. # your own command is a matter of course if none fits.
  43. genmd5 ()
  44. {
  45.         # For md(1) on Mac OS X, FreeBSD and OpenBSD-CURRENT
  46.         md5 -r "$@"
  47.         # For md(1) on NetBSD
  48.         #md5 -n "$@"
  49.         # For md5sum(1) from FSF on GNU/Linux
  50.         #md5sum "$@"
  51. }
  52. # This is the function to generate variable containing raw
  53. # output which will be further processed later.
  54. genraw ()
  55. {
  56.         for i in "$@"; do
  57.                 if [ ! -e "$i" ]; then
  58.                         echo "$0"": missing file: ""$i" 1>&2
  59.                         continue
  60.                 elif "$recur" && [ \( -d "$i" \) -a \( -x "$i" \) ]; then
  61.                         i=$(echo $i | sed 's/\/$//')
  62.                         genraw "$i"/*
  63.                 elif [ -f "$i" ]; then
  64.                         if "$verbose"; then
  65.                                 echo "Checking: ""$i"
  66.                         fi
  67.                         raw=$(genmd5 "$i")"\n""$raw"
  68.                 fi
  69.         done
  70. }
  71. recur=false
  72. help=false
  73. verbose=false
  74. while getopts ':rhv' opt; do
  75.         case "$opt" in
  76.                 r )
  77.                         recur=true ;;
  78.                 v )
  79.                         verbose=true ;;
  80.                 h | \? )
  81.                         help=true ;;
  82.         esac
  83. done
  84. shift $(($OPTIND - 1))
  85. if "$help"; then
  86.         usage
  87. elif [ "$#" = 0 ]; then
  88.         echo "$0"": no files to check." 1>&2
  89.         usage
  90. fi
  91. # Call the function genraw() to generate the variable.
  92. # NOTE: If the wildcard cannot be expanded, the script
  93. #        will give a warning of missing file but never
  94. #        mind, since you're not in bad case, just ignore
  95. #        it.
  96. genraw "$@"
  97. # Regenerate the variable's value as a whole.
  98. raw=$(echo -e "$raw")
  99. # This variable contains same MD5 hash of duplicate files
  100. # line by line.
  101. dup=$(echo "$raw" | awk '{print $1}' | sort | uniq -c | awk '$1 != 1 { print $2 }')
  102. # If no files are duplicate, exit normally.
  103. if [ -z "$dup" ]; then
  104.         if "$verbose";then
  105.                 echo -e '\n'
  106.         fi
  107.         echo "No duplicate files found."
  108.         exit 0
  109. fi
  110. isfirst=true
  111. echo "$dup" | {
  112.         while read i; do
  113.                 if "$isfirst"; then
  114.                         if "$verbose"; then
  115.                                 echo -e "\n"
  116.                         fi
  117.                         echo "The following files are duplicate:"
  118.                         isfirst=false
  119.                 fi
  120.                 echo
  121.                 echo "$raw" | grep "$i" | awk '{
  122.                                                         i = 2
  123.                                                         j = $i
  124.                                                         while (i<NF) {
  125.                                                                 i++
  126.                                                                 j = j" "$i
  127.                                                         }
  128.                                                         print j
  129.                                                 }'
  130.         done
  131. }
  132. exit 0
复制代码
[color="Red"]注意:genmd5()需要按照自己系统的实际情况进行修改。我是在Mac OS X上写的,所以默认工具用的是 md5(1) 。
为了排版我把源文件上传在附件中...我是菜鸟,注释写得很糟糕,代码中潜在的bug也没找不出来(如果存在的话)...交流为主便不计较这么多。各位多多包涵。:thank
使用示例:

本帖子中包含更多资源

您需要 登录 才可以下载或查看,没有帐号?注册

x
发表于 2007-6-8 04:52:44 | 显示全部楼层
看效果图不错。
建议:可以加上个 SHA1SUM,这样两个一起比较,冲撞率基本应该就是0了。
回复 支持 反对

使用道具 举报

 楼主| 发表于 2007-6-8 12:50:49 | 显示全部楼层
Post by 晨想
建议:可以加上个 SHA1SUM,这样两个一起比较,冲撞率基本应该就是0了。
所以我提供了genmd5()这个函数wrapper供个人修改,除了函数本身名字牵强外,使用任何cksum算法的程序都能放进去(如果要两个cksum算法同时起作用,那么就得一个文件一个文件处理,把两个hash拼接成段,这无关紧要,因为hash不会输出,用户可以忽略内部运行的细节),只要保证cksum程序的输出格式是(没有“<”和“>”符号):
  1. <HASH码> <文件名>
复制代码
这个脚本还能改进很多,譬如前几天我在递归一个很深的目录时想到可以添加一个“-l <num>”选项指定递归的目录深度;作为Gentoo的饭,也可以加一个“-c”选项加入ansi转义码后多色彩输出,会更醒目一些。
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 注册

本版积分规则

快速回复 返回顶部 返回列表