Initial implementation.
This commit is contained in:
commit
c6405f09ed
1 changed files with 133 additions and 0 deletions
133
dups
Executable file
133
dups
Executable file
|
@ -0,0 +1,133 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
find_args="-not -path *@snapshot*"
|
||||||
|
PROGNAME="$0"
|
||||||
|
|
||||||
|
#set -x
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
debug() {
|
||||||
|
echo "$@" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
abort() {
|
||||||
|
echo "$@" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
hashfc_short() {
|
||||||
|
local fname
|
||||||
|
[[ -z "$1" ]] && return
|
||||||
|
fname="$(awk '{print substr($0,index($0,$6))}' <<< "$1")"
|
||||||
|
head -c 4096 "$fname" | sha256sum -b - | awk '{print $1}' | echo "$(awk '{print $1,$2,$3,$4}' <<< "$1") $(cat -) $fname"
|
||||||
|
}
|
||||||
|
|
||||||
|
hashfc() {
|
||||||
|
local fname
|
||||||
|
[[ -z "$1" ]] && return
|
||||||
|
fname="$(awk '{print substr($0,index($0,$6))}' <<< "$1")"
|
||||||
|
sha256sum -b - < "$fname" | awk '{print $1}' | echo "$(awk '{print $1,$2,$3,$4}' <<< "$1") $(cat -) $fname"
|
||||||
|
}
|
||||||
|
|
||||||
|
collect_files() {
|
||||||
|
# timestamp size inode #hard_link hash path
|
||||||
|
#find $(< "$infile") -type f $find_args -printf '0 %s %i %n %p\n' | awk "$filename_to_base64_awk"
|
||||||
|
echo "=== COLLECT FILES ==="
|
||||||
|
find $(</dev/stdin) -type f $find_args -printf '0 %s %i %n 0 %p\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_inode() {
|
||||||
|
read -r header
|
||||||
|
[[ "$header" != "=== COLLECT FILES ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
|
||||||
|
echo "=== FILTER INODE ==="
|
||||||
|
|
||||||
|
sort -uk 3,3
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_size() {
|
||||||
|
local sizes
|
||||||
|
|
||||||
|
read -r header
|
||||||
|
[[ "$header" != "=== FILTER INODE ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
|
||||||
|
echo "=== FILTER SIZE ==="
|
||||||
|
|
||||||
|
sizes="$(sort -k2b,2)"
|
||||||
|
grep -Ef <(awk '{print $2}' <<< "$sizes" | uniq -d | awk '{print "^([^\\s]+\\s+){1}"$1"\\s+"}') <<< "$sizes"
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_hashsum_short() {
|
||||||
|
local hashes
|
||||||
|
|
||||||
|
read -r header
|
||||||
|
[[ "$header" != "=== FILTER SIZE ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
|
||||||
|
echo "=== FILTER HASHSUM SHORT ==="
|
||||||
|
|
||||||
|
hashes="$(xargs -n1 -d '\n' "$PROGNAME" hashfc_short | sort -k5b,5)"
|
||||||
|
grep -Ef <(awk '{print $5}' <<< "$hashes" | uniq -d | awk '{print "^([^\\s]+\\s+){4}"$1"\\s+"}') <<< "$hashes"
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_hashsum() {
|
||||||
|
local hashes
|
||||||
|
|
||||||
|
read -r header
|
||||||
|
[[ "$header" != "=== FILTER HASHSUM SHORT ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
|
||||||
|
echo "=== FILTER HASHSUM ==="
|
||||||
|
|
||||||
|
hashes="$(xargs -n1 -d '\n' "$PROGNAME" hashfc | sort -k5b,5)"
|
||||||
|
grep -Ef <(awk '{print $5}' <<< "$hashes" | uniq -d | awk '{print "^([^\\s]+\\s+){4}"$1"\\s+"}') <<< "$hashes"
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_sort() {
|
||||||
|
read -r header
|
||||||
|
[[ "$header" != "=== FILTER HASHSUM ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
|
||||||
|
echo "=== FILTER SORT ==="
|
||||||
|
|
||||||
|
sort -nrk4,5
|
||||||
|
}
|
||||||
|
|
||||||
|
replace() {
|
||||||
|
read -r header
|
||||||
|
[[ "$header" != "=== FILTER SORT ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
|
||||||
|
echo "=== REPLACE ==="
|
||||||
|
|
||||||
|
base_line=
|
||||||
|
while read -r line; do
|
||||||
|
base_hash="$(awk '{print $5}' <<< "$base_line")"
|
||||||
|
base_file="$(awk '{print substr($0,index($0,$6))}' <<< "$base_line")"
|
||||||
|
_hash="$(awk '{print $5}' <<< "$line")"
|
||||||
|
_file="$(awk '{print substr($0,index($0,$6))}' <<< "$line")"
|
||||||
|
[[ "$_hash" = "0" ]] && abort "Hashes not yet computed"
|
||||||
|
if [[ "$base_hash" != "$_hash" ]]; then
|
||||||
|
base_line="$line"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [[ "$(stat -c '%d' "$base_file")" != "$(stat -c '%d' "$_file")" ]]; then
|
||||||
|
debug "Files \"$base_file\" and \"$_file\" are on different file systems."
|
||||||
|
debug "Aborting hard linking"
|
||||||
|
echo "$base_line"
|
||||||
|
echo "$line"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
$DEBUG ln -f "$base_file" "$_file"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
filter() {
|
||||||
|
filter_inode \
|
||||||
|
| filter_size \
|
||||||
|
| filter_hashsum_short \
|
||||||
|
| filter_hashsum \
|
||||||
|
| filter_sort
|
||||||
|
}
|
||||||
|
|
||||||
|
all() {
|
||||||
|
collect_files \
|
||||||
|
| filter \
|
||||||
|
| replace
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#getopt --options s: --longoptions stage:
|
||||||
|
|
||||||
|
"$@"
|
Reference in a new issue