This repository has been archived on 2024-10-30. You can view files and clone it, but cannot push or open issues or pull requests.
dups/dups
2023-10-12 16:42:09 +02:00

133 lines
3.7 KiB
Bash
Executable file

#!/bin/bash
find_args="-not -path *@snapshot*"
PROGNAME="$0"
#set -x
set -euo pipefail
debug() {
echo "$@" >&2
}
abort() {
echo "$@" >&2
exit 1
}
hashfc_short() {
local fname
[[ -z "$1" ]] && return
fname="$(awk '{print substr($0,index($0,$6))}' <<< "$1")"
head -c 4096 "$fname" | sha256sum -b - | awk '{print $1}' | echo "$(awk '{print $1,$2,$3,$4}' <<< "$1") $(cat -) $fname"
}
hashfc() {
local fname
[[ -z "$1" ]] && return
fname="$(awk '{print substr($0,index($0,$6))}' <<< "$1")"
sha256sum -b - < "$fname" | awk '{print $1}' | echo "$(awk '{print $1,$2,$3,$4}' <<< "$1") $(cat -) $fname"
}
collect_files() {
# timestamp size inode #hard_link hash path
#find $(< "$infile") -type f $find_args -printf '0 %s %i %n %p\n' | awk "$filename_to_base64_awk"
echo "=== COLLECT FILES ==="
find $(</dev/stdin) -type f $find_args -printf '0 %s %i %n 0 %p\n'
}
filter_inode() {
read -r header
[[ "$header" != "=== COLLECT FILES ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
echo "=== FILTER INODE ==="
sort -uk 3,3
}
filter_size() {
local sizes
read -r header
[[ "$header" != "=== FILTER INODE ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
echo "=== FILTER SIZE ==="
sizes="$(sort -k2b,2)"
grep -Ef <(awk '{print $2}' <<< "$sizes" | uniq -d | awk '{print "^([^\\s]+\\s+){1}"$1"\\s+"}') <<< "$sizes"
}
filter_hashsum_short() {
local hashes
read -r header
[[ "$header" != "=== FILTER SIZE ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
echo "=== FILTER HASHSUM SHORT ==="
hashes="$(xargs -n1 -d '\n' "$PROGNAME" hashfc_short | sort -k5b,5)"
grep -Ef <(awk '{print $5}' <<< "$hashes" | uniq -d | awk '{print "^([^\\s]+\\s+){4}"$1"\\s+"}') <<< "$hashes"
}
filter_hashsum() {
local hashes
read -r header
[[ "$header" != "=== FILTER HASHSUM SHORT ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
echo "=== FILTER HASHSUM ==="
hashes="$(xargs -n1 -d '\n' "$PROGNAME" hashfc | sort -k5b,5)"
grep -Ef <(awk '{print $5}' <<< "$hashes" | uniq -d | awk '{print "^([^\\s]+\\s+){4}"$1"\\s+"}') <<< "$hashes"
}
filter_sort() {
read -r header
[[ "$header" != "=== FILTER HASHSUM ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
echo "=== FILTER SORT ==="
sort -nrk4,5
}
replace() {
read -r header
[[ "$header" != "=== FILTER SORT ===" ]] && abort "Input files didn't come from expected stage; previous stage was $header"
echo "=== REPLACE ==="
base_line=
while read -r line; do
base_hash="$(awk '{print $5}' <<< "$base_line")"
base_file="$(awk '{print substr($0,index($0,$6))}' <<< "$base_line")"
_hash="$(awk '{print $5}' <<< "$line")"
_file="$(awk '{print substr($0,index($0,$6))}' <<< "$line")"
[[ "$_hash" = "0" ]] && abort "Hashes not yet computed"
if [[ "$base_hash" != "$_hash" ]]; then
base_line="$line"
continue
fi
if [[ "$(stat -c '%d' "$base_file")" != "$(stat -c '%d' "$_file")" ]]; then
debug "Files \"$base_file\" and \"$_file\" are on different file systems."
debug "Aborting hard linking"
echo "$base_line"
echo "$line"
continue
fi
$DEBUG ln -f "$base_file" "$_file"
done
}
filter() {
filter_inode \
| filter_size \
| filter_hashsum_short \
| filter_hashsum \
| filter_sort
}
all() {
collect_files \
| filter \
| replace
}
#getopt --options s: --longoptions stage:
"$@"