Implement file deduplication at scale
Company: Anthropic
Role: Software Engineer
Category: Coding & Algorithms
Difficulty: Medium
Interview Round: Onsite
Quick Answer: This question evaluates a candidate's competence in designing and implementing scalable file deduplication systems, covering algorithmic hashing strategies, efficient disk I/O and batching, parallelization across cores or machines, large-file chunking, and safe filesystem operations like hard-linking.
Constraints
- 0 <= len(files) <= 200000
- 0 <= total number of characters across all chunks <= 1000000
- Each `path` is unique
- `mode` is either `'report'` or `'link'`
Examples
Input: ([('/a', 1, 100, ['hello', 'world']), ('/b', 1, 101, ['helloworld']), ('/c', 2, 200, ['hello', 'world']), ('/d', 1, 102, ['hello', 'there'])], 'link')
Expected Output: ([['/a', '/b', '/c']], [('/b', '/a')])
Explanation: Files /a, /b, and /c have identical content even though /a and /b use different chunk boundaries. Only /b can be hard-linked to /a because /c is on a different device.
Input: ([('/e', 1, 10, []), ('/f', 1, 11, []), ('/g', 1, 20, ['x']), ('/h', 1, 20, ['x']), ('/i', 1, 21, ['x'])], 'link')
Expected Output: ([['/e', '/f'], ['/g', '/h', '/i']], [('/f', '/e'), ('/i', '/g')])
Explanation: Empty files /e and /f are duplicates, so /f can link to /e. Files /g, /h, and /i are duplicates; /h already shares /g's inode, so only /i needs a hard-link operation.
Input: ([('/m1', 1, 1, ['PPPPPPPPPPPPPPPP', 'A', 'SSSSSSSSSSSSSSSS']), ('/m2', 1, 2, ['PPPPPPPPPPPPPPPPB', 'SSSSSSSSSSSSSSSS']), ('/m3', 1, 3, ['PPPPPPPPPPPPPPPPASSSSSSSSSSSSSSSS'])], 'report')
Expected Output: ([['/m1', '/m3']], [])
Explanation: All three files have the same size and the same first and last 16 characters, so a cheap partial fingerprint alone would collide. Full hashing and final verification show that only /m1 and /m3 are truly identical.
Input: ([], 'report')
Expected Output: ([], [])
Explanation: With no files, there are no duplicate groups and no link operations.
Input: ([('/z2', 2, 2, ['dup']), ('/z1', 1, 1, ['dup']), ('/z3', 1, 3, ['dup']), ('/z4', 2, 4, ['dup'])], 'link')
Expected Output: ([['/z1', '/z2', '/z3', '/z4']], [('/z3', '/z1'), ('/z4', '/z2')])
Explanation: All four files are duplicates, but hard links must be planned per device. On device 1, /z1 is the canonical target; on device 2, /z2 is the canonical target.
Solution
def solution(files, mode):
import hashlib
from collections import defaultdict
if mode not in ('report', 'link'):
raise ValueError("mode must be 'report' or 'link'")
PREVIEW = 16
def content_size(chunks):
total = 0
for chunk in chunks:
total += len(chunk)
return total
def first_k(chunks, k):
parts = []
remaining = k
for chunk in chunks:
if remaining == 0:
break
if not chunk:
continue
piece = chunk[:remaining]
parts.append(piece)
remaining -= len(piece)
return ''.join(parts)
def last_k(chunks, k):
parts = []
remaining = k
for chunk in reversed(chunks):
if remaining == 0:
break
if not chunk:
continue
if len(chunk) <= remaining:
parts.append(chunk)
remaining -= len(chunk)
else:
parts.append(chunk[-remaining:])
remaining = 0
return ''.join(reversed(parts))
def full_digest(chunks):
h = hashlib.blake2b(digest_size=32)
for chunk in chunks:
if chunk:
h.update(chunk.encode('utf-8'))
return h.hexdigest()
def same_content(chunks_a, chunks_b):
ia = ib = 0
oa = ob = 0
while True:
while ia < len(chunks_a) and oa >= len(chunks_a[ia]):
ia += 1
oa = 0
while ib < len(chunks_b) and ob >= len(chunks_b[ib]):
ib += 1
ob = 0
if ia == len(chunks_a) or ib == len(chunks_b):
break
a = chunks_a[ia]
b = chunks_b[ib]
take = min(len(a) - oa, len(b) - ob)
if a[oa:oa + take] != b[ob:ob + take]:
return False
oa += take
ob += take
while ia < len(chunks_a) and oa >= len(chunks_a[ia]):
ia += 1
oa = 0
while ib < len(chunks_b) and ob >= len(chunks_b[ib]):
ib += 1
ob = 0
return ia == len(chunks_a) and ib == len(chunks_b)
records = []
for path, device_id, inode_id, chunks in files:
records.append({
'path': path,
'device': device_id,
'inode': inode_id,
'chunks': chunks,
'size': content_size(chunks)
})
size_groups = defaultdict(list)
for record in records:
size_groups[record['size']].append(record)
verified_groups = []
for group in size_groups.values():
if len(group) < 2:
continue
partial_groups = defaultdict(list)
for record in group:
key = (
record['size'],
first_k(record['chunks'], PREVIEW),
last_k(record['chunks'], PREVIEW)
)
partial_groups[key].append(record)
for partial_group in partial_groups.values():
if len(partial_group) < 2:
continue
digest_groups = defaultdict(list)
for record in partial_group:
digest_groups[full_digest(record['chunks'])].append(record)
for digest_group in digest_groups.values():
if len(digest_group) < 2:
continue
collision_safe_groups = []
for record in digest_group:
placed = False
for bucket in collision_safe_groups:
if same_content(record['chunks'], bucket[0]['chunks']):
bucket.append(record)
placed = True
break
if not placed:
collision_safe_groups.append([record])
for bucket in collision_safe_groups:
if len(bucket) >= 2:
verified_groups.append(bucket)
groups = []
for group in verified_groups:
paths = sorted(record['path'] for record in group)
groups.append(paths)
groups.sort(key=lambda g: g[0])
links = []
if mode == 'link':
for group in verified_groups:
by_device = defaultdict(list)
for record in group:
by_device[record['device']].append(record)
for device_records in by_device.values():
if len(device_records) < 2:
continue
device_records.sort(key=lambda record: record['path'])
canonical = device_records[0]
for record in device_records[1:]:
if record['inode'] != canonical['inode']:
links.append((record['path'], canonical['path']))
links.sort()
return groups, linksTime complexity: O(B + N log N), where B is the total number of characters across all chunks and N is the number of files. Space complexity: O(N).
Hints
- Use increasingly expensive filters: size, then a small content-based prefix/suffix fingerprint, then a full streaming hash.
- A matching full hash is not enough for absolute correctness. Compare the actual streamed contents inside a full-hash bucket to handle collisions and different chunk boundaries.