I. File Deduplication (文件去重)#

1. Problem Statement (题目描述)#

Given a Directory Tree (目录树), find and group files with identical Byte Content (字节内容) and output duplicate file paths where group size ≥ 2.

Task Description (任务说明):

Input: A root directory (根目录) containing files and subdirectories
Definition: Duplicate Files (重复文件) have exactly the same byte content
Output: Groups of file paths (文件路径组), each group contains identical files
Format: One line per group, paths separated by spaces (空格分隔路径)

Example (示例):

1
/a/1.txt content "hello"
2
/b/2.txt content "hello"
3
/c/3.txt content "world"

Output:

1
/a/1.txt /b/2.txt

2. Core Approach (核心思路)#

1) Directory Traversal (目录遍历)#

Use DFS/BFS (深度优先/广度优先搜索) to visit all files in the directory tree. What should we do if the file cannot be opened?

2) Hashing Files (文件哈希)#

Use a Hash Function (哈希函数) to convert file content into a Hash Value (哈希值) for quick comparison.

3) Grouping Duplicates (分组重复文件)#

Use a Hash Map (哈希表) to map hash -> list of file paths and collect duplicates.

3. Code Implementation (代码实现)#

1) Python Example (可独立运行)#

1
import os
2
import hashlib
3

4
def get_file_hash(file_path, chunk_size=4096):
5
    hasher = hashlib.md5()  # MD5哈希函数
6
    with open(file_path, 'rb') as f:
7
        while True:
8
            chunk = f.read(chunk_size)
9
            if not chunk:
10
                break
11
            hasher.update(chunk)
12
    return hasher.hexdigest()
13

14
def find_duplicates(root_dir):
15
    hash_map = {}  # 哈希值 -> 文件路径列表
16

17
    for dirpath, _, filenames in os.walk(root_dir):
18
        for filename in filenames:
19
            file_path = os.path.join(dirpath, filename)
20
            file_hash = get_file_hash(file_path)
21

22
            if file_hash not in hash_map:
23
                hash_map[file_hash] = []
24
            hash_map[file_hash].append(file_path)
25

26
    for paths in hash_map.values():
27
        if len(paths) >= 2:
28
            print(" ".join(paths))
29

30
if __name__ == "__main__":
31
    # Ensure example_dir exists with test files
32
    find_duplicates("./example_dir")

1
import os
2
import hashlib
3
from collections import defaultdict
4

5

6
def get_file_hash(file_path, chunk_size=4096):
7
    hasher = hashlib.md5()
8
    with open(file_path, "rb") as f:
9
        while True:
10
            chunk = f.read(chunk_size)
11
            if not chunk:
12
                break
13
            hasher.update(chunk)
14
    return hasher.hexdigest()
15

16

17
def find_duplicates(root_dir):
18
    size_map = defaultdict(list)   # 文件大小 -> 文件路径列表
19
    hash_map = defaultdict(list)   # 文件哈希 -> 文件路径列表
20

21
    # 1）先遍历所有文件，按文件大小分组
22
    for dirpath, _, filenames in os.walk(root_dir):
23
        for filename in filenames:
24
            file_path = os.path.join(dirpath, filename)
25
            try:
26
                file_size = os.path.getsize(file_path)
27
                size_map[file_size].append(file_path)
28
            except (OSError, PermissionError) as e:
29
                print(f"无法读取文件大小: {file_path}, 错误: {e}")
30

31
    # 2）只对“大小相同”的文件计算哈希
32
    for file_size, paths in size_map.items():
33
        if len(paths) < 2:
34
            continue
35

36
        for file_path in paths:
37
            try:
38
                file_hash = get_file_hash(file_path)
39
                hash_map[file_hash].append(file_path)
40
            except (OSError, PermissionError) as e:
41
                print(f"无法读取文件内容: {file_path}, 错误: {e}")
42

43
    # 3）输出真正重复的文件
44
    found = False
45
    for paths in hash_map.values():
46
        if len(paths) >= 2:
47
            found = True
48
            print(" ".join(paths))
49

50
    if not found:
51
        print("没有找到重复文件")
52

53

54
if __name__ == "__main__":
55
    find_duplicates("./example_dir")

1
import os
2
import hashlib
3
from collections import defaultdict
4
from concurrent.futures import ThreadPoolExecutor, as_completed
5

6

7
def get_file_hash(file_path, chunk_size=4096):
8
    hasher = hashlib.md5()
9
    with open(file_path, "rb") as f:
10
        while True:
11
            chunk = f.read(chunk_size)
12
            if not chunk:
13
                break
14
            hasher.update(chunk)
15
    return hasher.hexdigest()
16

17

18
def hash_file_worker(file_path):
19
    try:
20
        file_hash = get_file_hash(file_path)
21
        return file_path, file_hash, None
22
    except (OSError, PermissionError) as e:
23
        return file_path, None, e
24

25

26
def find_duplicates(root_dir, max_workers=8):
27
    size_map = defaultdict(list)
28

29
    # 1）先按文件大小分组
30
    for dirpath, _, filenames in os.walk(root_dir):
31
        for filename in filenames:
32
            file_path = os.path.join(dirpath, filename)
33
            try:
34
                file_size = os.path.getsize(file_path)
35
                size_map[file_size].append(file_path)
36
            except (OSError, PermissionError) as e:
37
                print(f"无法读取文件大小: {file_path}, 错误: {e}")
38

39
    duplicate_groups = []
40

41
    # 2）只对大小相同的文件组计算哈希
42
    for file_size, paths in size_map.items():
43
        if len(paths) < 2:
44
            continue
45

46
        hash_map = defaultdict(list)
47

48
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
49
            futures = [executor.submit(hash_file_worker, file_path) for file_path in paths]
50

51
            for future in as_completed(futures):
52
                file_path, file_hash, error = future.result()
53
                if error is not None:
54
                    print(f"无法读取文件内容: {file_path}, 错误: {error}")
55
                    continue
56
                hash_map[file_hash].append(file_path)
57

58
        # 3）收集真正重复的文件
59
        for same_files in hash_map.values():
60
            if len(same_files) >= 2:
61
                duplicate_groups.append(same_files)
62

63
    # 4）输出结果
64
    if duplicate_groups:
65
        print("找到重复文件：")
66
        for i, group in enumerate(duplicate_groups, 1):
67
            print(f"\n第{i}组重复文件：")
68
            for path in group:
69
                print(path)
70
    else:
71
        print("没有找到重复文件")
72

73

74
if __name__ == "__main__":
75
    find_duplicates("./example_dir", max_workers=8)

4. Complexity Analysis (复杂度分析)#

1) Time Complexity (时间复杂度)#

The Time Complexity (时间复杂度) is $O(N \cdot S)$ where N is number of files and S is average file size.

2) Space Complexity (空间复杂度)#

The Space Complexity (空间复杂度) is $O(N)$ due to storing hash mappings.

5. Optimization Strategies (优化策略)#

1) I/O Bound Optimization (I/O瓶颈优化)#

Reduce Disk I/O (磁盘读写) by filtering files using File Size (文件大小) before hashing.

2) CPU Bound Optimization (CPU瓶颈优化)#

Reduce Hash Computation (哈希计算) cost by using faster hash functions or parallel processing.

II. Detect Duplicate Files (文件去重-按大小+哈希)#

1. Problem Statement (题目描述)#

Given File Metadata (文件元数据) and a Content Reading Interface (文件读取接口), detect Duplicate Files (重复文件) where two files are duplicates iff their contents are identical (内容完全相同).

Requirements (要求):

Input: A list files, each contains:
- path (路径)
- size (文件大小，字节)
Helper Functions (辅助函数):
- read_file(path) -> bytes (读取文件内容)
- hash_bytes(data) -> str (计算哈希值)
Output: List[List[str]], each group contains duplicate file paths (每组至少2个文件)

Constraints (约束):

Number of Files (文件数量) up to $10^6$
Files may be very large (大文件GB级)
Need Streaming Processing (流式处理) to avoid loading entire file into memory

Optimization Requirement (优化要求):

Stage 1: Group by File Size (按文件大小分组)
Stage 2: For same size files, compute Content Hash (内容哈希)

Example (示例):

Input:

1
[a.txt size=3 content=abc,
2
 b.txt size=3 content=abc,
3
 c.txt size=3 content=abd,
4
 d.txt size=10 content=0123456789]

Output:

1
[[a.txt, b.txt]]

2. Core Idea (核心思路)#

1) Two-Stage Filtering (两阶段过滤)#

First use File Size (文件大小) to prune candidates, then use Hash Function (哈希函数) to confirm duplicates.

2) Performance Insight (性能关键点)#

This approach reduces expensive I/O (磁盘读取) and Hash Computation (哈希计算).

3. Algorithm Steps (算法步骤)#

1) Step Flow (步骤流程)#

Build Size Map (大小映射): size -> list of paths
Filter groups with size ≥ 2
For each group, compute Hash (计算哈希)
Build Hash Map (哈希映射): hash -> list of paths
Collect groups with size ≥ 2

4. Code Implementation (代码实现)#

1) Python Example (可独立运行)#

1
import hashlib
2
from collections import defaultdict
3

4
# Mock read_file (模拟读取函数)
5
def read_file(path):
6
    data_map = {
7
        "a.txt": b"abc",
8
        "b.txt": b"abc",
9
        "c.txt": b"abd",
10
        "d.txt": b"0123456789"
11
    }
12
    return data_map[path]
13

14
def hash_bytes(data):
15
    return hashlib.sha256(data).hexdigest()
16

17
def find_duplicates(files):
18
    size_map = defaultdict(list)
19

20
    # Stage 1: group by file size
21
    for f in files:
22
        size_map[f["size"]].append(f["path"])
23

24
    result = []
25

26
    # Stage 2: group by content hash
27
    for paths in size_map.values():
28
        if len(paths) < 2:
29
            continue
30

31
        hash_map = defaultdict(list)
32
        for path in paths:
33
            data = read_file(path)
34
            h = hash_bytes(data)
35
            hash_map[h].append(path)
36

37
        for group in hash_map.values():
38
            if len(group) >= 2:
39
                result.append(group)
40

41
    return result
42

43
if __name__ == "__main__":
44
    files = [
45
        {"path": "a.txt", "size": 3},
46
        {"path": "b.txt", "size": 3},
47
        {"path": "c.txt", "size": 3},
48
        {"path": "d.txt", "size": 10},
49
    ]
50

51
    print(find_duplicates(files))

5. Complexity Analysis (复杂度分析)#

1) Time Complexity (时间复杂度)#

The Time Complexity (时间复杂度) is $O(N + K \cdot S)$ where K is number of candidate files and S is file size.

2) Space Complexity (空间复杂度)#

The Space Complexity (空间复杂度) is $O(N)$ for storing mappings.

6. System Design Discussion (系统设计讨论)#

1) Large File Handling (大文件处理)#

Use Streaming Hashing (流式哈希) to process files in chunks to avoid Memory Overflow (内存溢出).

2) I/O Bound Optimization (I/O瓶颈优化)#

Use Concurrent I/O (并发I/O) and Batch Processing (批处理) to reduce disk latency.

3) CPU Bound Optimization (CPU瓶颈优化)#

Use Parallel Hashing (并行哈希) with Multi-processing (多进程) to speed up computation.

4) Real-time Detection (实时检测)#

Use File System Watcher (文件系统监听器) and Incremental Indexing (增量索引) to detect duplicates dynamically.

III. Find Duplicate Files by Content (按内容查找重复文件)#

1. Problem Statement (题目描述)#

Given a Directory Structure (目录结构), find all files with duplicate content where file content can be compared by a Hash String (哈希字符串).

Requirements (要求):

Input: A list of strings paths, each string contains:
- Directory Path (目录路径)
- File Name (文件名)
- File Content (文件内容)
Output: List[List[str]], each group contains file paths with identical content (内容相同的文件路径分组)

Example (示例):

Input:

1
[
2
    "root/a 1.txt(abcd) 2.txt(efgh)",
3
    "root/c 3.txt(abcd)",
4
    "root/c/d 4.txt(efgh)",
5
    "root 4.txt(1234)"
6
]

Output:

1
[
2
    ["root/a/1.txt", "root/c/3.txt"],
3
    ["root/a/2.txt", "root/c/d/4.txt"]
4
]

Constraints (约束):

Each input string length (每个输入字符串长度) is less than $300$
Number of files (文件数量) is less than $10^4$

Extra Example (额外示例):

Input:

1
["root/a 1.txt(abcd) 2.txt(efgh)"]

Output:

1
[]

2. Core Idea (核心思路)#

1) Hash Map Grouping (哈希表分组)#

Use a Hash Map (哈希表) to map Content (内容) to Full Paths (完整路径), because the same content should belong to the same group.

2) String Parsing (字符串解析)#

Split each record into Directory (目录) and File Info (文件信息), then extract File Name (文件名) and Content (内容) from each file token.

3. Algorithm Steps (算法步骤)#

1) Step Flow (步骤流程)#

Traverse each path string
Split it by spaces into Directory (目录) and File Entries (文件项)
For each file entry, parse File Name (文件名) and Content (内容)
Build Full Path (完整路径)
Store it in Hash Map (哈希表): content -> list of full paths
Return groups whose size is at least 2

4. Code Implementation (代码实现)#

1) Python Example (可独立运行)#

1
from collections import defaultdict
2

3

4
def find_duplicate(paths):
5
    content_map = defaultdict(list)
6

7
    for record in paths:
8
        parts = record.split(" ")
9
        directory = parts[0]
10

11
        for file_info in parts[1:]:
12
            left = file_info.find("(")
13
            right = file_info.rfind(")")
14

15
            file_name = file_info[:left]
16
            content = file_info[left + 1:right]
17
            full_path = directory + "/" + file_name
18

19
            content_map[content].append(full_path)
20

21
    return [group for group in content_map.values() if len(group) >= 2]
22

23

24
if __name__ == "__main__":
25
    paths = [
26
        "root/a 1.txt(abcd) 2.txt(efgh)",
27
        "root/c 3.txt(abcd)",
28
        "root/c/d 4.txt(efgh)",
29
        "root 4.txt(1234)"
30
    ]
31

32
    result = find_duplicate(paths)
33
    print(result)
34

35
    extra_input = ["root/a 1.txt(abcd) 2.txt(efgh)"]
36
    print(find_duplicate(extra_input))

1
from collections import defaultdict
2
from concurrent.futures import ThreadPoolExecutor
3

4

5
def parse_record(record):
6
    parts = record.split(" ")
7
    directory = parts[0]
8

9
    local_map = defaultdict(list)
10

11
    for file_info in parts[1:]:
12
        left = file_info.find("(")
13
        right = file_info.rfind(")")
14

15
        file_name = file_info[:left]
16
        content = file_info[left + 1:right]
17
        full_path = directory + "/" + file_name
18

19
        local_map[content].append(full_path)
20

21
    return local_map
22

23

24
def find_duplicate(paths, max_workers=4):
25
    content_map = defaultdict(list)
26

27
    # 多线程解析
28
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
29
        results = executor.map(parse_record, paths)
30

31
    # 合并结果
32
    for local_map in results:
33
        for content, file_list in local_map.items():
34
            content_map[content].extend(file_list)
35

36
    return [group for group in content_map.values() if len(group) >= 2]
37

38

39
if __name__ == "__main__":
40
    paths = [
41
        "root/a 1.txt(abcd) 2.txt(efgh)",
42
        "root/c 3.txt(abcd)",
43
        "root/c/d 4.txt(efgh)",
44
        "root 4.txt(1234)"
45
    ]
46

47
    result = find_duplicate(paths, max_workers=4)
48
    print(result)
49

50
    extra_input = ["root/a 1.txt(abcd) 2.txt(efgh)"]
51
    print(find_duplicate(extra_input))

5. Complexity Analysis (复杂度分析)#

1) Time Complexity (时间复杂度)#

The Time Complexity (时间复杂度) is $O(N \cdot K)$ , where $N$ is the number of files and $K$ is the average parsing cost.

2) Space Complexity (空间复杂度)#

The Space Complexity (空间复杂度) is $O(N \cdot K)$ , because we store Content (内容) and File Paths (文件路径) in a Hash Map (哈希表).

6. Interview Notes (面试要点)#

1) Why Hash Map (为什么用哈希表)#

A Hash Map (哈希表) is the most direct way to group files by the same Content (内容).

2) Why Not Compare Every Pair (为什么不两两比较)#

Pairwise Comparison (两两比较) is too slow at $O(N^2)$ , so grouping by key is the standard optimization.

3) Edge Case (边界情况)#

If every file has unique content, the answer is an empty list because no group has at least two files.