File size: 5,258 Bytes
92ea780
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""Deduplication module for EEE validation pipeline.

Two-level dedup:
- Exact duplicates: SHA256 hash of entire file content
- Near duplicates: SHA256 hash of content minus timestamps/UUIDs
"""

import hashlib
import json
import logging
from dataclasses import dataclass, field
from typing import Any

from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError

logger = logging.getLogger(__name__)

DATASET_REPO_ID = "evaleval/EEE_datastore"
MANIFEST_PATH = "manifest.json"

# Fields to strip for near-duplicate fingerprinting
FINGERPRINT_STRIP_FIELDS = {
    "retrieved_timestamp",
    "evaluation_id",
    "evaluation_timestamp",
}


def compute_sha256(content: bytes) -> str:
    return hashlib.sha256(content).hexdigest()


def _strip_fields(data: dict[str, Any], fields_to_strip: set[str]) -> dict[str, Any]:
    """Recursively strip specified fields from a dict for fingerprinting."""
    result = {}
    for key, value in data.items():
        if key in fields_to_strip:
            continue
        if isinstance(value, dict):
            result[key] = _strip_fields(value, fields_to_strip)
        elif isinstance(value, list):
            result[key] = [
                _strip_fields(item, fields_to_strip) if isinstance(item, dict) else item
                for item in value
            ]
        else:
            result[key] = value
    return result


def compute_fingerprint(content: bytes) -> str:
    """Compute a near-duplicate fingerprint by hashing content minus timestamps/UUIDs."""
    try:
        data = json.loads(content)
    except (json.JSONDecodeError, UnicodeDecodeError):
        # If we can't parse as JSON, fall back to full content hash
        return compute_sha256(content)

    stripped = _strip_fields(data, FINGERPRINT_STRIP_FIELDS)
    # Serialize deterministically
    canonical = json.dumps(stripped, sort_keys=True, ensure_ascii=True).encode()
    return hashlib.sha256(canonical).hexdigest()


@dataclass
class DedupResult:
    """Results of deduplication check for a single file."""
    file_path: str
    sha256: str
    fingerprint: str
    exact_duplicate_of: str | None = None
    near_duplicate_of: str | None = None


@dataclass
class DedupReport:
    """Aggregated dedup report across all checked files."""
    results: list[DedupResult] = field(default_factory=list)

    @property
    def has_exact_duplicates(self) -> bool:
        return any(r.exact_duplicate_of is not None for r in self.results)

    @property
    def has_near_duplicates(self) -> bool:
        return any(r.near_duplicate_of is not None for r in self.results)


def load_manifest(api: HfApi) -> dict[str, Any]:
    """Download and parse manifest.json from the dataset repo's main branch."""
    try:
        manifest_file = hf_hub_download(
            repo_id=DATASET_REPO_ID,
            filename=MANIFEST_PATH,
            repo_type="dataset",
            revision="main",
        )
        with open(manifest_file, "r") as f:
            return json.load(f)
    except (EntryNotFoundError, RepositoryNotFoundError):
        logger.warning("manifest.json not found in %s, using empty manifest", DATASET_REPO_ID)
        return {"files": {}}
    except Exception:
        logger.exception("Failed to load manifest.json")
        return {"files": {}}


def check_duplicates(
    file_paths: list[str],
    file_contents: dict[str, bytes],
    manifest: dict[str, Any],
) -> DedupReport:
    """Check files against the manifest for exact and near duplicates.

    Args:
        file_paths: Repo-relative paths of files to check (e.g. "data/gsm8k/.../abc.json")
        file_contents: Map of repo-relative path -> raw file bytes
        manifest: Parsed manifest.json with "files" key
    """
    report = DedupReport()
    manifest_files = manifest.get("files", {})

    # Build reverse lookups from manifest
    sha256_to_path: dict[str, str] = {}
    fingerprint_to_path: dict[str, str] = {}
    for path, entry in manifest_files.items():
        sha256_to_path[entry["sha256"]] = path
        fingerprint_to_path[entry["fingerprint"]] = path

    for file_path in file_paths:
        content = file_contents.get(file_path)
        if content is None:
            continue

        sha256 = compute_sha256(content)

        # Only compute fingerprints for .json files (not .jsonl)
        if file_path.endswith(".json"):
            fingerprint = compute_fingerprint(content)
        else:
            fingerprint = sha256  # For JSONL, fingerprint == sha256

        result = DedupResult(
            file_path=file_path,
            sha256=sha256,
            fingerprint=fingerprint,
        )

        # Check exact duplicate
        if sha256 in sha256_to_path and sha256_to_path[sha256] != file_path:
            result.exact_duplicate_of = sha256_to_path[sha256]

        # Check near duplicate (only if not already an exact duplicate)
        if (
            result.exact_duplicate_of is None
            and fingerprint in fingerprint_to_path
            and fingerprint_to_path[fingerprint] != file_path
        ):
            result.near_duplicate_of = fingerprint_to_path[fingerprint]

        report.results.append(result)

    return report