"""Dekontaminasyon — 13-gram örtüşmesiyle eval-set sızıntısını tespit. Kararlı hash (blake2b 8-byte) kullanılır ki build (faz1_03) ile filtre (faz1_01) farklı süreçlerde aynı sonucu versin (Python hash() randomize, kullanılamaz). """ from __future__ import annotations import re, gzip, pickle, hashlib from typing import Iterable _WORD = re.compile(r"\w+", re.UNICODE) N = 13 # 13-gram (GPT-3/literatür standardı; yanlış-pozitifi düşük tutacak kadar uzun) def words(text: str) -> list[str]: return _WORD.findall(text.lower()) def _h(gram: str) -> int: return int.from_bytes(hashlib.blake2b(gram.encode("utf-8"), digest_size=8).digest(), "big") def ngram_hashes(text: str, n: int = N) -> Iterable[int]: w = words(text) for i in range(len(w) - n + 1): yield _h(" ".join(w[i:i + n])) class Decontaminator: def __init__(self, n: int = N): self.n = n self.grams: set[int] = set() def add_text(self, text: str) -> int: before = len(self.grams) self.grams.update(ngram_hashes(text, self.n)) return len(self.grams) - before def is_contaminated(self, text: str) -> bool: if not self.grams: return False for h in ngram_hashes(text, self.n): if h in self.grams: return True return False def save(self, path: str): with gzip.open(path, "wb") as f: pickle.dump({"n": self.n, "grams": self.grams}, f, protocol=4) @classmethod def load(cls, path: str) -> "Decontaminator": with gzip.open(path, "rb") as f: d = pickle.load(f) obj = cls(d["n"]); obj.grams = d["grams"] return obj