| """Dekontaminasyon — 13-gram örtüşmesiyle eval-set sızıntısını tespit. |
| |
| Kararlı hash (blake2b 8-byte) kullanılır ki build (faz1_03) ile filtre (faz1_01) |
| farklı süreçlerde aynı sonucu versin (Python hash() randomize, kullanılamaz). |
| """ |
| from __future__ import annotations |
| import re, gzip, pickle, hashlib |
| from typing import Iterable |
|
|
| _WORD = re.compile(r"\w+", re.UNICODE) |
| N = 13 |
|
|
|
|
| def words(text: str) -> list[str]: |
| return _WORD.findall(text.lower()) |
|
|
|
|
| def _h(gram: str) -> int: |
| return int.from_bytes(hashlib.blake2b(gram.encode("utf-8"), digest_size=8).digest(), "big") |
|
|
|
|
| def ngram_hashes(text: str, n: int = N) -> Iterable[int]: |
| w = words(text) |
| for i in range(len(w) - n + 1): |
| yield _h(" ".join(w[i:i + n])) |
|
|
|
|
| class Decontaminator: |
| def __init__(self, n: int = N): |
| self.n = n |
| self.grams: set[int] = set() |
|
|
| def add_text(self, text: str) -> int: |
| before = len(self.grams) |
| self.grams.update(ngram_hashes(text, self.n)) |
| return len(self.grams) - before |
|
|
| def is_contaminated(self, text: str) -> bool: |
| if not self.grams: |
| return False |
| for h in ngram_hashes(text, self.n): |
| if h in self.grams: |
| return True |
| return False |
|
|
| def save(self, path: str): |
| with gzip.open(path, "wb") as f: |
| pickle.dump({"n": self.n, "grams": self.grams}, f, protocol=4) |
|
|
| @classmethod |
| def load(cls, path: str) -> "Decontaminator": |
| with gzip.open(path, "rb") as f: |
| d = pickle.load(f) |
| obj = cls(d["n"]); obj.grams = d["grams"] |
| return obj |
|
|