smartcore-v1 / code /kod /decontam.py
kdirgul's picture
kod (data hariç) Colab için
9aed7c4 verified
Raw
History Blame Contribute Delete
1.71 kB
"""Dekontaminasyon — 13-gram örtüşmesiyle eval-set sızıntısını tespit.
Kararlı hash (blake2b 8-byte) kullanılır ki build (faz1_03) ile filtre (faz1_01)
farklı süreçlerde aynı sonucu versin (Python hash() randomize, kullanılamaz).
"""
from __future__ import annotations
import re, gzip, pickle, hashlib
from typing import Iterable
_WORD = re.compile(r"\w+", re.UNICODE)
N = 13 # 13-gram (GPT-3/literatür standardı; yanlış-pozitifi düşük tutacak kadar uzun)
def words(text: str) -> list[str]:
return _WORD.findall(text.lower())
def _h(gram: str) -> int:
return int.from_bytes(hashlib.blake2b(gram.encode("utf-8"), digest_size=8).digest(), "big")
def ngram_hashes(text: str, n: int = N) -> Iterable[int]:
w = words(text)
for i in range(len(w) - n + 1):
yield _h(" ".join(w[i:i + n]))
class Decontaminator:
def __init__(self, n: int = N):
self.n = n
self.grams: set[int] = set()
def add_text(self, text: str) -> int:
before = len(self.grams)
self.grams.update(ngram_hashes(text, self.n))
return len(self.grams) - before
def is_contaminated(self, text: str) -> bool:
if not self.grams:
return False
for h in ngram_hashes(text, self.n):
if h in self.grams:
return True
return False
def save(self, path: str):
with gzip.open(path, "wb") as f:
pickle.dump({"n": self.n, "grams": self.grams}, f, protocol=4)
@classmethod
def load(cls, path: str) -> "Decontaminator":
with gzip.open(path, "rb") as f:
d = pickle.load(f)
obj = cls(d["n"]); obj.grams = d["grams"]
return obj