Plaiglab / scripts /test_normalize.py
SanidhyaDhangar's picture
PlaigLab — Hugging Face Space (Docker) clean deploy
ebebfe8
Raw
History Blame Contribute Delete
1.19 kB
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from plagdetect.normalize import SPACE_VARIANTS, deobfuscate
print("space pattern codepoints:",
[f"U+{ord(c):04X}" for c in SPACE_VARIANTS.pattern.strip("[]") if c != "-"])
t = ("word word word word word "
"“quote” ‘s’ non‑breaking em—dash")
clean, _ = deobfuscate(t)
print(repr(clean))
assert " " not in clean and " " not in clean, "space variants not folded"
assert '"quote"' in clean and "'s'" in clean, "quotes not folded"
assert "non-breaking" in clean, "hyphen variant not folded"
assert "—" in clean, "em dash should be PRESERVED (style signal)"
# adversarial: homoglyphs + zero width
bad = ("The rаpid grоwth of mаchine leаrning​ "
"hаs​ transformed​ modern​ reseаrch.")
clean, rep = deobfuscate(bad)
assert rep["spoof_suspected"] and rep["homoglyph_count"] >= 5
assert clean.startswith("The rapid growth of machine learning")
print("zero-width kinds:", rep["zero_width_kinds"])
ex = rep["homoglyph_examples"][0]
print("example:", ex["codepoint"], ex["name"], "->", ex["folded_to"])
print("ALL NORMALIZE TESTS PASS")