import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from plagdetect.normalize import SPACE_VARIANTS, deobfuscate print("space pattern codepoints:", [f"U+{ord(c):04X}" for c in SPACE_VARIANTS.pattern.strip("[]") if c != "-"]) t = ("word word word word word " "“quote” ‘s’ non‑breaking em—dash") clean, _ = deobfuscate(t) print(repr(clean)) assert " " not in clean and " " not in clean, "space variants not folded" assert '"quote"' in clean and "'s'" in clean, "quotes not folded" assert "non-breaking" in clean, "hyphen variant not folded" assert "—" in clean, "em dash should be PRESERVED (style signal)" # adversarial: homoglyphs + zero width bad = ("The rаpid grоwth of mаchine leаrning​ " "hаs​ transformed​ modern​ reseаrch.") clean, rep = deobfuscate(bad) assert rep["spoof_suspected"] and rep["homoglyph_count"] >= 5 assert clean.startswith("The rapid growth of machine learning") print("zero-width kinds:", rep["zero_width_kinds"]) ex = rep["homoglyph_examples"][0] print("example:", ex["codepoint"], ex["name"], "->", ex["folded_to"]) print("ALL NORMALIZE TESTS PASS")