Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from plagdetect.normalize import SPACE_VARIANTS, deobfuscate | |
| print("space pattern codepoints:", | |
| [f"U+{ord(c):04X}" for c in SPACE_VARIANTS.pattern.strip("[]") if c != "-"]) | |
| t = ("word word word word word " | |
| "“quote” ‘s’ non‑breaking em—dash") | |
| clean, _ = deobfuscate(t) | |
| print(repr(clean)) | |
| assert " " not in clean and " " not in clean, "space variants not folded" | |
| assert '"quote"' in clean and "'s'" in clean, "quotes not folded" | |
| assert "non-breaking" in clean, "hyphen variant not folded" | |
| assert "—" in clean, "em dash should be PRESERVED (style signal)" | |
| # adversarial: homoglyphs + zero width | |
| bad = ("The rаpid grоwth of mаchine leаrning " | |
| "hаs transformed modern reseаrch.") | |
| clean, rep = deobfuscate(bad) | |
| assert rep["spoof_suspected"] and rep["homoglyph_count"] >= 5 | |
| assert clean.startswith("The rapid growth of machine learning") | |
| print("zero-width kinds:", rep["zero_width_kinds"]) | |
| ex = rep["homoglyph_examples"][0] | |
| print("example:", ex["codepoint"], ex["name"], "->", ex["folded_to"]) | |
| print("ALL NORMALIZE TESTS PASS") | |