Commit ·
609c035
1
Parent(s): 175fc9d
FIX-27: Grammar structured data + hallucination protection
Browse filesFIX-27a: Block grammar diffs containing digits — prevents the grammar
model from corrupting dates (2026-06-22→عشرين), numbers (25.5→ثلاث عشر),
percentages (95.7%→95 ، 7 %), and formatted numbers (1,000,000→1 , 000).
FIX-27b: Block grammar diffs with Jaccard < 0.5 at character level —
catches hallucinations like القانون→القانين, يعزف→يعزفون, للإنسان→للإنسين.
Expected to fix: SC009, SC013-SC016, G043, H014, H017 (~7-8 FPs)
- src/app.py +29 -0
- tests/phase10/reports/phase10_results.json +0 -0
src/app.py
CHANGED
|
@@ -1693,6 +1693,35 @@ def analyze_text():
|
|
| 1693 |
)
|
| 1694 |
continue
|
| 1695 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1696 |
# ── FIX-06: Directional block protection for grammar ──
|
| 1697 |
# Prevents meaning-changing substitutions (كان→كأن etc.)
|
| 1698 |
# especially critical when spelling is skipped (>1000 chars).
|
|
|
|
| 1693 |
)
|
| 1694 |
continue
|
| 1695 |
|
| 1696 |
+
# ── FIX-27a: Grammar structured data protection ──
|
| 1697 |
+
# Block grammar diffs where the original contains digits.
|
| 1698 |
+
# The grammar model corrupts dates/numbers/times/percentages.
|
| 1699 |
+
# e.g., '2026-06-22' → 'عشرين 26-06-22ا'
|
| 1700 |
+
if orig_text and any(c.isdigit() for c in orig_text):
|
| 1701 |
+
logger.info(
|
| 1702 |
+
f"[GRAMMAR] Blocked digit-containing diff: "
|
| 1703 |
+
f"'{orig_text}'\u2192'{corr_text}'"
|
| 1704 |
+
)
|
| 1705 |
+
continue
|
| 1706 |
+
|
| 1707 |
+
# ── FIX-27b: Grammar hallucination guard (Jaccard) ──
|
| 1708 |
+
# Block grammar diffs where the correction is too different
|
| 1709 |
+
# from the original (character-level Jaccard < 0.5).
|
| 1710 |
+
# Catches: القانون→القانين, يعزف→يعزفون, للإنسان→للإنسين
|
| 1711 |
+
if orig_text and corr_text and len(orig_text) > 2:
|
| 1712 |
+
import re as _re_jac
|
| 1713 |
+
# Strip punctuation/spaces for comparison
|
| 1714 |
+
_o_chars = set(_re_jac.sub(r'[\s.,،؛؟!:;?]', '', orig_text))
|
| 1715 |
+
_c_chars = set(_re_jac.sub(r'[\s.,،؛؟!:;?]', '', corr_text))
|
| 1716 |
+
if _o_chars and _c_chars:
|
| 1717 |
+
_jac = len(_o_chars & _c_chars) / len(_o_chars | _c_chars)
|
| 1718 |
+
if _jac < 0.5:
|
| 1719 |
+
logger.info(
|
| 1720 |
+
f"[GRAMMAR] Blocked low-Jaccard diff (j={_jac:.2f}): "
|
| 1721 |
+
f"'{orig_text}'\u2192'{corr_text}'"
|
| 1722 |
+
)
|
| 1723 |
+
continue
|
| 1724 |
+
|
| 1725 |
# ── FIX-06: Directional block protection for grammar ──
|
| 1726 |
# Prevents meaning-changing substitutions (كان→كأن etc.)
|
| 1727 |
# especially critical when spelling is skipped (>1000 chars).
|
tests/phase10/reports/phase10_results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|