Mohamed Atef commited on
Commit ·
c830869
1
Parent(s): 38a1924
Phase 13: Fix punctuation terminal injection filter (context-aware Rule 0)
Browse files- src/app.py +2 -2
- src/nlp/punctuation/punctuation_rules.py +36 -9
src/app.py
CHANGED
|
@@ -2186,7 +2186,7 @@ def analyze_text():
|
|
| 2186 |
f"(locked by {owner}[{ls}:{le}])"
|
| 2187 |
)
|
| 2188 |
# Punctuation safety layer: reject non-punctuation changes
|
| 2189 |
-
if not validate_punctuation_diff(d):
|
| 2190 |
logger.info(
|
| 2191 |
f"[PUNC-SAFETY] Rejected diff [{d['start']}:{d['end']}] "
|
| 2192 |
f"'{d.get('original','')}' → '{d.get('correction','')}' — not a safe punctuation change"
|
|
@@ -2215,7 +2215,7 @@ def analyze_text():
|
|
| 2215 |
|
| 2216 |
# FIX-05: Rebuild punctuation text from accepted diffs only
|
| 2217 |
_safe_punc = ctx.current_text
|
| 2218 |
-
_punc_accepted = [d for d in diffs if validate_punctuation_diff(d)]
|
| 2219 |
for _pd in sorted(_punc_accepted, key=lambda x: x['start'], reverse=True):
|
| 2220 |
_safe_punc = (_safe_punc[:_pd['start']] +
|
| 2221 |
_pd['correction'] +
|
|
|
|
| 2186 |
f"(locked by {owner}[{ls}:{le}])"
|
| 2187 |
)
|
| 2188 |
# Punctuation safety layer: reject non-punctuation changes
|
| 2189 |
+
if not validate_punctuation_diff(d, full_text=ctx.current_text):
|
| 2190 |
logger.info(
|
| 2191 |
f"[PUNC-SAFETY] Rejected diff [{d['start']}:{d['end']}] "
|
| 2192 |
f"'{d.get('original','')}' → '{d.get('correction','')}' — not a safe punctuation change"
|
|
|
|
| 2215 |
|
| 2216 |
# FIX-05: Rebuild punctuation text from accepted diffs only
|
| 2217 |
_safe_punc = ctx.current_text
|
| 2218 |
+
_punc_accepted = [d for d in diffs if validate_punctuation_diff(d, full_text=ctx.current_text)]
|
| 2219 |
for _pd in sorted(_punc_accepted, key=lambda x: x['start'], reverse=True):
|
| 2220 |
_safe_punc = (_safe_punc[:_pd['start']] +
|
| 2221 |
_pd['correction'] +
|
src/nlp/punctuation/punctuation_rules.py
CHANGED
|
@@ -89,13 +89,14 @@ def _normalize_for_comparison(text: str) -> str:
|
|
| 89 |
return text
|
| 90 |
|
| 91 |
|
| 92 |
-
def validate_punctuation_diff(diff: dict) -> bool:
|
| 93 |
"""
|
| 94 |
Return True ONLY if the diff is a safe punctuation-only change.
|
| 95 |
|
| 96 |
ALLOWED:
|
| 97 |
- Inserting 1 punctuation mark (short text) or 1–3 (long text)
|
| 98 |
- Replacing one punctuation mark with another
|
|
|
|
| 99 |
|
| 100 |
REJECTED:
|
| 101 |
- Adding/deleting/duplicating Arabic words
|
|
@@ -104,15 +105,21 @@ def validate_punctuation_diff(diff: dict) -> bool:
|
|
| 104 |
- Punctuation spam: delta/word_count > 0.5 (multi-word diffs)
|
| 105 |
- Short text (≤2 words): delta > 1
|
| 106 |
- Any diff: delta > MAX_PUNCT_DELTA
|
| 107 |
-
- Adding terminal punctuation to
|
|
|
|
| 108 |
"""
|
| 109 |
original = diff.get('original', '')
|
| 110 |
correction = diff.get('correction', '')
|
| 111 |
|
| 112 |
-
# ── Rule 0 (FIX-01): Reject terminal punctuation injection
|
| 113 |
# PuncAra-v1 unconditionally adds . or ؟ to every sentence.
|
| 114 |
# This rule catches the pattern: "word" → "word." / "word؟" / "word،"
|
| 115 |
# where the ONLY change is appending 1-2 terminal punctuation marks.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
TERMINAL_PUNCT = set('.,،؛؟!:;?!')
|
| 117 |
orig_stripped = original.rstrip()
|
| 118 |
corr_stripped = correction.rstrip()
|
|
@@ -131,12 +138,32 @@ def validate_punctuation_diff(diff: dict) -> bool:
|
|
| 131 |
corr_no_punct = re.sub(r'[.,،؛؟!:;?!]+$', '', correction)
|
| 132 |
if _normalize_for_comparison(orig_no_punct.replace(' ', '')) == \
|
| 133 |
_normalize_for_comparison(corr_no_punct.replace(' ', '')):
|
| 134 |
-
# This is a pure terminal-punctuation addition
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
)
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
# ── Rule 0b (Batch 4): Reject punct insertion when original has no punctuation ──
|
| 142 |
# If the original text has zero Arabic punctuation and the correction
|
|
|
|
| 89 |
return text
|
| 90 |
|
| 91 |
|
| 92 |
+
def validate_punctuation_diff(diff: dict, full_text: str = '') -> bool:
|
| 93 |
"""
|
| 94 |
Return True ONLY if the diff is a safe punctuation-only change.
|
| 95 |
|
| 96 |
ALLOWED:
|
| 97 |
- Inserting 1 punctuation mark (short text) or 1–3 (long text)
|
| 98 |
- Replacing one punctuation mark with another
|
| 99 |
+
- Adding terminal punctuation to sentences (3+ words) that lack it
|
| 100 |
|
| 101 |
REJECTED:
|
| 102 |
- Adding/deleting/duplicating Arabic words
|
|
|
|
| 105 |
- Punctuation spam: delta/word_count > 0.5 (multi-word diffs)
|
| 106 |
- Short text (≤2 words): delta > 1
|
| 107 |
- Any diff: delta > MAX_PUNCT_DELTA
|
| 108 |
+
- Adding terminal punctuation to short fragments (≤2 words) (FIX-01)
|
| 109 |
+
- Adding terminal punctuation when text already ends with punct
|
| 110 |
"""
|
| 111 |
original = diff.get('original', '')
|
| 112 |
correction = diff.get('correction', '')
|
| 113 |
|
| 114 |
+
# ── Rule 0 (FIX-01): Reject terminal punctuation injection ──
|
| 115 |
# PuncAra-v1 unconditionally adds . or ؟ to every sentence.
|
| 116 |
# This rule catches the pattern: "word" → "word." / "word؟" / "word،"
|
| 117 |
# where the ONLY change is appending 1-2 terminal punctuation marks.
|
| 118 |
+
#
|
| 119 |
+
# Phase 13: Allow terminal punct for multi-word sentences (3+ words)
|
| 120 |
+
# that don't already end with punctuation. Only block for:
|
| 121 |
+
# - Short fragments (≤2 words in full text)
|
| 122 |
+
# - Text that already has terminal punctuation
|
| 123 |
TERMINAL_PUNCT = set('.,،؛؟!:;?!')
|
| 124 |
orig_stripped = original.rstrip()
|
| 125 |
corr_stripped = correction.rstrip()
|
|
|
|
| 138 |
corr_no_punct = re.sub(r'[.,،؛؟!:;?!]+$', '', correction)
|
| 139 |
if _normalize_for_comparison(orig_no_punct.replace(' ', '')) == \
|
| 140 |
_normalize_for_comparison(corr_no_punct.replace(' ', '')):
|
| 141 |
+
# This is a pure terminal-punctuation addition.
|
| 142 |
+
# Decide whether to allow based on full text context.
|
| 143 |
+
_full_word_count = len(re.findall(
|
| 144 |
+
r'[\u0600-\u06FFa-zA-Z]+', full_text
|
| 145 |
+
)) if full_text else 0
|
| 146 |
+
_full_already_has_terminal = bool(
|
| 147 |
+
re.search(r'[.،؛؟!?!][\s]*$', full_text)
|
| 148 |
+
) if full_text else False
|
| 149 |
+
# Also check for ellipsis (... at end)
|
| 150 |
+
_full_has_ellipsis = full_text.rstrip().endswith('...') if full_text else False
|
| 151 |
+
|
| 152 |
+
if _full_word_count >= 3 and not _full_already_has_terminal and not _full_has_ellipsis:
|
| 153 |
+
# Multi-word sentence without terminal punct → ALLOW
|
| 154 |
+
logger.info(
|
| 155 |
+
f"[PUNC-SAFETY] Allowed terminal punct for sentence "
|
| 156 |
+
f"({_full_word_count} words): "
|
| 157 |
+
f"'{original}' → '{correction}'"
|
| 158 |
+
)
|
| 159 |
+
# Fall through to remaining rules (don't return yet)
|
| 160 |
+
else:
|
| 161 |
+
# Short fragment OR already has terminal punct → REJECT
|
| 162 |
+
logger.info(
|
| 163 |
+
f"[PUNC-SAFETY] Rejected terminal punct injection: "
|
| 164 |
+
f"'{original}' → '{correction}'"
|
| 165 |
+
)
|
| 166 |
+
return False
|
| 167 |
|
| 168 |
# ── Rule 0b (Batch 4): Reject punct insertion when original has no punctuation ──
|
| 169 |
# If the original text has zero Arabic punctuation and the correction
|