Mohamed Atef commited on
Commit
c830869
·
1 Parent(s): 38a1924

Phase 13: Fix punctuation terminal injection filter (context-aware Rule 0)

Browse files
src/app.py CHANGED
@@ -2186,7 +2186,7 @@ def analyze_text():
2186
  f"(locked by {owner}[{ls}:{le}])"
2187
  )
2188
  # Punctuation safety layer: reject non-punctuation changes
2189
- if not validate_punctuation_diff(d):
2190
  logger.info(
2191
  f"[PUNC-SAFETY] Rejected diff [{d['start']}:{d['end']}] "
2192
  f"'{d.get('original','')}' → '{d.get('correction','')}' — not a safe punctuation change"
@@ -2215,7 +2215,7 @@ def analyze_text():
2215
 
2216
  # FIX-05: Rebuild punctuation text from accepted diffs only
2217
  _safe_punc = ctx.current_text
2218
- _punc_accepted = [d for d in diffs if validate_punctuation_diff(d)]
2219
  for _pd in sorted(_punc_accepted, key=lambda x: x['start'], reverse=True):
2220
  _safe_punc = (_safe_punc[:_pd['start']] +
2221
  _pd['correction'] +
 
2186
  f"(locked by {owner}[{ls}:{le}])"
2187
  )
2188
  # Punctuation safety layer: reject non-punctuation changes
2189
+ if not validate_punctuation_diff(d, full_text=ctx.current_text):
2190
  logger.info(
2191
  f"[PUNC-SAFETY] Rejected diff [{d['start']}:{d['end']}] "
2192
  f"'{d.get('original','')}' → '{d.get('correction','')}' — not a safe punctuation change"
 
2215
 
2216
  # FIX-05: Rebuild punctuation text from accepted diffs only
2217
  _safe_punc = ctx.current_text
2218
+ _punc_accepted = [d for d in diffs if validate_punctuation_diff(d, full_text=ctx.current_text)]
2219
  for _pd in sorted(_punc_accepted, key=lambda x: x['start'], reverse=True):
2220
  _safe_punc = (_safe_punc[:_pd['start']] +
2221
  _pd['correction'] +
src/nlp/punctuation/punctuation_rules.py CHANGED
@@ -89,13 +89,14 @@ def _normalize_for_comparison(text: str) -> str:
89
  return text
90
 
91
 
92
- def validate_punctuation_diff(diff: dict) -> bool:
93
  """
94
  Return True ONLY if the diff is a safe punctuation-only change.
95
 
96
  ALLOWED:
97
  - Inserting 1 punctuation mark (short text) or 1–3 (long text)
98
  - Replacing one punctuation mark with another
 
99
 
100
  REJECTED:
101
  - Adding/deleting/duplicating Arabic words
@@ -104,15 +105,21 @@ def validate_punctuation_diff(diff: dict) -> bool:
104
  - Punctuation spam: delta/word_count > 0.5 (multi-word diffs)
105
  - Short text (≤2 words): delta > 1
106
  - Any diff: delta > MAX_PUNCT_DELTA
107
- - Adding terminal punctuation to a single word (FIX-01)
 
108
  """
109
  original = diff.get('original', '')
110
  correction = diff.get('correction', '')
111
 
112
- # ── Rule 0 (FIX-01): Reject terminal punctuation injection on single words ──
113
  # PuncAra-v1 unconditionally adds . or ؟ to every sentence.
114
  # This rule catches the pattern: "word" → "word." / "word؟" / "word،"
115
  # where the ONLY change is appending 1-2 terminal punctuation marks.
 
 
 
 
 
116
  TERMINAL_PUNCT = set('.,،؛؟!:;?!')
117
  orig_stripped = original.rstrip()
118
  corr_stripped = correction.rstrip()
@@ -131,12 +138,32 @@ def validate_punctuation_diff(diff: dict) -> bool:
131
  corr_no_punct = re.sub(r'[.,،؛؟!:;?!]+$', '', correction)
132
  if _normalize_for_comparison(orig_no_punct.replace(' ', '')) == \
133
  _normalize_for_comparison(corr_no_punct.replace(' ', '')):
134
- # This is a pure terminal-punctuation addition — reject
135
- logger.info(
136
- f"[PUNC-SAFETY] Rejected terminal punct injection: "
137
- f"'{original}' → '{correction}'"
138
- )
139
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  # ── Rule 0b (Batch 4): Reject punct insertion when original has no punctuation ──
142
  # If the original text has zero Arabic punctuation and the correction
 
89
  return text
90
 
91
 
92
+ def validate_punctuation_diff(diff: dict, full_text: str = '') -> bool:
93
  """
94
  Return True ONLY if the diff is a safe punctuation-only change.
95
 
96
  ALLOWED:
97
  - Inserting 1 punctuation mark (short text) or 1–3 (long text)
98
  - Replacing one punctuation mark with another
99
+ - Adding terminal punctuation to sentences (3+ words) that lack it
100
 
101
  REJECTED:
102
  - Adding/deleting/duplicating Arabic words
 
105
  - Punctuation spam: delta/word_count > 0.5 (multi-word diffs)
106
  - Short text (≤2 words): delta > 1
107
  - Any diff: delta > MAX_PUNCT_DELTA
108
+ - Adding terminal punctuation to short fragments (≤2 words) (FIX-01)
109
+ - Adding terminal punctuation when text already ends with punct
110
  """
111
  original = diff.get('original', '')
112
  correction = diff.get('correction', '')
113
 
114
+ # ── Rule 0 (FIX-01): Reject terminal punctuation injection ──
115
  # PuncAra-v1 unconditionally adds . or ؟ to every sentence.
116
  # This rule catches the pattern: "word" → "word." / "word؟" / "word،"
117
  # where the ONLY change is appending 1-2 terminal punctuation marks.
118
+ #
119
+ # Phase 13: Allow terminal punct for multi-word sentences (3+ words)
120
+ # that don't already end with punctuation. Only block for:
121
+ # - Short fragments (≤2 words in full text)
122
+ # - Text that already has terminal punctuation
123
  TERMINAL_PUNCT = set('.,،؛؟!:;?!')
124
  orig_stripped = original.rstrip()
125
  corr_stripped = correction.rstrip()
 
138
  corr_no_punct = re.sub(r'[.,،؛؟!:;?!]+$', '', correction)
139
  if _normalize_for_comparison(orig_no_punct.replace(' ', '')) == \
140
  _normalize_for_comparison(corr_no_punct.replace(' ', '')):
141
+ # This is a pure terminal-punctuation addition.
142
+ # Decide whether to allow based on full text context.
143
+ _full_word_count = len(re.findall(
144
+ r'[\u0600-\u06FFa-zA-Z]+', full_text
145
+ )) if full_text else 0
146
+ _full_already_has_terminal = bool(
147
+ re.search(r'[.،؛؟!?!][\s]*$', full_text)
148
+ ) if full_text else False
149
+ # Also check for ellipsis (... at end)
150
+ _full_has_ellipsis = full_text.rstrip().endswith('...') if full_text else False
151
+
152
+ if _full_word_count >= 3 and not _full_already_has_terminal and not _full_has_ellipsis:
153
+ # Multi-word sentence without terminal punct → ALLOW
154
+ logger.info(
155
+ f"[PUNC-SAFETY] Allowed terminal punct for sentence "
156
+ f"({_full_word_count} words): "
157
+ f"'{original}' → '{correction}'"
158
+ )
159
+ # Fall through to remaining rules (don't return yet)
160
+ else:
161
+ # Short fragment OR already has terminal punct → REJECT
162
+ logger.info(
163
+ f"[PUNC-SAFETY] Rejected terminal punct injection: "
164
+ f"'{original}' → '{correction}'"
165
+ )
166
+ return False
167
 
168
  # ── Rule 0b (Batch 4): Reject punct insertion when original has no punctuation ──
169
  # If the original text has zero Arabic punctuation and the correction