youssefreda9 commited on
Commit
f5d7f0e
·
1 Parent(s): 4616185

FIX-36: Merge punctuation into grammar when they overlap same span

Browse files

When grammar fixes المعلمون→المعلمين at [18:26] and punctuation
wants المعلمون→المعلمين. at the same [18:26], the overlap resolver
was dropping the punctuation entirely. Now detects that the punctuation
correction is grammar_correction + trailing punct char, and merges them
into a single suggestion: المعلمون→المعلمين.

This preserves both the grammar fix AND the terminal period.

Files changed (1) hide show
  1. src/nlp/correction_patch.py +38 -7
src/nlp/correction_patch.py CHANGED
@@ -102,18 +102,23 @@ class PatchSet:
102
  2. Spelling + Punctuation patches from different stages always coexist
103
  (they're compatible: one fixes the word, the other adds punct)
104
  3. Same-stage overlaps are always resolved (higher confidence wins)
 
105
  """
106
  sorted_patches = sorted(
107
  self.patches,
108
  key=lambda p: (-p.priority, -p.confidence, p.start_original, p.id)
109
  )
110
 
111
- claimed_ranges = [] # list of (start, end, stage)
112
  resolved = []
113
 
 
 
 
114
  for patch in sorted_patches:
115
  has_substantial_overlap = False
116
- for cs, ce, claimed_stage in claimed_ranges:
 
117
  # Check if there's any overlap at all
118
  if patch.start_original < ce and patch.end_original > cs:
119
  # ── Phase 14: Cross-stage compatibility ──
@@ -126,6 +131,28 @@ class PatchSet:
126
  if frozenset({patch.stage, claimed_stage}) in _compatible_pair:
127
  continue # Compatible stages — allow coexistence
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  # Calculate overlap amount
130
  overlap_start = max(patch.start_original, cs)
131
  overlap_end = min(patch.end_original, ce)
@@ -137,16 +164,20 @@ class PatchSet:
137
  overlap_ratio = overlap_width / smaller_width
138
  if overlap_ratio > 0.5:
139
  has_substantial_overlap = True
 
140
  break
141
 
142
  if not has_substantial_overlap:
 
143
  resolved.append(patch)
144
- claimed_ranges.append((patch.start_original, patch.end_original, patch.stage))
145
  else:
146
- logger.info(
147
- f"[OVERLAP] Dropped {patch.stage} [{patch.start_original}:{patch.end_original}] "
148
- f"'{patch.original}' — conflicts with higher-priority span"
149
- )
 
 
150
 
151
  dropped = len(self.patches) - len(resolved)
152
  if dropped > 0:
 
102
  2. Spelling + Punctuation patches from different stages always coexist
103
  (they're compatible: one fixes the word, the other adds punct)
104
  3. Same-stage overlaps are always resolved (higher confidence wins)
105
+ 4. FIX-36: Grammar + Punctuation — merge trailing punct into grammar
106
  """
107
  sorted_patches = sorted(
108
  self.patches,
109
  key=lambda p: (-p.priority, -p.confidence, p.start_original, p.id)
110
  )
111
 
112
+ claimed_ranges = [] # list of (start, end, stage, patch_index)
113
  resolved = []
114
 
115
+ # FIX-36: Punctuation chars that can be merged into grammar corrections
116
+ _PUNCT_CHARS = set('.,،؛;:!؟?')
117
+
118
  for patch in sorted_patches:
119
  has_substantial_overlap = False
120
+ overlapping_resolved_idx = None
121
+ for ci, (cs, ce, claimed_stage, res_idx) in enumerate(claimed_ranges):
122
  # Check if there's any overlap at all
123
  if patch.start_original < ce and patch.end_original > cs:
124
  # ── Phase 14: Cross-stage compatibility ──
 
131
  if frozenset({patch.stage, claimed_stage}) in _compatible_pair:
132
  continue # Compatible stages — allow coexistence
133
 
134
+ # ── FIX-36: Grammar + Punctuation merge ──
135
+ # When punctuation adds a trailing character to a grammar
136
+ # correction at the same span, merge instead of dropping.
137
+ if (patch.stage == 'punctuation' and claimed_stage == 'grammar'
138
+ and patch.start_original == cs and patch.end_original == ce):
139
+ # Check if punctuation correction = grammar correction + punct char
140
+ grammar_patch = resolved[res_idx]
141
+ punc_correction = patch.replacement
142
+ gram_correction = grammar_patch.replacement
143
+ if (len(punc_correction) == len(gram_correction) + 1
144
+ and punc_correction.startswith(gram_correction)
145
+ and punc_correction[-1] in _PUNCT_CHARS):
146
+ # Merge: append the trailing punct to grammar correction
147
+ grammar_patch.replacement = punc_correction
148
+ logger.info(
149
+ f"[OVERLAP] Merged punctuation into grammar "
150
+ f"[{cs}:{ce}]: '{grammar_patch.original}' → "
151
+ f"'{grammar_patch.replacement}'"
152
+ )
153
+ has_substantial_overlap = True # Don't add separately
154
+ break
155
+
156
  # Calculate overlap amount
157
  overlap_start = max(patch.start_original, cs)
158
  overlap_end = min(patch.end_original, ce)
 
164
  overlap_ratio = overlap_width / smaller_width
165
  if overlap_ratio > 0.5:
166
  has_substantial_overlap = True
167
+ overlapping_resolved_idx = res_idx
168
  break
169
 
170
  if not has_substantial_overlap:
171
+ res_idx = len(resolved)
172
  resolved.append(patch)
173
+ claimed_ranges.append((patch.start_original, patch.end_original, patch.stage, res_idx))
174
  else:
175
+ # Only log "Dropped" if we didn't merge
176
+ if overlapping_resolved_idx is not None or patch.stage != 'punctuation':
177
+ logger.info(
178
+ f"[OVERLAP] Dropped {patch.stage} [{patch.start_original}:{patch.end_original}] "
179
+ f"'{patch.original}' — conflicts with higher-priority span"
180
+ )
181
 
182
  dropped = len(self.patches) - len(resolved)
183
  if dropped > 0: