youssefreda9 commited on
Commit
f4cabc9
·
1 Parent(s): e653046

fix: Handle N-to-M word replacements in analyze post-filter - Process each original word individually against corrected segment - Accept 1-to-1 small edits and 1-to-N word splits within multi-word blocks - Previously all multi-word blocks were silently rejected

Browse files
Files changed (1) hide show
  1. src/app.py +54 -1
src/app.py CHANGED
@@ -756,7 +756,60 @@ def analyze_text():
756
  else:
757
  new_words.append(current_text[start_idx:end_idx])
758
  else:
759
- new_words.extend([current_text[orig_word_positions[idx][1]:orig_word_positions[idx][2]] for idx in range(i1, i2)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
  elif tag == 'delete':
761
  for idx in range(i1, i2):
762
  new_words.append(current_text[orig_word_positions[idx][1]:orig_word_positions[idx][2]])
 
756
  else:
757
  new_words.append(current_text[start_idx:end_idx])
758
  else:
759
+ # N→M replacement: process each original word individually
760
+ # Build a mapping by trying to match original words to corrected words
761
+ corr_joined = " ".join(c_segment)
762
+ ci = 0 # cursor into c_segment
763
+ for oi in range(i1, i2):
764
+ o_word = orig_word_strings[oi]
765
+ o_start = orig_word_positions[oi][1]
766
+ o_end = orig_word_positions[oi][2]
767
+
768
+ if ci < len(c_segment):
769
+ c_word = c_segment[ci]
770
+ # Check if this is a 1→1 small edit
771
+ if _is_small_spelling_change(o_word, c_word):
772
+ new_words.append(c_word)
773
+ suggestions.append({
774
+ 'start': o_start,
775
+ 'end': o_end,
776
+ 'original': o_word,
777
+ 'correction': c_word,
778
+ 'type': 'spelling',
779
+ })
780
+ ci += 1
781
+ # Check if this is a 1→N word split
782
+ elif len(o_word) >= 5 and ci + 1 < len(c_segment):
783
+ # Try to consume multiple corrected words for this one original word
784
+ split_parts = [c_segment[ci]]
785
+ temp_ci = ci + 1
786
+ joined = c_segment[ci]
787
+ while temp_ci < len(c_segment) and len(joined) < len(o_word) + 2:
788
+ joined += c_segment[temp_ci]
789
+ split_parts.append(c_segment[temp_ci])
790
+ temp_ci += 1
791
+ # Check if the joined parts roughly match the original
792
+ corr_str = " ".join(split_parts)
793
+ joined_no_space = "".join(split_parts)
794
+ dist = _levenshtein(o_word, joined_no_space)
795
+ if dist <= 3 and len(split_parts) > 1:
796
+ new_words.append(corr_str)
797
+ suggestions.append({
798
+ 'start': o_start,
799
+ 'end': o_end,
800
+ 'original': o_word,
801
+ 'correction': corr_str,
802
+ 'type': 'spelling',
803
+ })
804
+ ci = temp_ci
805
+ else:
806
+ new_words.append(current_text[o_start:o_end])
807
+ ci += 1
808
+ else:
809
+ new_words.append(current_text[o_start:o_end])
810
+ ci += 1
811
+ else:
812
+ new_words.append(current_text[o_start:o_end])
813
  elif tag == 'delete':
814
  for idx in range(i1, i2):
815
  new_words.append(current_text[orig_word_positions[idx][1]:orig_word_positions[idx][2]])