Commit ·
f4cabc9
1
Parent(s): e653046
fix: Handle N-to-M word replacements in analyze post-filter - Process each original word individually against corrected segment - Accept 1-to-1 small edits and 1-to-N word splits within multi-word blocks - Previously all multi-word blocks were silently rejected
Browse files- src/app.py +54 -1
src/app.py
CHANGED
|
@@ -756,7 +756,60 @@ def analyze_text():
|
|
| 756 |
else:
|
| 757 |
new_words.append(current_text[start_idx:end_idx])
|
| 758 |
else:
|
| 759 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
elif tag == 'delete':
|
| 761 |
for idx in range(i1, i2):
|
| 762 |
new_words.append(current_text[orig_word_positions[idx][1]:orig_word_positions[idx][2]])
|
|
|
|
| 756 |
else:
|
| 757 |
new_words.append(current_text[start_idx:end_idx])
|
| 758 |
else:
|
| 759 |
+
# N→M replacement: process each original word individually
|
| 760 |
+
# Build a mapping by trying to match original words to corrected words
|
| 761 |
+
corr_joined = " ".join(c_segment)
|
| 762 |
+
ci = 0 # cursor into c_segment
|
| 763 |
+
for oi in range(i1, i2):
|
| 764 |
+
o_word = orig_word_strings[oi]
|
| 765 |
+
o_start = orig_word_positions[oi][1]
|
| 766 |
+
o_end = orig_word_positions[oi][2]
|
| 767 |
+
|
| 768 |
+
if ci < len(c_segment):
|
| 769 |
+
c_word = c_segment[ci]
|
| 770 |
+
# Check if this is a 1→1 small edit
|
| 771 |
+
if _is_small_spelling_change(o_word, c_word):
|
| 772 |
+
new_words.append(c_word)
|
| 773 |
+
suggestions.append({
|
| 774 |
+
'start': o_start,
|
| 775 |
+
'end': o_end,
|
| 776 |
+
'original': o_word,
|
| 777 |
+
'correction': c_word,
|
| 778 |
+
'type': 'spelling',
|
| 779 |
+
})
|
| 780 |
+
ci += 1
|
| 781 |
+
# Check if this is a 1→N word split
|
| 782 |
+
elif len(o_word) >= 5 and ci + 1 < len(c_segment):
|
| 783 |
+
# Try to consume multiple corrected words for this one original word
|
| 784 |
+
split_parts = [c_segment[ci]]
|
| 785 |
+
temp_ci = ci + 1
|
| 786 |
+
joined = c_segment[ci]
|
| 787 |
+
while temp_ci < len(c_segment) and len(joined) < len(o_word) + 2:
|
| 788 |
+
joined += c_segment[temp_ci]
|
| 789 |
+
split_parts.append(c_segment[temp_ci])
|
| 790 |
+
temp_ci += 1
|
| 791 |
+
# Check if the joined parts roughly match the original
|
| 792 |
+
corr_str = " ".join(split_parts)
|
| 793 |
+
joined_no_space = "".join(split_parts)
|
| 794 |
+
dist = _levenshtein(o_word, joined_no_space)
|
| 795 |
+
if dist <= 3 and len(split_parts) > 1:
|
| 796 |
+
new_words.append(corr_str)
|
| 797 |
+
suggestions.append({
|
| 798 |
+
'start': o_start,
|
| 799 |
+
'end': o_end,
|
| 800 |
+
'original': o_word,
|
| 801 |
+
'correction': corr_str,
|
| 802 |
+
'type': 'spelling',
|
| 803 |
+
})
|
| 804 |
+
ci = temp_ci
|
| 805 |
+
else:
|
| 806 |
+
new_words.append(current_text[o_start:o_end])
|
| 807 |
+
ci += 1
|
| 808 |
+
else:
|
| 809 |
+
new_words.append(current_text[o_start:o_end])
|
| 810 |
+
ci += 1
|
| 811 |
+
else:
|
| 812 |
+
new_words.append(current_text[o_start:o_end])
|
| 813 |
elif tag == 'delete':
|
| 814 |
for idx in range(i1, i2):
|
| 815 |
new_words.append(current_text[orig_word_positions[idx][1]:orig_word_positions[idx][2]])
|