Commit ·
79407d3
1
Parent(s): e68c40c
fix: re-label grammar spelling fixes + handle prefix in hamza whitelist
Browse files- src/app.py +79 -1
- src/nlp/spelling/araspell_rules.py +19 -2
src/app.py
CHANGED
|
@@ -767,6 +767,79 @@ def _is_small_spelling_change(orig_word, corr_word):
|
|
| 767 |
return dist <= 3 and (dist / max_len) <= 0.5
|
| 768 |
|
| 769 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 770 |
@app.route('/api/analyze', methods=['POST'])
|
| 771 |
def analyze_text():
|
| 772 |
"""
|
|
@@ -985,8 +1058,13 @@ def analyze_text():
|
|
| 985 |
f"'{d.get('original','')}' — locked by previous stage"
|
| 986 |
)
|
| 987 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 988 |
ctx.add_patch(
|
| 989 |
-
|
| 990 |
d['correction'], confidence=1.0
|
| 991 |
)
|
| 992 |
ctx.mutate_text(corrected_grammar, OffsetMapper)
|
|
|
|
| 767 |
return dist <= 3 and (dist / max_len) <= 0.5
|
| 768 |
|
| 769 |
|
| 770 |
+
def _is_spelling_only_change(original: str, correction: str) -> bool:
|
| 771 |
+
"""
|
| 772 |
+
Detect if a grammar model's correction is actually a spelling/orthographic fix
|
| 773 |
+
(hamza, ه→ة, ا→أ, etc.) rather than a true grammar change.
|
| 774 |
+
|
| 775 |
+
Used to re-label grammar patches as 'spelling' for correct UI icons.
|
| 776 |
+
"""
|
| 777 |
+
if not original or not correction:
|
| 778 |
+
return False
|
| 779 |
+
|
| 780 |
+
# Normalize: strip diacritics for comparison
|
| 781 |
+
import re as _re
|
| 782 |
+
strip_diacritics = lambda t: _re.sub(r'[\u064B-\u065F\u0670]', '', t)
|
| 783 |
+
o = strip_diacritics(original)
|
| 784 |
+
c = strip_diacritics(correction)
|
| 785 |
+
|
| 786 |
+
if o == c:
|
| 787 |
+
return True # Only diacritical difference
|
| 788 |
+
|
| 789 |
+
# Check word-by-word for single-word changes
|
| 790 |
+
o_words = o.split()
|
| 791 |
+
c_words = c.split()
|
| 792 |
+
|
| 793 |
+
if len(o_words) != len(c_words):
|
| 794 |
+
return False # Word count changed = grammar (word split/merge)
|
| 795 |
+
|
| 796 |
+
all_spelling = True
|
| 797 |
+
for ow, cw in zip(o_words, c_words):
|
| 798 |
+
if ow == cw:
|
| 799 |
+
continue
|
| 800 |
+
if _is_orthographic_variant(ow, cw):
|
| 801 |
+
continue
|
| 802 |
+
all_spelling = False
|
| 803 |
+
break
|
| 804 |
+
|
| 805 |
+
return all_spelling
|
| 806 |
+
|
| 807 |
+
|
| 808 |
+
def _is_orthographic_variant(word1: str, word2: str) -> bool:
|
| 809 |
+
"""
|
| 810 |
+
Check if two words differ only by common Arabic orthographic variations:
|
| 811 |
+
- Hamza placement: ا↔أ↔إ↔آ, ى↔ي, ه↔ة
|
| 812 |
+
- These are spelling differences, not grammar.
|
| 813 |
+
"""
|
| 814 |
+
if len(word1) != len(word2):
|
| 815 |
+
# Allow ه→ة at end (same length since both are 1 char)
|
| 816 |
+
# But also allow small length diffs for hamza additions
|
| 817 |
+
if abs(len(word1) - len(word2)) > 1:
|
| 818 |
+
return False
|
| 819 |
+
# Check if only difference is a trailing ة↔ه
|
| 820 |
+
if (word1[:-1] == word2[:-1] and
|
| 821 |
+
{word1[-1], word2[-1]} <= {'ه', 'ة'}):
|
| 822 |
+
return True
|
| 823 |
+
return False
|
| 824 |
+
|
| 825 |
+
# Same length: check char-by-char
|
| 826 |
+
SPELLING_EQUIVALENCES = {
|
| 827 |
+
frozenset({'ا', 'أ'}), frozenset({'ا', 'إ'}), frozenset({'ا', 'آ'}),
|
| 828 |
+
frozenset({'أ', 'إ'}), frozenset({'أ', 'آ'}), frozenset({'إ', 'آ'}),
|
| 829 |
+
frozenset({'ى', 'ي'}), frozenset({'ه', 'ة'}),
|
| 830 |
+
frozenset({'ؤ', 'و'}), frozenset({'ئ', 'ي'}), frozenset({'ئ', 'ء'}),
|
| 831 |
+
}
|
| 832 |
+
diff_count = 0
|
| 833 |
+
for c1, c2 in zip(word1, word2):
|
| 834 |
+
if c1 == c2:
|
| 835 |
+
continue
|
| 836 |
+
if frozenset({c1, c2}) in SPELLING_EQUIVALENCES:
|
| 837 |
+
diff_count += 1
|
| 838 |
+
else:
|
| 839 |
+
return False # Non-orthographic difference = grammar
|
| 840 |
+
return diff_count > 0 # At least one orthographic difference
|
| 841 |
+
|
| 842 |
+
|
| 843 |
@app.route('/api/analyze', methods=['POST'])
|
| 844 |
def analyze_text():
|
| 845 |
"""
|
|
|
|
| 1058 |
f"'{d.get('original','')}' — locked by previous stage"
|
| 1059 |
)
|
| 1060 |
continue
|
| 1061 |
+
# Re-label: if grammar's change is purely orthographic
|
| 1062 |
+
# (hamza, ه→ة, etc.), tag it as 'spelling' for correct UI icon
|
| 1063 |
+
stage_label = 'grammar'
|
| 1064 |
+
if _is_spelling_only_change(d.get('original', ''), d.get('correction', '')):
|
| 1065 |
+
stage_label = 'spelling'
|
| 1066 |
ctx.add_patch(
|
| 1067 |
+
stage_label, d['start'], d['end'],
|
| 1068 |
d['correction'], confidence=1.0
|
| 1069 |
)
|
| 1070 |
ctx.mutate_text(corrected_grammar, OffsetMapper)
|
src/nlp/spelling/araspell_rules.py
CHANGED
|
@@ -172,11 +172,17 @@ class AraSpellPostProcessor:
|
|
| 172 |
result.append(word)
|
| 173 |
return ' '.join(result)
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
@staticmethod
|
| 176 |
def fix_common_hamza(text: str) -> str:
|
| 177 |
"""
|
| 178 |
Fix common hamza placement errors using a whitelist.
|
| 179 |
-
|
|
|
|
| 180 |
"""
|
| 181 |
words = text.split()
|
| 182 |
result = []
|
|
@@ -184,7 +190,18 @@ class AraSpellPostProcessor:
|
|
| 184 |
# Check exact match first
|
| 185 |
if word in AraSpellPostProcessor.HAMZA_WHITELIST:
|
| 186 |
result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
result.append(word)
|
| 189 |
return ' '.join(result)
|
| 190 |
|
|
|
|
| 172 |
result.append(word)
|
| 173 |
return ' '.join(result)
|
| 174 |
|
| 175 |
+
# Attached prefixes that can precede hamza-whitelist words
|
| 176 |
+
# Ordered longest-first so وال is tried before و
|
| 177 |
+
HAMZA_PREFIXES = ['وبال', 'فبال', 'وال', 'بال', 'فال', 'كال', 'ول', 'فل',
|
| 178 |
+
'وب', 'فب', 'وك', 'فك', 'و', 'ف', 'ب', 'ك', 'ل']
|
| 179 |
+
|
| 180 |
@staticmethod
|
| 181 |
def fix_common_hamza(text: str) -> str:
|
| 182 |
"""
|
| 183 |
Fix common hamza placement errors using a whitelist.
|
| 184 |
+
Also handles prefixed words: و/ف/ب/ك/ل + whitelist word.
|
| 185 |
+
e.g. واصدقائي → وأصدقائي, بالاسعار → بالأسعار
|
| 186 |
"""
|
| 187 |
words = text.split()
|
| 188 |
result = []
|
|
|
|
| 190 |
# Check exact match first
|
| 191 |
if word in AraSpellPostProcessor.HAMZA_WHITELIST:
|
| 192 |
result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
# Try stripping common prefixes and looking up the remainder
|
| 196 |
+
fixed = False
|
| 197 |
+
for prefix in AraSpellPostProcessor.HAMZA_PREFIXES:
|
| 198 |
+
if word.startswith(prefix) and len(word) > len(prefix) + 1:
|
| 199 |
+
remainder = word[len(prefix):]
|
| 200 |
+
if remainder in AraSpellPostProcessor.HAMZA_WHITELIST:
|
| 201 |
+
result.append(prefix + AraSpellPostProcessor.HAMZA_WHITELIST[remainder])
|
| 202 |
+
fixed = True
|
| 203 |
+
break
|
| 204 |
+
if not fixed:
|
| 205 |
result.append(word)
|
| 206 |
return ' '.join(result)
|
| 207 |
|