youssefreda9 commited on
Commit
79407d3
·
1 Parent(s): e68c40c

fix: re-label grammar spelling fixes + handle prefix in hamza whitelist

Browse files
Files changed (2) hide show
  1. src/app.py +79 -1
  2. src/nlp/spelling/araspell_rules.py +19 -2
src/app.py CHANGED
@@ -767,6 +767,79 @@ def _is_small_spelling_change(orig_word, corr_word):
767
  return dist <= 3 and (dist / max_len) <= 0.5
768
 
769
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
770
  @app.route('/api/analyze', methods=['POST'])
771
  def analyze_text():
772
  """
@@ -985,8 +1058,13 @@ def analyze_text():
985
  f"'{d.get('original','')}' — locked by previous stage"
986
  )
987
  continue
 
 
 
 
 
988
  ctx.add_patch(
989
- 'grammar', d['start'], d['end'],
990
  d['correction'], confidence=1.0
991
  )
992
  ctx.mutate_text(corrected_grammar, OffsetMapper)
 
767
  return dist <= 3 and (dist / max_len) <= 0.5
768
 
769
 
770
+ def _is_spelling_only_change(original: str, correction: str) -> bool:
771
+ """
772
+ Detect if a grammar model's correction is actually a spelling/orthographic fix
773
+ (hamza, ه→ة, ا→أ, etc.) rather than a true grammar change.
774
+
775
+ Used to re-label grammar patches as 'spelling' for correct UI icons.
776
+ """
777
+ if not original or not correction:
778
+ return False
779
+
780
+ # Normalize: strip diacritics for comparison
781
+ import re as _re
782
+ strip_diacritics = lambda t: _re.sub(r'[\u064B-\u065F\u0670]', '', t)
783
+ o = strip_diacritics(original)
784
+ c = strip_diacritics(correction)
785
+
786
+ if o == c:
787
+ return True # Only diacritical difference
788
+
789
+ # Check word-by-word for single-word changes
790
+ o_words = o.split()
791
+ c_words = c.split()
792
+
793
+ if len(o_words) != len(c_words):
794
+ return False # Word count changed = grammar (word split/merge)
795
+
796
+ all_spelling = True
797
+ for ow, cw in zip(o_words, c_words):
798
+ if ow == cw:
799
+ continue
800
+ if _is_orthographic_variant(ow, cw):
801
+ continue
802
+ all_spelling = False
803
+ break
804
+
805
+ return all_spelling
806
+
807
+
808
+ def _is_orthographic_variant(word1: str, word2: str) -> bool:
809
+ """
810
+ Check if two words differ only by common Arabic orthographic variations:
811
+ - Hamza placement: ا↔أ↔إ↔آ, ى↔ي, ه↔ة
812
+ - These are spelling differences, not grammar.
813
+ """
814
+ if len(word1) != len(word2):
815
+ # Allow ه→ة at end (same length since both are 1 char)
816
+ # But also allow small length diffs for hamza additions
817
+ if abs(len(word1) - len(word2)) > 1:
818
+ return False
819
+ # Check if only difference is a trailing ة↔ه
820
+ if (word1[:-1] == word2[:-1] and
821
+ {word1[-1], word2[-1]} <= {'ه', 'ة'}):
822
+ return True
823
+ return False
824
+
825
+ # Same length: check char-by-char
826
+ SPELLING_EQUIVALENCES = {
827
+ frozenset({'ا', 'أ'}), frozenset({'ا', 'إ'}), frozenset({'ا', 'آ'}),
828
+ frozenset({'أ', 'إ'}), frozenset({'أ', 'آ'}), frozenset({'إ', 'آ'}),
829
+ frozenset({'ى', 'ي'}), frozenset({'ه', 'ة'}),
830
+ frozenset({'ؤ', 'و'}), frozenset({'ئ', 'ي'}), frozenset({'ئ', 'ء'}),
831
+ }
832
+ diff_count = 0
833
+ for c1, c2 in zip(word1, word2):
834
+ if c1 == c2:
835
+ continue
836
+ if frozenset({c1, c2}) in SPELLING_EQUIVALENCES:
837
+ diff_count += 1
838
+ else:
839
+ return False # Non-orthographic difference = grammar
840
+ return diff_count > 0 # At least one orthographic difference
841
+
842
+
843
  @app.route('/api/analyze', methods=['POST'])
844
  def analyze_text():
845
  """
 
1058
  f"'{d.get('original','')}' — locked by previous stage"
1059
  )
1060
  continue
1061
+ # Re-label: if grammar's change is purely orthographic
1062
+ # (hamza, ه→ة, etc.), tag it as 'spelling' for correct UI icon
1063
+ stage_label = 'grammar'
1064
+ if _is_spelling_only_change(d.get('original', ''), d.get('correction', '')):
1065
+ stage_label = 'spelling'
1066
  ctx.add_patch(
1067
+ stage_label, d['start'], d['end'],
1068
  d['correction'], confidence=1.0
1069
  )
1070
  ctx.mutate_text(corrected_grammar, OffsetMapper)
src/nlp/spelling/araspell_rules.py CHANGED
@@ -172,11 +172,17 @@ class AraSpellPostProcessor:
172
  result.append(word)
173
  return ' '.join(result)
174
 
 
 
 
 
 
175
  @staticmethod
176
  def fix_common_hamza(text: str) -> str:
177
  """
178
  Fix common hamza placement errors using a whitelist.
179
- These are the most frequent informal Arabic spelling mistakes.
 
180
  """
181
  words = text.split()
182
  result = []
@@ -184,7 +190,18 @@ class AraSpellPostProcessor:
184
  # Check exact match first
185
  if word in AraSpellPostProcessor.HAMZA_WHITELIST:
186
  result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
187
- else:
 
 
 
 
 
 
 
 
 
 
 
188
  result.append(word)
189
  return ' '.join(result)
190
 
 
172
  result.append(word)
173
  return ' '.join(result)
174
 
175
+ # Attached prefixes that can precede hamza-whitelist words
176
+ # Ordered longest-first so وال is tried before و
177
+ HAMZA_PREFIXES = ['وبال', 'فبال', 'وال', 'بال', 'فال', 'كال', 'ول', 'فل',
178
+ 'وب', 'فب', 'وك', 'فك', 'و', 'ف', 'ب', 'ك', 'ل']
179
+
180
  @staticmethod
181
  def fix_common_hamza(text: str) -> str:
182
  """
183
  Fix common hamza placement errors using a whitelist.
184
+ Also handles prefixed words: و/ف/ب/ك/ل + whitelist word.
185
+ e.g. واصدقائي → وأصدقائي, بالاسعار → بالأسعار
186
  """
187
  words = text.split()
188
  result = []
 
190
  # Check exact match first
191
  if word in AraSpellPostProcessor.HAMZA_WHITELIST:
192
  result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
193
+ continue
194
+
195
+ # Try stripping common prefixes and looking up the remainder
196
+ fixed = False
197
+ for prefix in AraSpellPostProcessor.HAMZA_PREFIXES:
198
+ if word.startswith(prefix) and len(word) > len(prefix) + 1:
199
+ remainder = word[len(prefix):]
200
+ if remainder in AraSpellPostProcessor.HAMZA_WHITELIST:
201
+ result.append(prefix + AraSpellPostProcessor.HAMZA_WHITELIST[remainder])
202
+ fixed = True
203
+ break
204
+ if not fixed:
205
  result.append(word)
206
  return ' '.join(result)
207