youssefreda9 commited on
Commit
01b11d4
·
1 Parent(s): 61cee96

revert: restore backend/model files to cf07939 state, keep UI/UX changes only

Browse files
src/app.py CHANGED
@@ -6,9 +6,6 @@ Provides API endpoints for the Bayan web application.
6
  import os
7
  import logging
8
  import time
9
- import hashlib
10
- from collections import OrderedDict
11
- from functools import wraps
12
  from flask import Flask, request, jsonify, Response
13
  from flask_cors import CORS
14
  from pathlib import Path
@@ -79,119 +76,6 @@ MAX_TEXT_LENGTH = 5000 # Maximum characters for input text
79
  MAX_SUMMARY_LENGTH = 512 # Maximum tokens for summary
80
  MIN_TEXT_LENGTH = 10 # Minimum characters for summarization
81
 
82
- # ── Response Cache (P3) ──
83
- # LRU cache for /api/analyze: hash(text) → (response_dict, timestamp)
84
- _ANALYZE_CACHE_MAX = 500
85
- _ANALYZE_CACHE_TTL = 300 # 5 minutes
86
- _analyze_cache = OrderedDict()
87
-
88
- # ── Rate Limiter (P3) ──
89
- _RATE_LIMIT_MAX = 30 # requests per window
90
- _RATE_LIMIT_WINDOW = 60 # seconds
91
- _rate_limit_store = {} # ip → [(timestamp, ...)]
92
-
93
- # ── Ta Marbuta Dictionary (P2) ──
94
- # Common words where ه at the end should be ة
95
- _TA_MARBUTA_DICT = {
96
- 'المدرسه': 'المدرسة', 'الجامعه': 'الجامعة', 'المكتبه': 'المكتبة',
97
- 'الحياه': 'الحياة', 'الصلاه': 'الصلاة', 'الزكاه': 'الزكاة',
98
- 'القراءه': 'القراءة', 'الكتابه': 'الكتابة', 'المعرفه': 'المعرفة',
99
- 'الثقافه': 'الثقافة', 'السياسه': 'السياسة', 'الاقتصاديه': 'الاقتصادية',
100
- 'العربيه': 'العربية', 'الاسلاميه': 'الإسلامية', 'التربيه': 'التربية',
101
- 'الشريعه': 'الشريعة', 'الدوله': 'الدولة', 'الحكومه': 'الحكومة',
102
- 'المدينه': 'المدينة', 'القريه': 'القرية', 'الغرفه': 'الغرفة',
103
- 'السياره': 'السيارة', 'الطاوله': 'الطاولة', 'الرساله': 'الرسالة',
104
- 'المقاله': 'المقالة', 'الصحيفه': 'الصحيفة', 'الجريده': 'الجريدة',
105
- 'القصه': 'القصة', 'الروايه': 'الرواية', 'اللغه': 'اللغة',
106
- 'الفكره': 'الفكرة', 'الخطوه': 'الخطوة', 'المرحله': 'المرحلة',
107
- 'النتيجه': 'النتيجة', 'المشكله': 'المشكلة', 'الطريقه': 'الطريقة',
108
- 'الحاله': 'الحالة', 'الصوره': 'الصورة', 'القوه': 'القوة',
109
- 'الوحده': 'الوحدة', 'العلاقه': 'العلاقة', 'التجربه': 'التجربة',
110
- 'الحركه': 'الحركة', 'السلطه': 'السلطة', 'المنطقه': 'المنطقة',
111
- 'الساعه': 'الساعة', 'اللحظه': 'اللحظة', 'الفتره': 'الفترة',
112
- 'الاداره': 'الإدارة', 'البيئه': 'البيئة', 'الماده': 'المادة',
113
- 'الاسره': 'الأسرة', 'العائله': 'العائلة', 'الشركه': 'الشركة',
114
- 'المؤسسه': 'المؤسسة', 'المنظمه': 'المنظمة', 'الجمعيه': 'الجمعية',
115
- 'الوزاره': 'الوزارة', 'السفاره': 'السفارة', 'القياده': 'القيادة',
116
- 'الزياره': 'الزيارة', 'المحاوله': 'المحاولة', 'الدراسه': 'الدراسة',
117
- 'الممارسه': 'الممارسة', 'المتابعه': 'المتابعة', 'الخدمه': 'الخدمة',
118
- 'التقنيه': 'التقنية', 'الهندسه': 'الهندسة', 'الفلسفه': 'الفلسفة',
119
- 'مدرسه': 'مدرسة', 'جامعه': 'جامعة', 'مكتبه': 'مكتبة',
120
- 'حياه': 'حياة', 'صلاه': 'صلاة', 'زكاه': 'زكاة',
121
- 'لغه': 'لغة', 'قصه': 'قصة', 'فكره': 'فكرة',
122
- 'خطوه': 'خطوة', 'صوره': 'صورة', 'قوه': 'قوة',
123
- 'سياره': 'سيارة', 'رساله': 'رسالة', 'ساعه': 'ساعة',
124
- 'غرفه': 'غرفة', 'شركه': 'شركة', 'دوله': 'دولة',
125
- }
126
-
127
-
128
- def _fix_ta_marbuta(text):
129
- """Fix common ه→ة errors at pipeline level using dictionary lookup."""
130
- words = text.split()
131
- fixed_words = []
132
- changes = []
133
- pos = 0
134
- for word in words:
135
- start = text.find(word, pos)
136
- end = start + len(word)
137
- # Check bare word
138
- if word in _TA_MARBUTA_DICT:
139
- fixed_words.append(_TA_MARBUTA_DICT[word])
140
- changes.append({'start': start, 'end': end, 'original': word, 'correction': _TA_MARBUTA_DICT[word]})
141
- # Check word ending in ه that should be ة (pattern match)
142
- elif word.endswith('ه') and len(word) >= 3:
143
- candidate = word[:-1] + 'ة'
144
- if candidate in _TA_MARBUTA_DICT.values():
145
- fixed_words.append(candidate)
146
- changes.append({'start': start, 'end': end, 'original': word, 'correction': candidate})
147
- else:
148
- fixed_words.append(word)
149
- else:
150
- fixed_words.append(word)
151
- pos = end
152
- return ' '.join(fixed_words), changes
153
-
154
-
155
- def _check_rate_limit(ip):
156
- """Check if IP has exceeded rate limit. Returns True if allowed."""
157
- now = time.time()
158
- if ip not in _rate_limit_store:
159
- _rate_limit_store[ip] = []
160
- # Clean old entries
161
- _rate_limit_store[ip] = [t for t in _rate_limit_store[ip] if now - t < _RATE_LIMIT_WINDOW]
162
- if len(_rate_limit_store[ip]) >= _RATE_LIMIT_MAX:
163
- return False
164
- _rate_limit_store[ip].append(now)
165
- return True
166
-
167
-
168
- def _get_cache_key(text):
169
- """Generate cache key from text."""
170
- return hashlib.md5(text.encode('utf-8')).hexdigest()
171
-
172
-
173
- def _get_cached_response(text):
174
- """Get cached response if exists and not expired."""
175
- key = _get_cache_key(text)
176
- if key in _analyze_cache:
177
- data, ts = _analyze_cache[key]
178
- if time.time() - ts < _ANALYZE_CACHE_TTL:
179
- _analyze_cache.move_to_end(key)
180
- return data
181
- else:
182
- del _analyze_cache[key]
183
- return None
184
-
185
-
186
- def _set_cached_response(text, response_data):
187
- """Store response in cache."""
188
- key = _get_cache_key(text)
189
- _analyze_cache[key] = (response_data, time.time())
190
- # Evict oldest if over limit
191
- while len(_analyze_cache) > _ANALYZE_CACHE_MAX:
192
- _analyze_cache.popitem(last=False)
193
-
194
-
195
  # Global model instances
196
  summarization_model = None
197
  spelling_model = None
@@ -1033,12 +917,6 @@ def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
1033
  ('ء', 'أ'), ('أ', 'ء'), # standalone hamza ↔ hamza on alef
1034
  ('ء', 'ؤ'), ('ؤ', 'ء'), # standalone hamza ↔ hamza on waw
1035
  ('ء', 'ئ'), ('ئ', 'ء'), # standalone hamza ↔ hamza on ya
1036
- # Common Arabic letter confusions (sound-alike pairs)
1037
- ('ص', 'س'), ('س', 'ص'), # emphatic/plain sibilant (المدرصة→المدرسة)
1038
- ('ض', 'ظ'), ('ظ', 'ض'), # emphatic pair confusion
1039
- ('ذ', 'ز'), ('ز', 'ذ'), # voiced fricatives
1040
- ('ث', 'س'), ('س', 'ث'), # voiceless fricatives
1041
- ('ط', 'ت'), ('ت', 'ط'), # emphatic/plain stop
1042
  }
1043
  # Check every character pair — reject if ANY non-orthographic change
1044
  if len(orig_word) != len(corr_word):
@@ -1191,14 +1069,6 @@ def _is_orthographic_variant(word1: str, word2: str) -> bool:
1191
 
1192
  @app.route('/api/analyze', methods=['POST'])
1193
  def analyze_text():
1194
- # ── Rate Limiting (P3) ──
1195
- client_ip = request.headers.get('X-Forwarded-For', request.remote_addr)
1196
- if not _check_rate_limit(client_ip):
1197
- return jsonify({
1198
- 'error': 'Rate limit exceeded. Please wait before making more requests.',
1199
- 'status': 'error'
1200
- }), 429
1201
-
1202
  """
1203
  Perform sequential analysis (Spelling -> Grammar -> Punctuation)
1204
  and return word-level suggestions with offsets.
@@ -1220,12 +1090,6 @@ def analyze_text():
1220
  if not text:
1221
  return jsonify({'error': 'Text is required', 'status': 'error'}), 400
1222
 
1223
- # ── Cache Check (P3) ──
1224
- cached = _get_cached_response(text)
1225
- if cached:
1226
- logger.info(f"[ANALYZE] Cache hit for text (len={len(text)})")
1227
- return jsonify(cached)
1228
-
1229
  # Reject inputs that are predominantly non-Arabic (code, markup, etc.)
1230
  arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
1231
  alpha_chars = len(re.findall(r'[a-zA-Z\u0600-\u06FF]', text))
@@ -1494,22 +1358,6 @@ def analyze_text():
1494
  except Exception as e:
1495
  logger.error(f"[ANALYZE] Hamza fix failed: {type(e).__name__}: {e}")
1496
 
1497
- # ── Ta Marbuta fix pass (P2) ──
1498
- # Catches common ه→ة errors like المدرسه→المدرسة at pipeline level.
1499
- try:
1500
- ta_fixed, ta_changes = _fix_ta_marbuta(current_text)
1501
- if ta_fixed != current_text:
1502
- for tc in ta_changes:
1503
- ctx.add_patch(
1504
- 'spelling', tc['start'], tc['end'],
1505
- tc['correction'], confidence=0.95,
1506
- )
1507
- logger.info(f"[TA-MARBUTA] '{tc['original']}' → '{tc['correction']}'")
1508
- ctx.mutate_text(ta_fixed, OffsetMapper)
1509
- current_text = ctx.current_text
1510
- except Exception as e:
1511
- logger.error(f"[ANALYZE] Ta Marbuta fix failed: {type(e).__name__}: {e}")
1512
-
1513
  # 2. Grammar (runs on spelling-corrected text — word-level dependency)
1514
  try:
1515
  t0 = time.time()
@@ -1618,22 +1466,6 @@ def analyze_text():
1618
  stage_label = 'grammar'
1619
  if _is_spelling_only_change(orig_text, corr_text):
1620
  stage_label = 'spelling'
1621
-
1622
- # ── Directional blocks for grammar (mirrors spelling filter) ──
1623
- # Prevents grammar from making meaning-changing corrections
1624
- # like كان→كأن ("was" → "as if").
1625
- _GRAMMAR_BLOCKS = {
1626
- 'كان': {'كأن'}, 'كأن': {'كان'},
1627
- 'هذه': {'هذة'}, 'هذا': {'هذة', 'هذه'},
1628
- 'إلى': {'ع��ى', 'علي'}, 'على': {'إلى', 'علي'},
1629
- 'لكن': {'لاكن'}, 'ذلك': {'ذالك'},
1630
- }
1631
- if corr_text in _GRAMMAR_BLOCKS.get(orig_text, set()):
1632
- logger.info(
1633
- f"[GRAMMAR] Blocked directional: '{orig_text}'→'{corr_text}'"
1634
- )
1635
- continue
1636
-
1637
  ctx.add_patch(
1638
  stage_label, d['start'], d['end'],
1639
  corr_text, confidence=1.0
@@ -1670,13 +1502,6 @@ def analyze_text():
1670
  from nlp.punctuation.punctuation_service import get_punctuation_model
1671
  punc_checker = get_punctuation_model()
1672
  corrected_punc = punc_checker.correct(ctx.current_text)
1673
- # ── Post-process: strip duplicate trailing punctuation ──
1674
- # Model sometimes turns "..." into "...." or "." into ".."
1675
- import re as _punc_re
1676
- # Collapse non-dot duplicate punctuation: ,, → , ;; → ; etc.
1677
- corrected_punc = _punc_re.sub(r'([،؛:!?؟])\1+', r'\1', corrected_punc)
1678
- # Collapse 4+ dots into ellipsis (3 dots), preserve intentional ...
1679
- corrected_punc = _punc_re.sub(r'\.{4,}', '...', corrected_punc)
1680
  timing_ms['punctuation_ms'] = int((time.time() - t0) * 1000)
1681
  logger.info(f"[ANALYZE] Step 3: Punctuation done in {timing_ms['punctuation_ms']}ms")
1682
  if corrected_punc != ctx.current_text:
@@ -1703,20 +1528,6 @@ def analyze_text():
1703
  f"'{d.get('original','')}' \u2192 '{d.get('correction','')}' "
1704
  f"(locked by {owner}[{ls}:{le}])"
1705
  )
1706
- # ── Mid-word split guard ──
1707
- # Reject punctuation diffs where the original is NOT a complete
1708
- # word — i.e., the character after the diff end is still Arabic.
1709
- # This catches cases like الدفتر being split into الدفت.ر
1710
- d_end = d['end']
1711
- if d_end < len(ctx.current_text):
1712
- next_ch = ctx.current_text[d_end]
1713
- if '\u0600' <= next_ch <= '\u06FF':
1714
- logger.info(
1715
- f"[PUNC-SAFETY] Rejected mid-word split [{d['start']}:{d_end}] "
1716
- f"'{d.get('original','')}' → '{d.get('correction','')}' "
1717
- f"(next char '{next_ch}' is Arabic — word was split)"
1718
- )
1719
- continue
1720
  # Punctuation safety layer: reject non-punctuation changes
1721
  if not validate_punctuation_diff(d):
1722
  logger.info(
@@ -1724,21 +1535,6 @@ def analyze_text():
1724
  f"'{d.get('original','')}' → '{d.get('correction','')}' — not a safe punctuation change"
1725
  )
1726
  continue
1727
- # ── Duplicate punctuation guard ──
1728
- # Reject corrections that just append punctuation to already-punctuated text
1729
- # e.g. "الحديقة." → "الحديقة.." or "..." → "...."
1730
- import re as _re2
1731
- orig_txt = d.get('original', '')
1732
- corr_txt = d.get('correction', '')
1733
- _PUNC_CHARS = set('.,،؛:!?؟…。')
1734
- if orig_txt and corr_txt and len(corr_txt) > len(orig_txt):
1735
- suffix_added = corr_txt[len(orig_txt):]
1736
- if all(c in _PUNC_CHARS for c in suffix_added) and orig_txt[-1] in _PUNC_CHARS:
1737
- logger.info(
1738
- f"[PUNC-DUP] Rejected duplicate punctuation [{d['start']}:{d['end']}] "
1739
- f"'{orig_txt}' → '{corr_txt}' — already has punctuation"
1740
- )
1741
- continue
1742
  ctx.add_patch(
1743
  'punctuation', d['start'], d['end'],
1744
  d['correction'], confidence=0.8
@@ -1809,10 +1605,6 @@ def analyze_text():
1809
  if stage_errors:
1810
  response_data['warnings'] = stage_errors
1811
 
1812
- # ── Cache Store (P3) ──
1813
- if response_status == 'success':
1814
- _set_cached_response(text, response_data)
1815
-
1816
  return jsonify(response_data)
1817
 
1818
  except Exception as e:
@@ -1825,52 +1617,6 @@ def analyze_text():
1825
  }), 500
1826
 
1827
 
1828
- @app.route('/api/feedback', methods=['POST'])
1829
- def submit_feedback():
1830
- """Accept user feedback on correction suggestions."""
1831
- try:
1832
- if not request.is_json:
1833
- return jsonify({'error': 'Request must be JSON', 'status': 'error'}), 400
1834
-
1835
- data = request.get_json()
1836
- suggestion_id = data.get('suggestion_id', '')
1837
- helpful = data.get('helpful', None)
1838
- text = data.get('text', '')[:200] # Truncate for safety
1839
- original = data.get('original', '')[:100]
1840
- correction = data.get('correction', '')[:100]
1841
-
1842
- if helpful is None:
1843
- return jsonify({'error': 'helpful field is required', 'status': 'error'}), 400
1844
-
1845
- # Log feedback (simple file-based for now)
1846
- feedback_entry = {
1847
- 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
1848
- 'suggestion_id': suggestion_id,
1849
- 'helpful': helpful,
1850
- 'original': original,
1851
- 'correction': correction,
1852
- 'text_snippet': text,
1853
- 'ip': request.headers.get('X-Forwarded-For', request.remote_addr),
1854
- }
1855
- logger.info(f"[FEEDBACK] {feedback_entry}")
1856
-
1857
- # Append to feedback log file
1858
- try:
1859
- feedback_dir = Path(__file__).parent.parent / 'logs'
1860
- feedback_dir.mkdir(exist_ok=True)
1861
- with open(feedback_dir / 'feedback.jsonl', 'a', encoding='utf-8') as f:
1862
- import json
1863
- f.write(json.dumps(feedback_entry, ensure_ascii=False) + '\n')
1864
- except Exception as log_err:
1865
- logger.warning(f"[FEEDBACK] Could not write to file: {log_err}")
1866
-
1867
- return jsonify({'status': 'success', 'message': 'شكراً لملاحظاتك!'})
1868
-
1869
- except Exception as e:
1870
- logger.error(f"[FEEDBACK] Error: {e}")
1871
- return jsonify({'error': 'Failed to submit feedback', 'status': 'error'}), 500
1872
-
1873
-
1874
  @app.errorhandler(404)
1875
  def not_found(error):
1876
  """Handle 404 errors."""
 
6
  import os
7
  import logging
8
  import time
 
 
 
9
  from flask import Flask, request, jsonify, Response
10
  from flask_cors import CORS
11
  from pathlib import Path
 
76
  MAX_SUMMARY_LENGTH = 512 # Maximum tokens for summary
77
  MIN_TEXT_LENGTH = 10 # Minimum characters for summarization
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # Global model instances
80
  summarization_model = None
81
  spelling_model = None
 
917
  ('ء', 'أ'), ('أ', 'ء'), # standalone hamza ↔ hamza on alef
918
  ('ء', 'ؤ'), ('ؤ', 'ء'), # standalone hamza ↔ hamza on waw
919
  ('ء', 'ئ'), ('ئ', 'ء'), # standalone hamza ↔ hamza on ya
 
 
 
 
 
 
920
  }
921
  # Check every character pair — reject if ANY non-orthographic change
922
  if len(orig_word) != len(corr_word):
 
1069
 
1070
  @app.route('/api/analyze', methods=['POST'])
1071
  def analyze_text():
 
 
 
 
 
 
 
 
1072
  """
1073
  Perform sequential analysis (Spelling -> Grammar -> Punctuation)
1074
  and return word-level suggestions with offsets.
 
1090
  if not text:
1091
  return jsonify({'error': 'Text is required', 'status': 'error'}), 400
1092
 
 
 
 
 
 
 
1093
  # Reject inputs that are predominantly non-Arabic (code, markup, etc.)
1094
  arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
1095
  alpha_chars = len(re.findall(r'[a-zA-Z\u0600-\u06FF]', text))
 
1358
  except Exception as e:
1359
  logger.error(f"[ANALYZE] Hamza fix failed: {type(e).__name__}: {e}")
1360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1361
  # 2. Grammar (runs on spelling-corrected text — word-level dependency)
1362
  try:
1363
  t0 = time.time()
 
1466
  stage_label = 'grammar'
1467
  if _is_spelling_only_change(orig_text, corr_text):
1468
  stage_label = 'spelling'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1469
  ctx.add_patch(
1470
  stage_label, d['start'], d['end'],
1471
  corr_text, confidence=1.0
 
1502
  from nlp.punctuation.punctuation_service import get_punctuation_model
1503
  punc_checker = get_punctuation_model()
1504
  corrected_punc = punc_checker.correct(ctx.current_text)
 
 
 
 
 
 
 
1505
  timing_ms['punctuation_ms'] = int((time.time() - t0) * 1000)
1506
  logger.info(f"[ANALYZE] Step 3: Punctuation done in {timing_ms['punctuation_ms']}ms")
1507
  if corrected_punc != ctx.current_text:
 
1528
  f"'{d.get('original','')}' \u2192 '{d.get('correction','')}' "
1529
  f"(locked by {owner}[{ls}:{le}])"
1530
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1531
  # Punctuation safety layer: reject non-punctuation changes
1532
  if not validate_punctuation_diff(d):
1533
  logger.info(
 
1535
  f"'{d.get('original','')}' → '{d.get('correction','')}' — not a safe punctuation change"
1536
  )
1537
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1538
  ctx.add_patch(
1539
  'punctuation', d['start'], d['end'],
1540
  d['correction'], confidence=0.8
 
1605
  if stage_errors:
1606
  response_data['warnings'] = stage_errors
1607
 
 
 
 
 
1608
  return jsonify(response_data)
1609
 
1610
  except Exception as e:
 
1617
  }), 500
1618
 
1619
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1620
  @app.errorhandler(404)
1621
  def not_found(error):
1622
  """Handle 404 errors."""
src/nlp/punctuation/punctuation_rules.py CHANGED
@@ -137,19 +137,5 @@ def validate_punctuation_diff(diff: dict) -> bool:
137
  if punct_delta > MAX_PUNCT_DELTA:
138
  return False
139
 
140
- # ── Rule 6: Reject mid-word punctuation insertion ──
141
- # If the correction ends with a punctuation mark followed by nothing,
142
- # but the original word is a PREFIX of a longer word in context,
143
- # this indicates mid-word split (e.g. الدفت→الدفت. when word was الدفتر).
144
- # Detect by checking if correction has punctuation NOT at word boundary.
145
- for pc in ARABIC_PUNCT_CHARS:
146
- if pc in correction:
147
- # Check if punctuation is followed by an Arabic letter (mid-word)
148
- idx = correction.find(pc)
149
- if idx >= 0 and idx < len(correction) - 1:
150
- next_char = correction[idx + 1]
151
- if '\u0600' <= next_char <= '\u06FF':
152
- return False
153
-
154
  return True
155
 
 
137
  if punct_delta > MAX_PUNCT_DELTA:
138
  return False
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  return True
141
 
src/nlp/spelling/araspell_rules.py CHANGED
@@ -184,16 +184,6 @@ class AraSpellPostProcessor:
184
  Also handles prefixed words: و/ف/ب/ك/ل + whitelist word.
185
  e.g. واصدقائي → وأصدقائي, بالاسعار → بالأسعار
186
  """
187
- # Words that must NOT be decomposed by prefix stripping
188
- # كان (was) ≠ ك+أن, بان (appeared) ≠ ب+أن, لان (softened) ≠ ل+أن, فان (van) ≠ ف+أن
189
- HAMZA_PREFIX_BLACKLIST = {
190
- 'كان', 'كانت', 'كانوا', 'كانا',
191
- 'بان', 'بانت', 'بانوا',
192
- 'لان', 'لانت',
193
- 'فان', 'فانت',
194
- 'وان', 'وانت',
195
- 'كانه', 'كانها', 'كانهم',
196
- }
197
  words = text.split()
198
  result = []
199
  for word in words:
@@ -202,11 +192,6 @@ class AraSpellPostProcessor:
202
  result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
203
  continue
204
 
205
- # Skip words in the blacklist — they are valid as-is
206
- if word in HAMZA_PREFIX_BLACKLIST:
207
- result.append(word)
208
- continue
209
-
210
  # Try stripping common prefixes and looking up the remainder
211
  fixed = False
212
  for prefix in AraSpellPostProcessor.HAMZA_PREFIXES:
 
184
  Also handles prefixed words: و/ف/ب/ك/ل + whitelist word.
185
  e.g. واصدقائي → وأصدقائي, بالاسعار → بالأسعار
186
  """
 
 
 
 
 
 
 
 
 
 
187
  words = text.split()
188
  result = []
189
  for word in words:
 
192
  result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
193
  continue
194
 
 
 
 
 
 
195
  # Try stripping common prefixes and looking up the remainder
196
  fixed = False
197
  for prefix in AraSpellPostProcessor.HAMZA_PREFIXES:
tests/test_bug_fixes.py CHANGED
@@ -657,186 +657,5 @@ class TestSuffixCorruption(unittest.TestCase):
657
  "Verb+pronoun كتبته→كتبتة must be blocked")
658
 
659
 
660
- # ═══════════════════════════════════════════════════════════════
661
- # P2: Ta Marbuta Fix Tests
662
- # ═══════════════════════════════════════════════════════════════
663
- class TestTaMarbutaFix(unittest.TestCase):
664
- """Tests for the _fix_ta_marbuta pipeline function."""
665
-
666
- @classmethod
667
- def setUpClass(cls):
668
- from app import _fix_ta_marbuta, _TA_MARBUTA_DICT
669
- cls.fix = staticmethod(_fix_ta_marbuta)
670
- cls.dict = _TA_MARBUTA_DICT
671
-
672
- def test_basic_fix(self):
673
- """المدرسه should be corrected to المدرسة."""
674
- result, changes = self.fix('ذهبت الى المدرسه')
675
- self.assertIn('المدرسة', result)
676
- self.assertEqual(len(changes), 1)
677
- self.assertEqual(changes[0]['original'], 'المدرسه')
678
- self.assertEqual(changes[0]['correction'], 'المدرسة')
679
-
680
- def test_multiple_fixes(self):
681
- """Multiple ta marbuta errors in one sentence."""
682
- result, changes = self.fix('الحياه في المدينه جميله')
683
- self.assertIn('الحياة', result)
684
- self.assertIn('المدينة', result)
685
- self.assertGreaterEqual(len(changes), 2)
686
-
687
- def test_no_false_positives(self):
688
- """Words ending in ه that are NOT ta marbuta should be left alone."""
689
- result, changes = self.fix('الله أكبر')
690
- self.assertEqual(result, 'الله أكبر')
691
- self.assertEqual(len(changes), 0)
692
-
693
- def test_correct_text_untouched(self):
694
- """Already correct text should not be changed."""
695
- result, changes = self.fix('ذهبت إلى المدرسة')
696
- self.assertEqual(result, 'ذهبت إلى المدرسة')
697
- self.assertEqual(len(changes), 0)
698
-
699
- def test_without_alef_lam(self):
700
- """Bare words without ال should also be fixed."""
701
- result, changes = self.fix('هذه مدرسه كبيره')
702
- self.assertIn('مدرسة', result)
703
-
704
- def test_dict_coverage(self):
705
- """Dictionary should have significant coverage."""
706
- self.assertGreater(len(self.dict), 50)
707
-
708
-
709
- # ═══════════════════════════════════════════════════════════════
710
- # P2: Hamza Whitelist Tests
711
- # ═══════════════════════════════════════════════════════════════
712
- class TestHamzaWhitelist(unittest.TestCase):
713
- """Tests for hamza fix function."""
714
-
715
- @classmethod
716
- def setUpClass(cls):
717
- try:
718
- from nlp.spelling.araspell_rules import AraSpellPostProcessor
719
- cls.fix = staticmethod(AraSpellPostProcessor.fix_common_hamza)
720
- cls.available = True
721
- except Exception:
722
- cls.available = False
723
-
724
- def test_anta_fix(self):
725
- """انت should become أنت."""
726
- if not self.available:
727
- self.skipTest("AraSpellPostProcessor not available")
728
- result = self.fix('انت طالب')
729
- self.assertIn('أنت', result)
730
-
731
- def test_ana_fix(self):
732
- """انا should become أنا."""
733
- if not self.available:
734
- self.skipTest("AraSpellPostProcessor not available")
735
- result = self.fix('انا ذاهب')
736
- self.assertIn('أنا', result)
737
-
738
- def test_alaan_fix(self):
739
- """الان should become الآن."""
740
- if not self.available:
741
- self.skipTest("AraSpellPostProcessor not available")
742
- result = self.fix('اذهب الان')
743
- self.assertIn('الآن', result)
744
-
745
- def test_correct_hamza_untouched(self):
746
- """Already correct hamza should not be changed."""
747
- if not self.available:
748
- self.skipTest("AraSpellPostProcessor not available")
749
- result = self.fix('أنت ذاهب إلى المدرسة')
750
- self.assertEqual(result, 'أنت ذاهب إلى المدرسة')
751
-
752
-
753
- # ═══════════════════════════════════════════════════════════════
754
- # P3: Caching & Rate Limiting Tests
755
- # ═══════════════════════════════════════════════════════════════
756
- class TestCachingAndRateLimiting(unittest.TestCase):
757
- """Tests for response caching and rate limiting."""
758
-
759
- @classmethod
760
- def setUpClass(cls):
761
- from app import (
762
- _get_cache_key, _get_cached_response,
763
- _set_cached_response, _check_rate_limit,
764
- _analyze_cache, _rate_limit_store
765
- )
766
- cls._get_cache_key = staticmethod(_get_cache_key)
767
- cls._get_cached = staticmethod(_get_cached_response)
768
- cls._set_cached = staticmethod(_set_cached_response)
769
- cls._check_rate = staticmethod(_check_rate_limit)
770
- cls._cache = _analyze_cache
771
- cls._rate_store = _rate_limit_store
772
-
773
- def setUp(self):
774
- self._cache.clear()
775
- self._rate_store.clear()
776
-
777
- def test_cache_key_deterministic(self):
778
- """Same text should produce same cache key."""
779
- key1 = self._get_cache_key('مرحبا')
780
- key2 = self._get_cache_key('مرحبا')
781
- self.assertEqual(key1, key2)
782
-
783
- def test_cache_key_different(self):
784
- """Different texts should produce different keys."""
785
- key1 = self._get_cache_key('مرحبا')
786
- key2 = self._get_cache_key('أهلا')
787
- self.assertNotEqual(key1, key2)
788
-
789
- def test_cache_store_and_retrieve(self):
790
- """Cached response should be retrievable."""
791
- data = {'original': 'test', 'corrected': 'test', 'suggestions': []}
792
- self._set_cached('مرحبا', data)
793
- result = self._get_cached('مرحبا')
794
- self.assertIsNotNone(result)
795
- self.assertEqual(result['original'], 'test')
796
-
797
- def test_cache_miss(self):
798
- """Non-cached text should return None."""
799
- result = self._get_cached('نص جديد')
800
- self.assertIsNone(result)
801
-
802
- def test_rate_limit_allows(self):
803
- """First request should be allowed."""
804
- self.assertTrue(self._check_rate('127.0.0.1'))
805
-
806
- def test_rate_limit_blocks(self):
807
- """Should block after exceeding limit."""
808
- for _ in range(30):
809
- self._check_rate('test_ip')
810
- self.assertFalse(self._check_rate('test_ip'))
811
-
812
-
813
- # ═══════════════════════════════════════════════════════════════
814
- # P2: Grammar Splitting Tests
815
- # ═══════════════════════════════════════════════════════════════
816
- class TestGrammarSplitting(unittest.TestCase):
817
- """Tests for grammar multi-word diff splitting logic."""
818
-
819
- def test_split_logic(self):
820
- """Multi-word grammar diffs should be split into individual words."""
821
- # Simulate the splitting logic from analyze_text
822
- orig_text = 'الي المدرسه الاستاذ'
823
- corr_text = 'إلى المدرسة الأستاذ'
824
- orig_words = orig_text.split()
825
- corr_words = corr_text.split()
826
-
827
- self.assertEqual(len(orig_words), len(corr_words))
828
-
829
- diffs = []
830
- for ow, cw in zip(orig_words, corr_words):
831
- if ow != cw:
832
- diffs.append({'original': ow, 'correction': cw})
833
-
834
- self.assertEqual(len(diffs), 3)
835
- self.assertEqual(diffs[0]['original'], 'الي')
836
- self.assertEqual(diffs[0]['correction'], 'إلى')
837
- self.assertEqual(diffs[1]['original'], 'المدرسه')
838
- self.assertEqual(diffs[1]['correction'], 'المدرسة')
839
-
840
-
841
  if __name__ == '__main__':
842
  unittest.main()
 
657
  "Verb+pronoun كتبته→كتبتة must be blocked")
658
 
659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
  if __name__ == '__main__':
661
  unittest.main()