VanguardAI commited on
Commit
9d3935e
·
verified ·
1 Parent(s): 18785f1

Create arabic_connector.py

Browse files
Files changed (1) hide show
  1. arabic_connector.py +489 -0
arabic_connector.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Arabic OCR Text Correction Module
3
+
4
+ This module provides comprehensive post-processing and correction for Arabic OCR output
5
+ using dictionary-based fuzzy matching, context-aware selection, and linguistic knowledge.
6
+
7
+ Author: AI Assistant
8
+ License: MIT
9
+ """
10
+
11
+ import os
12
+ import json
13
+ import re
14
+ import pickle
15
+ from typing import List, Dict, Tuple, Optional, Set
16
+ from collections import defaultdict, Counter
17
+ from pathlib import Path
18
+
19
+ import requests
20
+ from rapidfuzz import fuzz, process
21
+ import pyarabic.araby as araby
22
+ from camel_tools.utils.normalize import normalize_unicode, normalize_alef_maksura_ar, normalize_alef_ar, normalize_teh_marbuta_ar
23
+
24
+
25
+ class ArabicTextCorrector:
26
+ """
27
+ Professional Arabic text correction system with dictionary-based fuzzy matching,
28
+ context-aware selection, and confidence scoring.
29
+ """
30
+
31
+ def __init__(self, cache_dir: str = "./arabic_resources"):
32
+ """
33
+ Initialize the Arabic text corrector.
34
+
35
+ Args:
36
+ cache_dir: Directory to cache downloaded resources
37
+ """
38
+ self.cache_dir = Path(cache_dir)
39
+ self.cache_dir.mkdir(exist_ok=True)
40
+
41
+ # Core data structures
42
+ self.dictionary: Set[str] = set()
43
+ self.word_frequencies: Dict[str, int] = {}
44
+ self.bigrams: Dict[Tuple[str, str], int] = defaultdict(int)
45
+ self.trigrams: Dict[Tuple[str, str, str], int] = defaultdict(int)
46
+
47
+ # Arabic letter similarity map for OCR error patterns
48
+ self.letter_similarity = self._build_letter_similarity_map()
49
+
50
+ # Load resources
51
+ self._load_or_download_resources()
52
+
53
+ def _build_letter_similarity_map(self) -> Dict[str, List[str]]:
54
+ """
55
+ Build a map of commonly confused Arabic letters in OCR.
56
+
57
+ Returns:
58
+ Dictionary mapping each letter to similar-looking letters
59
+ """
60
+ return {
61
+ 'ب': ['ت', 'ث', 'ن', 'ي'],
62
+ 'ت': ['ب', 'ث', 'ن'],
63
+ 'ث': ['ب', 'ت', 'ن'],
64
+ 'ج': ['ح', 'خ'],
65
+ 'ح': ['ج', 'خ'],
66
+ 'خ': ['ج', 'ح'],
67
+ 'د': ['ذ'],
68
+ 'ذ': ['د'],
69
+ 'ر': ['ز'],
70
+ 'ز': ['ر'],
71
+ 'س': ['ش'],
72
+ 'ش': ['س'],
73
+ 'ص': ['ض'],
74
+ 'ض': ['ص'],
75
+ 'ط': ['ظ'],
76
+ 'ظ': ['ط'],
77
+ 'ع': ['غ'],
78
+ 'غ': ['ع'],
79
+ 'ف': ['ق'],
80
+ 'ق': ['ف'],
81
+ 'ك': ['گ'],
82
+ 'ل': ['لا'],
83
+ 'ن': ['ب', 'ت', 'ث', 'ي'],
84
+ 'ه': ['ة'],
85
+ 'ة': ['ه'],
86
+ 'و': ['ؤ'],
87
+ 'ي': ['ئ', 'ى', 'ب', 'ت', 'ن'],
88
+ 'ى': ['ي', 'ئ'],
89
+ 'ا': ['أ', 'إ', 'آ'],
90
+ 'أ': ['ا', 'إ', 'آ'],
91
+ 'إ': ['ا', 'أ', 'آ'],
92
+ 'آ': ['ا', 'أ', 'إ'],
93
+ }
94
+
95
+ def _load_or_download_resources(self):
96
+ """Load or download Arabic language resources."""
97
+ dict_file = self.cache_dir / "arabic_dictionary.pkl"
98
+ freq_file = self.cache_dir / "word_frequencies.pkl"
99
+ ngram_file = self.cache_dir / "ngrams.pkl"
100
+
101
+ if dict_file.exists() and freq_file.exists() and ngram_file.exists():
102
+ print("📚 Loading cached Arabic resources...")
103
+ try:
104
+ with open(dict_file, 'rb') as f:
105
+ self.dictionary = pickle.load(f)
106
+ with open(freq_file, 'rb') as f:
107
+ self.word_frequencies = pickle.load(f)
108
+ with open(ngram_file, 'rb') as f:
109
+ ngram_data = pickle.load(f)
110
+ self.bigrams = ngram_data['bigrams']
111
+ self.trigrams = ngram_data['trigrams']
112
+ print(f"✅ Loaded {len(self.dictionary)} Arabic words")
113
+ return
114
+ except Exception as e:
115
+ print(f"⚠️ Error loading cache: {e}. Downloading fresh...")
116
+
117
+ print("📥 Downloading Arabic language resources...")
118
+ self._download_arabic_wordlist()
119
+ self._build_ngram_models()
120
+
121
+ # Cache for future use
122
+ print("💾 Caching resources for faster startup...")
123
+ with open(dict_file, 'wb') as f:
124
+ pickle.dump(self.dictionary, f)
125
+ with open(freq_file, 'wb') as f:
126
+ pickle.dump(self.word_frequencies, f)
127
+ with open(ngram_file, 'wb') as f:
128
+ pickle.dump({'bigrams': dict(self.bigrams), 'trigrams': dict(self.trigrams)}, f)
129
+
130
+ print(f"✅ Resources ready: {len(self.dictionary)} words loaded")
131
+
132
+ def _download_arabic_wordlist(self):
133
+ """
134
+ Download and process Arabic word frequency list from online sources.
135
+ Uses the Arabic Gigaword frequency list.
136
+ """
137
+ try:
138
+ # Try to get Arabic word frequency list
139
+ # Using a curated list from GitHub
140
+ url = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2018/ar/ar_50k.txt"
141
+
142
+ print(f" Downloading from {url}...")
143
+ response = requests.get(url, timeout=30)
144
+ response.raise_for_status()
145
+
146
+ lines = response.text.strip().split('\n')
147
+ for line in lines:
148
+ parts = line.strip().split()
149
+ if len(parts) >= 2:
150
+ word = parts[0]
151
+ try:
152
+ freq = int(parts[1])
153
+ except ValueError:
154
+ freq = 1
155
+
156
+ # Normalize and add to dictionary
157
+ normalized = self.normalize_text(word)
158
+ if normalized and self._is_valid_arabic_word(normalized):
159
+ self.dictionary.add(normalized)
160
+ self.word_frequencies[normalized] = freq
161
+
162
+ print(f" ✓ Downloaded {len(self.dictionary)} words")
163
+
164
+ except Exception as e:
165
+ print(f" ⚠️ Download failed: {e}")
166
+ print(" Using fallback: basic Arabic word set...")
167
+ self._create_fallback_dictionary()
168
+
169
+ def _create_fallback_dictionary(self):
170
+ """Create a basic fallback dictionary with common Arabic words."""
171
+ # Common Arabic words as fallback
172
+ common_words = [
173
+ 'في', 'من', 'على', 'إلى', 'هذا', 'هذه', 'ذلك', 'التي', 'الذي', 'كان',
174
+ 'أن', 'قد', 'لا', 'ما', 'هو', 'هي', 'كل', 'عن', 'أو', 'إن',
175
+ 'بعد', 'قبل', 'عند', 'الى', 'اللذي', 'اللتي', 'والتي', 'والذي',
176
+ 'كانت', 'يكون', 'تكون', 'مع', 'بين', 'خلال', 'أيضا', 'حيث',
177
+ 'عليها', 'عليه', 'منها', 'منه', 'فيها', 'فيه', 'بها', 'به',
178
+ 'لها', 'له', 'لهم', 'لهن', 'عام', 'سنة', 'يوم', 'شهر',
179
+ ]
180
+
181
+ for word in common_words:
182
+ normalized = self.normalize_text(word)
183
+ self.dictionary.add(normalized)
184
+ self.word_frequencies[normalized] = 1000
185
+
186
+ def _build_ngram_models(self):
187
+ """
188
+ Build n-gram language models from the word frequency data.
189
+ This creates bigram and trigram models for context-aware correction.
190
+ """
191
+ print(" Building n-gram language models...")
192
+
193
+ # Simple approach: use word frequencies to build basic n-grams
194
+ # In a production system, you'd build this from a large corpus
195
+ sorted_words = sorted(self.word_frequencies.items(), key=lambda x: x[1], reverse=True)
196
+
197
+ # Create basic bigrams from frequent words
198
+ for i in range(len(sorted_words) - 1):
199
+ word1 = sorted_words[i][0]
200
+ word2 = sorted_words[i + 1][0]
201
+ self.bigrams[(word1, word2)] = min(sorted_words[i][1], sorted_words[i + 1][1])
202
+
203
+ print(f" ✓ Built {len(self.bigrams)} bigrams")
204
+
205
+ def _is_valid_arabic_word(self, word: str) -> bool:
206
+ """
207
+ Check if a word is valid Arabic (contains Arabic letters).
208
+
209
+ Args:
210
+ word: Word to validate
211
+
212
+ Returns:
213
+ True if word contains Arabic letters, False otherwise
214
+ """
215
+ if not word or len(word) < 2:
216
+ return False
217
+
218
+ arabic_count = sum(1 for c in word if '\u0600' <= c <= '\u06FF')
219
+ return arabic_count >= len(word) * 0.7 # At least 70% Arabic characters
220
+
221
+ def normalize_text(self, text: str) -> str:
222
+ """
223
+ Normalize Arabic text for better matching.
224
+
225
+ Args:
226
+ text: Input Arabic text
227
+
228
+ Returns:
229
+ Normalized text
230
+ """
231
+ if not text:
232
+ return ""
233
+
234
+ # Remove diacritics (tashkeel)
235
+ text = araby.strip_diacritics(text)
236
+
237
+ # Normalize using camel-tools
238
+ text = normalize_unicode(text)
239
+ text = normalize_alef_ar(text)
240
+ text = normalize_alef_maksura_ar(text)
241
+ text = normalize_teh_marbuta_ar(text)
242
+
243
+ # Remove extra whitespace
244
+ text = ' '.join(text.split())
245
+
246
+ return text
247
+
248
+ def get_word_candidates(self, word: str, max_candidates: int = 5, max_distance: int = 3) -> List[Tuple[str, float, int]]:
249
+ """
250
+ Get candidate corrections for a word using fuzzy matching.
251
+
252
+ Args:
253
+ word: Input word to correct
254
+ max_candidates: Maximum number of candidates to return
255
+ max_distance: Maximum edit distance to consider
256
+
257
+ Returns:
258
+ List of (candidate, similarity_score, edit_distance) tuples
259
+ """
260
+ if not word or not self._is_valid_arabic_word(word):
261
+ return []
262
+
263
+ normalized_word = self.normalize_text(word)
264
+
265
+ # Exact match - high confidence
266
+ if normalized_word in self.dictionary:
267
+ return [(normalized_word, 100.0, 0)]
268
+
269
+ # Use rapidfuzz for efficient fuzzy matching
270
+ candidates = []
271
+
272
+ # Get top matches using Levenshtein distance
273
+ matches = process.extract(
274
+ normalized_word,
275
+ self.dictionary,
276
+ scorer=fuzz.ratio,
277
+ limit=max_candidates * 3 # Get more to filter
278
+ )
279
+
280
+ for match_word, similarity, _ in matches:
281
+ # Calculate actual edit distance
282
+ edit_dist = self._calculate_edit_distance(normalized_word, match_word)
283
+
284
+ if edit_dist <= max_distance:
285
+ # Boost score if word is frequent
286
+ freq_bonus = min(20, self.word_frequencies.get(match_word, 0) / 1000)
287
+ adjusted_score = min(99.9, similarity + freq_bonus)
288
+
289
+ candidates.append((match_word, adjusted_score, edit_dist))
290
+
291
+ # Sort by score, then by frequency
292
+ candidates.sort(key=lambda x: (x[1], self.word_frequencies.get(x[0], 0)), reverse=True)
293
+
294
+ return candidates[:max_candidates]
295
+
296
+ def _calculate_edit_distance(self, word1: str, word2: str) -> int:
297
+ """
298
+ Calculate Levenshtein edit distance between two words.
299
+
300
+ Args:
301
+ word1: First word
302
+ word2: Second word
303
+
304
+ Returns:
305
+ Edit distance
306
+ """
307
+ if len(word1) < len(word2):
308
+ return self._calculate_edit_distance(word2, word1)
309
+
310
+ if len(word2) == 0:
311
+ return len(word1)
312
+
313
+ previous_row = range(len(word2) + 1)
314
+ for i, c1 in enumerate(word1):
315
+ current_row = [i + 1]
316
+ for j, c2 in enumerate(word2):
317
+ # Cost of insertions, deletions, or substitutions
318
+ insertions = previous_row[j + 1] + 1
319
+ deletions = current_row[j] + 1
320
+ substitutions = previous_row[j] + (c1 != c2)
321
+ current_row.append(min(insertions, deletions, substitutions))
322
+ previous_row = current_row
323
+
324
+ return previous_row[-1]
325
+
326
+ def get_bigram_score(self, word1: str, word2: str) -> float:
327
+ """
328
+ Get bigram probability score for word pair.
329
+
330
+ Args:
331
+ word1: First word
332
+ word2: Second word
333
+
334
+ Returns:
335
+ Bigram score (0-100)
336
+ """
337
+ pair = (word1, word2)
338
+ if pair in self.bigrams:
339
+ # Normalize to 0-100 scale
340
+ max_freq = max(self.bigrams.values()) if self.bigrams else 1
341
+ return (self.bigrams[pair] / max_freq) * 100
342
+ return 0.0
343
+
344
+ def correct_word_with_context(
345
+ self,
346
+ word: str,
347
+ prev_word: Optional[str] = None,
348
+ next_word: Optional[str] = None
349
+ ) -> Tuple[str, float, List[Tuple[str, float]]]:
350
+ """
351
+ Correct a word using context-aware selection.
352
+
353
+ Args:
354
+ word: Word to correct
355
+ prev_word: Previous word in sequence (for context)
356
+ next_word: Next word in sequence (for context)
357
+
358
+ Returns:
359
+ Tuple of (best_correction, confidence_score, all_candidates)
360
+ """
361
+ # Get candidates
362
+ candidates = self.get_word_candidates(word)
363
+
364
+ if not candidates:
365
+ # No candidates found - return original with low confidence
366
+ return (word, 0.0, [])
367
+
368
+ # Exact match case
369
+ if candidates[0][2] == 0: # edit distance = 0
370
+ return (candidates[0][0], 100.0, candidates)
371
+
372
+ # Context-aware selection
373
+ scored_candidates = []
374
+
375
+ for candidate_word, base_score, edit_dist in candidates:
376
+ context_score = 0.0
377
+
378
+ # Consider previous word context
379
+ if prev_word:
380
+ prev_normalized = self.normalize_text(prev_word)
381
+ context_score += self.get_bigram_score(prev_normalized, candidate_word) * 0.3
382
+
383
+ # Consider next word context
384
+ if next_word:
385
+ next_normalized = self.normalize_text(next_word)
386
+ context_score += self.get_bigram_score(candidate_word, next_normalized) * 0.3
387
+
388
+ # Final score: base similarity + context + frequency
389
+ final_score = base_score * 0.6 + context_score * 0.4
390
+ scored_candidates.append((candidate_word, final_score))
391
+
392
+ # Sort by final score
393
+ scored_candidates.sort(key=lambda x: x[1], reverse=True)
394
+
395
+ best_word, best_score = scored_candidates[0]
396
+
397
+ return (best_word, best_score, scored_candidates)
398
+
399
+ def correct_text(self, text: str) -> Dict[str, any]:
400
+ """
401
+ Correct an entire text with word-level tracking.
402
+
403
+ Args:
404
+ text: Input Arabic text
405
+
406
+ Returns:
407
+ Dictionary containing:
408
+ - original: Original text
409
+ - corrected: Corrected text
410
+ - words: List of word correction details
411
+ - overall_confidence: Average confidence score
412
+ """
413
+ if not text:
414
+ return {
415
+ 'original': '',
416
+ 'corrected': '',
417
+ 'words': [],
418
+ 'overall_confidence': 0.0
419
+ }
420
+
421
+ # Split into words while preserving punctuation
422
+ words = re.findall(r'[\u0600-\u06FF]+|[^\u0600-\u06FF\s]+', text)
423
+
424
+ corrected_words = []
425
+ word_details = []
426
+ total_confidence = 0.0
427
+ correction_count = 0
428
+
429
+ for i, word in enumerate(words):
430
+ if not self._is_valid_arabic_word(word):
431
+ # Non-Arabic word (punctuation, numbers, etc.)
432
+ corrected_words.append(word)
433
+ word_details.append({
434
+ 'original': word,
435
+ 'corrected': word,
436
+ 'confidence': 100.0,
437
+ 'candidates': [],
438
+ 'changed': False
439
+ })
440
+ continue
441
+
442
+ # Get context
443
+ prev_word = words[i-1] if i > 0 and self._is_valid_arabic_word(words[i-1]) else None
444
+ next_word = words[i+1] if i < len(words)-1 and self._is_valid_arabic_word(words[i+1]) else None
445
+
446
+ # Correct with context
447
+ corrected, confidence, candidates = self.correct_word_with_context(word, prev_word, next_word)
448
+
449
+ corrected_words.append(corrected)
450
+ total_confidence += confidence
451
+
452
+ changed = (self.normalize_text(word) != self.normalize_text(corrected))
453
+ if changed:
454
+ correction_count += 1
455
+
456
+ word_details.append({
457
+ 'original': word,
458
+ 'corrected': corrected,
459
+ 'confidence': round(confidence, 1),
460
+ 'candidates': [(c[0], round(c[1], 1)) for c in candidates[:5]],
461
+ 'changed': changed
462
+ })
463
+
464
+ overall_confidence = total_confidence / len(words) if words else 0.0
465
+
466
+ return {
467
+ 'original': text,
468
+ 'corrected': ' '.join(corrected_words),
469
+ 'words': word_details,
470
+ 'overall_confidence': round(overall_confidence, 1),
471
+ 'corrections_made': correction_count
472
+ }
473
+
474
+
475
+ # Global instance (singleton pattern for efficiency)
476
+ _corrector_instance = None
477
+
478
+ def get_corrector() -> ArabicTextCorrector:
479
+ """
480
+ Get or create the global Arabic text corrector instance.
481
+
482
+ Returns:
483
+ ArabicTextCorrector instance
484
+ """
485
+ global _corrector_instance
486
+ if _corrector_instance is None:
487
+ _corrector_instance = ArabicTextCorrector()
488
+ return _corrector_instance
489
+