aboalaa147 commited on
Commit
b0b0b0f
Β·
verified Β·
1 Parent(s): 93057e6

Create arabic_aligner.py

Browse files
Files changed (1) hide show
  1. arabic_aligner.py +333 -0
arabic_aligner.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Tuple, Dict
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+
6
+ class ErrorType(Enum):
7
+ MATCH = "match"
8
+ SUBSTITUTION = "substitution"
9
+ INSERTION = "insertion"
10
+ DELETION = "deletion"
11
+ DIACRITIC_ERROR = "diacritic_error"
12
+
13
+ @dataclass
14
+ class AlignmentError:
15
+ error_type: ErrorType
16
+ position: int
17
+ user_word: str
18
+ reference_word: str
19
+ details: str = ""
20
+
21
+ class ArabicAligner:
22
+ # Arabic diacritics
23
+ DIACRITICS = '\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0657\u0658'
24
+ DIACRITIC_PATTERN = f'[{DIACRITICS}]'
25
+
26
+ def __init__(self):
27
+ self.alignment_matrix = None
28
+ self.backtrack_matrix = None
29
+
30
+ def normalize_text(self, text: str) -> str:
31
+ """Normalize Arabic text: remove extra spaces, normalize characters"""
32
+ # Remove tatweel (Ω€)
33
+ text = text.replace('\u0640', '')
34
+
35
+ # Normalize Alef variations to plain Alef
36
+ text = re.sub('[Ψ₯Ψ£Ψ’Ω±]', 'Ψ§', text)
37
+
38
+ # Normalize Hamza variations
39
+ text = re.sub('[Ψ€Ψ¦]', 'Ψ‘', text)
40
+
41
+ # Normalize Teh Marbuta
42
+ text = re.sub('Ψ©', 'Ω‡', text)
43
+
44
+ # Remove extra whitespace
45
+ text = ' '.join(text.split())
46
+
47
+ return text.strip()
48
+
49
+ def remove_diacritics(self, text: str) -> str:
50
+ """Remove all diacritics from Arabic text"""
51
+ return re.sub(self.DIACRITIC_PATTERN, '', text)
52
+
53
+ def extract_diacritics(self, word: str) -> List[Tuple[int, str]]:
54
+ """Extract diacritics and their positions from a word"""
55
+ diacritics = []
56
+ pos = 0
57
+ for i, char in enumerate(word):
58
+ if char in self.DIACRITICS:
59
+ diacritics.append((pos, char))
60
+ else:
61
+ pos += 1
62
+ return diacritics
63
+
64
+ def tokenize(self, text: str) -> List[str]:
65
+ """Tokenize text into words"""
66
+ # Split by whitespace and punctuation
67
+ words = re.findall(r'[\w\u0600-\u06FF]+', text)
68
+ return [w for w in words if w.strip()]
69
+
70
+ def compute_alignment(self, user_words: List[str], ref_words: List[str]) -> Tuple[List[List[int]], List[List[str]]]:
71
+ """
72
+ Compute word-level alignment using dynamic programming (edit distance).
73
+ Returns the cost matrix and backtrack matrix.
74
+ """
75
+ m, n = len(user_words), len(ref_words)
76
+
77
+ # Initialize matrices
78
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
79
+ backtrack = [['' for _ in range(n + 1)] for _ in range(m + 1)]
80
+
81
+ # Initialize base cases
82
+ for i in range(m + 1):
83
+ dp[i][0] = i
84
+ if i > 0:
85
+ backtrack[i][0] = 'INS'
86
+
87
+ for j in range(n + 1):
88
+ dp[0][j] = j
89
+ if j > 0:
90
+ backtrack[0][j] = 'DEL'
91
+
92
+ backtrack[0][0] = ''
93
+
94
+ # Fill the DP table
95
+ for i in range(1, m + 1):
96
+ for j in range(1, n + 1):
97
+ # Remove diacritics for comparison
98
+ user_clean = self.remove_diacritics(user_words[i-1])
99
+ ref_clean = self.remove_diacritics(ref_words[j-1])
100
+
101
+ if user_clean == ref_clean:
102
+ # Match (cost 0)
103
+ dp[i][j] = dp[i-1][j-1]
104
+ backtrack[i][j] = 'MATCH'
105
+ else:
106
+ # Substitution
107
+ subst_cost = dp[i-1][j-1] + 1
108
+ # Deletion from reference
109
+ del_cost = dp[i][j-1] + 1
110
+ # Insertion to user
111
+ ins_cost = dp[i-1][j] + 1
112
+
113
+ min_cost = min(subst_cost, del_cost, ins_cost)
114
+ dp[i][j] = min_cost
115
+
116
+ if min_cost == subst_cost:
117
+ backtrack[i][j] = 'SUBST'
118
+ elif min_cost == del_cost:
119
+ backtrack[i][j] = 'DEL'
120
+ else:
121
+ backtrack[i][j] = 'INS'
122
+
123
+ self.alignment_matrix = dp
124
+ self.backtrack_matrix = backtrack
125
+
126
+ return dp, backtrack
127
+
128
+ def traceback_alignment(self, user_words: List[str], ref_words: List[str]) -> List[Tuple[str, int, int]]:
129
+ """
130
+ Traceback through the alignment to get aligned pairs.
131
+ Returns list of (operation, user_idx, ref_idx) tuples.
132
+ """
133
+ if self.backtrack_matrix is None:
134
+ raise ValueError("Must call compute_alignment first")
135
+
136
+ alignments = []
137
+ i, j = len(user_words), len(ref_words)
138
+
139
+ while i > 0 or j > 0:
140
+ operation = self.backtrack_matrix[i][j]
141
+
142
+ if operation == 'MATCH':
143
+ alignments.append(('MATCH', i-1, j-1))
144
+ i -= 1
145
+ j -= 1
146
+ elif operation == 'SUBST':
147
+ alignments.append(('SUBST', i-1, j-1))
148
+ i -= 1
149
+ j -= 1
150
+ elif operation == 'DEL':
151
+ alignments.append(('DEL', -1, j-1))
152
+ j -= 1
153
+ elif operation == 'INS':
154
+ alignments.append(('INS', i-1, -1))
155
+ i -= 1
156
+
157
+ return list(reversed(alignments))
158
+
159
+ def compare_diacritics(self, user_word: str, ref_word: str) -> Tuple[bool, str]:
160
+ """
161
+ Compare diacritics between two words (after confirming base match).
162
+ Returns (is_match, details_string)
163
+ """
164
+ user_clean = self.remove_diacritics(user_word)
165
+ ref_clean = self.remove_diacritics(ref_word)
166
+
167
+ if user_clean != ref_clean:
168
+ return False, "Base words don't match"
169
+
170
+ user_diacs = self.extract_diacritics(user_word)
171
+ ref_diacs = self.extract_diacritics(ref_word)
172
+
173
+ if user_diacs == ref_diacs:
174
+ return True, "Perfect match"
175
+
176
+ # Detailed comparison
177
+ user_dict = {pos: diac for pos, diac in user_diacs}
178
+ ref_dict = {pos: diac for pos, diac in ref_diacs}
179
+
180
+ errors = []
181
+ all_positions = sorted(set(user_dict.keys()) | set(ref_dict.keys()))
182
+
183
+ for pos in all_positions:
184
+ if pos in user_dict and pos not in ref_dict:
185
+ errors.append(f"Extra diacritic '{user_dict[pos]}' at position {pos}")
186
+ elif pos not in user_dict and pos in ref_dict:
187
+ errors.append(f"Missing diacritic '{ref_dict[pos]}' at position {pos}")
188
+ elif user_dict[pos] != ref_dict[pos]:
189
+ errors.append(f"Wrong diacritic at position {pos}: '{user_dict[pos]}' should be '{ref_dict[pos]}'")
190
+
191
+ return False, "; ".join(errors)
192
+
193
+ def align_and_compare(self, user_text: str, reference_text: str) -> Dict:
194
+ """
195
+ Main function: align texts and detect all errors.
196
+ """
197
+ # Step 1: Normalize
198
+ user_normalized = self.normalize_text(user_text)
199
+ ref_normalized = self.normalize_text(reference_text)
200
+
201
+ # Step 2: Tokenize
202
+ user_words = self.tokenize(user_normalized)
203
+ ref_words = self.tokenize(ref_normalized)
204
+
205
+ # Step 3: Compute alignment
206
+ dp, backtrack = self.compute_alignment(user_words, ref_words)
207
+
208
+ # Step 4: Traceback and identify errors
209
+ alignments = self.traceback_alignment(user_words, ref_words)
210
+
211
+ errors = []
212
+ ref_position = 0
213
+
214
+ for operation, user_idx, ref_idx in alignments:
215
+ if operation == 'MATCH':
216
+ # Check diacritics for matched words
217
+ user_word = user_words[user_idx]
218
+ ref_word = ref_words[ref_idx]
219
+
220
+ is_match, details = self.compare_diacritics(user_word, ref_word)
221
+
222
+ if is_match:
223
+ errors.append(AlignmentError(
224
+ error_type=ErrorType.MATCH,
225
+ position=ref_position,
226
+ user_word=user_word,
227
+ reference_word=ref_word,
228
+ details="Perfect match"
229
+ ))
230
+ else:
231
+ errors.append(AlignmentError(
232
+ error_type=ErrorType.DIACRITIC_ERROR,
233
+ position=ref_position,
234
+ user_word=user_word,
235
+ reference_word=ref_word,
236
+ details=details
237
+ ))
238
+ ref_position += 1
239
+
240
+ elif operation == 'SUBST':
241
+ errors.append(AlignmentError(
242
+ error_type=ErrorType.SUBSTITUTION,
243
+ position=ref_position,
244
+ user_word=user_words[user_idx],
245
+ reference_word=ref_words[ref_idx],
246
+ details=f"Word substituted"
247
+ ))
248
+ ref_position += 1
249
+
250
+ elif operation == 'DEL':
251
+ errors.append(AlignmentError(
252
+ error_type=ErrorType.DELETION,
253
+ position=ref_position,
254
+ user_word="",
255
+ reference_word=ref_words[ref_idx],
256
+ details=f"Word deleted from user text"
257
+ ))
258
+ ref_position += 1
259
+
260
+ elif operation == 'INS':
261
+ errors.append(AlignmentError(
262
+ error_type=ErrorType.INSERTION,
263
+ position=ref_position,
264
+ user_word=user_words[user_idx],
265
+ reference_word="",
266
+ details=f"Word inserted in user text"
267
+ ))
268
+
269
+ # Compile results
270
+ total_errors = sum(1 for e in errors if e.error_type != ErrorType.MATCH)
271
+ diacritic_errors = sum(1 for e in errors if e.error_type == ErrorType.DIACRITIC_ERROR)
272
+ word_errors = sum(1 for e in errors if e.error_type in [ErrorType.SUBSTITUTION, ErrorType.INSERTION, ErrorType.DELETION])
273
+
274
+ return {
275
+ 'user_words': user_words,
276
+ 'reference_words': ref_words,
277
+ 'alignments': alignments,
278
+ 'errors': errors,
279
+ 'edit_distance': dp[-1][-1],
280
+ 'statistics': {
281
+ 'total_reference_words': len(ref_words),
282
+ 'total_user_words': len(user_words),
283
+ 'total_errors': total_errors,
284
+ 'word_level_errors': word_errors,
285
+ 'diacritic_errors': diacritic_errors,
286
+ 'accuracy': (len(ref_words) - total_errors) / len(ref_words) * 100 if ref_words else 0
287
+ }
288
+ }
289
+
290
+ def print_results(self, results: Dict):
291
+ """Print formatted results"""
292
+ print("=" * 80)
293
+ print("ARABIC TEXT ALIGNMENT ANALYSIS")
294
+ print("=" * 80)
295
+
296
+ print(f"\nUser Text Words: {len(results['user_words'])}")
297
+ print(f"Reference Text Words: {len(results['reference_words'])}")
298
+ print(f"Edit Distance: {results['edit_distance']}")
299
+
300
+ print("\n" + "-" * 80)
301
+ print("STATISTICS")
302
+ print("-" * 80)
303
+ stats = results['statistics']
304
+ print(f"Total Errors: {stats['total_errors']}")
305
+ print(f" - Word-level Errors: {stats['word_level_errors']}")
306
+ print(f" - Diacritic Errors: {stats['diacritic_errors']}")
307
+ print(f"Accuracy: {stats['accuracy']:.2f}%")
308
+
309
+ print("\n" + "-" * 80)
310
+ print("DETAILED ERRORS")
311
+ print("-" * 80)
312
+
313
+ for i, error in enumerate(results['errors'], 1):
314
+ if error.error_type == ErrorType.MATCH:
315
+ continue # Skip perfect matches in detailed output
316
+
317
+ print(f"\n[{i}] Position: {error.position}")
318
+ print(f" Type: {error.error_type.value.upper()}")
319
+
320
+ if error.error_type == ErrorType.INSERTION:
321
+ print(f" User: '{error.user_word}' (extra word)")
322
+ print(f" Expected: [nothing]")
323
+ elif error.error_type == ErrorType.DELETION:
324
+ print(f" User: [missing]")
325
+ print(f" Expected: '{error.reference_word}'")
326
+ elif error.error_type == ErrorType.SUBSTITUTION:
327
+ print(f" User: '{error.user_word}'")
328
+ print(f" Expected: '{error.reference_word}'")
329
+ elif error.error_type == ErrorType.DIACRITIC_ERROR:
330
+ print(f" User: '{error.user_word}'")
331
+ print(f" Expected: '{error.reference_word}'")
332
+
333
+ print(f" Details: {error.details}")