File size: 12,228 Bytes
b0b0b0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
import re
from typing import List, Tuple, Dict
from dataclasses import dataclass
from enum import Enum

class ErrorType(Enum):
    MATCH = "match"
    SUBSTITUTION = "substitution"
    INSERTION = "insertion"
    DELETION = "deletion"
    DIACRITIC_ERROR = "diacritic_error"

@dataclass
class AlignmentError:
    error_type: ErrorType
    position: int
    user_word: str
    reference_word: str
    details: str = ""

class ArabicAligner:
    # Arabic diacritics
    DIACRITICS = '\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0657\u0658'
    DIACRITIC_PATTERN = f'[{DIACRITICS}]'

    def __init__(self):
        self.alignment_matrix = None
        self.backtrack_matrix = None

    def normalize_text(self, text: str) -> str:
        """Normalize Arabic text: remove extra spaces, normalize characters"""
        # Remove tatweel (ـ)
        text = text.replace('\u0640', '')

        # Normalize Alef variations to plain Alef
        text = re.sub('[إأآٱ]', 'ا', text)

        # Normalize Hamza variations
        text = re.sub('[ؤئ]', 'ء', text)

        # Normalize Teh Marbuta
        text = re.sub('ة', 'ه', text)

        # Remove extra whitespace
        text = ' '.join(text.split())

        return text.strip()

    def remove_diacritics(self, text: str) -> str:
        """Remove all diacritics from Arabic text"""
        return re.sub(self.DIACRITIC_PATTERN, '', text)

    def extract_diacritics(self, word: str) -> List[Tuple[int, str]]:
        """Extract diacritics and their positions from a word"""
        diacritics = []
        pos = 0
        for i, char in enumerate(word):
            if char in self.DIACRITICS:
                diacritics.append((pos, char))
            else:
                pos += 1
        return diacritics

    def tokenize(self, text: str) -> List[str]:
        """Tokenize text into words"""
        # Split by whitespace and punctuation
        words = re.findall(r'[\w\u0600-\u06FF]+', text)
        return [w for w in words if w.strip()]

    def compute_alignment(self, user_words: List[str], ref_words: List[str]) -> Tuple[List[List[int]], List[List[str]]]:
        """
        Compute word-level alignment using dynamic programming (edit distance).
        Returns the cost matrix and backtrack matrix.
        """
        m, n = len(user_words), len(ref_words)

        # Initialize matrices
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        backtrack = [['' for _ in range(n + 1)] for _ in range(m + 1)]

        # Initialize base cases
        for i in range(m + 1):
            dp[i][0] = i
            if i > 0:
                backtrack[i][0] = 'INS'

        for j in range(n + 1):
            dp[0][j] = j
            if j > 0:
                backtrack[0][j] = 'DEL'

        backtrack[0][0] = ''

        # Fill the DP table
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                # Remove diacritics for comparison
                user_clean = self.remove_diacritics(user_words[i-1])
                ref_clean = self.remove_diacritics(ref_words[j-1])

                if user_clean == ref_clean:
                    # Match (cost 0)
                    dp[i][j] = dp[i-1][j-1]
                    backtrack[i][j] = 'MATCH'
                else:
                    # Substitution
                    subst_cost = dp[i-1][j-1] + 1
                    # Deletion from reference
                    del_cost = dp[i][j-1] + 1
                    # Insertion to user
                    ins_cost = dp[i-1][j] + 1

                    min_cost = min(subst_cost, del_cost, ins_cost)
                    dp[i][j] = min_cost

                    if min_cost == subst_cost:
                        backtrack[i][j] = 'SUBST'
                    elif min_cost == del_cost:
                        backtrack[i][j] = 'DEL'
                    else:
                        backtrack[i][j] = 'INS'

        self.alignment_matrix = dp
        self.backtrack_matrix = backtrack

        return dp, backtrack

    def traceback_alignment(self, user_words: List[str], ref_words: List[str]) -> List[Tuple[str, int, int]]:
        """
        Traceback through the alignment to get aligned pairs.
        Returns list of (operation, user_idx, ref_idx) tuples.
        """
        if self.backtrack_matrix is None:
            raise ValueError("Must call compute_alignment first")

        alignments = []
        i, j = len(user_words), len(ref_words)

        while i > 0 or j > 0:
            operation = self.backtrack_matrix[i][j]

            if operation == 'MATCH':
                alignments.append(('MATCH', i-1, j-1))
                i -= 1
                j -= 1
            elif operation == 'SUBST':
                alignments.append(('SUBST', i-1, j-1))
                i -= 1
                j -= 1
            elif operation == 'DEL':
                alignments.append(('DEL', -1, j-1))
                j -= 1
            elif operation == 'INS':
                alignments.append(('INS', i-1, -1))
                i -= 1

        return list(reversed(alignments))

    def compare_diacritics(self, user_word: str, ref_word: str) -> Tuple[bool, str]:
        """
        Compare diacritics between two words (after confirming base match).
        Returns (is_match, details_string)
        """
        user_clean = self.remove_diacritics(user_word)
        ref_clean = self.remove_diacritics(ref_word)

        if user_clean != ref_clean:
            return False, "Base words don't match"

        user_diacs = self.extract_diacritics(user_word)
        ref_diacs = self.extract_diacritics(ref_word)

        if user_diacs == ref_diacs:
            return True, "Perfect match"

        # Detailed comparison
        user_dict = {pos: diac for pos, diac in user_diacs}
        ref_dict = {pos: diac for pos, diac in ref_diacs}

        errors = []
        all_positions = sorted(set(user_dict.keys()) | set(ref_dict.keys()))

        for pos in all_positions:
            if pos in user_dict and pos not in ref_dict:
                errors.append(f"Extra diacritic '{user_dict[pos]}' at position {pos}")
            elif pos not in user_dict and pos in ref_dict:
                errors.append(f"Missing diacritic '{ref_dict[pos]}' at position {pos}")
            elif user_dict[pos] != ref_dict[pos]:
                errors.append(f"Wrong diacritic at position {pos}: '{user_dict[pos]}' should be '{ref_dict[pos]}'")

        return False, "; ".join(errors)

    def align_and_compare(self, user_text: str, reference_text: str) -> Dict:
        """
        Main function: align texts and detect all errors.
        """
        # Step 1: Normalize
        user_normalized = self.normalize_text(user_text)
        ref_normalized = self.normalize_text(reference_text)

        # Step 2: Tokenize
        user_words = self.tokenize(user_normalized)
        ref_words = self.tokenize(ref_normalized)

        # Step 3: Compute alignment
        dp, backtrack = self.compute_alignment(user_words, ref_words)

        # Step 4: Traceback and identify errors
        alignments = self.traceback_alignment(user_words, ref_words)

        errors = []
        ref_position = 0

        for operation, user_idx, ref_idx in alignments:
            if operation == 'MATCH':
                # Check diacritics for matched words
                user_word = user_words[user_idx]
                ref_word = ref_words[ref_idx]

                is_match, details = self.compare_diacritics(user_word, ref_word)

                if is_match:
                    errors.append(AlignmentError(
                        error_type=ErrorType.MATCH,
                        position=ref_position,
                        user_word=user_word,
                        reference_word=ref_word,
                        details="Perfect match"
                    ))
                else:
                    errors.append(AlignmentError(
                        error_type=ErrorType.DIACRITIC_ERROR,
                        position=ref_position,
                        user_word=user_word,
                        reference_word=ref_word,
                        details=details
                    ))
                ref_position += 1

            elif operation == 'SUBST':
                errors.append(AlignmentError(
                    error_type=ErrorType.SUBSTITUTION,
                    position=ref_position,
                    user_word=user_words[user_idx],
                    reference_word=ref_words[ref_idx],
                    details=f"Word substituted"
                ))
                ref_position += 1

            elif operation == 'DEL':
                errors.append(AlignmentError(
                    error_type=ErrorType.DELETION,
                    position=ref_position,
                    user_word="",
                    reference_word=ref_words[ref_idx],
                    details=f"Word deleted from user text"
                ))
                ref_position += 1

            elif operation == 'INS':
                errors.append(AlignmentError(
                    error_type=ErrorType.INSERTION,
                    position=ref_position,
                    user_word=user_words[user_idx],
                    reference_word="",
                    details=f"Word inserted in user text"
                ))

        # Compile results
        total_errors = sum(1 for e in errors if e.error_type != ErrorType.MATCH)
        diacritic_errors = sum(1 for e in errors if e.error_type == ErrorType.DIACRITIC_ERROR)
        word_errors = sum(1 for e in errors if e.error_type in [ErrorType.SUBSTITUTION, ErrorType.INSERTION, ErrorType.DELETION])

        return {
            'user_words': user_words,
            'reference_words': ref_words,
            'alignments': alignments,
            'errors': errors,
            'edit_distance': dp[-1][-1],
            'statistics': {
                'total_reference_words': len(ref_words),
                'total_user_words': len(user_words),
                'total_errors': total_errors,
                'word_level_errors': word_errors,
                'diacritic_errors': diacritic_errors,
                'accuracy': (len(ref_words) - total_errors) / len(ref_words) * 100 if ref_words else 0
            }
        }

    def print_results(self, results: Dict):
        """Print formatted results"""
        print("=" * 80)
        print("ARABIC TEXT ALIGNMENT ANALYSIS")
        print("=" * 80)

        print(f"\nUser Text Words: {len(results['user_words'])}")
        print(f"Reference Text Words: {len(results['reference_words'])}")
        print(f"Edit Distance: {results['edit_distance']}")

        print("\n" + "-" * 80)
        print("STATISTICS")
        print("-" * 80)
        stats = results['statistics']
        print(f"Total Errors: {stats['total_errors']}")
        print(f"  - Word-level Errors: {stats['word_level_errors']}")
        print(f"  - Diacritic Errors: {stats['diacritic_errors']}")
        print(f"Accuracy: {stats['accuracy']:.2f}%")

        print("\n" + "-" * 80)
        print("DETAILED ERRORS")
        print("-" * 80)

        for i, error in enumerate(results['errors'], 1):
            if error.error_type == ErrorType.MATCH:
                continue  # Skip perfect matches in detailed output

            print(f"\n[{i}] Position: {error.position}")
            print(f"    Type: {error.error_type.value.upper()}")

            if error.error_type == ErrorType.INSERTION:
                print(f"    User: '{error.user_word}' (extra word)")
                print(f"    Expected: [nothing]")
            elif error.error_type == ErrorType.DELETION:
                print(f"    User: [missing]")
                print(f"    Expected: '{error.reference_word}'")
            elif error.error_type == ErrorType.SUBSTITUTION:
                print(f"    User: '{error.user_word}'")
                print(f"    Expected: '{error.reference_word}'")
            elif error.error_type == ErrorType.DIACRITIC_ERROR:
                print(f"    User: '{error.user_word}'")
                print(f"    Expected: '{error.reference_word}'")

            print(f"    Details: {error.details}")