recitation-segmenter-app-API

Runtime error

App Files Files Community

aboalaa147 commited on Dec 19, 2025

Commit

b0b0b0f

verified ·

1 Parent(s): 93057e6

Create arabic_aligner.py

Browse files

Files changed (1) hide show

arabic_aligner.py +333 -0

arabic_aligner.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import re
+from typing import List, Tuple, Dict
+from dataclasses import dataclass
+from enum import Enum
+class ErrorType(Enum):
+    MATCH = "match"
+    SUBSTITUTION = "substitution"
+    INSERTION = "insertion"
+    DELETION = "deletion"
+    DIACRITIC_ERROR = "diacritic_error"
+@dataclass
+class AlignmentError:
+    error_type: ErrorType
+    position: int
+    user_word: str
+    reference_word: str
+    details: str = ""
+class ArabicAligner:
+    # Arabic diacritics
+    DIACRITICS = '\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0657\u0658'
+    DIACRITIC_PATTERN = f'[{DIACRITICS}]'
+    def __init__(self):
+        self.alignment_matrix = None
+        self.backtrack_matrix = None
+    def normalize_text(self, text: str) -> str:
+        """Normalize Arabic text: remove extra spaces, normalize characters"""
+        # Remove tatweel (ـ)
+        text = text.replace('\u0640', '')
+        # Normalize Alef variations to plain Alef
+        text = re.sub('[إأآٱ]', 'ا', text)
+        # Normalize Hamza variations
+        text = re.sub('[ؤئ]', 'ء', text)
+        # Normalize Teh Marbuta
+        text = re.sub('ة', 'ه', text)
+        # Remove extra whitespace
+        text = ' '.join(text.split())
+        return text.strip()
+    def remove_diacritics(self, text: str) -> str:
+        """Remove all diacritics from Arabic text"""
+        return re.sub(self.DIACRITIC_PATTERN, '', text)
+    def extract_diacritics(self, word: str) -> List[Tuple[int, str]]:
+        """Extract diacritics and their positions from a word"""
+        diacritics = []
+        pos = 0
+        for i, char in enumerate(word):
+            if char in self.DIACRITICS:
+                diacritics.append((pos, char))
+            else:
+                pos += 1
+        return diacritics
+    def tokenize(self, text: str) -> List[str]:
+        """Tokenize text into words"""
+        # Split by whitespace and punctuation
+        words = re.findall(r'[\w\u0600-\u06FF]+', text)
+        return [w for w in words if w.strip()]
+    def compute_alignment(self, user_words: List[str], ref_words: List[str]) -> Tuple[List[List[int]], List[List[str]]]:
+        """
+        Compute word-level alignment using dynamic programming (edit distance).
+        Returns the cost matrix and backtrack matrix.
+        """
+        m, n = len(user_words), len(ref_words)
+        # Initialize matrices
+        dp = [[0] * (n + 1) for _ in range(m + 1)]
+        backtrack = [['' for _ in range(n + 1)] for _ in range(m + 1)]
+        # Initialize base cases
+        for i in range(m + 1):
+            dp[i][0] = i
+            if i > 0:
+                backtrack[i][0] = 'INS'
+        for j in range(n + 1):
+            dp[0][j] = j
+            if j > 0:
+                backtrack[0][j] = 'DEL'
+        backtrack[0][0] = ''
+        # Fill the DP table
+        for i in range(1, m + 1):
+            for j in range(1, n + 1):
+                # Remove diacritics for comparison
+                user_clean = self.remove_diacritics(user_words[i-1])
+                ref_clean = self.remove_diacritics(ref_words[j-1])
+                if user_clean == ref_clean:
+                    # Match (cost 0)
+                    dp[i][j] = dp[i-1][j-1]
+                    backtrack[i][j] = 'MATCH'
+                else:
+                    # Substitution
+                    subst_cost = dp[i-1][j-1] + 1
+                    # Deletion from reference
+                    del_cost = dp[i][j-1] + 1
+                    # Insertion to user
+                    ins_cost = dp[i-1][j] + 1
+                    min_cost = min(subst_cost, del_cost, ins_cost)
+                    dp[i][j] = min_cost
+                    if min_cost == subst_cost:
+                        backtrack[i][j] = 'SUBST'
+                    elif min_cost == del_cost:
+                        backtrack[i][j] = 'DEL'
+                    else:
+                        backtrack[i][j] = 'INS'
+        self.alignment_matrix = dp
+        self.backtrack_matrix = backtrack
+        return dp, backtrack
+    def traceback_alignment(self, user_words: List[str], ref_words: List[str]) -> List[Tuple[str, int, int]]:
+        """
+        Traceback through the alignment to get aligned pairs.
+        Returns list of (operation, user_idx, ref_idx) tuples.
+        """
+        if self.backtrack_matrix is None:
+            raise ValueError("Must call compute_alignment first")
+        alignments = []
+        i, j = len(user_words), len(ref_words)
+        while i > 0 or j > 0:
+            operation = self.backtrack_matrix[i][j]
+            if operation == 'MATCH':
+                alignments.append(('MATCH', i-1, j-1))
+                i -= 1
+                j -= 1
+            elif operation == 'SUBST':
+                alignments.append(('SUBST', i-1, j-1))
+                i -= 1
+                j -= 1
+            elif operation == 'DEL':
+                alignments.append(('DEL', -1, j-1))
+                j -= 1
+            elif operation == 'INS':
+                alignments.append(('INS', i-1, -1))
+                i -= 1
+        return list(reversed(alignments))
+    def compare_diacritics(self, user_word: str, ref_word: str) -> Tuple[bool, str]:
+        """
+        Compare diacritics between two words (after confirming base match).
+        Returns (is_match, details_string)
+        """
+        user_clean = self.remove_diacritics(user_word)
+        ref_clean = self.remove_diacritics(ref_word)
+        if user_clean != ref_clean:
+            return False, "Base words don't match"
+        user_diacs = self.extract_diacritics(user_word)
+        ref_diacs = self.extract_diacritics(ref_word)
+        if user_diacs == ref_diacs:
+            return True, "Perfect match"
+        # Detailed comparison
+        user_dict = {pos: diac for pos, diac in user_diacs}
+        ref_dict = {pos: diac for pos, diac in ref_diacs}
+        errors = []
+        all_positions = sorted(set(user_dict.keys()) | set(ref_dict.keys()))
+        for pos in all_positions:
+            if pos in user_dict and pos not in ref_dict:
+                errors.append(f"Extra diacritic '{user_dict[pos]}' at position {pos}")
+            elif pos not in user_dict and pos in ref_dict:
+                errors.append(f"Missing diacritic '{ref_dict[pos]}' at position {pos}")
+            elif user_dict[pos] != ref_dict[pos]:
+                errors.append(f"Wrong diacritic at position {pos}: '{user_dict[pos]}' should be '{ref_dict[pos]}'")
+        return False, "; ".join(errors)
+    def align_and_compare(self, user_text: str, reference_text: str) -> Dict:
+        """
+        Main function: align texts and detect all errors.
+        """
+        # Step 1: Normalize
+        user_normalized = self.normalize_text(user_text)
+        ref_normalized = self.normalize_text(reference_text)
+        # Step 2: Tokenize
+        user_words = self.tokenize(user_normalized)
+        ref_words = self.tokenize(ref_normalized)
+        # Step 3: Compute alignment
+        dp, backtrack = self.compute_alignment(user_words, ref_words)
+        # Step 4: Traceback and identify errors
+        alignments = self.traceback_alignment(user_words, ref_words)
+        errors = []
+        ref_position = 0
+        for operation, user_idx, ref_idx in alignments:
+            if operation == 'MATCH':
+                # Check diacritics for matched words
+                user_word = user_words[user_idx]
+                ref_word = ref_words[ref_idx]
+                is_match, details = self.compare_diacritics(user_word, ref_word)
+                if is_match:
+                    errors.append(AlignmentError(
+                        error_type=ErrorType.MATCH,
+                        position=ref_position,
+                        user_word=user_word,
+                        reference_word=ref_word,
+                        details="Perfect match"
+                    ))
+                else:
+                    errors.append(AlignmentError(
+                        error_type=ErrorType.DIACRITIC_ERROR,
+                        position=ref_position,
+                        user_word=user_word,
+                        reference_word=ref_word,
+                        details=details
+                    ))
+                ref_position += 1
+            elif operation == 'SUBST':
+                errors.append(AlignmentError(
+                    error_type=ErrorType.SUBSTITUTION,
+                    position=ref_position,
+                    user_word=user_words[user_idx],
+                    reference_word=ref_words[ref_idx],
+                    details=f"Word substituted"
+                ))
+                ref_position += 1
+            elif operation == 'DEL':
+                errors.append(AlignmentError(
+                    error_type=ErrorType.DELETION,
+                    position=ref_position,
+                    user_word="",
+                    reference_word=ref_words[ref_idx],
+                    details=f"Word deleted from user text"
+                ))
+                ref_position += 1
+            elif operation == 'INS':
+                errors.append(AlignmentError(
+                    error_type=ErrorType.INSERTION,
+                    position=ref_position,
+                    user_word=user_words[user_idx],
+                    reference_word="",
+                    details=f"Word inserted in user text"
+                ))
+        # Compile results
+        total_errors = sum(1 for e in errors if e.error_type != ErrorType.MATCH)
+        diacritic_errors = sum(1 for e in errors if e.error_type == ErrorType.DIACRITIC_ERROR)
+        word_errors = sum(1 for e in errors if e.error_type in [ErrorType.SUBSTITUTION, ErrorType.INSERTION, ErrorType.DELETION])
+        return {
+            'user_words': user_words,
+            'reference_words': ref_words,
+            'alignments': alignments,
+            'errors': errors,
+            'edit_distance': dp[-1][-1],
+            'statistics': {
+                'total_reference_words': len(ref_words),
+                'total_user_words': len(user_words),
+                'total_errors': total_errors,
+                'word_level_errors': word_errors,
+                'diacritic_errors': diacritic_errors,
+                'accuracy': (len(ref_words) - total_errors) / len(ref_words) * 100 if ref_words else 0
+            }
+        }
+    def print_results(self, results: Dict):
+        """Print formatted results"""
+        print("=" * 80)
+        print("ARABIC TEXT ALIGNMENT ANALYSIS")
+        print("=" * 80)
+        print(f"\nUser Text Words: {len(results['user_words'])}")
+        print(f"Reference Text Words: {len(results['reference_words'])}")
+        print(f"Edit Distance: {results['edit_distance']}")
+        print("\n" + "-" * 80)
+        print("STATISTICS")
+        print("-" * 80)
+        stats = results['statistics']
+        print(f"Total Errors: {stats['total_errors']}")
+        print(f"  - Word-level Errors: {stats['word_level_errors']}")
+        print(f"  - Diacritic Errors: {stats['diacritic_errors']}")
+        print(f"Accuracy: {stats['accuracy']:.2f}%")
+        print("\n" + "-" * 80)
+        print("DETAILED ERRORS")
+        print("-" * 80)
+        for i, error in enumerate(results['errors'], 1):
+            if error.error_type == ErrorType.MATCH:
+                continue  # Skip perfect matches in detailed output
+            print(f"\n[{i}] Position: {error.position}")
+            print(f"    Type: {error.error_type.value.upper()}")
+            if error.error_type == ErrorType.INSERTION:
+                print(f"    User: '{error.user_word}' (extra word)")
+                print(f"    Expected: [nothing]")
+            elif error.error_type == ErrorType.DELETION:
+                print(f"    User: [missing]")
+                print(f"    Expected: '{error.reference_word}'")
+            elif error.error_type == ErrorType.SUBSTITUTION:
+                print(f"    User: '{error.user_word}'")
+                print(f"    Expected: '{error.reference_word}'")
+            elif error.error_type == ErrorType.DIACRITIC_ERROR:
+                print(f"    User: '{error.user_word}'")
+                print(f"    Expected: '{error.reference_word}'")
+            print(f"    Details: {error.details}")