Spaces:
Runtime error
Runtime error
| import re | |
| from typing import List, Tuple, Dict | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| class ErrorType(Enum): | |
| MATCH = "match" | |
| SUBSTITUTION = "substitution" | |
| INSERTION = "insertion" | |
| DELETION = "deletion" | |
| DIACRITIC_ERROR = "diacritic_error" | |
| class AlignmentError: | |
| error_type: ErrorType | |
| position: int | |
| user_word: str | |
| reference_word: str | |
| details: str = "" | |
| class ArabicAligner: | |
| # Arabic diacritics | |
| DIACRITICS = '\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0657\u0658' | |
| DIACRITIC_PATTERN = f'[{DIACRITICS}]' | |
| def __init__(self): | |
| self.alignment_matrix = None | |
| self.backtrack_matrix = None | |
| def normalize_text(self, text: str) -> str: | |
| """Normalize Arabic text: remove extra spaces, normalize characters""" | |
| # Remove tatweel (ـ) | |
| text = text.replace('\u0640', '') | |
| # Normalize Alef variations to plain Alef | |
| text = re.sub('[إأآٱ]', 'ا', text) | |
| # Normalize Hamza variations | |
| text = re.sub('[ؤئ]', 'ء', text) | |
| # Normalize Teh Marbuta | |
| text = re.sub('ة', 'ه', text) | |
| # Remove extra whitespace | |
| text = ' '.join(text.split()) | |
| return text.strip() | |
| def remove_diacritics(self, text: str) -> str: | |
| """Remove all diacritics from Arabic text""" | |
| return re.sub(self.DIACRITIC_PATTERN, '', text) | |
| def extract_diacritics(self, word: str) -> List[Tuple[int, str]]: | |
| """Extract diacritics and their positions from a word""" | |
| diacritics = [] | |
| pos = 0 | |
| for i, char in enumerate(word): | |
| if char in self.DIACRITICS: | |
| diacritics.append((pos, char)) | |
| else: | |
| pos += 1 | |
| return diacritics | |
| def tokenize(self, text: str) -> List[str]: | |
| """Tokenize text into words""" | |
| # Split by whitespace and punctuation | |
| words = re.findall(r'[\w\u0600-\u06FF]+', text) | |
| return [w for w in words if w.strip()] | |
| def compute_alignment(self, user_words: List[str], ref_words: List[str]) -> Tuple[List[List[int]], List[List[str]]]: | |
| """ | |
| Compute word-level alignment using dynamic programming (edit distance). | |
| Returns the cost matrix and backtrack matrix. | |
| """ | |
| m, n = len(user_words), len(ref_words) | |
| # Initialize matrices | |
| dp = [[0] * (n + 1) for _ in range(m + 1)] | |
| backtrack = [['' for _ in range(n + 1)] for _ in range(m + 1)] | |
| # Initialize base cases | |
| for i in range(m + 1): | |
| dp[i][0] = i | |
| if i > 0: | |
| backtrack[i][0] = 'INS' | |
| for j in range(n + 1): | |
| dp[0][j] = j | |
| if j > 0: | |
| backtrack[0][j] = 'DEL' | |
| backtrack[0][0] = '' | |
| # Fill the DP table | |
| for i in range(1, m + 1): | |
| for j in range(1, n + 1): | |
| # Remove diacritics for comparison | |
| user_clean = self.remove_diacritics(user_words[i-1]) | |
| ref_clean = self.remove_diacritics(ref_words[j-1]) | |
| if user_clean == ref_clean: | |
| # Match (cost 0) | |
| dp[i][j] = dp[i-1][j-1] | |
| backtrack[i][j] = 'MATCH' | |
| else: | |
| # Substitution | |
| subst_cost = dp[i-1][j-1] + 1 | |
| # Deletion from reference | |
| del_cost = dp[i][j-1] + 1 | |
| # Insertion to user | |
| ins_cost = dp[i-1][j] + 1 | |
| min_cost = min(subst_cost, del_cost, ins_cost) | |
| dp[i][j] = min_cost | |
| if min_cost == subst_cost: | |
| backtrack[i][j] = 'SUBST' | |
| elif min_cost == del_cost: | |
| backtrack[i][j] = 'DEL' | |
| else: | |
| backtrack[i][j] = 'INS' | |
| self.alignment_matrix = dp | |
| self.backtrack_matrix = backtrack | |
| return dp, backtrack | |
| def traceback_alignment(self, user_words: List[str], ref_words: List[str]) -> List[Tuple[str, int, int]]: | |
| """ | |
| Traceback through the alignment to get aligned pairs. | |
| Returns list of (operation, user_idx, ref_idx) tuples. | |
| """ | |
| if self.backtrack_matrix is None: | |
| raise ValueError("Must call compute_alignment first") | |
| alignments = [] | |
| i, j = len(user_words), len(ref_words) | |
| while i > 0 or j > 0: | |
| operation = self.backtrack_matrix[i][j] | |
| if operation == 'MATCH': | |
| alignments.append(('MATCH', i-1, j-1)) | |
| i -= 1 | |
| j -= 1 | |
| elif operation == 'SUBST': | |
| alignments.append(('SUBST', i-1, j-1)) | |
| i -= 1 | |
| j -= 1 | |
| elif operation == 'DEL': | |
| alignments.append(('DEL', -1, j-1)) | |
| j -= 1 | |
| elif operation == 'INS': | |
| alignments.append(('INS', i-1, -1)) | |
| i -= 1 | |
| return list(reversed(alignments)) | |
| def compare_diacritics(self, user_word: str, ref_word: str) -> Tuple[bool, str]: | |
| """ | |
| Compare diacritics between two words (after confirming base match). | |
| Returns (is_match, details_string) | |
| """ | |
| user_clean = self.remove_diacritics(user_word) | |
| ref_clean = self.remove_diacritics(ref_word) | |
| if user_clean != ref_clean: | |
| return False, "Base words don't match" | |
| user_diacs = self.extract_diacritics(user_word) | |
| ref_diacs = self.extract_diacritics(ref_word) | |
| if user_diacs == ref_diacs: | |
| return True, "Perfect match" | |
| # Detailed comparison | |
| user_dict = {pos: diac for pos, diac in user_diacs} | |
| ref_dict = {pos: diac for pos, diac in ref_diacs} | |
| errors = [] | |
| all_positions = sorted(set(user_dict.keys()) | set(ref_dict.keys())) | |
| for pos in all_positions: | |
| if pos in user_dict and pos not in ref_dict: | |
| errors.append(f"Extra diacritic '{user_dict[pos]}' at position {pos}") | |
| elif pos not in user_dict and pos in ref_dict: | |
| errors.append(f"Missing diacritic '{ref_dict[pos]}' at position {pos}") | |
| elif user_dict[pos] != ref_dict[pos]: | |
| errors.append(f"Wrong diacritic at position {pos}: '{user_dict[pos]}' should be '{ref_dict[pos]}'") | |
| return False, "; ".join(errors) | |
| def align_and_compare(self, user_text: str, reference_text: str) -> Dict: | |
| """ | |
| Main function: align texts and detect all errors. | |
| """ | |
| # Step 1: Normalize | |
| user_normalized = self.normalize_text(user_text) | |
| ref_normalized = self.normalize_text(reference_text) | |
| # Step 2: Tokenize | |
| user_words = self.tokenize(user_normalized) | |
| ref_words = self.tokenize(ref_normalized) | |
| # Step 3: Compute alignment | |
| dp, backtrack = self.compute_alignment(user_words, ref_words) | |
| # Step 4: Traceback and identify errors | |
| alignments = self.traceback_alignment(user_words, ref_words) | |
| errors = [] | |
| ref_position = 0 | |
| for operation, user_idx, ref_idx in alignments: | |
| if operation == 'MATCH': | |
| # Check diacritics for matched words | |
| user_word = user_words[user_idx] | |
| ref_word = ref_words[ref_idx] | |
| is_match, details = self.compare_diacritics(user_word, ref_word) | |
| if is_match: | |
| errors.append(AlignmentError( | |
| error_type=ErrorType.MATCH, | |
| position=ref_position, | |
| user_word=user_word, | |
| reference_word=ref_word, | |
| details="Perfect match" | |
| )) | |
| else: | |
| errors.append(AlignmentError( | |
| error_type=ErrorType.DIACRITIC_ERROR, | |
| position=ref_position, | |
| user_word=user_word, | |
| reference_word=ref_word, | |
| details=details | |
| )) | |
| ref_position += 1 | |
| elif operation == 'SUBST': | |
| errors.append(AlignmentError( | |
| error_type=ErrorType.SUBSTITUTION, | |
| position=ref_position, | |
| user_word=user_words[user_idx], | |
| reference_word=ref_words[ref_idx], | |
| details=f"Word substituted" | |
| )) | |
| ref_position += 1 | |
| elif operation == 'DEL': | |
| errors.append(AlignmentError( | |
| error_type=ErrorType.DELETION, | |
| position=ref_position, | |
| user_word="", | |
| reference_word=ref_words[ref_idx], | |
| details=f"Word deleted from user text" | |
| )) | |
| ref_position += 1 | |
| elif operation == 'INS': | |
| errors.append(AlignmentError( | |
| error_type=ErrorType.INSERTION, | |
| position=ref_position, | |
| user_word=user_words[user_idx], | |
| reference_word="", | |
| details=f"Word inserted in user text" | |
| )) | |
| # Compile results | |
| total_errors = sum(1 for e in errors if e.error_type != ErrorType.MATCH) | |
| diacritic_errors = sum(1 for e in errors if e.error_type == ErrorType.DIACRITIC_ERROR) | |
| word_errors = sum(1 for e in errors if e.error_type in [ErrorType.SUBSTITUTION, ErrorType.INSERTION, ErrorType.DELETION]) | |
| return { | |
| 'user_words': user_words, | |
| 'reference_words': ref_words, | |
| 'alignments': alignments, | |
| 'errors': errors, | |
| 'edit_distance': dp[-1][-1], | |
| 'statistics': { | |
| 'total_reference_words': len(ref_words), | |
| 'total_user_words': len(user_words), | |
| 'total_errors': total_errors, | |
| 'word_level_errors': word_errors, | |
| 'diacritic_errors': diacritic_errors, | |
| 'accuracy': (len(ref_words) - total_errors) / len(ref_words) * 100 if ref_words else 0 | |
| } | |
| } | |
| def print_results(self, results: Dict): | |
| """Print formatted results""" | |
| print("=" * 80) | |
| print("ARABIC TEXT ALIGNMENT ANALYSIS") | |
| print("=" * 80) | |
| print(f"\nUser Text Words: {len(results['user_words'])}") | |
| print(f"Reference Text Words: {len(results['reference_words'])}") | |
| print(f"Edit Distance: {results['edit_distance']}") | |
| print("\n" + "-" * 80) | |
| print("STATISTICS") | |
| print("-" * 80) | |
| stats = results['statistics'] | |
| print(f"Total Errors: {stats['total_errors']}") | |
| print(f" - Word-level Errors: {stats['word_level_errors']}") | |
| print(f" - Diacritic Errors: {stats['diacritic_errors']}") | |
| print(f"Accuracy: {stats['accuracy']:.2f}%") | |
| print("\n" + "-" * 80) | |
| print("DETAILED ERRORS") | |
| print("-" * 80) | |
| for i, error in enumerate(results['errors'], 1): | |
| if error.error_type == ErrorType.MATCH: | |
| continue # Skip perfect matches in detailed output | |
| print(f"\n[{i}] Position: {error.position}") | |
| print(f" Type: {error.error_type.value.upper()}") | |
| if error.error_type == ErrorType.INSERTION: | |
| print(f" User: '{error.user_word}' (extra word)") | |
| print(f" Expected: [nothing]") | |
| elif error.error_type == ErrorType.DELETION: | |
| print(f" User: [missing]") | |
| print(f" Expected: '{error.reference_word}'") | |
| elif error.error_type == ErrorType.SUBSTITUTION: | |
| print(f" User: '{error.user_word}'") | |
| print(f" Expected: '{error.reference_word}'") | |
| elif error.error_type == ErrorType.DIACRITIC_ERROR: | |
| print(f" User: '{error.user_word}'") | |
| print(f" Expected: '{error.reference_word}'") | |
| print(f" Details: {error.details}") |