recitation-segmenter-app-API

Runtime error

App Files Files Community

recitation-segmenter-app-API / arabic_aligner.py

aboalaa147

Create arabic_aligner.py

b0b0b0f verified 21 days ago

raw

history blame contribute delete

12.2 kB

	import re
	from typing import List, Tuple, Dict
	from dataclasses import dataclass
	from enum import Enum

	class ErrorType(Enum):
	MATCH = "match"
	SUBSTITUTION = "substitution"
	INSERTION = "insertion"
	DELETION = "deletion"
	DIACRITIC_ERROR = "diacritic_error"

	@dataclass
	class AlignmentError:
	error_type: ErrorType
	position: int
	user_word: str
	reference_word: str
	details: str = ""

	class ArabicAligner:
	# Arabic diacritics
	DIACRITICS = '\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0657\u0658'
	DIACRITIC_PATTERN = f'[{DIACRITICS}]'

	def __init__(self):
	self.alignment_matrix = None
	self.backtrack_matrix = None

	def normalize_text(self, text: str) -> str:
	"""Normalize Arabic text: remove extra spaces, normalize characters"""
	# Remove tatweel (ـ)
	text = text.replace('\u0640', '')

	# Normalize Alef variations to plain Alef
	text = re.sub('[إأآٱ]', 'ا', text)

	# Normalize Hamza variations
	text = re.sub('[ؤئ]', 'ء', text)

	# Normalize Teh Marbuta
	text = re.sub('ة', 'ه', text)

	# Remove extra whitespace
	text = ' '.join(text.split())

	return text.strip()

	def remove_diacritics(self, text: str) -> str:
	"""Remove all diacritics from Arabic text"""
	return re.sub(self.DIACRITIC_PATTERN, '', text)

	def extract_diacritics(self, word: str) -> List[Tuple[int, str]]:
	"""Extract diacritics and their positions from a word"""
	diacritics = []
	pos = 0
	for i, char in enumerate(word):
	if char in self.DIACRITICS:
	diacritics.append((pos, char))
	else:
	pos += 1
	return diacritics

	def tokenize(self, text: str) -> List[str]:
	"""Tokenize text into words"""
	# Split by whitespace and punctuation
	words = re.findall(r'[\w\u0600-\u06FF]+', text)
	return [w for w in words if w.strip()]

	def compute_alignment(self, user_words: List[str], ref_words: List[str]) -> Tuple[List[List[int]], List[List[str]]]:
	"""
	Compute word-level alignment using dynamic programming (edit distance).
	Returns the cost matrix and backtrack matrix.
	"""
	m, n = len(user_words), len(ref_words)

	# Initialize matrices
	dp = [[0] * (n + 1) for _ in range(m + 1)]
	backtrack = [['' for _ in range(n + 1)] for _ in range(m + 1)]

	# Initialize base cases
	for i in range(m + 1):
	dp[i][0] = i
	if i > 0:
	backtrack[i][0] = 'INS'

	for j in range(n + 1):
	dp[0][j] = j
	if j > 0:
	backtrack[0][j] = 'DEL'

	backtrack[0][0] = ''

	# Fill the DP table
	for i in range(1, m + 1):
	for j in range(1, n + 1):
	# Remove diacritics for comparison
	user_clean = self.remove_diacritics(user_words[i-1])
	ref_clean = self.remove_diacritics(ref_words[j-1])

	if user_clean == ref_clean:
	# Match (cost 0)
	dp[i][j] = dp[i-1][j-1]
	backtrack[i][j] = 'MATCH'
	else:
	# Substitution
	subst_cost = dp[i-1][j-1] + 1
	# Deletion from reference
	del_cost = dp[i][j-1] + 1
	# Insertion to user
	ins_cost = dp[i-1][j] + 1

	min_cost = min(subst_cost, del_cost, ins_cost)
	dp[i][j] = min_cost

	if min_cost == subst_cost:
	backtrack[i][j] = 'SUBST'
	elif min_cost == del_cost:
	backtrack[i][j] = 'DEL'
	else:
	backtrack[i][j] = 'INS'

	self.alignment_matrix = dp
	self.backtrack_matrix = backtrack

	return dp, backtrack

	def traceback_alignment(self, user_words: List[str], ref_words: List[str]) -> List[Tuple[str, int, int]]:
	"""
	Traceback through the alignment to get aligned pairs.
	Returns list of (operation, user_idx, ref_idx) tuples.
	"""
	if self.backtrack_matrix is None:
	raise ValueError("Must call compute_alignment first")

	alignments = []
	i, j = len(user_words), len(ref_words)

	while i > 0 or j > 0:
	operation = self.backtrack_matrix[i][j]

	if operation == 'MATCH':
	alignments.append(('MATCH', i-1, j-1))
	i -= 1
	j -= 1
	elif operation == 'SUBST':
	alignments.append(('SUBST', i-1, j-1))
	i -= 1
	j -= 1
	elif operation == 'DEL':
	alignments.append(('DEL', -1, j-1))
	j -= 1
	elif operation == 'INS':
	alignments.append(('INS', i-1, -1))
	i -= 1

	return list(reversed(alignments))

	def compare_diacritics(self, user_word: str, ref_word: str) -> Tuple[bool, str]:
	"""
	Compare diacritics between two words (after confirming base match).
	Returns (is_match, details_string)
	"""
	user_clean = self.remove_diacritics(user_word)
	ref_clean = self.remove_diacritics(ref_word)

	if user_clean != ref_clean:
	return False, "Base words don't match"

	user_diacs = self.extract_diacritics(user_word)
	ref_diacs = self.extract_diacritics(ref_word)

	if user_diacs == ref_diacs:
	return True, "Perfect match"

	# Detailed comparison
	user_dict = {pos: diac for pos, diac in user_diacs}
	ref_dict = {pos: diac for pos, diac in ref_diacs}

	errors = []
	all_positions = sorted(set(user_dict.keys()) \| set(ref_dict.keys()))

	for pos in all_positions:
	if pos in user_dict and pos not in ref_dict:
	errors.append(f"Extra diacritic '{user_dict[pos]}' at position {pos}")
	elif pos not in user_dict and pos in ref_dict:
	errors.append(f"Missing diacritic '{ref_dict[pos]}' at position {pos}")
	elif user_dict[pos] != ref_dict[pos]:
	errors.append(f"Wrong diacritic at position {pos}: '{user_dict[pos]}' should be '{ref_dict[pos]}'")

	return False, "; ".join(errors)

	def align_and_compare(self, user_text: str, reference_text: str) -> Dict:
	"""
	Main function: align texts and detect all errors.
	"""
	# Step 1: Normalize
	user_normalized = self.normalize_text(user_text)
	ref_normalized = self.normalize_text(reference_text)

	# Step 2: Tokenize
	user_words = self.tokenize(user_normalized)
	ref_words = self.tokenize(ref_normalized)

	# Step 3: Compute alignment
	dp, backtrack = self.compute_alignment(user_words, ref_words)

	# Step 4: Traceback and identify errors
	alignments = self.traceback_alignment(user_words, ref_words)

	errors = []
	ref_position = 0

	for operation, user_idx, ref_idx in alignments:
	if operation == 'MATCH':
	# Check diacritics for matched words
	user_word = user_words[user_idx]
	ref_word = ref_words[ref_idx]

	is_match, details = self.compare_diacritics(user_word, ref_word)

	if is_match:
	errors.append(AlignmentError(
	error_type=ErrorType.MATCH,
	position=ref_position,
	user_word=user_word,
	reference_word=ref_word,
	details="Perfect match"
	))
	else:
	errors.append(AlignmentError(
	error_type=ErrorType.DIACRITIC_ERROR,
	position=ref_position,
	user_word=user_word,
	reference_word=ref_word,
	details=details
	))
	ref_position += 1

	elif operation == 'SUBST':
	errors.append(AlignmentError(
	error_type=ErrorType.SUBSTITUTION,
	position=ref_position,
	user_word=user_words[user_idx],
	reference_word=ref_words[ref_idx],
	details=f"Word substituted"
	))
	ref_position += 1

	elif operation == 'DEL':
	errors.append(AlignmentError(
	error_type=ErrorType.DELETION,
	position=ref_position,
	user_word="",
	reference_word=ref_words[ref_idx],
	details=f"Word deleted from user text"
	))
	ref_position += 1

	elif operation == 'INS':
	errors.append(AlignmentError(
	error_type=ErrorType.INSERTION,
	position=ref_position,
	user_word=user_words[user_idx],
	reference_word="",
	details=f"Word inserted in user text"
	))

	# Compile results
	total_errors = sum(1 for e in errors if e.error_type != ErrorType.MATCH)
	diacritic_errors = sum(1 for e in errors if e.error_type == ErrorType.DIACRITIC_ERROR)
	word_errors = sum(1 for e in errors if e.error_type in [ErrorType.SUBSTITUTION, ErrorType.INSERTION, ErrorType.DELETION])

	return {
	'user_words': user_words,
	'reference_words': ref_words,
	'alignments': alignments,
	'errors': errors,
	'edit_distance': dp[-1][-1],
	'statistics': {
	'total_reference_words': len(ref_words),
	'total_user_words': len(user_words),
	'total_errors': total_errors,
	'word_level_errors': word_errors,
	'diacritic_errors': diacritic_errors,
	'accuracy': (len(ref_words) - total_errors) / len(ref_words) * 100 if ref_words else 0
	}
	}

	def print_results(self, results: Dict):
	"""Print formatted results"""
	print("=" * 80)
	print("ARABIC TEXT ALIGNMENT ANALYSIS")
	print("=" * 80)

	print(f"\nUser Text Words: {len(results['user_words'])}")
	print(f"Reference Text Words: {len(results['reference_words'])}")
	print(f"Edit Distance: {results['edit_distance']}")

	print("\n" + "-" * 80)
	print("STATISTICS")
	print("-" * 80)
	stats = results['statistics']
	print(f"Total Errors: {stats['total_errors']}")
	print(f" - Word-level Errors: {stats['word_level_errors']}")
	print(f" - Diacritic Errors: {stats['diacritic_errors']}")
	print(f"Accuracy: {stats['accuracy']:.2f}%")

	print("\n" + "-" * 80)
	print("DETAILED ERRORS")
	print("-" * 80)

	for i, error in enumerate(results['errors'], 1):
	if error.error_type == ErrorType.MATCH:
	continue # Skip perfect matches in detailed output

	print(f"\n[{i}] Position: {error.position}")
	print(f" Type: {error.error_type.value.upper()}")

	if error.error_type == ErrorType.INSERTION:
	print(f" User: '{error.user_word}' (extra word)")
	print(f" Expected: [nothing]")
	elif error.error_type == ErrorType.DELETION:
	print(f" User: [missing]")
	print(f" Expected: '{error.reference_word}'")
	elif error.error_type == ErrorType.SUBSTITUTION:
	print(f" User: '{error.user_word}'")
	print(f" Expected: '{error.reference_word}'")
	elif error.error_type == ErrorType.DIACRITIC_ERROR:
	print(f" User: '{error.user_word}'")
	print(f" Expected: '{error.reference_word}'")

	print(f" Details: {error.details}")