Spaces:

iRecite
/

iRecite-MVP-API

Sleeping

iRecite-MVP-API / step15_global_word_alignment.py

didodev

Deploy iRecite MVP API (Docker + FastAPI)

4ca6263 about 1 month ago

4.02 kB

	import json
	import re
	from difflib import SequenceMatcher

	CANON_PATH = "data/fatiha_canonical.json"
	ASR_TEXT_PATH = "output/asr_raw.txt"
	OUT_PATH = "output/text_alignment_global.json"

	ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
	TATWEEL = "\u0640"

	def normalize_ar(s: str) -> str:
	s = s.replace(TATWEEL, "")
	s = re.sub(ARABIC_DIACRITICS, "", s)
	s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
	s = s.replace("ى", "ي")
	s = s.replace("ة", "ه")
	s = re.sub(r"\s+", " ", s).strip()
	return s

	def tokenize(s: str):
	s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
	s = re.sub(r"\s+", " ", s).strip()
	return s.split(" ") if s else []

	def sim(a, b) -> float:
	return SequenceMatcher(None, a, b).ratio()

	def main():
	canon = json.load(open(CANON_PATH, encoding="utf-8"))
	raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
	raw_n = normalize_ar(raw)

	asr_tokens = tokenize(raw_n)

	canon_words = []
	for ay in canon["ayahs"]:
	for w in ay["words"]:
	canon_words.append({
	"ayah": ay["ayah"],
	"word": w,
	"norm": normalize_ar(w)
	})

	# --- Global alignment DP ---
	n = len(canon_words)
	m = len(asr_tokens)

	# scoring
	GAP = -0.45 # penalty for skipping a token/word
	def match_score(i, j):
	# reward similarity, centered around 0.75
	s = sim(canon_words[i]["norm"], asr_tokens[j])
	return (s - 0.75) * 2.0 # >0 is good match

	# DP matrices
	dp = [[0.0]*(m+1) for _ in range(n+1)]
	bt = [[None]*(m+1) for _ in range(n+1)] # backtrack: 'D' diag, 'U' up, 'L' left

	for i in range(1, n+1):
	dp[i][0] = dp[i-1][0] + GAP
	bt[i][0] = 'U'
	for j in range(1, m+1):
	dp[0][j] = dp[0][j-1] + GAP
	bt[0][j] = 'L'

	for i in range(1, n+1):
	for j in range(1, m+1):
	diag = dp[i-1][j-1] + match_score(i-1, j-1)
	up = dp[i-1][j] + GAP
	left = dp[i][j-1] + GAP
	best = max(diag, up, left)
	dp[i][j] = best
	bt[i][j] = 'D' if best == diag else ('U' if best == up else 'L')

	# Backtrack to alignment pairs
	aligned = []
	i, j = n, m
	while i > 0 or j > 0:
	move = bt[i][j]
	if move == 'D':
	cw = canon_words[i-1]
	tok = asr_tokens[j-1]
	s = sim(cw["norm"], tok)
	aligned.append({
	"canon": cw,
	"asr_token": tok,
	"score": round(float(s), 3),
	"match": bool(s >= 0.72)
	})
	i -= 1
	j -= 1
	elif move == 'U':
	cw = canon_words[i-1]
	aligned.append({
	"canon": cw,
	"asr_token": None,
	"score": 0.0,
	"match": False
	})
	i -= 1
	else: # 'L'
	# ASR token skipped
	j -= 1

	aligned.reverse()

	total = len(canon_words)
	matches = sum(1 for a in aligned if a["canon"] and a["match"])
	mismatches = total - matches

	out = {
	"asr_raw": raw,
	"asr_normalized": raw_n,
	"stats": {
	"canonical_words": total,
	"asr_tokens": len(asr_tokens),
	"matches": matches,
	"mismatches": mismatches,
	"match_rate": round(matches / total, 3) if total else 0.0
	},
	"alignment": aligned
	}

	json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)

	print("OK ✅ wrote", OUT_PATH)
	print("Match rate:", out["stats"]["match_rate"])
	print("First 8 alignments:")
	shown = 0
	for a in aligned:
	if a["canon"] is None:
	continue
	print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])
	shown += 1
	if shown >= 8:
	break

	if __name__ == "__main__":
	main()