import json import re from difflib import SequenceMatcher CANON_PATH = "data/fatiha_canonical.json" ASR_TEXT_PATH = "output/asr_raw.txt" OUT_PATH = "output/text_alignment_global.json" ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") TATWEEL = "\u0640" def normalize_ar(s: str) -> str: s = s.replace(TATWEEL, "") s = re.sub(ARABIC_DIACRITICS, "", s) s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا") s = s.replace("ى", "ي") s = s.replace("ة", "ه") s = re.sub(r"\s+", " ", s).strip() return s def tokenize(s: str): s = re.sub(r"[^\u0600-\u06FF\s]", " ", s) s = re.sub(r"\s+", " ", s).strip() return s.split(" ") if s else [] def sim(a, b) -> float: return SequenceMatcher(None, a, b).ratio() def main(): canon = json.load(open(CANON_PATH, encoding="utf-8")) raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip() raw_n = normalize_ar(raw) asr_tokens = tokenize(raw_n) canon_words = [] for ay in canon["ayahs"]: for w in ay["words"]: canon_words.append({ "ayah": ay["ayah"], "word": w, "norm": normalize_ar(w) }) # --- Global alignment DP --- n = len(canon_words) m = len(asr_tokens) # scoring GAP = -0.45 # penalty for skipping a token/word def match_score(i, j): # reward similarity, centered around 0.75 s = sim(canon_words[i]["norm"], asr_tokens[j]) return (s - 0.75) * 2.0 # >0 is good match # DP matrices dp = [[0.0]*(m+1) for _ in range(n+1)] bt = [[None]*(m+1) for _ in range(n+1)] # backtrack: 'D' diag, 'U' up, 'L' left for i in range(1, n+1): dp[i][0] = dp[i-1][0] + GAP bt[i][0] = 'U' for j in range(1, m+1): dp[0][j] = dp[0][j-1] + GAP bt[0][j] = 'L' for i in range(1, n+1): for j in range(1, m+1): diag = dp[i-1][j-1] + match_score(i-1, j-1) up = dp[i-1][j] + GAP left = dp[i][j-1] + GAP best = max(diag, up, left) dp[i][j] = best bt[i][j] = 'D' if best == diag else ('U' if best == up else 'L') # Backtrack to alignment pairs aligned = [] i, j = n, m while i > 0 or j > 0: move = bt[i][j] if move == 'D': cw = canon_words[i-1] tok = asr_tokens[j-1] s = sim(cw["norm"], tok) aligned.append({ "canon": cw, "asr_token": tok, "score": round(float(s), 3), "match": bool(s >= 0.72) }) i -= 1 j -= 1 elif move == 'U': cw = canon_words[i-1] aligned.append({ "canon": cw, "asr_token": None, "score": 0.0, "match": False }) i -= 1 else: # 'L' # ASR token skipped j -= 1 aligned.reverse() total = len(canon_words) matches = sum(1 for a in aligned if a["canon"] and a["match"]) mismatches = total - matches out = { "asr_raw": raw, "asr_normalized": raw_n, "stats": { "canonical_words": total, "asr_tokens": len(asr_tokens), "matches": matches, "mismatches": mismatches, "match_rate": round(matches / total, 3) if total else 0.0 }, "alignment": aligned } json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2) print("OK ✅ wrote", OUT_PATH) print("Match rate:", out["stats"]["match_rate"]) print("First 8 alignments:") shown = 0 for a in aligned: if a["canon"] is None: continue print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"]) shown += 1 if shown >= 8: break if __name__ == "__main__": main()