Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| from difflib import SequenceMatcher | |
| CANON_PATH = "data/fatiha_canonical.json" | |
| ASR_TEXT_PATH = "output/asr_raw.txt" | |
| OUT_PATH = "output/text_alignment_global.json" | |
| ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") | |
| TATWEEL = "\u0640" | |
| def normalize_ar(s: str) -> str: | |
| s = s.replace(TATWEEL, "") | |
| s = re.sub(ARABIC_DIACRITICS, "", s) | |
| s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا") | |
| s = s.replace("ى", "ي") | |
| s = s.replace("ة", "ه") | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def tokenize(s: str): | |
| s = re.sub(r"[^\u0600-\u06FF\s]", " ", s) | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s.split(" ") if s else [] | |
| def sim(a, b) -> float: | |
| return SequenceMatcher(None, a, b).ratio() | |
| def main(): | |
| canon = json.load(open(CANON_PATH, encoding="utf-8")) | |
| raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip() | |
| raw_n = normalize_ar(raw) | |
| asr_tokens = tokenize(raw_n) | |
| canon_words = [] | |
| for ay in canon["ayahs"]: | |
| for w in ay["words"]: | |
| canon_words.append({ | |
| "ayah": ay["ayah"], | |
| "word": w, | |
| "norm": normalize_ar(w) | |
| }) | |
| # --- Global alignment DP --- | |
| n = len(canon_words) | |
| m = len(asr_tokens) | |
| # scoring | |
| GAP = -0.45 # penalty for skipping a token/word | |
| def match_score(i, j): | |
| # reward similarity, centered around 0.75 | |
| s = sim(canon_words[i]["norm"], asr_tokens[j]) | |
| return (s - 0.75) * 2.0 # >0 is good match | |
| # DP matrices | |
| dp = [[0.0]*(m+1) for _ in range(n+1)] | |
| bt = [[None]*(m+1) for _ in range(n+1)] # backtrack: 'D' diag, 'U' up, 'L' left | |
| for i in range(1, n+1): | |
| dp[i][0] = dp[i-1][0] + GAP | |
| bt[i][0] = 'U' | |
| for j in range(1, m+1): | |
| dp[0][j] = dp[0][j-1] + GAP | |
| bt[0][j] = 'L' | |
| for i in range(1, n+1): | |
| for j in range(1, m+1): | |
| diag = dp[i-1][j-1] + match_score(i-1, j-1) | |
| up = dp[i-1][j] + GAP | |
| left = dp[i][j-1] + GAP | |
| best = max(diag, up, left) | |
| dp[i][j] = best | |
| bt[i][j] = 'D' if best == diag else ('U' if best == up else 'L') | |
| # Backtrack to alignment pairs | |
| aligned = [] | |
| i, j = n, m | |
| while i > 0 or j > 0: | |
| move = bt[i][j] | |
| if move == 'D': | |
| cw = canon_words[i-1] | |
| tok = asr_tokens[j-1] | |
| s = sim(cw["norm"], tok) | |
| aligned.append({ | |
| "canon": cw, | |
| "asr_token": tok, | |
| "score": round(float(s), 3), | |
| "match": bool(s >= 0.72) | |
| }) | |
| i -= 1 | |
| j -= 1 | |
| elif move == 'U': | |
| cw = canon_words[i-1] | |
| aligned.append({ | |
| "canon": cw, | |
| "asr_token": None, | |
| "score": 0.0, | |
| "match": False | |
| }) | |
| i -= 1 | |
| else: # 'L' | |
| # ASR token skipped | |
| j -= 1 | |
| aligned.reverse() | |
| total = len(canon_words) | |
| matches = sum(1 for a in aligned if a["canon"] and a["match"]) | |
| mismatches = total - matches | |
| out = { | |
| "asr_raw": raw, | |
| "asr_normalized": raw_n, | |
| "stats": { | |
| "canonical_words": total, | |
| "asr_tokens": len(asr_tokens), | |
| "matches": matches, | |
| "mismatches": mismatches, | |
| "match_rate": round(matches / total, 3) if total else 0.0 | |
| }, | |
| "alignment": aligned | |
| } | |
| json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2) | |
| print("OK ✅ wrote", OUT_PATH) | |
| print("Match rate:", out["stats"]["match_rate"]) | |
| print("First 8 alignments:") | |
| shown = 0 | |
| for a in aligned: | |
| if a["canon"] is None: | |
| continue | |
| print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"]) | |
| shown += 1 | |
| if shown >= 8: | |
| break | |
| if __name__ == "__main__": | |
| main() |