import json import re from difflib import SequenceMatcher CANON_PATH = "data/fatiha_canonical.json" ASR_TEXT_PATH = "output/asr_raw.txt" OUT_PATH = "output/text_alignment.json" # --- Normalization helpers --- ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") # harakat etc. TATWEEL = "\u0640" def normalize_ar(s: str) -> str: s = s.replace(TATWEEL, "") s = re.sub(ARABIC_DIACRITICS, "", s) # normalize common variants s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا") s = s.replace("ى", "ي") s = s.replace("ة", "ه") s = re.sub(r"\s+", " ", s).strip() return s def tokenize(s: str): # keep Arabic letters and spaces only s = re.sub(r"[^\u0600-\u06FF\s]", " ", s) s = re.sub(r"\s+", " ", s).strip() return s.split(" ") if s else [] def sim(a, b) -> float: return SequenceMatcher(None, a, b).ratio() def main(): canon = json.load(open(CANON_PATH, encoding="utf-8")) # Load ASR raw text (we will create it in 14.2) raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip() raw_n = normalize_ar(raw) asr_tokens = tokenize(raw_n) # Canonical tokens (word-level) from JSON canon_words = [] for ay in canon["ayahs"]: for w in ay["words"]: canon_words.append({ "ayah": ay["ayah"], "word": w, "norm": normalize_ar(w) }) # Greedy alignment: for each canonical word, find best match in a moving window of ASR tokens aligned = [] j = 0 WINDOW = 6 for i, cw in enumerate(canon_words): best = None best_j = None for k in range(j, min(len(asr_tokens), j + WINDOW)): score = sim(cw["norm"], asr_tokens[k]) if (best is None) or (score > best): best = score best_j = k if best is None: aligned.append({ "canon": cw, "asr_token": None, "score": 0.0, "match": False }) continue token = asr_tokens[best_j] match = best >= 0.75 # MVP threshold aligned.append({ "canon": cw, "asr_token": token, "score": round(float(best), 3), "match": bool(match) }) # advance pointer to keep order j = best_j + 1 # Summaries total = len(aligned) matches = sum(1 for a in aligned if a["match"]) mismatches = total - matches out = { "asr_raw": raw, "asr_normalized": raw_n, "stats": { "canonical_words": total, "matches": matches, "mismatches": mismatches, "match_rate": round(matches / total, 3) if total else 0.0 }, "alignment": aligned } json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2) print("OK ✅ wrote", OUT_PATH) print("Match rate:", out["stats"]["match_rate"]) print("First 5 alignments:") for a in aligned[:5]: print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"]) if __name__ == "__main__": main()