Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| from difflib import SequenceMatcher | |
| CANON_PATH = "data/fatiha_canonical.json" | |
| ASR_TEXT_PATH = "output/asr_raw.txt" | |
| OUT_PATH = "output/text_alignment.json" | |
| # --- Normalization helpers --- | |
| ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") # harakat etc. | |
| TATWEEL = "\u0640" | |
| def normalize_ar(s: str) -> str: | |
| s = s.replace(TATWEEL, "") | |
| s = re.sub(ARABIC_DIACRITICS, "", s) | |
| # normalize common variants | |
| s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا") | |
| s = s.replace("ى", "ي") | |
| s = s.replace("ة", "ه") | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def tokenize(s: str): | |
| # keep Arabic letters and spaces only | |
| s = re.sub(r"[^\u0600-\u06FF\s]", " ", s) | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s.split(" ") if s else [] | |
| def sim(a, b) -> float: | |
| return SequenceMatcher(None, a, b).ratio() | |
| def main(): | |
| canon = json.load(open(CANON_PATH, encoding="utf-8")) | |
| # Load ASR raw text (we will create it in 14.2) | |
| raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip() | |
| raw_n = normalize_ar(raw) | |
| asr_tokens = tokenize(raw_n) | |
| # Canonical tokens (word-level) from JSON | |
| canon_words = [] | |
| for ay in canon["ayahs"]: | |
| for w in ay["words"]: | |
| canon_words.append({ | |
| "ayah": ay["ayah"], | |
| "word": w, | |
| "norm": normalize_ar(w) | |
| }) | |
| # Greedy alignment: for each canonical word, find best match in a moving window of ASR tokens | |
| aligned = [] | |
| j = 0 | |
| WINDOW = 6 | |
| for i, cw in enumerate(canon_words): | |
| best = None | |
| best_j = None | |
| for k in range(j, min(len(asr_tokens), j + WINDOW)): | |
| score = sim(cw["norm"], asr_tokens[k]) | |
| if (best is None) or (score > best): | |
| best = score | |
| best_j = k | |
| if best is None: | |
| aligned.append({ | |
| "canon": cw, | |
| "asr_token": None, | |
| "score": 0.0, | |
| "match": False | |
| }) | |
| continue | |
| token = asr_tokens[best_j] | |
| match = best >= 0.75 # MVP threshold | |
| aligned.append({ | |
| "canon": cw, | |
| "asr_token": token, | |
| "score": round(float(best), 3), | |
| "match": bool(match) | |
| }) | |
| # advance pointer to keep order | |
| j = best_j + 1 | |
| # Summaries | |
| total = len(aligned) | |
| matches = sum(1 for a in aligned if a["match"]) | |
| mismatches = total - matches | |
| out = { | |
| "asr_raw": raw, | |
| "asr_normalized": raw_n, | |
| "stats": { | |
| "canonical_words": total, | |
| "matches": matches, | |
| "mismatches": mismatches, | |
| "match_rate": round(matches / total, 3) if total else 0.0 | |
| }, | |
| "alignment": aligned | |
| } | |
| json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2) | |
| print("OK ✅ wrote", OUT_PATH) | |
| print("Match rate:", out["stats"]["match_rate"]) | |
| print("First 5 alignments:") | |
| for a in aligned[:5]: | |
| print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"]) | |
| if __name__ == "__main__": | |
| main() |