iRecite-MVP-API / step14_align_text_to_canonical.py
didodev
Deploy iRecite MVP API (Docker + FastAPI)
4ca6263
import json
import re
from difflib import SequenceMatcher
CANON_PATH = "data/fatiha_canonical.json"
ASR_TEXT_PATH = "output/asr_raw.txt"
OUT_PATH = "output/text_alignment.json"
# --- Normalization helpers ---
ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") # harakat etc.
TATWEEL = "\u0640"
def normalize_ar(s: str) -> str:
s = s.replace(TATWEEL, "")
s = re.sub(ARABIC_DIACRITICS, "", s)
# normalize common variants
s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
s = s.replace("ى", "ي")
s = s.replace("ة", "ه")
s = re.sub(r"\s+", " ", s).strip()
return s
def tokenize(s: str):
# keep Arabic letters and spaces only
s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s.split(" ") if s else []
def sim(a, b) -> float:
return SequenceMatcher(None, a, b).ratio()
def main():
canon = json.load(open(CANON_PATH, encoding="utf-8"))
# Load ASR raw text (we will create it in 14.2)
raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
raw_n = normalize_ar(raw)
asr_tokens = tokenize(raw_n)
# Canonical tokens (word-level) from JSON
canon_words = []
for ay in canon["ayahs"]:
for w in ay["words"]:
canon_words.append({
"ayah": ay["ayah"],
"word": w,
"norm": normalize_ar(w)
})
# Greedy alignment: for each canonical word, find best match in a moving window of ASR tokens
aligned = []
j = 0
WINDOW = 6
for i, cw in enumerate(canon_words):
best = None
best_j = None
for k in range(j, min(len(asr_tokens), j + WINDOW)):
score = sim(cw["norm"], asr_tokens[k])
if (best is None) or (score > best):
best = score
best_j = k
if best is None:
aligned.append({
"canon": cw,
"asr_token": None,
"score": 0.0,
"match": False
})
continue
token = asr_tokens[best_j]
match = best >= 0.75 # MVP threshold
aligned.append({
"canon": cw,
"asr_token": token,
"score": round(float(best), 3),
"match": bool(match)
})
# advance pointer to keep order
j = best_j + 1
# Summaries
total = len(aligned)
matches = sum(1 for a in aligned if a["match"])
mismatches = total - matches
out = {
"asr_raw": raw,
"asr_normalized": raw_n,
"stats": {
"canonical_words": total,
"matches": matches,
"mismatches": mismatches,
"match_rate": round(matches / total, 3) if total else 0.0
},
"alignment": aligned
}
json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
print("OK ✅ wrote", OUT_PATH)
print("Match rate:", out["stats"]["match_rate"])
print("First 5 alignments:")
for a in aligned[:5]:
print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])
if __name__ == "__main__":
main()