iRecite-MVP-API / step15_global_word_alignment.py
didodev
Deploy iRecite MVP API (Docker + FastAPI)
4ca6263
import json
import re
from difflib import SequenceMatcher
CANON_PATH = "data/fatiha_canonical.json"
ASR_TEXT_PATH = "output/asr_raw.txt"
OUT_PATH = "output/text_alignment_global.json"
ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
TATWEEL = "\u0640"
def normalize_ar(s: str) -> str:
s = s.replace(TATWEEL, "")
s = re.sub(ARABIC_DIACRITICS, "", s)
s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
s = s.replace("ى", "ي")
s = s.replace("ة", "ه")
s = re.sub(r"\s+", " ", s).strip()
return s
def tokenize(s: str):
s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s.split(" ") if s else []
def sim(a, b) -> float:
return SequenceMatcher(None, a, b).ratio()
def main():
canon = json.load(open(CANON_PATH, encoding="utf-8"))
raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
raw_n = normalize_ar(raw)
asr_tokens = tokenize(raw_n)
canon_words = []
for ay in canon["ayahs"]:
for w in ay["words"]:
canon_words.append({
"ayah": ay["ayah"],
"word": w,
"norm": normalize_ar(w)
})
# --- Global alignment DP ---
n = len(canon_words)
m = len(asr_tokens)
# scoring
GAP = -0.45 # penalty for skipping a token/word
def match_score(i, j):
# reward similarity, centered around 0.75
s = sim(canon_words[i]["norm"], asr_tokens[j])
return (s - 0.75) * 2.0 # >0 is good match
# DP matrices
dp = [[0.0]*(m+1) for _ in range(n+1)]
bt = [[None]*(m+1) for _ in range(n+1)] # backtrack: 'D' diag, 'U' up, 'L' left
for i in range(1, n+1):
dp[i][0] = dp[i-1][0] + GAP
bt[i][0] = 'U'
for j in range(1, m+1):
dp[0][j] = dp[0][j-1] + GAP
bt[0][j] = 'L'
for i in range(1, n+1):
for j in range(1, m+1):
diag = dp[i-1][j-1] + match_score(i-1, j-1)
up = dp[i-1][j] + GAP
left = dp[i][j-1] + GAP
best = max(diag, up, left)
dp[i][j] = best
bt[i][j] = 'D' if best == diag else ('U' if best == up else 'L')
# Backtrack to alignment pairs
aligned = []
i, j = n, m
while i > 0 or j > 0:
move = bt[i][j]
if move == 'D':
cw = canon_words[i-1]
tok = asr_tokens[j-1]
s = sim(cw["norm"], tok)
aligned.append({
"canon": cw,
"asr_token": tok,
"score": round(float(s), 3),
"match": bool(s >= 0.72)
})
i -= 1
j -= 1
elif move == 'U':
cw = canon_words[i-1]
aligned.append({
"canon": cw,
"asr_token": None,
"score": 0.0,
"match": False
})
i -= 1
else: # 'L'
# ASR token skipped
j -= 1
aligned.reverse()
total = len(canon_words)
matches = sum(1 for a in aligned if a["canon"] and a["match"])
mismatches = total - matches
out = {
"asr_raw": raw,
"asr_normalized": raw_n,
"stats": {
"canonical_words": total,
"asr_tokens": len(asr_tokens),
"matches": matches,
"mismatches": mismatches,
"match_rate": round(matches / total, 3) if total else 0.0
},
"alignment": aligned
}
json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
print("OK ✅ wrote", OUT_PATH)
print("Match rate:", out["stats"]["match_rate"])
print("First 8 alignments:")
shown = 0
for a in aligned:
if a["canon"] is None:
continue
print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])
shown += 1
if shown >= 8:
break
if __name__ == "__main__":
main()