Spaces:
Sleeping
Sleeping
File size: 4,016 Bytes
4ca6263 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import json
import re
from difflib import SequenceMatcher
CANON_PATH = "data/fatiha_canonical.json"
ASR_TEXT_PATH = "output/asr_raw.txt"
OUT_PATH = "output/text_alignment_global.json"
ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
TATWEEL = "\u0640"
def normalize_ar(s: str) -> str:
s = s.replace(TATWEEL, "")
s = re.sub(ARABIC_DIACRITICS, "", s)
s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
s = s.replace("ى", "ي")
s = s.replace("ة", "ه")
s = re.sub(r"\s+", " ", s).strip()
return s
def tokenize(s: str):
s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s.split(" ") if s else []
def sim(a, b) -> float:
return SequenceMatcher(None, a, b).ratio()
def main():
canon = json.load(open(CANON_PATH, encoding="utf-8"))
raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
raw_n = normalize_ar(raw)
asr_tokens = tokenize(raw_n)
canon_words = []
for ay in canon["ayahs"]:
for w in ay["words"]:
canon_words.append({
"ayah": ay["ayah"],
"word": w,
"norm": normalize_ar(w)
})
# --- Global alignment DP ---
n = len(canon_words)
m = len(asr_tokens)
# scoring
GAP = -0.45 # penalty for skipping a token/word
def match_score(i, j):
# reward similarity, centered around 0.75
s = sim(canon_words[i]["norm"], asr_tokens[j])
return (s - 0.75) * 2.0 # >0 is good match
# DP matrices
dp = [[0.0]*(m+1) for _ in range(n+1)]
bt = [[None]*(m+1) for _ in range(n+1)] # backtrack: 'D' diag, 'U' up, 'L' left
for i in range(1, n+1):
dp[i][0] = dp[i-1][0] + GAP
bt[i][0] = 'U'
for j in range(1, m+1):
dp[0][j] = dp[0][j-1] + GAP
bt[0][j] = 'L'
for i in range(1, n+1):
for j in range(1, m+1):
diag = dp[i-1][j-1] + match_score(i-1, j-1)
up = dp[i-1][j] + GAP
left = dp[i][j-1] + GAP
best = max(diag, up, left)
dp[i][j] = best
bt[i][j] = 'D' if best == diag else ('U' if best == up else 'L')
# Backtrack to alignment pairs
aligned = []
i, j = n, m
while i > 0 or j > 0:
move = bt[i][j]
if move == 'D':
cw = canon_words[i-1]
tok = asr_tokens[j-1]
s = sim(cw["norm"], tok)
aligned.append({
"canon": cw,
"asr_token": tok,
"score": round(float(s), 3),
"match": bool(s >= 0.72)
})
i -= 1
j -= 1
elif move == 'U':
cw = canon_words[i-1]
aligned.append({
"canon": cw,
"asr_token": None,
"score": 0.0,
"match": False
})
i -= 1
else: # 'L'
# ASR token skipped
j -= 1
aligned.reverse()
total = len(canon_words)
matches = sum(1 for a in aligned if a["canon"] and a["match"])
mismatches = total - matches
out = {
"asr_raw": raw,
"asr_normalized": raw_n,
"stats": {
"canonical_words": total,
"asr_tokens": len(asr_tokens),
"matches": matches,
"mismatches": mismatches,
"match_rate": round(matches / total, 3) if total else 0.0
},
"alignment": aligned
}
json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
print("OK ✅ wrote", OUT_PATH)
print("Match rate:", out["stats"]["match_rate"])
print("First 8 alignments:")
shown = 0
for a in aligned:
if a["canon"] is None:
continue
print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])
shown += 1
if shown >= 8:
break
if __name__ == "__main__":
main() |