File size: 4,016 Bytes
4ca6263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import json
import re
from difflib import SequenceMatcher

CANON_PATH = "data/fatiha_canonical.json"
ASR_TEXT_PATH = "output/asr_raw.txt"
OUT_PATH = "output/text_alignment_global.json"

ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
TATWEEL = "\u0640"

def normalize_ar(s: str) -> str:
    s = s.replace(TATWEEL, "")
    s = re.sub(ARABIC_DIACRITICS, "", s)
    s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    s = s.replace("ى", "ي")
    s = s.replace("ة", "ه")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize(s: str):
    s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s.split(" ") if s else []

def sim(a, b) -> float:
    return SequenceMatcher(None, a, b).ratio()

def main():
    canon = json.load(open(CANON_PATH, encoding="utf-8"))
    raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
    raw_n = normalize_ar(raw)

    asr_tokens = tokenize(raw_n)

    canon_words = []
    for ay in canon["ayahs"]:
        for w in ay["words"]:
            canon_words.append({
                "ayah": ay["ayah"],
                "word": w,
                "norm": normalize_ar(w)
            })

    # --- Global alignment DP ---
    n = len(canon_words)
    m = len(asr_tokens)

    # scoring
    GAP = -0.45  # penalty for skipping a token/word
    def match_score(i, j):
        # reward similarity, centered around 0.75
        s = sim(canon_words[i]["norm"], asr_tokens[j])
        return (s - 0.75) * 2.0  # >0 is good match

    # DP matrices
    dp = [[0.0]*(m+1) for _ in range(n+1)]
    bt = [[None]*(m+1) for _ in range(n+1)]  # backtrack: 'D' diag, 'U' up, 'L' left

    for i in range(1, n+1):
        dp[i][0] = dp[i-1][0] + GAP
        bt[i][0] = 'U'
    for j in range(1, m+1):
        dp[0][j] = dp[0][j-1] + GAP
        bt[0][j] = 'L'

    for i in range(1, n+1):
        for j in range(1, m+1):
            diag = dp[i-1][j-1] + match_score(i-1, j-1)
            up   = dp[i-1][j] + GAP
            left = dp[i][j-1] + GAP
            best = max(diag, up, left)
            dp[i][j] = best
            bt[i][j] = 'D' if best == diag else ('U' if best == up else 'L')

    # Backtrack to alignment pairs
    aligned = []
    i, j = n, m
    while i > 0 or j > 0:
        move = bt[i][j]
        if move == 'D':
            cw = canon_words[i-1]
            tok = asr_tokens[j-1]
            s = sim(cw["norm"], tok)
            aligned.append({
                "canon": cw,
                "asr_token": tok,
                "score": round(float(s), 3),
                "match": bool(s >= 0.72)
            })
            i -= 1
            j -= 1
        elif move == 'U':
            cw = canon_words[i-1]
            aligned.append({
                "canon": cw,
                "asr_token": None,
                "score": 0.0,
                "match": False
            })
            i -= 1
        else:  # 'L'
            # ASR token skipped
            j -= 1

    aligned.reverse()

    total = len(canon_words)
    matches = sum(1 for a in aligned if a["canon"] and a["match"])
    mismatches = total - matches

    out = {
        "asr_raw": raw,
        "asr_normalized": raw_n,
        "stats": {
            "canonical_words": total,
            "asr_tokens": len(asr_tokens),
            "matches": matches,
            "mismatches": mismatches,
            "match_rate": round(matches / total, 3) if total else 0.0
        },
        "alignment": aligned
    }

    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)

    print("OK ✅ wrote", OUT_PATH)
    print("Match rate:", out["stats"]["match_rate"])
    print("First 8 alignments:")
    shown = 0
    for a in aligned:
        if a["canon"] is None:
            continue
        print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])
        shown += 1
        if shown >= 8:
            break

if __name__ == "__main__":
    main()