File size: 3,224 Bytes
4ca6263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import json
import re
from difflib import SequenceMatcher

CANON_PATH = "data/fatiha_canonical.json"
ASR_TEXT_PATH = "output/asr_raw.txt"
OUT_PATH = "output/text_alignment.json"

# --- Normalization helpers ---
ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")  # harakat etc.
TATWEEL = "\u0640"

def normalize_ar(s: str) -> str:
    s = s.replace(TATWEEL, "")
    s = re.sub(ARABIC_DIACRITICS, "", s)
    # normalize common variants
    s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    s = s.replace("ى", "ي")
    s = s.replace("ة", "ه")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize(s: str):
    # keep Arabic letters and spaces only
    s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s.split(" ") if s else []

def sim(a, b) -> float:
    return SequenceMatcher(None, a, b).ratio()

def main():
    canon = json.load(open(CANON_PATH, encoding="utf-8"))

    # Load ASR raw text (we will create it in 14.2)
    raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
    raw_n = normalize_ar(raw)

    asr_tokens = tokenize(raw_n)

    # Canonical tokens (word-level) from JSON
    canon_words = []
    for ay in canon["ayahs"]:
        for w in ay["words"]:
            canon_words.append({
                "ayah": ay["ayah"],
                "word": w,
                "norm": normalize_ar(w)
            })

    # Greedy alignment: for each canonical word, find best match in a moving window of ASR tokens
    aligned = []
    j = 0
    WINDOW = 6

    for i, cw in enumerate(canon_words):
        best = None
        best_j = None
        for k in range(j, min(len(asr_tokens), j + WINDOW)):
            score = sim(cw["norm"], asr_tokens[k])
            if (best is None) or (score > best):
                best = score
                best_j = k

        if best is None:
            aligned.append({
                "canon": cw,
                "asr_token": None,
                "score": 0.0,
                "match": False
            })
            continue

        token = asr_tokens[best_j]
        match = best >= 0.75  # MVP threshold

        aligned.append({
            "canon": cw,
            "asr_token": token,
            "score": round(float(best), 3),
            "match": bool(match)
        })

        # advance pointer to keep order
        j = best_j + 1

    # Summaries
    total = len(aligned)
    matches = sum(1 for a in aligned if a["match"])
    mismatches = total - matches

    out = {
        "asr_raw": raw,
        "asr_normalized": raw_n,
        "stats": {
            "canonical_words": total,
            "matches": matches,
            "mismatches": mismatches,
            "match_rate": round(matches / total, 3) if total else 0.0
        },
        "alignment": aligned
    }

    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)

    print("OK ✅ wrote", OUT_PATH)
    print("Match rate:", out["stats"]["match_rate"])
    print("First 5 alignments:")
    for a in aligned[:5]:
        print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])

if __name__ == "__main__":
    main()