Spaces:

iRecite
/

iRecite-MVP-API

Sleeping

File size: 4,016 Bytes

4ca6263

import json
import re
from difflib import SequenceMatcher

CANON_PATH = "data/fatiha_canonical.json"
ASR_TEXT_PATH = "output/asr_raw.txt"
OUT_PATH = "output/text_alignment_global.json"

ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
TATWEEL = "\u0640"

def normalize_ar(s: str) -> str:
    s = s.replace(TATWEEL, "")
    s = re.sub(ARABIC_DIACRITICS, "", s)
    s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    s = s.replace("ى", "ي")
    s = s.replace("ة", "ه")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize(s: str):
    s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s.split(" ") if s else []

def sim(a, b) -> float:
    return SequenceMatcher(None, a, b).ratio()

def main():
    canon = json.load(open(CANON_PATH, encoding="utf-8"))
    raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
    raw_n = normalize_ar(raw)

    asr_tokens = tokenize(raw_n)

    canon_words = []
    for ay in canon["ayahs"]:
        for w in ay["words"]:
            canon_words.append({
                "ayah": ay["ayah"],
                "word": w,
                "norm": normalize_ar(w)
            })

    # --- Global alignment DP ---
    n = len(canon_words)
    m = len(asr_tokens)

    # scoring
    GAP = -0.45  # penalty for skipping a token/word
    def match_score(i, j):
        # reward similarity, centered around 0.75
        s = sim(canon_words[i]["norm"], asr_tokens[j])
        return (s - 0.75) * 2.0  # >0 is good match

    # DP matrices
    dp = [[0.0]*(m+1) for _ in range(n+1)]
    bt = [[None]*(m+1) for _ in range(n+1)]  # backtrack: 'D' diag, 'U' up, 'L' left

    for i in range(1, n+1):
        dp[i][0] = dp[i-1][0] + GAP
        bt[i][0] = 'U'
    for j in range(1, m+1):
        dp[0][j] = dp[0][j-1] + GAP
        bt[0][j] = 'L'

    for i in range(1, n+1):
        for j in range(1, m+1):
            diag = dp[i-1][j-1] + match_score(i-1, j-1)
            up   = dp[i-1][j] + GAP
            left = dp[i][j-1] + GAP
            best = max(diag, up, left)
            dp[i][j] = best
            bt[i][j] = 'D' if best == diag else ('U' if best == up else 'L')

    # Backtrack to alignment pairs
    aligned = []
    i, j = n, m
    while i > 0 or j > 0:
        move = bt[i][j]
        if move == 'D':
            cw = canon_words[i-1]
            tok = asr_tokens[j-1]
            s = sim(cw["norm"], tok)
            aligned.append({
                "canon": cw,
                "asr_token": tok,
                "score": round(float(s), 3),
                "match": bool(s >= 0.72)
            })
            i -= 1
            j -= 1
        elif move == 'U':
            cw = canon_words[i-1]
            aligned.append({
                "canon": cw,
                "asr_token": None,
                "score": 0.0,
                "match": False
            })
            i -= 1
        else:  # 'L'
            # ASR token skipped
            j -= 1

    aligned.reverse()

    total = len(canon_words)
    matches = sum(1 for a in aligned if a["canon"] and a["match"])
    mismatches = total - matches

    out = {
        "asr_raw": raw,
        "asr_normalized": raw_n,
        "stats": {
            "canonical_words": total,
            "asr_tokens": len(asr_tokens),
            "matches": matches,
            "mismatches": mismatches,
            "match_rate": round(matches / total, 3) if total else 0.0
        },
        "alignment": aligned
    }

    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)

    print("OK ✅ wrote", OUT_PATH)
    print("Match rate:", out["stats"]["match_rate"])
    print("First 8 alignments:")
    shown = 0
    for a in aligned:
        if a["canon"] is None:
            continue
        print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])
        shown += 1
        if shown >= 8:
            break

if __name__ == "__main__":
    main()