Spaces:

iRecite
/

iRecite-MVP-API

Sleeping

File size: 3,679 Bytes

4ca6263

import json
import re
import librosa

AUDIO_PATH = "sample_trim.wav"
ALIGN_GLOBAL_PATH = "output/text_alignment_global.json"
OUT_PATH = "output/word_timestamps_v2.json"

ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
TATWEEL = "\u0640"

def normalize_ar(s: str) -> str:
    s = s.replace(TATWEEL, "")
    s = re.sub(ARABIC_DIACRITICS, "", s)
    s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    s = s.replace("ى", "ي")
    s = s.replace("ة", "ه")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize_ar_words(s: str):
    s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s.split(" ") if s else []

def main():
    # Load audio duration
    audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
    total_sec = len(audio) / sr

    # Load global alignment (has asr_raw + alignment pairs)
    g = json.load(open(ALIGN_GLOBAL_PATH, encoding="utf-8"))
    asr_raw = g["asr_raw"]
    asr_norm = normalize_ar(asr_raw)
    asr_tokens = tokenize_ar_words(asr_norm)

    # Build token timeline: divide total audio time across ASR tokens evenly
    # (MVP approximation; later replace with real forced alignment)
    N = max(1, len(asr_tokens))
    token_times = []
    for i in range(N):
        start = (i / N) * total_sec
        end = ((i + 1) / N) * total_sec
        token_times.append((round(start, 3), round(end, 3)))

    # Now assign each canonical word the timestamp of its matched ASR token (if any),
    # otherwise interpolate from its index in canonical sequence.
    alignment = [a for a in g["alignment"] if a.get("canon")]

    out_words = []
    last_token_idx = 0
    for idx, a in enumerate(alignment):
        cw = a["canon"]
        tok = a["asr_token"]

        if tok is not None:
            tok_norm = normalize_ar(tok)
            # find token index in asr_tokens near expected position
            # we use a forward search to keep monotonic mapping
            # MVP: choose first exact match, else fallback to proportional index
                        # monotonic search: only search forward from last token index
            found = None
            for ti in range(last_token_idx, len(asr_tokens)):
                if asr_tokens[ti] == tok_norm:
                    found = ti
                    break

            if found is None:
                # fallback: proportional but also monotonic
                found = int((idx / max(1, len(alignment))) * (N - 1))
                found = max(found, last_token_idx)

            t0, t1 = token_times[found]
            last_token_idx = found + 1
        else:
            # no matched token: proportional fallback
            found = int((idx / max(1, len(alignment))) * (N - 1))
            t0, t1 = token_times[found]

        out_words.append({
            "index": idx + 1,
            "ayah": cw["ayah"],
            "word": cw["word"],
            "asr_token": tok,
            "score": a["score"],
            "match": a["match"],
            "timestamp": {"start": t0, "end": t1}
        })

    out = {
        "audio_path": AUDIO_PATH,
        "method": "token-time interpolation (MVP)",
        "stats": {
            "canonical_words": len(out_words),
            "asr_tokens": len(asr_tokens),
            "timestamped": len(out_words)
        },
        "words": out_words
    }

    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
    print("OK ✅ wrote", OUT_PATH)
    print("Words timestamped:", len(out_words), "/", len(out_words))
    print("First:", out_words[0])
    print("Last:", out_words[-1])

if __name__ == "__main__":
    main()