import json import re import librosa AUDIO_PATH = "sample_trim.wav" ALIGN_GLOBAL_PATH = "output/text_alignment_global.json" OUT_PATH = "output/word_timestamps_v2.json" ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") TATWEEL = "\u0640" def normalize_ar(s: str) -> str: s = s.replace(TATWEEL, "") s = re.sub(ARABIC_DIACRITICS, "", s) s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا") s = s.replace("ى", "ي") s = s.replace("ة", "ه") s = re.sub(r"\s+", " ", s).strip() return s def tokenize_ar_words(s: str): s = re.sub(r"[^\u0600-\u06FF\s]", " ", s) s = re.sub(r"\s+", " ", s).strip() return s.split(" ") if s else [] def main(): # Load audio duration audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True) total_sec = len(audio) / sr # Load global alignment (has asr_raw + alignment pairs) g = json.load(open(ALIGN_GLOBAL_PATH, encoding="utf-8")) asr_raw = g["asr_raw"] asr_norm = normalize_ar(asr_raw) asr_tokens = tokenize_ar_words(asr_norm) # Build token timeline: divide total audio time across ASR tokens evenly # (MVP approximation; later replace with real forced alignment) N = max(1, len(asr_tokens)) token_times = [] for i in range(N): start = (i / N) * total_sec end = ((i + 1) / N) * total_sec token_times.append((round(start, 3), round(end, 3))) # Now assign each canonical word the timestamp of its matched ASR token (if any), # otherwise interpolate from its index in canonical sequence. alignment = [a for a in g["alignment"] if a.get("canon")] out_words = [] last_token_idx = 0 for idx, a in enumerate(alignment): cw = a["canon"] tok = a["asr_token"] if tok is not None: tok_norm = normalize_ar(tok) # find token index in asr_tokens near expected position # we use a forward search to keep monotonic mapping # MVP: choose first exact match, else fallback to proportional index # monotonic search: only search forward from last token index found = None for ti in range(last_token_idx, len(asr_tokens)): if asr_tokens[ti] == tok_norm: found = ti break if found is None: # fallback: proportional but also monotonic found = int((idx / max(1, len(alignment))) * (N - 1)) found = max(found, last_token_idx) t0, t1 = token_times[found] last_token_idx = found + 1 else: # no matched token: proportional fallback found = int((idx / max(1, len(alignment))) * (N - 1)) t0, t1 = token_times[found] out_words.append({ "index": idx + 1, "ayah": cw["ayah"], "word": cw["word"], "asr_token": tok, "score": a["score"], "match": a["match"], "timestamp": {"start": t0, "end": t1} }) out = { "audio_path": AUDIO_PATH, "method": "token-time interpolation (MVP)", "stats": { "canonical_words": len(out_words), "asr_tokens": len(asr_tokens), "timestamped": len(out_words) }, "words": out_words } json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2) print("OK ✅ wrote", OUT_PATH) print("Words timestamped:", len(out_words), "/", len(out_words)) print("First:", out_words[0]) print("Last:", out_words[-1]) if __name__ == "__main__": main()