Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| import librosa | |
| AUDIO_PATH = "sample_trim.wav" | |
| ALIGN_GLOBAL_PATH = "output/text_alignment_global.json" | |
| OUT_PATH = "output/word_timestamps_v2.json" | |
| ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") | |
| TATWEEL = "\u0640" | |
| def normalize_ar(s: str) -> str: | |
| s = s.replace(TATWEEL, "") | |
| s = re.sub(ARABIC_DIACRITICS, "", s) | |
| s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا") | |
| s = s.replace("ى", "ي") | |
| s = s.replace("ة", "ه") | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def tokenize_ar_words(s: str): | |
| s = re.sub(r"[^\u0600-\u06FF\s]", " ", s) | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s.split(" ") if s else [] | |
| def main(): | |
| # Load audio duration | |
| audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True) | |
| total_sec = len(audio) / sr | |
| # Load global alignment (has asr_raw + alignment pairs) | |
| g = json.load(open(ALIGN_GLOBAL_PATH, encoding="utf-8")) | |
| asr_raw = g["asr_raw"] | |
| asr_norm = normalize_ar(asr_raw) | |
| asr_tokens = tokenize_ar_words(asr_norm) | |
| # Build token timeline: divide total audio time across ASR tokens evenly | |
| # (MVP approximation; later replace with real forced alignment) | |
| N = max(1, len(asr_tokens)) | |
| token_times = [] | |
| for i in range(N): | |
| start = (i / N) * total_sec | |
| end = ((i + 1) / N) * total_sec | |
| token_times.append((round(start, 3), round(end, 3))) | |
| # Now assign each canonical word the timestamp of its matched ASR token (if any), | |
| # otherwise interpolate from its index in canonical sequence. | |
| alignment = [a for a in g["alignment"] if a.get("canon")] | |
| out_words = [] | |
| last_token_idx = 0 | |
| for idx, a in enumerate(alignment): | |
| cw = a["canon"] | |
| tok = a["asr_token"] | |
| if tok is not None: | |
| tok_norm = normalize_ar(tok) | |
| # find token index in asr_tokens near expected position | |
| # we use a forward search to keep monotonic mapping | |
| # MVP: choose first exact match, else fallback to proportional index | |
| # monotonic search: only search forward from last token index | |
| found = None | |
| for ti in range(last_token_idx, len(asr_tokens)): | |
| if asr_tokens[ti] == tok_norm: | |
| found = ti | |
| break | |
| if found is None: | |
| # fallback: proportional but also monotonic | |
| found = int((idx / max(1, len(alignment))) * (N - 1)) | |
| found = max(found, last_token_idx) | |
| t0, t1 = token_times[found] | |
| last_token_idx = found + 1 | |
| else: | |
| # no matched token: proportional fallback | |
| found = int((idx / max(1, len(alignment))) * (N - 1)) | |
| t0, t1 = token_times[found] | |
| out_words.append({ | |
| "index": idx + 1, | |
| "ayah": cw["ayah"], | |
| "word": cw["word"], | |
| "asr_token": tok, | |
| "score": a["score"], | |
| "match": a["match"], | |
| "timestamp": {"start": t0, "end": t1} | |
| }) | |
| out = { | |
| "audio_path": AUDIO_PATH, | |
| "method": "token-time interpolation (MVP)", | |
| "stats": { | |
| "canonical_words": len(out_words), | |
| "asr_tokens": len(asr_tokens), | |
| "timestamped": len(out_words) | |
| }, | |
| "words": out_words | |
| } | |
| json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2) | |
| print("OK ✅ wrote", OUT_PATH) | |
| print("Words timestamped:", len(out_words), "/", len(out_words)) | |
| print("First:", out_words[0]) | |
| print("Last:", out_words[-1]) | |
| if __name__ == "__main__": | |
| main() |