iRecite-MVP-API / step16b_token_interpolation_timestamps.py
didodev
Deploy iRecite MVP API (Docker + FastAPI)
4ca6263
import json
import re
import librosa
AUDIO_PATH = "sample_trim.wav"
ALIGN_GLOBAL_PATH = "output/text_alignment_global.json"
OUT_PATH = "output/word_timestamps_v2.json"
ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
TATWEEL = "\u0640"
def normalize_ar(s: str) -> str:
s = s.replace(TATWEEL, "")
s = re.sub(ARABIC_DIACRITICS, "", s)
s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
s = s.replace("ى", "ي")
s = s.replace("ة", "ه")
s = re.sub(r"\s+", " ", s).strip()
return s
def tokenize_ar_words(s: str):
s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s.split(" ") if s else []
def main():
# Load audio duration
audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
total_sec = len(audio) / sr
# Load global alignment (has asr_raw + alignment pairs)
g = json.load(open(ALIGN_GLOBAL_PATH, encoding="utf-8"))
asr_raw = g["asr_raw"]
asr_norm = normalize_ar(asr_raw)
asr_tokens = tokenize_ar_words(asr_norm)
# Build token timeline: divide total audio time across ASR tokens evenly
# (MVP approximation; later replace with real forced alignment)
N = max(1, len(asr_tokens))
token_times = []
for i in range(N):
start = (i / N) * total_sec
end = ((i + 1) / N) * total_sec
token_times.append((round(start, 3), round(end, 3)))
# Now assign each canonical word the timestamp of its matched ASR token (if any),
# otherwise interpolate from its index in canonical sequence.
alignment = [a for a in g["alignment"] if a.get("canon")]
out_words = []
last_token_idx = 0
for idx, a in enumerate(alignment):
cw = a["canon"]
tok = a["asr_token"]
if tok is not None:
tok_norm = normalize_ar(tok)
# find token index in asr_tokens near expected position
# we use a forward search to keep monotonic mapping
# MVP: choose first exact match, else fallback to proportional index
# monotonic search: only search forward from last token index
found = None
for ti in range(last_token_idx, len(asr_tokens)):
if asr_tokens[ti] == tok_norm:
found = ti
break
if found is None:
# fallback: proportional but also monotonic
found = int((idx / max(1, len(alignment))) * (N - 1))
found = max(found, last_token_idx)
t0, t1 = token_times[found]
last_token_idx = found + 1
else:
# no matched token: proportional fallback
found = int((idx / max(1, len(alignment))) * (N - 1))
t0, t1 = token_times[found]
out_words.append({
"index": idx + 1,
"ayah": cw["ayah"],
"word": cw["word"],
"asr_token": tok,
"score": a["score"],
"match": a["match"],
"timestamp": {"start": t0, "end": t1}
})
out = {
"audio_path": AUDIO_PATH,
"method": "token-time interpolation (MVP)",
"stats": {
"canonical_words": len(out_words),
"asr_tokens": len(asr_tokens),
"timestamped": len(out_words)
},
"words": out_words
}
json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
print("OK ✅ wrote", OUT_PATH)
print("Words timestamped:", len(out_words), "/", len(out_words))
print("First:", out_words[0])
print("Last:", out_words[-1])
if __name__ == "__main__":
main()