Spaces:

iRecite
/

iRecite-MVP-API

Sleeping

iRecite-MVP-API / step16b_token_interpolation_timestamps.py

didodev

Deploy iRecite MVP API (Docker + FastAPI)

4ca6263 about 1 month ago

3.68 kB

	import json
	import re
	import librosa

	AUDIO_PATH = "sample_trim.wav"
	ALIGN_GLOBAL_PATH = "output/text_alignment_global.json"
	OUT_PATH = "output/word_timestamps_v2.json"

	ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
	TATWEEL = "\u0640"

	def normalize_ar(s: str) -> str:
	s = s.replace(TATWEEL, "")
	s = re.sub(ARABIC_DIACRITICS, "", s)
	s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
	s = s.replace("ى", "ي")
	s = s.replace("ة", "ه")
	s = re.sub(r"\s+", " ", s).strip()
	return s

	def tokenize_ar_words(s: str):
	s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
	s = re.sub(r"\s+", " ", s).strip()
	return s.split(" ") if s else []

	def main():
	# Load audio duration
	audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
	total_sec = len(audio) / sr

	# Load global alignment (has asr_raw + alignment pairs)
	g = json.load(open(ALIGN_GLOBAL_PATH, encoding="utf-8"))
	asr_raw = g["asr_raw"]
	asr_norm = normalize_ar(asr_raw)
	asr_tokens = tokenize_ar_words(asr_norm)

	# Build token timeline: divide total audio time across ASR tokens evenly
	# (MVP approximation; later replace with real forced alignment)
	N = max(1, len(asr_tokens))
	token_times = []
	for i in range(N):
	start = (i / N) * total_sec
	end = ((i + 1) / N) * total_sec
	token_times.append((round(start, 3), round(end, 3)))

	# Now assign each canonical word the timestamp of its matched ASR token (if any),
	# otherwise interpolate from its index in canonical sequence.
	alignment = [a for a in g["alignment"] if a.get("canon")]

	out_words = []
	last_token_idx = 0
	for idx, a in enumerate(alignment):
	cw = a["canon"]
	tok = a["asr_token"]

	if tok is not None:
	tok_norm = normalize_ar(tok)
	# find token index in asr_tokens near expected position
	# we use a forward search to keep monotonic mapping
	# MVP: choose first exact match, else fallback to proportional index
	# monotonic search: only search forward from last token index
	found = None
	for ti in range(last_token_idx, len(asr_tokens)):
	if asr_tokens[ti] == tok_norm:
	found = ti
	break

	if found is None:
	# fallback: proportional but also monotonic
	found = int((idx / max(1, len(alignment))) * (N - 1))
	found = max(found, last_token_idx)

	t0, t1 = token_times[found]
	last_token_idx = found + 1
	else:
	# no matched token: proportional fallback
	found = int((idx / max(1, len(alignment))) * (N - 1))
	t0, t1 = token_times[found]

	out_words.append({
	"index": idx + 1,
	"ayah": cw["ayah"],
	"word": cw["word"],
	"asr_token": tok,
	"score": a["score"],
	"match": a["match"],
	"timestamp": {"start": t0, "end": t1}
	})

	out = {
	"audio_path": AUDIO_PATH,
	"method": "token-time interpolation (MVP)",
	"stats": {
	"canonical_words": len(out_words),
	"asr_tokens": len(asr_tokens),
	"timestamped": len(out_words)
	},
	"words": out_words
	}

	json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
	print("OK ✅ wrote", OUT_PATH)
	print("Words timestamped:", len(out_words), "/", len(out_words))
	print("First:", out_words[0])
	print("Last:", out_words[-1])

	if __name__ == "__main__":
	main()