iRecite-MVP-API / step10_word_segments_and_mapping.py
didodev
Deploy iRecite MVP API (Docker + FastAPI)
4ca6263
import json
import wave
import contextlib
import numpy as np
import webrtcvad
import librosa
from difflib import SequenceMatcher
from arabic_phonemizer import ArabicPhonemizer
AUDIO_PATH = "sample.wav"
CANON_PATH = "data/fatiha_canonical_fallback.json"
OUT_PATH = "output/word_mapping.json"
# VAD settings
VAD_MODE = 2 # 0-3 (higher = more aggressive)
FRAME_MS = 30 # 10, 20, or 30ms required
def read_wav_mono16k(path):
# librosa loads float32; we need int16 pcm for VAD
audio, sr = librosa.load(path, sr=16000, mono=True)
pcm16 = (audio * 32767).astype(np.int16)
return pcm16, 16000
def frame_generator(pcm16, sr, frame_ms):
n = int(sr * frame_ms / 1000)
offset = 0
while offset + n < len(pcm16):
yield pcm16[offset:offset+n]
offset += n
def vad_segments(pcm16, sr, frame_ms, mode):
vad = webrtcvad.Vad(mode)
frames = list(frame_generator(pcm16, sr, frame_ms))
voiced_flags = [vad.is_speech(f.tobytes(), sr) for f in frames]
# Convert voiced_flags into segments in seconds
segments = []
in_seg = False
start_i = 0
for i, v in enumerate(voiced_flags):
if v and not in_seg:
in_seg = True
start_i = i
elif (not v) and in_seg:
in_seg = False
end_i = i
segments.append((start_i, end_i))
if in_seg:
segments.append((start_i, len(voiced_flags)))
# Merge segments that are too close
merged = []
for s, e in segments:
if not merged:
merged.append([s, e])
else:
prev_s, prev_e = merged[-1]
gap = s - prev_e
if gap <= 2: # ~60ms gap
merged[-1][1] = e
else:
merged.append([s, e])
# Convert to time
out = []
for s, e in merged:
t0 = (s * frame_ms) / 1000.0
t1 = (e * frame_ms) / 1000.0
if (t1 - t0) >= 0.10:
out.append((round(t0, 3), round(t1, 3)))
return out
def canonical_words(canon):
words = []
for ay in canon["ayahs"]:
for w in ay["word_info"]:
words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]})
return words
def similarity(a, b):
return SequenceMatcher(None, a, b).ratio()
def main():
with open(CANON_PATH, "r", encoding="utf-8") as f:
canon = json.load(f)
canon_words = canonical_words(canon)
ph = ArabicPhonemizer()
pcm16, sr = read_wav_mono16k(AUDIO_PATH)
segs = vad_segments(pcm16, sr, FRAME_MS, VAD_MODE)
# For each audio segment, phonemize its "best guess" by just extracting audio and using fallback:
# We don't have ASR here; so we approximate by mapping segments to canonical words in order
# using a greedy approach: advance through canon words and match by duration / count.
#
# MVP: we map N segments to first N canon words (still better than madd-only mapping)
mapped = []
n = min(len(segs), len(canon_words))
for i in range(n):
t0, t1 = segs[i]
cw = canon_words[i]
mapped.append({
"segment_index": i+1,
"timestamp": {"start": t0, "end": t1},
"mapped_canonical": cw
})
out = {
"audio_path": AUDIO_PATH,
"vad": {"mode": VAD_MODE, "frame_ms": FRAME_MS},
"segments": segs,
"mapped": mapped,
"note": "This is MVP word-like segmentation. Next step will replace sequential mapping with acoustic+phoneme alignment."
}
with open(OUT_PATH, "w", encoding="utf-8") as f:
json.dump(out, f, ensure_ascii=False, indent=2)
print("OK ✅ wrote", OUT_PATH)
print("VAD segments:", len(segs))
if mapped:
print("First mapping:", mapped[0])
if __name__ == "__main__":
main()