import json import wave import contextlib import numpy as np import webrtcvad import librosa from difflib import SequenceMatcher from arabic_phonemizer import ArabicPhonemizer AUDIO_PATH = "sample.wav" CANON_PATH = "data/fatiha_canonical_fallback.json" OUT_PATH = "output/word_mapping.json" # VAD settings VAD_MODE = 2 # 0-3 (higher = more aggressive) FRAME_MS = 30 # 10, 20, or 30ms required def read_wav_mono16k(path): # librosa loads float32; we need int16 pcm for VAD audio, sr = librosa.load(path, sr=16000, mono=True) pcm16 = (audio * 32767).astype(np.int16) return pcm16, 16000 def frame_generator(pcm16, sr, frame_ms): n = int(sr * frame_ms / 1000) offset = 0 while offset + n < len(pcm16): yield pcm16[offset:offset+n] offset += n def vad_segments(pcm16, sr, frame_ms, mode): vad = webrtcvad.Vad(mode) frames = list(frame_generator(pcm16, sr, frame_ms)) voiced_flags = [vad.is_speech(f.tobytes(), sr) for f in frames] # Convert voiced_flags into segments in seconds segments = [] in_seg = False start_i = 0 for i, v in enumerate(voiced_flags): if v and not in_seg: in_seg = True start_i = i elif (not v) and in_seg: in_seg = False end_i = i segments.append((start_i, end_i)) if in_seg: segments.append((start_i, len(voiced_flags))) # Merge segments that are too close merged = [] for s, e in segments: if not merged: merged.append([s, e]) else: prev_s, prev_e = merged[-1] gap = s - prev_e if gap <= 2: # ~60ms gap merged[-1][1] = e else: merged.append([s, e]) # Convert to time out = [] for s, e in merged: t0 = (s * frame_ms) / 1000.0 t1 = (e * frame_ms) / 1000.0 if (t1 - t0) >= 0.10: out.append((round(t0, 3), round(t1, 3))) return out def canonical_words(canon): words = [] for ay in canon["ayahs"]: for w in ay["word_info"]: words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]}) return words def similarity(a, b): return SequenceMatcher(None, a, b).ratio() def main(): with open(CANON_PATH, "r", encoding="utf-8") as f: canon = json.load(f) canon_words = canonical_words(canon) ph = ArabicPhonemizer() pcm16, sr = read_wav_mono16k(AUDIO_PATH) segs = vad_segments(pcm16, sr, FRAME_MS, VAD_MODE) # For each audio segment, phonemize its "best guess" by just extracting audio and using fallback: # We don't have ASR here; so we approximate by mapping segments to canonical words in order # using a greedy approach: advance through canon words and match by duration / count. # # MVP: we map N segments to first N canon words (still better than madd-only mapping) mapped = [] n = min(len(segs), len(canon_words)) for i in range(n): t0, t1 = segs[i] cw = canon_words[i] mapped.append({ "segment_index": i+1, "timestamp": {"start": t0, "end": t1}, "mapped_canonical": cw }) out = { "audio_path": AUDIO_PATH, "vad": {"mode": VAD_MODE, "frame_ms": FRAME_MS}, "segments": segs, "mapped": mapped, "note": "This is MVP word-like segmentation. Next step will replace sequential mapping with acoustic+phoneme alignment." } with open(OUT_PATH, "w", encoding="utf-8") as f: json.dump(out, f, ensure_ascii=False, indent=2) print("OK ✅ wrote", OUT_PATH) print("VAD segments:", len(segs)) if mapped: print("First mapping:", mapped[0]) if __name__ == "__main__": main()