Spaces:
Sleeping
Sleeping
| import json | |
| import wave | |
| import contextlib | |
| import numpy as np | |
| import webrtcvad | |
| import librosa | |
| from difflib import SequenceMatcher | |
| from arabic_phonemizer import ArabicPhonemizer | |
| AUDIO_PATH = "sample.wav" | |
| CANON_PATH = "data/fatiha_canonical_fallback.json" | |
| OUT_PATH = "output/word_mapping.json" | |
| # VAD settings | |
| VAD_MODE = 2 # 0-3 (higher = more aggressive) | |
| FRAME_MS = 30 # 10, 20, or 30ms required | |
| def read_wav_mono16k(path): | |
| # librosa loads float32; we need int16 pcm for VAD | |
| audio, sr = librosa.load(path, sr=16000, mono=True) | |
| pcm16 = (audio * 32767).astype(np.int16) | |
| return pcm16, 16000 | |
| def frame_generator(pcm16, sr, frame_ms): | |
| n = int(sr * frame_ms / 1000) | |
| offset = 0 | |
| while offset + n < len(pcm16): | |
| yield pcm16[offset:offset+n] | |
| offset += n | |
| def vad_segments(pcm16, sr, frame_ms, mode): | |
| vad = webrtcvad.Vad(mode) | |
| frames = list(frame_generator(pcm16, sr, frame_ms)) | |
| voiced_flags = [vad.is_speech(f.tobytes(), sr) for f in frames] | |
| # Convert voiced_flags into segments in seconds | |
| segments = [] | |
| in_seg = False | |
| start_i = 0 | |
| for i, v in enumerate(voiced_flags): | |
| if v and not in_seg: | |
| in_seg = True | |
| start_i = i | |
| elif (not v) and in_seg: | |
| in_seg = False | |
| end_i = i | |
| segments.append((start_i, end_i)) | |
| if in_seg: | |
| segments.append((start_i, len(voiced_flags))) | |
| # Merge segments that are too close | |
| merged = [] | |
| for s, e in segments: | |
| if not merged: | |
| merged.append([s, e]) | |
| else: | |
| prev_s, prev_e = merged[-1] | |
| gap = s - prev_e | |
| if gap <= 2: # ~60ms gap | |
| merged[-1][1] = e | |
| else: | |
| merged.append([s, e]) | |
| # Convert to time | |
| out = [] | |
| for s, e in merged: | |
| t0 = (s * frame_ms) / 1000.0 | |
| t1 = (e * frame_ms) / 1000.0 | |
| if (t1 - t0) >= 0.10: | |
| out.append((round(t0, 3), round(t1, 3))) | |
| return out | |
| def canonical_words(canon): | |
| words = [] | |
| for ay in canon["ayahs"]: | |
| for w in ay["word_info"]: | |
| words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]}) | |
| return words | |
| def similarity(a, b): | |
| return SequenceMatcher(None, a, b).ratio() | |
| def main(): | |
| with open(CANON_PATH, "r", encoding="utf-8") as f: | |
| canon = json.load(f) | |
| canon_words = canonical_words(canon) | |
| ph = ArabicPhonemizer() | |
| pcm16, sr = read_wav_mono16k(AUDIO_PATH) | |
| segs = vad_segments(pcm16, sr, FRAME_MS, VAD_MODE) | |
| # For each audio segment, phonemize its "best guess" by just extracting audio and using fallback: | |
| # We don't have ASR here; so we approximate by mapping segments to canonical words in order | |
| # using a greedy approach: advance through canon words and match by duration / count. | |
| # | |
| # MVP: we map N segments to first N canon words (still better than madd-only mapping) | |
| mapped = [] | |
| n = min(len(segs), len(canon_words)) | |
| for i in range(n): | |
| t0, t1 = segs[i] | |
| cw = canon_words[i] | |
| mapped.append({ | |
| "segment_index": i+1, | |
| "timestamp": {"start": t0, "end": t1}, | |
| "mapped_canonical": cw | |
| }) | |
| out = { | |
| "audio_path": AUDIO_PATH, | |
| "vad": {"mode": VAD_MODE, "frame_ms": FRAME_MS}, | |
| "segments": segs, | |
| "mapped": mapped, | |
| "note": "This is MVP word-like segmentation. Next step will replace sequential mapping with acoustic+phoneme alignment." | |
| } | |
| with open(OUT_PATH, "w", encoding="utf-8") as f: | |
| json.dump(out, f, ensure_ascii=False, indent=2) | |
| print("OK ✅ wrote", OUT_PATH) | |
| print("VAD segments:", len(segs)) | |
| if mapped: | |
| print("First mapping:", mapped[0]) | |
| if __name__ == "__main__": | |
| main() |