Spaces:

iRecite
/

iRecite-MVP-API

Running

File size: 3,788 Bytes

4ca6263

import json
import wave
import contextlib
import numpy as np
import webrtcvad
import librosa
from difflib import SequenceMatcher
from arabic_phonemizer import ArabicPhonemizer

AUDIO_PATH = "sample.wav"
CANON_PATH = "data/fatiha_canonical_fallback.json"
OUT_PATH = "output/word_mapping.json"

# VAD settings
VAD_MODE = 2  # 0-3 (higher = more aggressive)
FRAME_MS = 30 # 10, 20, or 30ms required

def read_wav_mono16k(path):
    # librosa loads float32; we need int16 pcm for VAD
    audio, sr = librosa.load(path, sr=16000, mono=True)
    pcm16 = (audio * 32767).astype(np.int16)
    return pcm16, 16000

def frame_generator(pcm16, sr, frame_ms):
    n = int(sr * frame_ms / 1000)
    offset = 0
    while offset + n < len(pcm16):
        yield pcm16[offset:offset+n]
        offset += n

def vad_segments(pcm16, sr, frame_ms, mode):
    vad = webrtcvad.Vad(mode)
    frames = list(frame_generator(pcm16, sr, frame_ms))
    voiced_flags = [vad.is_speech(f.tobytes(), sr) for f in frames]

    # Convert voiced_flags into segments in seconds
    segments = []
    in_seg = False
    start_i = 0
    for i, v in enumerate(voiced_flags):
        if v and not in_seg:
            in_seg = True
            start_i = i
        elif (not v) and in_seg:
            in_seg = False
            end_i = i
            segments.append((start_i, end_i))
    if in_seg:
        segments.append((start_i, len(voiced_flags)))

    # Merge segments that are too close
    merged = []
    for s, e in segments:
        if not merged:
            merged.append([s, e])
        else:
            prev_s, prev_e = merged[-1]
            gap = s - prev_e
            if gap <= 2:  # ~60ms gap
                merged[-1][1] = e
            else:
                merged.append([s, e])

    # Convert to time
    out = []
    for s, e in merged:
        t0 = (s * frame_ms) / 1000.0
        t1 = (e * frame_ms) / 1000.0
        if (t1 - t0) >= 0.10:
            out.append((round(t0, 3), round(t1, 3)))
    return out

def canonical_words(canon):
    words = []
    for ay in canon["ayahs"]:
        for w in ay["word_info"]:
            words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]})
    return words

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def main():
    with open(CANON_PATH, "r", encoding="utf-8") as f:
        canon = json.load(f)

    canon_words = canonical_words(canon)
    ph = ArabicPhonemizer()

    pcm16, sr = read_wav_mono16k(AUDIO_PATH)
    segs = vad_segments(pcm16, sr, FRAME_MS, VAD_MODE)

    # For each audio segment, phonemize its "best guess" by just extracting audio and using fallback:
    # We don't have ASR here; so we approximate by mapping segments to canonical words in order
    # using a greedy approach: advance through canon words and match by duration / count.
    #
    # MVP: we map N segments to first N canon words (still better than madd-only mapping)
    mapped = []
    n = min(len(segs), len(canon_words))
    for i in range(n):
        t0, t1 = segs[i]
        cw = canon_words[i]
        mapped.append({
            "segment_index": i+1,
            "timestamp": {"start": t0, "end": t1},
            "mapped_canonical": cw
        })

    out = {
        "audio_path": AUDIO_PATH,
        "vad": {"mode": VAD_MODE, "frame_ms": FRAME_MS},
        "segments": segs,
        "mapped": mapped,
        "note": "This is MVP word-like segmentation. Next step will replace sequential mapping with acoustic+phoneme alignment."
    }

    with open(OUT_PATH, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    print("OK ✅ wrote", OUT_PATH)
    print("VAD segments:", len(segs))
    if mapped:
        print("First mapping:", mapped[0])

if __name__ == "__main__":
    main()