File size: 3,788 Bytes
4ca6263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import json
import wave
import contextlib
import numpy as np
import webrtcvad
import librosa
from difflib import SequenceMatcher
from arabic_phonemizer import ArabicPhonemizer

AUDIO_PATH = "sample.wav"
CANON_PATH = "data/fatiha_canonical_fallback.json"
OUT_PATH = "output/word_mapping.json"

# VAD settings
VAD_MODE = 2  # 0-3 (higher = more aggressive)
FRAME_MS = 30 # 10, 20, or 30ms required

def read_wav_mono16k(path):
    # librosa loads float32; we need int16 pcm for VAD
    audio, sr = librosa.load(path, sr=16000, mono=True)
    pcm16 = (audio * 32767).astype(np.int16)
    return pcm16, 16000

def frame_generator(pcm16, sr, frame_ms):
    n = int(sr * frame_ms / 1000)
    offset = 0
    while offset + n < len(pcm16):
        yield pcm16[offset:offset+n]
        offset += n

def vad_segments(pcm16, sr, frame_ms, mode):
    vad = webrtcvad.Vad(mode)
    frames = list(frame_generator(pcm16, sr, frame_ms))
    voiced_flags = [vad.is_speech(f.tobytes(), sr) for f in frames]

    # Convert voiced_flags into segments in seconds
    segments = []
    in_seg = False
    start_i = 0
    for i, v in enumerate(voiced_flags):
        if v and not in_seg:
            in_seg = True
            start_i = i
        elif (not v) and in_seg:
            in_seg = False
            end_i = i
            segments.append((start_i, end_i))
    if in_seg:
        segments.append((start_i, len(voiced_flags)))

    # Merge segments that are too close
    merged = []
    for s, e in segments:
        if not merged:
            merged.append([s, e])
        else:
            prev_s, prev_e = merged[-1]
            gap = s - prev_e
            if gap <= 2:  # ~60ms gap
                merged[-1][1] = e
            else:
                merged.append([s, e])

    # Convert to time
    out = []
    for s, e in merged:
        t0 = (s * frame_ms) / 1000.0
        t1 = (e * frame_ms) / 1000.0
        if (t1 - t0) >= 0.10:
            out.append((round(t0, 3), round(t1, 3)))
    return out

def canonical_words(canon):
    words = []
    for ay in canon["ayahs"]:
        for w in ay["word_info"]:
            words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]})
    return words

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def main():
    with open(CANON_PATH, "r", encoding="utf-8") as f:
        canon = json.load(f)

    canon_words = canonical_words(canon)
    ph = ArabicPhonemizer()

    pcm16, sr = read_wav_mono16k(AUDIO_PATH)
    segs = vad_segments(pcm16, sr, FRAME_MS, VAD_MODE)

    # For each audio segment, phonemize its "best guess" by just extracting audio and using fallback:
    # We don't have ASR here; so we approximate by mapping segments to canonical words in order
    # using a greedy approach: advance through canon words and match by duration / count.
    #
    # MVP: we map N segments to first N canon words (still better than madd-only mapping)
    mapped = []
    n = min(len(segs), len(canon_words))
    for i in range(n):
        t0, t1 = segs[i]
        cw = canon_words[i]
        mapped.append({
            "segment_index": i+1,
            "timestamp": {"start": t0, "end": t1},
            "mapped_canonical": cw
        })

    out = {
        "audio_path": AUDIO_PATH,
        "vad": {"mode": VAD_MODE, "frame_ms": FRAME_MS},
        "segments": segs,
        "mapped": mapped,
        "note": "This is MVP word-like segmentation. Next step will replace sequential mapping with acoustic+phoneme alignment."
    }

    with open(OUT_PATH, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    print("OK ✅ wrote", OUT_PATH)
    print("VAD segments:", len(segs))
    if mapped:
        print("First mapping:", mapped[0])

if __name__ == "__main__":
    main()