Spaces:

iRecite
/

iRecite-MVP-API

Sleeping

File size: 4,599 Bytes

4ca6263

import json
import numpy as np
import librosa
import torch
from dtw import dtw
from transformers import AutoFeatureExtractor, AutoModel
from arabic_phonemizer import ArabicPhonemizer

AUDIO_PATH = "sample_trim.wav"
CANON_PATH = "data/fatiha_canonical_fallback.json"
OUT_PATH = "output/alignment_wavlm.json"

MODEL_ID = "microsoft/wavlm-base"

def wavlm_embeddings(audio_16k: np.ndarray, sr: int):
    fe = AutoFeatureExtractor.from_pretrained(MODEL_ID)
    model = AutoModel.from_pretrained(MODEL_ID)
    model.eval()

    inputs = fe(audio_16k, sampling_rate=sr, return_tensors="pt")
    with torch.no_grad():
        out = model(**inputs)
    # (frames, hidden)
    emb = out.last_hidden_state[0].cpu().numpy()
    return emb

def mean_pool(emb: np.ndarray):
    return emb.mean(axis=0)

def load_audio_segment(path, start_s, end_s, sr=16000):
    audio, _ = librosa.load(path, sr=sr, mono=True, offset=float(start_s), duration=float(end_s - start_s))
    return audio

def canonical_word_list(canon):
    words = []
    for ay in canon["ayahs"]:
        for w in ay["word_info"]:
            words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]})
    return words

def vad_segments_from_step8(feedback_path="output/feedback_madd.json"):
    # Use the long segments already detected in your feedback JSON
    d = json.load(open(feedback_path, encoding="utf-8"))
    segs = [(s["start"], s["end"]) for s in d["segments_detected"]]
    return segs

def cosine(a, b):
    a = a / (np.linalg.norm(a) + 1e-9)
    b = b / (np.linalg.norm(b) + 1e-9)
    return float(np.dot(a, b))

def main():
    canon = json.load(open(CANON_PATH, encoding="utf-8"))
    canon_words = canonical_word_list(canon)

    # We will build "prototype embeddings" for each canonical word by phonemizing text
    # For MVP we don't synthesize audio; instead we just keep word order and do local matching.
    # Real version uses forced alignment / phoneme decoding.
    #
    # Here we do a practical improvement: map each detected long segment to a nearby word index
    # based on its relative time position in the recitation.
    segs = vad_segments_from_step8()

    # Compute full-audio embedding frames once
    full_audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
    full_emb = wavlm_embeddings(full_audio, sr)

    # Map time->frame index approximately
    # WavLM frame rate is roughly 50 fps-ish after feature extraction; we estimate using emb length
    total_sec = len(full_audio) / sr
    frames = full_emb.shape[0]
    fps = frames / total_sec

    results = []
    for i, (s, e) in enumerate(segs, 1):
        # Take embedding slice for this time window
        f0 = int(max(0, np.floor(s * fps)))
        f1 = int(min(frames, np.ceil(e * fps)))
        if f1 <= f0 + 1:
            continue
        seg_vec = mean_pool(full_emb[f0:f1])

        # Estimate position in surah by time ratio, then search around that word index
        t_mid = (s + e) / 2.0
        ratio = t_mid / total_sec
        est_idx = int(ratio * (len(canon_words) - 1))

        # Search a window around estimated index
        W = 6
        cand_range = range(max(0, est_idx - W), min(len(canon_words), est_idx + W + 1))

        # Score candidates (we don’t have word audio prototypes, so we use a simple proxy:
        # compare segment vector to other segment vectors nearby is not helpful.
        # Instead: pick the nearest index as MVP and output the search window.
        # This step is mainly building the structure; next step will add real phoneme decoder/alignment.)
        chosen = est_idx

        results.append({
            "segment_index": i,
            "timestamp": {"start": round(float(s), 3), "end": round(float(e), 3)},
            "estimated_word_index": est_idx,
            "candidate_word_indices": list(cand_range),
            "mapped_word": canon_words[chosen],
            "note": "MVP time-based alignment using WavLM frame mapping. Next step replaces this with phoneme/CTC alignment."
        })

    out = {
        "audio_path": AUDIO_PATH,
        "total_sec": round(float(total_sec), 3),
        "wavlm": {"model_id": MODEL_ID, "frames": int(frames), "fps_est": round(float(fps), 2)},
        "num_canonical_words": len(canon_words),
        "segments_used": len(results),
        "alignment": results
    }

    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
    print("OK ✅ wrote", OUT_PATH)
    print("Segments aligned:", len(results))
    if results:
        print("Sample:", results[0])

if __name__ == "__main__":
    main()