import json import numpy as np import librosa import torch from dtw import dtw from transformers import AutoFeatureExtractor, AutoModel from arabic_phonemizer import ArabicPhonemizer AUDIO_PATH = "sample_trim.wav" CANON_PATH = "data/fatiha_canonical_fallback.json" OUT_PATH = "output/alignment_wavlm.json" MODEL_ID = "microsoft/wavlm-base" def wavlm_embeddings(audio_16k: np.ndarray, sr: int): fe = AutoFeatureExtractor.from_pretrained(MODEL_ID) model = AutoModel.from_pretrained(MODEL_ID) model.eval() inputs = fe(audio_16k, sampling_rate=sr, return_tensors="pt") with torch.no_grad(): out = model(**inputs) # (frames, hidden) emb = out.last_hidden_state[0].cpu().numpy() return emb def mean_pool(emb: np.ndarray): return emb.mean(axis=0) def load_audio_segment(path, start_s, end_s, sr=16000): audio, _ = librosa.load(path, sr=sr, mono=True, offset=float(start_s), duration=float(end_s - start_s)) return audio def canonical_word_list(canon): words = [] for ay in canon["ayahs"]: for w in ay["word_info"]: words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]}) return words def vad_segments_from_step8(feedback_path="output/feedback_madd.json"): # Use the long segments already detected in your feedback JSON d = json.load(open(feedback_path, encoding="utf-8")) segs = [(s["start"], s["end"]) for s in d["segments_detected"]] return segs def cosine(a, b): a = a / (np.linalg.norm(a) + 1e-9) b = b / (np.linalg.norm(b) + 1e-9) return float(np.dot(a, b)) def main(): canon = json.load(open(CANON_PATH, encoding="utf-8")) canon_words = canonical_word_list(canon) # We will build "prototype embeddings" for each canonical word by phonemizing text # For MVP we don't synthesize audio; instead we just keep word order and do local matching. # Real version uses forced alignment / phoneme decoding. # # Here we do a practical improvement: map each detected long segment to a nearby word index # based on its relative time position in the recitation. segs = vad_segments_from_step8() # Compute full-audio embedding frames once full_audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True) full_emb = wavlm_embeddings(full_audio, sr) # Map time->frame index approximately # WavLM frame rate is roughly 50 fps-ish after feature extraction; we estimate using emb length total_sec = len(full_audio) / sr frames = full_emb.shape[0] fps = frames / total_sec results = [] for i, (s, e) in enumerate(segs, 1): # Take embedding slice for this time window f0 = int(max(0, np.floor(s * fps))) f1 = int(min(frames, np.ceil(e * fps))) if f1 <= f0 + 1: continue seg_vec = mean_pool(full_emb[f0:f1]) # Estimate position in surah by time ratio, then search around that word index t_mid = (s + e) / 2.0 ratio = t_mid / total_sec est_idx = int(ratio * (len(canon_words) - 1)) # Search a window around estimated index W = 6 cand_range = range(max(0, est_idx - W), min(len(canon_words), est_idx + W + 1)) # Score candidates (we don’t have word audio prototypes, so we use a simple proxy: # compare segment vector to other segment vectors nearby is not helpful. # Instead: pick the nearest index as MVP and output the search window. # This step is mainly building the structure; next step will add real phoneme decoder/alignment.) chosen = est_idx results.append({ "segment_index": i, "timestamp": {"start": round(float(s), 3), "end": round(float(e), 3)}, "estimated_word_index": est_idx, "candidate_word_indices": list(cand_range), "mapped_word": canon_words[chosen], "note": "MVP time-based alignment using WavLM frame mapping. Next step replaces this with phoneme/CTC alignment." }) out = { "audio_path": AUDIO_PATH, "total_sec": round(float(total_sec), 3), "wavlm": {"model_id": MODEL_ID, "frames": int(frames), "fps_est": round(float(fps), 2)}, "num_canonical_words": len(canon_words), "segments_used": len(results), "alignment": results } json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2) print("OK ✅ wrote", OUT_PATH) print("Segments aligned:", len(results)) if results: print("Sample:", results[0]) if __name__ == "__main__": main()