""" Build phoneme cache for all 114 chapters. Phonemizes the entire Quran in a single call and saves per-chapter ChapterReference objects to a pickle file for fast loading at runtime. Usage: python scripts/build_phoneme_cache.py """ import pickle import sys from collections import defaultdict from pathlib import Path _project_root = Path(__file__).parent.parent.resolve() sys.path.insert(0, str(_project_root)) from config import PHONEME_CACHE_PATH from src.alignment.phoneme_matcher import ChapterReference, RefWord from src.phonemizer_utils import get_phonemizer def build_all_chapters() -> dict[int, ChapterReference]: """Phonemize entire Quran and build all ChapterReference objects.""" pm = get_phonemizer() print("Phonemizing entire Quran (1-114)...") result = pm.phonemize(ref="1-114", stops=["verse"]) words = result._words nested = result._nested print(f"Total words: {len(words)}") # Group by surah surah_words: dict[int, list[RefWord]] = defaultdict(list) for word, phonemes in zip(words, nested): loc = word.location surah_words[loc.surah_num].append(RefWord( text=word.text, phonemes=phonemes, surah=loc.surah_num, ayah=loc.ayah_num, word_num=loc.word_num, )) # Build ChapterReference for each surah chapters: dict[int, ChapterReference] = {} for surah_num in sorted(surah_words): ref_words = surah_words[surah_num] total_phones = sum(len(w.phonemes) for w in ref_words) avg_phones_per_word = total_phones / len(ref_words) if ref_words else 4.0 flat_phonemes = [] flat_phone_to_word = [] word_phone_offsets = [] for word_idx, word in enumerate(ref_words): word_phone_offsets.append(len(flat_phonemes)) for ph in word.phonemes: flat_phonemes.append(ph) flat_phone_to_word.append(word_idx) # Sentinel offset word_phone_offsets.append(len(flat_phonemes)) chapters[surah_num] = ChapterReference( surah=surah_num, words=ref_words, avg_phones_per_word=avg_phones_per_word, flat_phonemes=flat_phonemes, flat_phone_to_word=flat_phone_to_word, word_phone_offsets=word_phone_offsets, ) print(f"Built {len(chapters)} chapter references") return chapters def main(): chapters = build_all_chapters() output_path = Path(PHONEME_CACHE_PATH) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "wb") as f: pickle.dump(chapters, f, protocol=pickle.HIGHEST_PROTOCOL) print(f"Saved to {output_path}") print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB") if __name__ == "__main__": main()