Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Build phoneme cache for all 114 chapters. | |
| Phonemizes the entire Quran in a single call and saves per-chapter | |
| ChapterReference objects to a pickle file for fast loading at runtime. | |
| Usage: | |
| python scripts/build_phoneme_cache.py | |
| """ | |
| import pickle | |
| import sys | |
| from collections import defaultdict | |
| from pathlib import Path | |
| _project_root = Path(__file__).parent.parent.resolve() | |
| sys.path.insert(0, str(_project_root)) | |
| from config import PHONEME_CACHE_PATH | |
| from src.alignment.phoneme_matcher import ChapterReference, RefWord | |
| from src.phonemizer_utils import get_phonemizer | |
| def build_all_chapters() -> dict[int, ChapterReference]: | |
| """Phonemize entire Quran and build all ChapterReference objects.""" | |
| pm = get_phonemizer() | |
| print("Phonemizing entire Quran (1-114)...") | |
| result = pm.phonemize(ref="1-114", stops=["verse"]) | |
| words = result._words | |
| nested = result._nested | |
| print(f"Total words: {len(words)}") | |
| # Group by surah | |
| surah_words: dict[int, list[RefWord]] = defaultdict(list) | |
| for word, phonemes in zip(words, nested): | |
| loc = word.location | |
| surah_words[loc.surah_num].append(RefWord( | |
| text=word.text, | |
| phonemes=phonemes, | |
| surah=loc.surah_num, | |
| ayah=loc.ayah_num, | |
| word_num=loc.word_num, | |
| )) | |
| # Build ChapterReference for each surah | |
| chapters: dict[int, ChapterReference] = {} | |
| for surah_num in sorted(surah_words): | |
| ref_words = surah_words[surah_num] | |
| total_phones = sum(len(w.phonemes) for w in ref_words) | |
| avg_phones_per_word = total_phones / len(ref_words) if ref_words else 4.0 | |
| flat_phonemes = [] | |
| flat_phone_to_word = [] | |
| word_phone_offsets = [] | |
| for word_idx, word in enumerate(ref_words): | |
| word_phone_offsets.append(len(flat_phonemes)) | |
| for ph in word.phonemes: | |
| flat_phonemes.append(ph) | |
| flat_phone_to_word.append(word_idx) | |
| # Sentinel offset | |
| word_phone_offsets.append(len(flat_phonemes)) | |
| chapters[surah_num] = ChapterReference( | |
| surah=surah_num, | |
| words=ref_words, | |
| avg_phones_per_word=avg_phones_per_word, | |
| flat_phonemes=flat_phonemes, | |
| flat_phone_to_word=flat_phone_to_word, | |
| word_phone_offsets=word_phone_offsets, | |
| ) | |
| print(f"Built {len(chapters)} chapter references") | |
| return chapters | |
| def main(): | |
| chapters = build_all_chapters() | |
| output_path = Path(PHONEME_CACHE_PATH) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, "wb") as f: | |
| pickle.dump(chapters, f, protocol=pickle.HIGHEST_PROTOCOL) | |
| print(f"Saved to {output_path}") | |
| print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB") | |
| if __name__ == "__main__": | |
| main() | |