Quran-multi-aligner / scripts /build_phoneme_cache.py
hetchyy's picture
Initial commit
20e9692
"""
Build phoneme cache for all 114 chapters.
Phonemizes the entire Quran in a single call and saves per-chapter
ChapterReference objects to a pickle file for fast loading at runtime.
Usage:
python scripts/build_phoneme_cache.py
"""
import pickle
import sys
from collections import defaultdict
from pathlib import Path
_project_root = Path(__file__).parent.parent.resolve()
sys.path.insert(0, str(_project_root))
from config import PHONEME_CACHE_PATH
from src.alignment.phoneme_matcher import ChapterReference, RefWord
from src.phonemizer_utils import get_phonemizer
def build_all_chapters() -> dict[int, ChapterReference]:
"""Phonemize entire Quran and build all ChapterReference objects."""
pm = get_phonemizer()
print("Phonemizing entire Quran (1-114)...")
result = pm.phonemize(ref="1-114", stops=["verse"])
words = result._words
nested = result._nested
print(f"Total words: {len(words)}")
# Group by surah
surah_words: dict[int, list[RefWord]] = defaultdict(list)
for word, phonemes in zip(words, nested):
loc = word.location
surah_words[loc.surah_num].append(RefWord(
text=word.text,
phonemes=phonemes,
surah=loc.surah_num,
ayah=loc.ayah_num,
word_num=loc.word_num,
))
# Build ChapterReference for each surah
chapters: dict[int, ChapterReference] = {}
for surah_num in sorted(surah_words):
ref_words = surah_words[surah_num]
total_phones = sum(len(w.phonemes) for w in ref_words)
avg_phones_per_word = total_phones / len(ref_words) if ref_words else 4.0
flat_phonemes = []
flat_phone_to_word = []
word_phone_offsets = []
for word_idx, word in enumerate(ref_words):
word_phone_offsets.append(len(flat_phonemes))
for ph in word.phonemes:
flat_phonemes.append(ph)
flat_phone_to_word.append(word_idx)
# Sentinel offset
word_phone_offsets.append(len(flat_phonemes))
chapters[surah_num] = ChapterReference(
surah=surah_num,
words=ref_words,
avg_phones_per_word=avg_phones_per_word,
flat_phonemes=flat_phonemes,
flat_phone_to_word=flat_phone_to_word,
word_phone_offsets=word_phone_offsets,
)
print(f"Built {len(chapters)} chapter references")
return chapters
def main():
chapters = build_all_chapters()
output_path = Path(PHONEME_CACHE_PATH)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "wb") as f:
pickle.dump(chapters, f, protocol=pickle.HIGHEST_PROTOCOL)
print(f"Saved to {output_path}")
print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")
if __name__ == "__main__":
main()