Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,851 Bytes
20e9692 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
"""
Build phoneme cache for all 114 chapters.
Phonemizes the entire Quran in a single call and saves per-chapter
ChapterReference objects to a pickle file for fast loading at runtime.
Usage:
python scripts/build_phoneme_cache.py
"""
import pickle
import sys
from collections import defaultdict
from pathlib import Path
_project_root = Path(__file__).parent.parent.resolve()
sys.path.insert(0, str(_project_root))
from config import PHONEME_CACHE_PATH
from src.alignment.phoneme_matcher import ChapterReference, RefWord
from src.phonemizer_utils import get_phonemizer
def build_all_chapters() -> dict[int, ChapterReference]:
"""Phonemize entire Quran and build all ChapterReference objects."""
pm = get_phonemizer()
print("Phonemizing entire Quran (1-114)...")
result = pm.phonemize(ref="1-114", stops=["verse"])
words = result._words
nested = result._nested
print(f"Total words: {len(words)}")
# Group by surah
surah_words: dict[int, list[RefWord]] = defaultdict(list)
for word, phonemes in zip(words, nested):
loc = word.location
surah_words[loc.surah_num].append(RefWord(
text=word.text,
phonemes=phonemes,
surah=loc.surah_num,
ayah=loc.ayah_num,
word_num=loc.word_num,
))
# Build ChapterReference for each surah
chapters: dict[int, ChapterReference] = {}
for surah_num in sorted(surah_words):
ref_words = surah_words[surah_num]
total_phones = sum(len(w.phonemes) for w in ref_words)
avg_phones_per_word = total_phones / len(ref_words) if ref_words else 4.0
flat_phonemes = []
flat_phone_to_word = []
word_phone_offsets = []
for word_idx, word in enumerate(ref_words):
word_phone_offsets.append(len(flat_phonemes))
for ph in word.phonemes:
flat_phonemes.append(ph)
flat_phone_to_word.append(word_idx)
# Sentinel offset
word_phone_offsets.append(len(flat_phonemes))
chapters[surah_num] = ChapterReference(
surah=surah_num,
words=ref_words,
avg_phones_per_word=avg_phones_per_word,
flat_phonemes=flat_phonemes,
flat_phone_to_word=flat_phone_to_word,
word_phone_offsets=word_phone_offsets,
)
print(f"Built {len(chapters)} chapter references")
return chapters
def main():
chapters = build_all_chapters()
output_path = Path(PHONEME_CACHE_PATH)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "wb") as f:
pickle.dump(chapters, f, protocol=pickle.HIGHEST_PROTOCOL)
print(f"Saved to {output_path}")
print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")
if __name__ == "__main__":
main()
|