Spaces:

hetchyy
/

Quran-multi-aligner

Running on Zero

File size: 2,851 Bytes

20e9692

"""
Build phoneme cache for all 114 chapters.

Phonemizes the entire Quran in a single call and saves per-chapter
ChapterReference objects to a pickle file for fast loading at runtime.

Usage:
    python scripts/build_phoneme_cache.py
"""

import pickle
import sys
from collections import defaultdict
from pathlib import Path

_project_root = Path(__file__).parent.parent.resolve()
sys.path.insert(0, str(_project_root))

from config import PHONEME_CACHE_PATH
from src.alignment.phoneme_matcher import ChapterReference, RefWord
from src.phonemizer_utils import get_phonemizer


def build_all_chapters() -> dict[int, ChapterReference]:
    """Phonemize entire Quran and build all ChapterReference objects."""
    pm = get_phonemizer()

    print("Phonemizing entire Quran (1-114)...")
    result = pm.phonemize(ref="1-114", stops=["verse"])

    words = result._words
    nested = result._nested
    print(f"Total words: {len(words)}")

    # Group by surah
    surah_words: dict[int, list[RefWord]] = defaultdict(list)
    for word, phonemes in zip(words, nested):
        loc = word.location
        surah_words[loc.surah_num].append(RefWord(
            text=word.text,
            phonemes=phonemes,
            surah=loc.surah_num,
            ayah=loc.ayah_num,
            word_num=loc.word_num,
        ))

    # Build ChapterReference for each surah
    chapters: dict[int, ChapterReference] = {}
    for surah_num in sorted(surah_words):
        ref_words = surah_words[surah_num]

        total_phones = sum(len(w.phonemes) for w in ref_words)
        avg_phones_per_word = total_phones / len(ref_words) if ref_words else 4.0

        flat_phonemes = []
        flat_phone_to_word = []
        word_phone_offsets = []

        for word_idx, word in enumerate(ref_words):
            word_phone_offsets.append(len(flat_phonemes))
            for ph in word.phonemes:
                flat_phonemes.append(ph)
                flat_phone_to_word.append(word_idx)

        # Sentinel offset
        word_phone_offsets.append(len(flat_phonemes))

        chapters[surah_num] = ChapterReference(
            surah=surah_num,
            words=ref_words,
            avg_phones_per_word=avg_phones_per_word,
            flat_phonemes=flat_phonemes,
            flat_phone_to_word=flat_phone_to_word,
            word_phone_offsets=word_phone_offsets,
        )

    print(f"Built {len(chapters)} chapter references")
    return chapters


def main():
    chapters = build_all_chapters()

    output_path = Path(PHONEME_CACHE_PATH)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(output_path, "wb") as f:
        pickle.dump(chapters, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Saved to {output_path}")
    print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")


if __name__ == "__main__":
    main()