File size: 6,932 Bytes

21f2aa3

#!/usr/bin/env python3
"""
Batch CTC Alignment for All Abdul Basit Surahs
Processes all 114 surahs with the full pipeline:
1. CTC forced alignment (wav2vec2)
2. Grapheme matching (App.tsx compatible)
3. Export to MahQuranApp format

Usage:
    cd /Documents/26apps/tajweedsst
    source venv/bin/activate
    python batch_align_all.py
"""
import json
import sys
import time
import torch
from pathlib import Path
from ctc_forced_aligner import (
    load_audio,
    load_alignment_model,
    generate_emissions,
    preprocess_text,
    get_alignments,
    get_spans,
    postprocess_results,
)

# Config
PROJECT_ROOT = Path("/home/absolut7/Documents/26apps/MahQuranApp")
VERSES_PATH = PROJECT_ROOT / "public/data/verses_v4.json"
OUTPUT_DIR = PROJECT_ROOT / "public/data/abdul_basit"
AUDIO_DIR = PROJECT_ROOT / "public/audio/abdul_basit"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 4

# Exact same DIACRITICS as App.tsx line 176
DIACRITICS = set(['ً', 'ٌ', 'ٍ', 'َ', 'ُ', 'ِ', 'ّ', 'ْ', 'ٰ', 'ۖ', 'ۗ', 'ۘ', 'ۙ', 'ۚ', 'ۛ', 'ۜ', 'ٔ', 'ٓ', 'ـ'])


def is_diacritic(ch):
    """Match App.tsx splitIntoGraphemes exactly"""
    return ch in DIACRITICS or (0x064B <= ord(ch) <= 0x0652) or (0x0610 <= ord(ch) <= 0x061A)


def split_into_graphemes(text):
    """Exact same logic as App.tsx splitIntoGraphemes"""
    graphemes = []
    current = ''
    for ch in text:
        if ch == ' ':
            if current:
                graphemes.append(current)
                current = ''
        elif is_diacritic(ch) and current:
            current += ch
        else:
            if current:
                graphemes.append(current)
            current = ch
    if current:
        graphemes.append(current)
    return graphemes


def load_quran_text(all_verses, surah_num):
    """Load Quran text for a surah"""
    verses = all_verses.get(str(surah_num), [])
    return ' '.join(v.get('text', '') for v in verses)


def get_grapheme_list(all_verses, surah_num):
    """Get graphemes with ayah info matching App.tsx rendering"""
    verses = all_verses.get(str(surah_num), [])
    grapheme_list = []
    for v in verses:
        for word in v['text'].split():
            for g in split_into_graphemes(word):
                grapheme_list.append({'char': g, 'ayah': v['ayah']})
    return grapheme_list


def process_surah(surah_num, alignment_model, alignment_tokenizer, all_verses):
    """Process a single surah through the full pipeline"""
    audio_path = AUDIO_DIR / f"surah_{surah_num:03d}.mp3"
    output_path = OUTPUT_DIR / f"letter_timing_{surah_num}.json"

    if not audio_path.exists():
        return None, "No audio file"

    text = load_quran_text(all_verses, surah_num)
    if not text.strip():
        return None, "No verse text"

    grapheme_list = get_grapheme_list(all_verses, surah_num)

    try:
        # Step 1: Load audio
        audio_waveform = load_audio(str(audio_path), alignment_model.dtype, alignment_model.device)

        # Step 2: Generate CTC emissions
        emissions, stride = generate_emissions(
            alignment_model, audio_waveform, batch_size=BATCH_SIZE
        )

        # Step 3: Preprocess text  
        tokens_starred, text_starred = preprocess_text(
            text, romanize=True, language="ara",
        )

        # Step 4: Get alignments
        segments, scores, blank_token = get_alignments(
            emissions, tokens_starred, alignment_tokenizer,
        )

        # Step 5: Get spans & post-process
        spans = get_spans(tokens_starred, segments, blank_token)
        word_timestamps = postprocess_results(text_starred, spans, stride, scores)

        # Step 6: Expand to character-level
        char_timings = []
        for wt in word_timestamps:
            word = wt['text']
            start = wt['start']
            end = wt['end']
            duration = end - start
            char_dur = duration / len(word) if word else 0
            for i, char in enumerate(word):
                if not char.isspace():
                    char_timings.append({
                        'start': start + i * char_dur,
                        'end': start + (i + 1) * char_dur,
                    })

        # Step 7: Map CTC chars to graphemes
        timing = []
        ci = 0
        for gi, ginfo in enumerate(grapheme_list):
            g = ginfo['char']
            s, e = None, None
            for _ in range(len(g)):
                if ci < len(char_timings):
                    if s is None:
                        s = int(char_timings[ci]['start'] * 1000)
                    e = int(char_timings[ci]['end'] * 1000)
                    ci += 1
            if s is None:
                s = timing[-1]['end'] if timing else 0
                e = s + 100

            timing.append({
                'idx': gi,
                'char': g,
                'ayah': ginfo['ayah'],
                'start': s,
                'end': e,
                'duration': e - s,
                'wordIdx': gi // 4,
                'weight': 1.0
            })

        # Save
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(timing, f, ensure_ascii=False, indent=2)

        return len(timing), f"OK ({len(grapheme_list)} graphemes)"

    except Exception as ex:
        return None, f"Error: {ex}"


def main():
    start_time = time.time()
    print("=" * 60)
    print("Batch CTC Alignment - Abdul Basit (All 114 Surahs)")
    print(f"Device: {DEVICE}")
    print("=" * 60)

    # Load model once
    print("\n[1] Loading wav2vec alignment model...")
    alignment_model, alignment_tokenizer = load_alignment_model(
        DEVICE,
        dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    )
    print("    Model loaded.")

    # Load all verses
    print("[2] Loading verses...")
    with open(VERSES_PATH, 'r', encoding='utf-8') as f:
        all_verses = json.load(f)
    print(f"    Loaded {len(all_verses)} surahs")

    # Process each surah
    results = []
    for surah_num in range(1, 115):
        elapsed = time.time() - start_time
        print(f"\n[Surah {surah_num:03d}/114] ({elapsed:.0f}s elapsed)...")

        count, status = process_surah(
            surah_num, alignment_model, alignment_tokenizer, all_verses
        )
        results.append((surah_num, count, status))

        if count:
            print(f"    ✓ {count} letters - {status}")
        else:
            print(f"    ✗ {status}")

    # Summary
    elapsed = time.time() - start_time
    ok = sum(1 for _, c, _ in results if c)
    fail = sum(1 for _, c, _ in results if not c)

    print("\n" + "=" * 60)
    print(f"BATCH COMPLETE in {elapsed:.0f}s ({elapsed/60:.1f}min)")
    print(f"  ✓ Success: {ok}/114")
    print(f"  ✗ Failed:  {fail}/114")
    print("=" * 60)

    # Cleanup
    del alignment_model
    torch.cuda.empty_cache()


if __name__ == "__main__":
    main()