File size: 6,932 Bytes
21f2aa3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | #!/usr/bin/env python3
"""
Batch CTC Alignment for All Abdul Basit Surahs
Processes all 114 surahs with the full pipeline:
1. CTC forced alignment (wav2vec2)
2. Grapheme matching (App.tsx compatible)
3. Export to MahQuranApp format
Usage:
cd /Documents/26apps/tajweedsst
source venv/bin/activate
python batch_align_all.py
"""
import json
import sys
import time
import torch
from pathlib import Path
from ctc_forced_aligner import (
load_audio,
load_alignment_model,
generate_emissions,
preprocess_text,
get_alignments,
get_spans,
postprocess_results,
)
# Config
PROJECT_ROOT = Path("/home/absolut7/Documents/26apps/MahQuranApp")
VERSES_PATH = PROJECT_ROOT / "public/data/verses_v4.json"
OUTPUT_DIR = PROJECT_ROOT / "public/data/abdul_basit"
AUDIO_DIR = PROJECT_ROOT / "public/audio/abdul_basit"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 4
# Exact same DIACRITICS as App.tsx line 176
DIACRITICS = set(['ً', 'ٌ', 'ٍ', 'َ', 'ُ', 'ِ', 'ّ', 'ْ', 'ٰ', 'ۖ', 'ۗ', 'ۘ', 'ۙ', 'ۚ', 'ۛ', 'ۜ', 'ٔ', 'ٓ', 'ـ'])
def is_diacritic(ch):
"""Match App.tsx splitIntoGraphemes exactly"""
return ch in DIACRITICS or (0x064B <= ord(ch) <= 0x0652) or (0x0610 <= ord(ch) <= 0x061A)
def split_into_graphemes(text):
"""Exact same logic as App.tsx splitIntoGraphemes"""
graphemes = []
current = ''
for ch in text:
if ch == ' ':
if current:
graphemes.append(current)
current = ''
elif is_diacritic(ch) and current:
current += ch
else:
if current:
graphemes.append(current)
current = ch
if current:
graphemes.append(current)
return graphemes
def load_quran_text(all_verses, surah_num):
"""Load Quran text for a surah"""
verses = all_verses.get(str(surah_num), [])
return ' '.join(v.get('text', '') for v in verses)
def get_grapheme_list(all_verses, surah_num):
"""Get graphemes with ayah info matching App.tsx rendering"""
verses = all_verses.get(str(surah_num), [])
grapheme_list = []
for v in verses:
for word in v['text'].split():
for g in split_into_graphemes(word):
grapheme_list.append({'char': g, 'ayah': v['ayah']})
return grapheme_list
def process_surah(surah_num, alignment_model, alignment_tokenizer, all_verses):
"""Process a single surah through the full pipeline"""
audio_path = AUDIO_DIR / f"surah_{surah_num:03d}.mp3"
output_path = OUTPUT_DIR / f"letter_timing_{surah_num}.json"
if not audio_path.exists():
return None, "No audio file"
text = load_quran_text(all_verses, surah_num)
if not text.strip():
return None, "No verse text"
grapheme_list = get_grapheme_list(all_verses, surah_num)
try:
# Step 1: Load audio
audio_waveform = load_audio(str(audio_path), alignment_model.dtype, alignment_model.device)
# Step 2: Generate CTC emissions
emissions, stride = generate_emissions(
alignment_model, audio_waveform, batch_size=BATCH_SIZE
)
# Step 3: Preprocess text
tokens_starred, text_starred = preprocess_text(
text, romanize=True, language="ara",
)
# Step 4: Get alignments
segments, scores, blank_token = get_alignments(
emissions, tokens_starred, alignment_tokenizer,
)
# Step 5: Get spans & post-process
spans = get_spans(tokens_starred, segments, blank_token)
word_timestamps = postprocess_results(text_starred, spans, stride, scores)
# Step 6: Expand to character-level
char_timings = []
for wt in word_timestamps:
word = wt['text']
start = wt['start']
end = wt['end']
duration = end - start
char_dur = duration / len(word) if word else 0
for i, char in enumerate(word):
if not char.isspace():
char_timings.append({
'start': start + i * char_dur,
'end': start + (i + 1) * char_dur,
})
# Step 7: Map CTC chars to graphemes
timing = []
ci = 0
for gi, ginfo in enumerate(grapheme_list):
g = ginfo['char']
s, e = None, None
for _ in range(len(g)):
if ci < len(char_timings):
if s is None:
s = int(char_timings[ci]['start'] * 1000)
e = int(char_timings[ci]['end'] * 1000)
ci += 1
if s is None:
s = timing[-1]['end'] if timing else 0
e = s + 100
timing.append({
'idx': gi,
'char': g,
'ayah': ginfo['ayah'],
'start': s,
'end': e,
'duration': e - s,
'wordIdx': gi // 4,
'weight': 1.0
})
# Save
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(timing, f, ensure_ascii=False, indent=2)
return len(timing), f"OK ({len(grapheme_list)} graphemes)"
except Exception as ex:
return None, f"Error: {ex}"
def main():
start_time = time.time()
print("=" * 60)
print("Batch CTC Alignment - Abdul Basit (All 114 Surahs)")
print(f"Device: {DEVICE}")
print("=" * 60)
# Load model once
print("\n[1] Loading wav2vec alignment model...")
alignment_model, alignment_tokenizer = load_alignment_model(
DEVICE,
dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
)
print(" Model loaded.")
# Load all verses
print("[2] Loading verses...")
with open(VERSES_PATH, 'r', encoding='utf-8') as f:
all_verses = json.load(f)
print(f" Loaded {len(all_verses)} surahs")
# Process each surah
results = []
for surah_num in range(1, 115):
elapsed = time.time() - start_time
print(f"\n[Surah {surah_num:03d}/114] ({elapsed:.0f}s elapsed)...")
count, status = process_surah(
surah_num, alignment_model, alignment_tokenizer, all_verses
)
results.append((surah_num, count, status))
if count:
print(f" ✓ {count} letters - {status}")
else:
print(f" ✗ {status}")
# Summary
elapsed = time.time() - start_time
ok = sum(1 for _, c, _ in results if c)
fail = sum(1 for _, c, _ in results if not c)
print("\n" + "=" * 60)
print(f"BATCH COMPLETE in {elapsed:.0f}s ({elapsed/60:.1f}min)")
print(f" ✓ Success: {ok}/114")
print(f" ✗ Failed: {fail}/114")
print("=" * 60)
# Cleanup
del alignment_model
torch.cuda.empty_cache()
if __name__ == "__main__":
main()
|