hetchyy commited on
Commit
20e9692
·
0 Parent(s):

Initial commit

Browse files
Files changed (50) hide show
  1. .claude/skills/phonemizer/SKILL.md +517 -0
  2. .gitattributes +8 -0
  3. .gitignore +54 -0
  4. README.md +16 -0
  5. app.py +0 -0
  6. config.py +322 -0
  7. data/112.mp3 +3 -0
  8. data/7.mp3 +3 -0
  9. data/84.mp3 +3 -0
  10. data/DigitalKhattV2.otf +3 -0
  11. data/Juz' 30.mp3 +3 -0
  12. data/digital_khatt_v2_script.json +3 -0
  13. data/font_data.py +0 -0
  14. data/ligatures.json +1 -0
  15. data/phoneme_cache.pkl +3 -0
  16. data/phoneme_ngram_index_5.pkl +3 -0
  17. data/phoneme_sub_costs.json +67 -0
  18. data/qpc_hafs.json +3 -0
  19. data/surah-name-v2.ttf +3 -0
  20. data/surah_info.json +0 -0
  21. docs/api.md +300 -0
  22. docs/usage-logging.md +370 -0
  23. requirements.txt +13 -0
  24. scripts/add_open_tanween.py +57 -0
  25. scripts/build_phoneme_cache.py +95 -0
  26. scripts/build_phoneme_ngram_index.py +110 -0
  27. scripts/export_onnx.py +160 -0
  28. scripts/fix_stop_sign_spacing.py +60 -0
  29. setup.py +15 -0
  30. src/__init__.py +1 -0
  31. src/_dp_core.pyx +357 -0
  32. src/alignment/__init__.py +0 -0
  33. src/alignment/alignment_pipeline.py +377 -0
  34. src/alignment/ngram_index.py +39 -0
  35. src/alignment/phoneme_anchor.py +293 -0
  36. src/alignment/phoneme_asr.py +355 -0
  37. src/alignment/phoneme_matcher.py +590 -0
  38. src/alignment/phoneme_matcher_cache.py +59 -0
  39. src/alignment/special_segments.py +295 -0
  40. src/phonemizer_utils.py +12 -0
  41. src/quran_index.py +150 -0
  42. src/segment_processor.py +20 -0
  43. src/segment_types.py +153 -0
  44. src/segmenter/__init__.py +0 -0
  45. src/segmenter/segmenter_aoti.py +379 -0
  46. src/segmenter/segmenter_model.py +158 -0
  47. src/segmenter/vad.py +97 -0
  48. src/zero_gpu.py +146 -0
  49. utils/__init__.py +0 -0
  50. utils/usage_logger.py +593 -0
.claude/skills/phonemizer/SKILL.md ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: phonemizer
3
+ description: Domain expert in Quranic phonetics, Tajweed rules, Arabic Unicode, and the Quranic Phonemizer system. Use for phoneme analysis, IPA mappings, Unicode character inspection, and investigating Tajweed rule patterns.
4
+ allowed-tools:
5
+ - Glob
6
+ - Grep
7
+ - Read
8
+ - Bash
9
+ - Python
10
+ - Python3
11
+ ---
12
+
13
+ # Phonemizer Domain Expert
14
+
15
+ You are a domain expert in Quranic phonetics, Tajweed rules, Arabic Unicode, and the Quranic Phonemizer system. Your role is to help understand patterns and associations between:
16
+ - **Tajweed rules** (pronunciation rules for Quranic recitation)
17
+ - **Unicode codepoints** (Arabic script encoding)
18
+ - **IPA phonemes** (International Phonetic Alphabet representations)
19
+ - **Arabic graphemes** (written characters and diacritics)
20
+
21
+ ---
22
+
23
+ ## Phonemizer API
24
+
25
+ The phonemizer is located at `/mnt/c/Users/ahmed/Documents/Uni/Thesis/Code/phonemizer/`.
26
+
27
+ ### Getting Text and Phonemes
28
+
29
+ ```python
30
+ import sys
31
+ sys.path.insert(0, "/mnt/c/Users/ahmed/Documents/Uni/Thesis/Code/phonemizer")
32
+ from core.phonemizer import Phonemizer
33
+
34
+ pm = Phonemizer()
35
+
36
+ # Get a verse by reference (surah:ayah or surah:ayah:word)
37
+ result = pm.phonemize(ref="2:255") # Ayat al-Kursi
38
+
39
+ # Access text
40
+ text = result.text() # Arabic text with verse markers
41
+
42
+ # Access phonemes
43
+ phonemes_str = result.phonemes_str(phoneme_sep=" ", word_sep="")
44
+ phonemes_list = result.phonemes_list(split='word') # [[phonemes], ...]
45
+ ```
46
+
47
+ ### Reference Formats
48
+
49
+ | Format | Example | Meaning |
50
+ |--------|---------|---------|
51
+ | Surah | `"1"` | Entire Al-Fatiha |
52
+ | Verse | `"2:255"` | Ayat al-Kursi |
53
+ | Word | `"1:1:2"` | Word 2 of 1:1 |
54
+ | Range | `"1:1 - 1:5"` | Verses 1-5 of surah 1 |
55
+ | Cross-surah | `"112 - 114"` | Last 3 surahs |
56
+
57
+ ### Getting Detailed Mappings
58
+
59
+ ```python
60
+ mapping = result.get_mapping()
61
+
62
+ # Flat phoneme sequence
63
+ all_phonemes = mapping.phoneme_sequence # ["b", "i", "s", "m", ...]
64
+
65
+ # Word-level data
66
+ for word in mapping.words:
67
+ print(word.location) # "1:1:1"
68
+ print(word.text) # "بِسْمِ"
69
+ print(word.phonemes) # ["b", "i", "s", "m", "i"]
70
+
71
+ # Letter-level breakdown
72
+ for letter in word.letter_mappings:
73
+ print(f" {letter.char} → {letter.phonemes}")
74
+ print(f" diacritic: {letter.diacritic}")
75
+ print(f" rules: {letter.letter_rules}")
76
+
77
+ # Alignment: phoneme → source letter
78
+ for entry in mapping.alignment:
79
+ print(f"Phoneme '{entry.phoneme}' from '{entry.source_char}'")
80
+ print(f" Tajweed rules: {entry.rules}")
81
+ ```
82
+
83
+ ---
84
+
85
+ ## Data Structures
86
+
87
+ The phonemizer produces structured mappings that the recitation app transforms into frozen dataclasses.
88
+
89
+ ### Phonemizer Core Structures (`phonemizer/core/mapping.py`)
90
+
91
+ #### PhonemizationMapping (root)
92
+ ```python
93
+ @dataclass
94
+ class PhonemizationMapping:
95
+ ref: str # Reference string
96
+ text: str # Arabic text
97
+ words: List[WordMapping] # Per-word breakdown
98
+ phoneme_sequence: List[str] # Flat phoneme list
99
+ alignment: List[AlignmentEntry] # Phoneme→letter mapping
100
+ ```
101
+
102
+ #### WordMapping
103
+ ```python
104
+ @dataclass
105
+ class WordMapping:
106
+ location: str # "surah:ayah:word"
107
+ text: str # Arabic word
108
+ phonemes: List[str] # Word's phonemes
109
+ letter_mappings: List[LetterMapping]
110
+ leading_symbols: List[OtherSymbolMapping] # Before word
111
+ trailing_symbols: List[OtherSymbolMapping] # After word (verse markers)
112
+ is_special_word: bool # Allah, etc.
113
+ is_starting: bool # First word of segment
114
+ is_stopping: bool # Last word (affects phonemization)
115
+ madd_mappings: List[MaddMapping] # Long vowel tracking
116
+ ```
117
+
118
+ #### LetterMapping
119
+ ```python
120
+ @dataclass
121
+ class LetterMapping:
122
+ index: int # Position in word (0-based)
123
+ char: str # Base Arabic character
124
+ phonemes: List[str] # Emitted phonemes (empty = silent)
125
+ diacritic: Optional[str] # "FATHA", "SUKUN", etc.
126
+ has_shaddah: bool # Gemination marker
127
+ extensions: List[ExtensionSymbolMapping] # Dagger alef, maddah, etc.
128
+ other_symbols: List[OtherSymbolMapping] # Tatweel, stop signs
129
+ mapping_type: MappingType # STANDARD, SILENT, ONE_TO_MANY, etc.
130
+ letter_rules: List[str] # Tajweed rules for this letter
131
+ phoneme_rules: List[List[str]] # Rules per emitted phoneme
132
+ ```
133
+
134
+ #### ExtensionSymbolMapping
135
+ ```python
136
+ @dataclass
137
+ class ExtensionSymbolMapping:
138
+ char: str # The extension character (e.g., "ٰ" dagger alef)
139
+ name: str # Identifier: "DAGGER_ALEF", "MADDAH", "MINI_WAW", etc.
140
+ ```
141
+
142
+ #### MaddMapping (Long Vowel Tracking)
143
+ ```python
144
+ @dataclass
145
+ class MaddMapping:
146
+ phoneme_index: int # Index in word's phoneme list
147
+ phoneme: str # Long vowel: "a:", "aˤ:", "u:", "i:"
148
+ letter_index: int # Letter that carries this madd
149
+ vowel_grapheme: str # ا, و, ي, ى, ٰ (source of the vowel)
150
+ has_maddah: bool # Whether ٓ is present
151
+ extension_index: Optional[int] # If vowel is an extension, its index
152
+ madd_type: Optional[str] # 'wajib_muttasil', 'jaiz_munfasil', 'lazim', etc.
153
+ is_lafdh_jalalah: bool # Implicit dagger alef in "Allah"
154
+ is_hamza_fathatan: bool # hamza + fathatan → hamza + fatha + alef
155
+ ```
156
+
157
+ ---
158
+
159
+ ### Recitation App Structures (`recitation_analysis/result.py`)
160
+
161
+ The recitation app transforms phonemizer output into **frozen (immutable)** dataclasses via `ResultBuilder`.
162
+
163
+ #### RecitationResult (top-level container)
164
+ ```python
165
+ @dataclass
166
+ class RecitationResult:
167
+ verse_ref: str
168
+ segment_ref: Optional[str]
169
+ canonical_phonemes: Tuple[str, ...]
170
+ detected_phonemes: Tuple[str, ...] # From ASR
171
+ canonical_words: Tuple[WordData, ...]
172
+ detected_words: Tuple[WordData, ...] # Modified by alignment
173
+ alignment: Tuple[AlignmentEntry, ...]
174
+ madd_mappings: Tuple[MaddMappingData, ...]
175
+ errors: List[RecitationError] # Mutable - added after analysis
176
+ ghunnah_instances: List[GhunnahInstance]
177
+ madd_instances: List[MaddInstance]
178
+ ```
179
+
180
+ #### WordData (frozen)
181
+ ```python
182
+ @dataclass(frozen=True)
183
+ class WordData:
184
+ index: int # Position in verse (0-based)
185
+ location: str # "surah:verse:word"
186
+ letters: Tuple[LetterData, ...] # Letters in the word
187
+ leading_symbols: Tuple[str, ...] # Rare symbols before word
188
+ trailing_symbols: Tuple[str, ...] # Verse markers, etc.
189
+ is_stopping: bool # True if stopping word
190
+
191
+ @property
192
+ def text(self) -> str: # Derived from letters
193
+
194
+ @property
195
+ def phonemes(self) -> Tuple[str, ...]: # All letter phonemes
196
+ ```
197
+
198
+ #### LetterData (frozen)
199
+ ```python
200
+ @dataclass(frozen=True)
201
+ class LetterData:
202
+ index: int # Position in word (0-based)
203
+ char: str # Base Arabic character
204
+ diacritic: Optional[str] # Diacritic name
205
+ diacritic_char: Optional[str] # Diacritic character
206
+ shaddah: bool # Whether shaddah present
207
+ extensions: Tuple[str, ...] # Extension CHARACTERS
208
+ other_symbols: Tuple[str, ...] # Stop signs, etc.
209
+ phonemes: Tuple[str, ...] # Phonemes this letter produces
210
+ letter_rules: Tuple[str, ...] # Tajweed rules
211
+ phoneme_rules: Tuple[Tuple[str, ...], ...] # Per-phoneme rules
212
+ is_silent: bool # True if no phonemes
213
+ name: str # "LAM", "NOON", etc.
214
+ extension_names: Tuple[str, ...] # ("DAGGER_ALEF",)
215
+
216
+ # Insertion tracking (for error highlighting)
217
+ inserted_base: bool
218
+ inserted_diacritic: bool
219
+ inserted_shaddah: bool
220
+ inserted_extensions: Tuple[int, ...]
221
+
222
+ def get_full_text(self) -> str:
223
+ """Builds: base + shaddah + diacritic + extensions + other"""
224
+ ```
225
+
226
+ ---
227
+
228
+ ## Extensions and Vowel Phoneme Production
229
+
230
+ **Key Insight:** Extensions are NOT separate letters with their own phoneme lists. They are symbols attached to a letter that **influence the parent letter's phoneme production**.
231
+
232
+ ### How Extensions Affect Phonemes
233
+
234
+ ```
235
+ ┌─────────────────────────────────────────────────────────────┐
236
+ │ Letter: ل (Lam) │
237
+ │ Diacritic: FATHA (produces "a") │
238
+ │ Extensions: [DAGGER_ALEF] │
239
+ │ │
240
+ │ Phoneme production: │
241
+ │ - Lam base → "l" │
242
+ │ - Fatha + extension → "a:" (long vowel) │
243
+ │ │
244
+ │ Final phonemes: ["l", "a:"] ← stored on LETTER, not ext │
245
+ └─────────────────────────────────────────────────────────────┘
246
+ ```
247
+
248
+ ### Extension Types and Their Effects
249
+
250
+ | Extension Name | Char | Effect on Phonemes |
251
+ |---------------|------|-------------------|
252
+ | `DAGGER_ALEF` | ٰ | Extends fatha: `a` → `a:` |
253
+ | `MADDAH` | ٓ | Marks madd, just visual, usually accompanied with another extension |
254
+ | `MINI_WAW` | ۥ | Extends damma: `u` → `u:` |
255
+ | `MINI_YA_END` | ۦ | Extends kasra: `i` → `i:` |
256
+ | `HAMZA_ABOVE` | ٔ | Adds glottal stop phoneme |
257
+ | `HAMZA_BELOW` | ٕ | Adds glottal stop phoneme |
258
+
259
+ ### Vowel Grapheme Sources
260
+
261
+ Long vowels can come from:
262
+ 1. **Vowel letters**: ا (alef), و (waw), ي (ya), ى (alef maksura)
263
+ 2. **Extensions**: ٰ (dagger alef), ۥ (mini waw), ۦ (mini ya)
264
+
265
+ The `MaddMapping.extension_index` tracks when the vowel comes from an extension:
266
+
267
+ ```python
268
+ # Example: ملٰـٓئِكَة (malaa'ika - angels)
269
+ # The lam has dagger alef extension producing a:
270
+ MaddMapping(
271
+ letter_index=0, # Lam
272
+ phoneme="a:",
273
+ vowel_grapheme="ٰ", # Dagger alef
274
+ extension_index=0, # First extension on the letter
275
+ )
276
+ ```
277
+
278
+ ### Rendering Order
279
+
280
+ When rendering a letter with extensions:
281
+ ```
282
+ base_char + shaddah? + diacritic + extensions + other_symbols
283
+ ل ّ َ ٰ ۟
284
+ ```
285
+
286
+ ---
287
+
288
+ ### Usage
289
+
290
+ ```python
291
+ from recitation_analysis.result_builder import ResultBuilder, get_result_builder
292
+
293
+ # Get singleton builder
294
+ builder = get_result_builder()
295
+
296
+ # Build from phonemizer result
297
+ result = builder.build_from_mapping(
298
+ mapping=phonemizer_result.get_mapping(),
299
+ verse_ref="1:1",
300
+ is_starting_segment=True,
301
+ )
302
+
303
+ # Result contains frozen WordData/LetterData with transforms applied
304
+ for word in result.canonical_words:
305
+ for letter in word.letters:
306
+ print(f"{letter.char} + {letter.extensions} → {letter.phonemes}")
307
+ ```
308
+
309
+ ---
310
+
311
+ ## Phoneme Inventory
312
+
313
+ ### Consonants (Base → Geminated)
314
+
315
+ | Letter | Phoneme | Geminated | Name |
316
+ |--------|---------|-----------|------|
317
+ | ب | `b` | `bb` | Ba |
318
+ | ت | `t` | `tt` | Ta |
319
+ | ث | `θ` | `θθ` | Tha |
320
+ | ج | `ʒ` | `ʒʒ` | Jeem |
321
+ | ح | `ħ` | `ħħ` | Hha |
322
+ | خ | `x` | `xx` | Kha |
323
+ | د | `d` | `dd` | Dal |
324
+ | ذ | `ð` | `ðð` | Thal |
325
+ | ر | `r` | `rr` | Ra |
326
+ | ز | `z` | `zz` | Zain |
327
+ | س | `s` | `ss` | Seen |
328
+ | ش | `ʃ` | `ʃʃ` | Sheen |
329
+ | ص | `sˤ` | `sˤsˤ` | Sad (emphatic) |
330
+ | ض | `dˤ` | `dˤdˤ` | Dad (emphatic) |
331
+ | ط | `tˤ` | `tˤtˤ` | Tta (emphatic) |
332
+ | ظ | `ðˤ` | `ðˤðˤ` | Dtha (emphatic) |
333
+ | ع | `ʕ` | `ʕʕ` | Ain |
334
+ | غ | `ɣ` | - | Ghain |
335
+ | ف | `f` | `ff` | Fa |
336
+ | ق | `q` | `qq` | Qaf |
337
+ | ك | `k` | `kk` | Kaf |
338
+ | ل | `l` | `ll` | Lam |
339
+ | م | `m` | - | Meem |
340
+ | ن | `n` | - | Noon |
341
+ | ه | `h` | `hh` | Ha |
342
+ | و | `w` | `ww` | Waw |
343
+ | ي | `j` | `jj` | Ya |
344
+ | ء | `ʔ` | - | Hamza |
345
+
346
+ ### Vowels
347
+
348
+ | Diacritic | Short | Long | Name |
349
+ |-----------|-------|------|------|
350
+ | فَ | `a` | `a:` | Fatha |
351
+ | فُ | `u` | `u:` | Damma |
352
+ | فِ | `i` | `i:` | Kasra |
353
+ | فً | `an` | - | Fathatan (tanween) |
354
+ | فٌ | `un` | - | Dammatan (tanween) |
355
+ | فٍ | `in` | - | Kasratan (tanween) |
356
+
357
+ ### Emphatic Vowels (after ص ض ط ظ)
358
+
359
+ | Context | Short | Long |
360
+ |---------|-------|------|
361
+ | Emphatic + fatha/alef | `aˤ` | `aˤ:` |
362
+
363
+ ### Tajweed-Specific Phonemes
364
+
365
+ | Phoneme | Rule | Description |
366
+ |---------|------|-------------|
367
+ | `ŋ` | Ikhfaa | Hidden noon/tanween |
368
+ | `m̃` | Idgham | Nasalized meem (ghunnah) |
369
+ | `ñ` | Idgham | Nasalized noon (ghunnah) |
370
+ | `j̃` | Idgham | Nasalized ya (ghunnah) |
371
+ | `w̃` | Idgham | Nasalized waw (ghunnah) |
372
+ | `Q` | Qalqala | Bouncing (mid-word) |
373
+ | `lˤlˤ` | Lam Heavy | Heavy lam in "Allah" |
374
+ | `rˤ` | Ra Heavy | Heavy ra (tafkheem) |
375
+
376
+ ---
377
+
378
+
379
+ ## Searching for Specific Rules
380
+
381
+ The precomputed mappings file is at `data/phonemizer_mappings.json` (118MB, covers surahs 10-114).
382
+
383
+ ### Search for a rule pattern
384
+
385
+ ```bash
386
+ # Find verses containing idgham
387
+ grep -o '"letter_rules":\s*\[[^\]]*idgham[^\]]*\]' data/phonemizer_mappings.json | head -5
388
+
389
+ # Find specific rule type
390
+ grep -o '"rules":\s*\[[^\]]*ikhfaa[^\]]*\]' data/phonemizer_mappings.json | head -5
391
+ ```
392
+
393
+ ### Python search script
394
+
395
+ ```python
396
+ import json
397
+
398
+ with open('data/phonemizer_mappings.json', 'r') as f:
399
+ raw = f.read()
400
+ data = json.loads(json.loads(raw)) if raw.startswith('"') else json.loads(raw)
401
+
402
+ def find_rule(rule_pattern: str, limit: int = 5):
403
+ """Find words containing a specific Tajweed rule."""
404
+ results = []
405
+ for word in data['words']:
406
+ for lm in word.get('letter_mappings', []):
407
+ rules = lm.get('letter_rules', []) + [r for pr in lm.get('phoneme_rules', []) for r in pr]
408
+ if any(rule_pattern.lower() in r.lower() for r in rules):
409
+ results.append({
410
+ 'location': word['location'],
411
+ 'text': word['text'],
412
+ 'char': lm['char'],
413
+ 'phonemes': lm['phonemes'],
414
+ 'rules': rules
415
+ })
416
+ if len(results) >= limit:
417
+ return results
418
+ return results
419
+
420
+ # Example: find idgham rules
421
+ for r in find_rule('idgham', limit=5):
422
+ print(f"{r['location']}: {r['text']} - {r['char']} → {r['phonemes']} ({r['rules']})")
423
+ ```
424
+
425
+ ---
426
+
427
+ ## Text to Unicode Inspection Script
428
+
429
+ To inspect any Arabic text character by character:
430
+
431
+ ```python
432
+ def text_to_unicode_sequence(text: str) -> list:
433
+ """Convert Arabic text to a sequence of Unicode codepoints with names."""
434
+ import unicodedata
435
+ result = []
436
+ for i, char in enumerate(text):
437
+ cp = ord(char)
438
+ try:
439
+ name = unicodedata.name(char)
440
+ except ValueError:
441
+ name = "UNKNOWN"
442
+ result.append({
443
+ 'index': i,
444
+ 'char': char,
445
+ 'codepoint': f"U+{cp:04X}",
446
+ 'decimal': cp,
447
+ 'name': name
448
+ })
449
+ return result
450
+
451
+ def print_unicode_sequence(text: str):
452
+ """Pretty print unicode sequence of text."""
453
+ seq = text_to_unicode_sequence(text)
454
+ print(f"Text: {text}")
455
+ print(f"Length: {len(seq)} codepoints")
456
+ print("-" * 60)
457
+ for item in seq:
458
+ print(f"[{item['index']:2d}] {item['char']!r:6} {item['codepoint']} {item['name']}")
459
+ ```
460
+
461
+ ### Combined: Phonemizer + Unicode Inspection
462
+
463
+ ```python
464
+ import sys
465
+ sys.path.insert(0, "/mnt/c/Users/ahmed/Documents/Uni/Thesis/Code/phonemizer")
466
+ from core.phonemizer import Phonemizer
467
+
468
+ pm = Phonemizer()
469
+ result = pm.phonemize("1:1:1")
470
+
471
+ # Get text and inspect unicode
472
+ text = result.text()
473
+ print_unicode_sequence(text)
474
+
475
+ # Get mapping and correlate
476
+ mapping = result.get_mapping()
477
+ for word in mapping.words:
478
+ print(f"\n=== {word.location}: {word.text} ===")
479
+ print_unicode_sequence(word.text)
480
+ print("\nLetter Mappings:")
481
+ for lm in word.letter_mappings:
482
+ print(f" {lm.char} (U+{ord(lm.char):04X}) → {lm.phonemes}")
483
+ ```
484
+
485
+ ---
486
+
487
+ ## Investigation Workflow
488
+
489
+ When investigating a specific Tajweed rule:
490
+
491
+ 1. **Search mappings file** for the rule pattern:
492
+ ```python
493
+ results = find_rule('qalqala', limit=10)
494
+ ```
495
+
496
+ 2. **Get the verse reference** from the results:
497
+ ```python
498
+ location = results[0]['location'] # e.g., "10:2:7"
499
+ ```
500
+
501
+ 3. **Run phonemizer** on that verse for detailed analysis:
502
+ ```python
503
+ result = pm.phonemize(location)
504
+ mapping = result.get_mapping()
505
+ ```
506
+
507
+ 4. **Inspect unicode** of specific words:
508
+ ```python
509
+ print_unicode_sequence(mapping.words[0].text)
510
+ ```
511
+
512
+ 5. **Examine alignments** to understand phoneme production:
513
+ ```python
514
+ for a in mapping.alignment:
515
+ if 'qalqala' in str(a.rules):
516
+ print(f"Phoneme '{a.phoneme}' from '{a.source_char}' with rules {a.rules}")
517
+ ```
.gitattributes ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # No LFS tracking - push files directly
2
+ data/digital_khatt_v2_script.json filter=lfs diff=lfs merge=lfs -text
3
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
4
+ *.otf filter=lfs diff=lfs merge=lfs -text
5
+ *.ttf filter=lfs diff=lfs merge=lfs -text
6
+ *.pkl filter=lfs diff=lfs merge=lfs -text
7
+ data/*.mp3 filter=lfs diff=lfs merge=lfs -text
8
+ data/qpc_hafs.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .env
25
+ .venv
26
+ env/
27
+ venv/
28
+ ENV/
29
+
30
+ # IDE
31
+ .idea/
32
+ .vscode/
33
+ *.swp
34
+ *.swo
35
+ *~
36
+
37
+ # OS
38
+ .DS_Store
39
+ Thumbs.db
40
+
41
+ # Gradio
42
+ flagged/
43
+
44
+ # Exported models
45
+ models/
46
+
47
+ # Test API
48
+ test_api.py
49
+ data/api_result.json
50
+
51
+ CLAUDE.md
52
+ inference_optimization.md
53
+
54
+ docs/
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Quran Multi-Aligner
3
+ emoji: 🎯
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 6.5.1
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: Segment recitations and extract text and word timestamps
11
+ license: mit
12
+ thumbnail: >-
13
+ https://cdn-uploads.huggingface.co/production/uploads/684abe5b6327ae8863d106d2/Rr-R8HNiyJNbaXCE5saU6.png
14
+ ---
15
+
16
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
The diff for this file is too large to render. See raw diff
 
config.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration settings for the Segments App.
3
+ """
4
+ import os
5
+ from pathlib import Path
6
+
7
+ # HF Spaces detection
8
+ IS_HF_SPACE = os.environ.get("SPACE_ID") is not None
9
+
10
+ # Get project root directory
11
+ PROJECT_ROOT = Path(__file__).parent.absolute()
12
+
13
+ # Port for local development
14
+ PORT = 6902
15
+
16
+ # =============================================================================
17
+ # Audio settings
18
+ # =============================================================================
19
+
20
+ RESAMPLE_TYPE = "soxr_lq"
21
+ SEGMENT_AUDIO_DIR = Path("/tmp/segments") # WAV files written here per request
22
+ AUDIO_PRELOAD_COUNT = 5 # First N segments use preload="auto"
23
+ DELETE_CACHE_FREQUENCY = 3600*5 # Gradio cache cleanup interval (seconds)
24
+ DELETE_CACHE_AGE = 3600*5 # Delete cached files older than this (seconds)
25
+
26
+ # =============================================================================
27
+ # Model and data paths
28
+ # =============================================================================
29
+
30
+ # VAD segmenter model
31
+ SEGMENTER_MODEL = "obadx/recitation-segmenter-v2"
32
+
33
+ # Phoneme ASR models (wav2vec2 CTC)
34
+ PHONEME_ASR_MODELS = {
35
+ "Base": "hetchyy/r15_95m",
36
+ "Large": "hetchyy/r7",
37
+ }
38
+ PHONEME_ASR_MODEL_DEFAULT = "Base"
39
+ PHONEME_ASR_MODEL = PHONEME_ASR_MODELS[PHONEME_ASR_MODEL_DEFAULT]
40
+
41
+ DATA_PATH = PROJECT_ROOT / "data"
42
+ SURAH_INFO_PATH = DATA_PATH / "surah_info.json"
43
+
44
+ # Quran script paths
45
+ QURAN_SCRIPT_PATH_COMPUTE = DATA_PATH / "qpc_hafs.json"
46
+ QURAN_SCRIPT_PATH_DISPLAY = DATA_PATH / "qpc_hafs.json"
47
+
48
+ # Pre-built phoneme cache (all 114 chapters)
49
+ PHONEME_CACHE_PATH = DATA_PATH / "phoneme_cache.pkl"
50
+
51
+ # Phoneme n-gram index for anchor detection
52
+ NGRAM_SIZE = 5
53
+ NGRAM_INDEX_PATH = DATA_PATH / f"phoneme_ngram_index_{NGRAM_SIZE}.pkl"
54
+
55
+ # =============================================================================
56
+ # Inference settings
57
+ # =============================================================================
58
+
59
+ def get_vad_duration(minutes):
60
+ """GPU seconds needed for VAD based on audio minutes."""
61
+ if minutes > 180:
62
+ return 60
63
+ elif minutes > 120:
64
+ return 40
65
+ elif minutes > 60:
66
+ return 25
67
+ elif minutes > 30:
68
+ return 15
69
+ elif minutes > 15:
70
+ return 10
71
+ else:
72
+ return 5
73
+
74
+ def get_asr_duration(minutes, model_name="Base"):
75
+ """GPU seconds needed for ASR based on audio minutes and model size."""
76
+ if minutes > 180:
77
+ return 20
78
+ elif minutes > 60:
79
+ base = 15
80
+ else:
81
+ base = 10
82
+
83
+ if model_name == "Large":
84
+ LARGE_MODEL_DURATION_FACTOR = 3
85
+ return int(base * LARGE_MODEL_DURATION_FACTOR)
86
+ return base
87
+
88
+ # Batching strategy
89
+ BATCHING_STRATEGY = "dynamic" # "naive" (fixed count) or "dynamic" (seconds + pad waste)
90
+
91
+ # Naive batching
92
+ INFERENCE_BATCH_SIZE = 32 # Fixed segments per batch (used when BATCHING_STRATEGY="naive")
93
+
94
+ # Dynamic batching constraints
95
+ MAX_BATCH_SECONDS = 300 # Max total audio seconds per batch (sum of durations)
96
+ MAX_PAD_WASTE = 0.15 # Max fraction of padded tensor that is wasted (0=no waste, 1=all waste)
97
+ MIN_BATCH_SIZE = 8 # Minimum segments per batch (prevents underutilization)
98
+
99
+ # Model precision
100
+ DTYPE = "float16"
101
+ TORCH_COMPILE = True # Apply torch.compile() to GPU models (reduce-overhead mode)
102
+
103
+ # AOTInductor compilation (ZeroGPU optimization)
104
+ AOTI_ENABLED = True # Enable AOT compilation for VAD model on HF Space
105
+ AOTI_MIN_AUDIO_MINUTES = 15 # Min audio duration for dynamic shapes
106
+ AOTI_MAX_AUDIO_MINUTES = 90 # Max audio duration for dynamic shapes
107
+ AOTI_HUB_ENABLED = True # Enable Hub persistence (upload/download compiled models)
108
+ AOTI_HUB_REPO = "hetchyy/quran-aligner-aoti" # Hub repo for compiled model cache
109
+
110
+ # =============================================================================
111
+ # Phoneme-based alignment settings
112
+ # =============================================================================
113
+
114
+ ANCHOR_SEGMENTS = 5 # N-gram voting uses first N Quran segments
115
+ ANCHOR_RARITY_WEIGHTING = True # Weight votes by 1/count (rarity); False = equal weight
116
+ ANCHOR_RUN_TRIM_RATIO = 0.15 # Trim leading/trailing ayahs whose weight < ratio * max weight in run
117
+
118
+ # Edit operation costs (Levenshtein hyperparameters)
119
+ COST_SUBSTITUTION = 1.0 # Default phoneme substitution cost
120
+ COST_INSERTION = 1.0 # Insert phoneme from reference (R)
121
+ COST_DELETION = 0.8 # Delete phoneme from ASR (P)
122
+
123
+ # Alignment thresholds (normalized edit distance: 0 = identical, 1 = completely different)
124
+ LOOKBACK_WORDS = 15 # Window words to look back from pointer for starting positions
125
+ LOOKAHEAD_WORDS = 10 # Window words to look ahead after expected end position
126
+ MAX_EDIT_DISTANCE = 0.25 # Max normalized edit distance for valid ayah match
127
+ MAX_SPECIAL_EDIT_DISTANCE = 0.35 # Max normalized edit distance for Basmala/Isti'adha detection
128
+ START_PRIOR_WEIGHT = 0.005 # Penalty per word away from expected position
129
+
130
+ # Failed Segments
131
+ RETRY_LOOKBACK_WORDS = 60 # Expanded lookback for retry tier 1+2
132
+ RETRY_LOOKAHEAD_WORDS = 40 # Expanded lookahead for retry tier 1+2
133
+ MAX_EDIT_DISTANCE_RELAXED = 0.45 # Relaxed threshold for retry tier 2
134
+ MAX_CONSECUTIVE_FAILURES = 2 # Re-anchor within surah after this many DP failures
135
+
136
+ # Debug output
137
+ ANCHOR_DEBUG = True # Show detailed n-gram voting info (votes, top candidates)
138
+ PHONEME_ALIGNMENT_DEBUG = True # Show detailed alignment info (R, P, edit costs)
139
+ PHONEME_ALIGNMENT_PROFILING = True # Track and log timing breakdown (DP, window setup, etc.)
140
+
141
+ # =============================================================================
142
+ # Segmentation slider settings
143
+ # =============================================================================
144
+
145
+ # Segmentation presets: (min_silence_ms, min_speech_ms, pad_ms)
146
+ PRESET_MUJAWWAD = (600, 1500, 300) # Slow / Mujawwad recitation
147
+ PRESET_MURATTAL = (200, 1000, 100) # Normal pace (default)
148
+ PRESET_FAST = (75, 750, 40) # Fast recitation
149
+
150
+ # Slider ranges (defaults come from PRESET_MURATTAL)
151
+ MIN_SILENCE_MIN = 25
152
+ MIN_SILENCE_MAX = 1000
153
+ MIN_SILENCE_STEP = 25
154
+
155
+ MIN_SPEECH_MIN = 500
156
+ MIN_SPEECH_MAX = 2000
157
+ MIN_SPEECH_STEP = 250
158
+
159
+ PAD_MIN = 0
160
+ PAD_MAX = 300
161
+ PAD_STEP = 25
162
+
163
+ # =============================================================================
164
+ # Confidence thresholds for color coding
165
+ # =============================================================================
166
+
167
+ CONFIDENCE_HIGH = 0.8 # >= this: Green
168
+ CONFIDENCE_MED = 0.6 # >= this: Yellow, below: Red
169
+ REVIEW_SUMMARY_MAX_SEGMENTS = 15 # Max segment numbers to list before truncating
170
+
171
+ # Undersegmentation detection thresholds
172
+ # Flagged when (word_count >= MIN_WORDS OR ayah_span >= MIN_AYAH_SPAN) AND duration >= MIN_DURATION
173
+ UNDERSEG_MIN_WORDS = 20 # Word count threshold
174
+ UNDERSEG_MIN_AYAH_SPAN = 2 # Ayah span threshold (segment crosses ayah boundary)
175
+ UNDERSEG_MIN_DURATION = 15 # Duration gate (seconds)
176
+
177
+ # =============================================================================
178
+ # MFA forced alignment (word-level timestamps via HF Space)
179
+ # =============================================================================
180
+
181
+ MFA_SPACE_URL = "https://hetchyy-quran-phoneme-mfa.hf.space"
182
+ MFA_TIMEOUT = 120
183
+
184
+ # =============================================================================
185
+ # Usage logging (pushed to HF Hub via ParquetScheduler)
186
+ # =============================================================================
187
+
188
+ USAGE_LOG_DATASET_REPO = "hetchyy/quran-aligner-logs"
189
+ USAGE_LOG_PUSH_INTERVAL_MINUTES = 60
190
+
191
+ # =============================================================================
192
+ # Progress bar settings
193
+ # =============================================================================
194
+
195
+ PROGRESS_PROCESS_AUDIO = {
196
+ "vad_asr": (0.00, "Segmenting and transcribing..."),
197
+ "asr": (0.15, "Running ASR..."),
198
+ "special_segments": (0.50, "Detecting special segments..."),
199
+ "anchor": (0.60, "Anchor detection..."),
200
+ "matching": (0.80, "Text matching..."),
201
+ "building": (0.90, "Building results..."),
202
+ "done": (1.00, "Done!"),
203
+ }
204
+
205
+ PROGRESS_RESEGMENT = {
206
+ "resegment": (0.00, "Resegmenting..."),
207
+ "asr": (0.15, "Running ASR..."),
208
+ "special_segments": (0.50, "Detecting special segments..."),
209
+ "anchor": (0.60, "Anchor detection..."),
210
+ "matching": (0.80, "Text matching..."),
211
+ "building": (0.90, "Building results..."),
212
+ "done": (1.00, "Done!"),
213
+ }
214
+
215
+ PROGRESS_RETRANSCRIBE = {
216
+ "retranscribe": (0.00, "Retranscribing with {model} model..."),
217
+ "asr": (0.15, "Running ASR..."),
218
+ "special_segments": (0.50, "Detecting special segments..."),
219
+ "anchor": (0.60, "Anchor detection..."),
220
+ "matching": (0.80, "Text matching..."),
221
+ "building": (0.90, "Building results..."),
222
+ "done": (1.00, "Done!"),
223
+ }
224
+
225
+ PROGRESS_COMPUTE_TIMESTAMPS = {
226
+ "upload": (0.00, "Uploading audio to MFA space..."),
227
+ "align": (0.20, "Running forced alignment..."),
228
+ "inject": (0.90, "Applying word timestamps..."),
229
+ "done": (1.00, "Done!"),
230
+ }
231
+
232
+ MFA_PROGRESS_SEGMENT_RATE = 0.05 # seconds per segment for progress bar animation
233
+
234
+ # =============================================================================
235
+ # UI settings
236
+ # =============================================================================
237
+
238
+ # Main layout column scales
239
+ LEFT_COLUMN_SCALE = 4
240
+ RIGHT_COLUMN_SCALE = 6
241
+
242
+ # Arabic font stack
243
+ ARABIC_FONT_STACK = "'DigitalKhatt', 'Traditional Arabic'"
244
+
245
+ QURAN_TEXT_SIZE_PX = 24 # Size for Quran text in segment cards
246
+ ARABIC_WORD_SPACING = "0.2em" # Word spacing for Arabic text
247
+
248
+ # =============================================================================
249
+ # Animation settings
250
+ # =============================================================================
251
+
252
+ # Animation granularity
253
+ ANIM_GRANULARITIES = ["Words", "Characters"]
254
+ ANIM_GRANULARITY_DEFAULT = "Words"
255
+
256
+ ANIM_WORD_COLOR = "#49c3b3" # Green highlight for active word
257
+ ANIM_STYLE_ROW_SCALES = (2, 6, 1, 1) # Granularity, Style, Verse Only, Color
258
+
259
+ ANIM_OPACITY_PREV_DEFAULT = 0.3 # Default "before" opacity
260
+ ANIM_OPACITY_AFTER_DEFAULT = 0.3 # Default "after" opacity
261
+ ANIM_OPACITY_STEP = 0.1 # Opacity slider step size
262
+
263
+ # Mega-card text styling sliders
264
+ MEGA_WORD_SPACING_MIN = 0.0
265
+ MEGA_WORD_SPACING_MAX = 1.0
266
+ MEGA_WORD_SPACING_STEP = 0.05
267
+ MEGA_WORD_SPACING_DEFAULT = 0.2 # matches ARABIC_WORD_SPACING
268
+
269
+ MEGA_TEXT_SIZE_MIN = 12
270
+ MEGA_TEXT_SIZE_MAX = 60
271
+ MEGA_TEXT_SIZE_STEP = 2
272
+ MEGA_TEXT_SIZE_DEFAULT = 30 # matches QURAN_TEXT_SIZE_PX
273
+ MEGA_SURAH_LIGATURE_SIZE = 2 # em — surah name ligature font size in megacard
274
+
275
+ MEGA_LINE_SPACING_MIN = 1.5
276
+ MEGA_LINE_SPACING_MAX = 3.0
277
+ MEGA_LINE_SPACING_STEP = 0.1
278
+ MEGA_LINE_SPACING_DEFAULT = 2 # matches mega-card line-height
279
+
280
+ # Window engine settings (all modes use the window engine internally)
281
+ ANIM_WINDOW_PREV_DEFAULT = 4 # Default number of visible previous words/chars
282
+ ANIM_WINDOW_AFTER_DEFAULT = 4 # Default number of visible after words/chars
283
+ ANIM_WINDOW_PREV_MIN = 0
284
+ ANIM_WINDOW_AFTER_MIN = 0
285
+ ANIM_WINDOW_PREV_MAX = 15
286
+ ANIM_WINDOW_AFTER_MAX = 15
287
+
288
+ # Presets map mode names to window engine parameter values
289
+ ANIM_DISPLAY_MODE_DEFAULT = "Reveal"
290
+ ANIM_DISPLAY_MODES = ["Reveal", "Fade", "Spotlight", "Isolate", "Consume", "Custom"]
291
+ ANIM_PRESETS = {
292
+ "Reveal": {
293
+ "prev_opacity": 1.0,
294
+ "prev_words": ANIM_WINDOW_PREV_MAX,
295
+ "after_opacity": 0.0,
296
+ "after_words": 0,
297
+ },
298
+ "Fade": {
299
+ "prev_opacity": 1.0,
300
+ "prev_words": ANIM_WINDOW_PREV_MAX,
301
+ "after_opacity": 0.3,
302
+ "after_words": ANIM_WINDOW_AFTER_MAX,
303
+ },
304
+ "Spotlight": {
305
+ "prev_opacity": 0.3,
306
+ "prev_words": ANIM_WINDOW_PREV_MAX,
307
+ "after_opacity": 0.3,
308
+ "after_words": ANIM_WINDOW_AFTER_MAX,
309
+ },
310
+ "Isolate": {
311
+ "prev_opacity": 0,
312
+ "prev_words": 0,
313
+ "after_opacity": 0,
314
+ "after_words": 0,
315
+ },
316
+ "Consume": {
317
+ "prev_opacity": 0,
318
+ "prev_words": 0,
319
+ "after_opacity": 0.3,
320
+ "after_words": ANIM_WINDOW_AFTER_MAX,
321
+ }
322
+ }
data/112.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efe1176e324ffe694ef16825c02f2102cd7f1f0d9bfe66148529aed65e9893fe
3
+ size 432301
data/7.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aef46f287881fbc2f110dbb553877c8e3a0e5ab01ba3289803e512de18bc970b
3
+ size 46313600
data/84.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e801be0312b45546ceea22735a0d9eb4286b3b9a506ba6e98bc8965341ea9ecf
3
+ size 2480855
data/DigitalKhattV2.otf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0935c48269a57c9808e52dfae47864c189396452901c689977156036a72dd217
3
+ size 521832
data/Juz' 30.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9493fd2161f098aa0c846998f6e4d45f8dd08517657cb10653132a2ee228e0b
3
+ size 54459943
data/digital_khatt_v2_script.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd396239908253a07dc530580fd978c515cbac1b7751b2680296580fb62b247c
3
+ size 14832957
data/font_data.py ADDED
The diff for this file is too large to render. See raw diff
 
data/ligatures.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"surah-1":"surah001","surah-2":"surah002","surah-3":"surah003","surah-4":"surah004","surah-5":"surah005","surah-6":"surah006","surah-7":"surah007","surah-8":"surah008","surah-9":"surah009","surah-10":"surah010","surah-11":"surah011","surah-12":"surah012","surah-13":"surah013","surah-14":"surah014","surah-15":"surah015","surah-16":"surah016","surah-17":"surah017","surah-18":"surah018","surah-19":"surah019","surah-20":"surah020","surah-21":"surah021","surah-22":"surah022","surah-23":"surah023","surah-24":"surah024","surah-25":"surah025","surah-26":"surah026","surah-27":"surah027","surah-28":"surah028","surah-29":"surah029","surah-30":"surah030","surah-31":"surah031","surah-32":"surah032","surah-33":"surah033","surah-34":"surah034","surah-35":"surah035","surah-36":"surah036","surah-37":"surah037","surah-38":"surah038","surah-39":"surah039","surah-40":"surah040","surah-41":"surah041","surah-42":"surah042","surah-43":"surah043","surah-44":"surah044","surah-45":"surah045","surah-46":"surah046","surah-47":"surah047","surah-48":"surah048","surah-49":"surah049","surah-50":"surah050","surah-51":"surah051","surah-52":"surah052","surah-53":"surah053","surah-54":"surah054","surah-55":"surah055","surah-56":"surah056","surah-57":"surah057","surah-58":"surah058","surah-59":"surah059","surah-60":"surah060","surah-61":"surah061","surah-62":"surah062","surah-63":"surah063","surah-64":"surah064","surah-65":"surah065","surah-66":"surah066","surah-67":"surah067","surah-68":"surah068","surah-69":"surah069","surah-70":"surah070","surah-71":"surah071","surah-72":"surah072","surah-73":"surah073","surah-74":"surah074","surah-75":"surah075","surah-76":"surah076","surah-77":"surah077","surah-78":"surah078","surah-79":"surah079","surah-80":"surah080","surah-81":"surah081","surah-82":"surah082","surah-83":"surah083","surah-84":"surah084","surah-85":"surah085","surah-86":"surah086","surah-87":"surah087","surah-88":"surah088","surah-89":"surah089","surah-90":"surah090","surah-91":"surah091","surah-92":"surah092","surah-93":"surah093","surah-94":"surah094","surah-95":"surah095","surah-96":"surah096","surah-97":"surah097","surah-98":"surah098","surah-99":"surah099","surah-100":"surah100","surah-101":"surah101","surah-102":"surah102","surah-103":"surah103","surah-104":"surah104","surah-105":"surah105","surah-106":"surah106","surah-107":"surah107","surah-108":"surah108","surah-109":"surah109","surah-110":"surah110","surah-111":"surah111","surah-112":"surah112","surah-113":"surah113","surah-114":"surah114"}
data/phoneme_cache.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:027283ab3be8a239b99ba4b3ffeb869efddc3da6fce12e02473d3e335dbf3a04
3
+ size 7964064
data/phoneme_ngram_index_5.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b522664de41f590fc18fad385f023cb1a85829623cbcf035fe18152be52bc739
3
+ size 6205946
data/phoneme_sub_costs.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_meta": {
3
+ "description": "Phoneme substitution costs for DP alignment. Keys are 'phA|phB' (sorted). Values are float costs (0-1). Default substitution cost is 1.0.",
4
+ "version": 1
5
+ },
6
+
7
+ "consonant_gemination": {
8
+ "b|bb": 0.25,
9
+ "t|tt": 0.25,
10
+ "d|dd": 0.25,
11
+ "f|ff": 0.25,
12
+ "h|hh": 0.25,
13
+ "j|jj": 0.25,
14
+ "k|kk": 0.25,
15
+ "l|ll": 0.25,
16
+ "q|qq": 0.25,
17
+ "r|rr": 0.25,
18
+ "s|ss": 0.25,
19
+ "w|ww": 0.25,
20
+ "x|xx": 0.25,
21
+ "z|zz": 0.25,
22
+ "ð|ðð": 0.25,
23
+ "θ|θθ": 0.25,
24
+ "ħ|ħħ": 0.25,
25
+ "ʃ|ʃʃ": 0.25,
26
+ "ʒ|ʒʒ": 0.25,
27
+ "ʕ|ʕʕ": 0.25,
28
+ "sˤ|sˤsˤ": 0.25,
29
+ "dˤ|dˤdˤ": 0.25,
30
+ "tˤ|tˤtˤ": 0.25,
31
+ "ðˤ|ðˤðˤ": 0.25
32
+ },
33
+
34
+ "emphatic_pairs": {
35
+ "a|aˤ": 0.25,
36
+ "a:|aˤ:": 0.25,
37
+ "r|rˤ": 0.25,
38
+ "rˤ|rˤrˤ": 0.25,
39
+ "r|rˤrˤ": 0.25,
40
+ "l|lˤlˤ": 0.25,
41
+ "ll|lˤlˤ": 0.25
42
+ },
43
+
44
+ "nasal_ghunnah": {
45
+ "m|m̃": 0.25,
46
+ "n|ñ": 0.25,
47
+ "n|ŋ": 0.25,
48
+ "ŋ|ñ": 0.25,
49
+ "ŋ|m̃": 0.25,
50
+ "j|j̃": 0.25,
51
+ "w|w̃": 0.25
52
+ },
53
+
54
+ "vowel_length": {
55
+ "a|a:": 0.25,
56
+ "aˤ|aˤ:": 0.25,
57
+ "a|aˤ:": 0.25,
58
+ "aˤ|a:": 0.25,
59
+ "i|i:": 0.25,
60
+ "u|u:": 0.25
61
+ },
62
+
63
+ "common_subs": {
64
+ "n|l": 0.25,
65
+ "n|m": 0.25
66
+ }
67
+ }
data/qpc_hafs.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b2f91a19769275d0da57464002beacd8cec396b02b520aa14d17e3b135012a7
3
+ size 11596756
data/surah-name-v2.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d4678ef53ef76c361c32c13a9ad26317b8f8219089ab5822aafa6ed5d17502a
3
+ size 580388
data/surah_info.json ADDED
The diff for this file is too large to render. See raw diff
 
docs/api.md ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API Documentation
2
+
3
+ ## Current Endpoints
4
+
5
+ ### `POST /process_audio_json`
6
+
7
+ Stateless endpoint. Accepts audio and segmentation parameters, returns aligned JSON output.
8
+
9
+ **Inputs:** audio file, min_silence_ms, min_speech_ms, pad_ms, model_name, device
10
+
11
+ **Returns:** JSON with `segments` array (segment index, timestamps, Quran references, matched text, confidence, errors).
12
+
13
+ **Limitation:** Every call requires re-uploading the audio. No way to resegment or retranscribe without re-sending the full file.
14
+
15
+ ---
16
+
17
+ ## Planned: Session-Based Endpoints
18
+
19
+ The Gradio UI already caches intermediate results (preprocessed audio, VAD output, segment boundaries, model name) in `gr.State` so that resegment/retranscribe operations skip expensive steps. But `gr.State` is WebSocket-only — API clients using `gradio_client` can't benefit from this.
20
+
21
+ ### Approach: Server-Side Session Store
22
+
23
+ On the first request, the server stores all intermediate data keyed by a UUID (`audio_id`) and returns it in the response. Subsequent requests reference this `audio_id` instead of re-uploading audio.
24
+
25
+ **What gets stored per session:**
26
+ - Preprocessed audio (float32, 16kHz mono) — saved to disk as `.npy`
27
+ - Raw VAD speech intervals — in memory (small)
28
+ - VAD completeness flags — in memory
29
+ - Cleaned segment boundaries — in memory
30
+ - Model name used — in memory
31
+
32
+ **Lifecycle:** Sessions expire after the same TTL as the existing Gradio cache (5 hours). A background thread purges expired sessions periodically. Audio files live under `/tmp/sessions/{audio_id}/`.
33
+
34
+ ### `POST /process_audio_session`
35
+
36
+ Full pipeline. Same as `/process_audio_json` but additionally creates a server-side session.
37
+
38
+ **Inputs:** audio file, min_silence_ms, min_speech_ms, pad_ms, model_name, device
39
+
40
+ **Returns:** Same JSON as `/process_audio_json` with an added `audio_id` field.
41
+
42
+ ### `POST /resegment_session`
43
+
44
+ Re-cleans VAD boundaries with new segmentation parameters and re-runs ASR. Skips audio upload, preprocessing, and VAD inference.
45
+
46
+ **Inputs:** audio_id, min_silence_ms, min_speech_ms, pad_ms, model_name, device
47
+
48
+ **Returns:** JSON with `segments` array and the same `audio_id`.
49
+
50
+ ### `POST /retranscribe_session`
51
+
52
+ Re-runs ASR with a different model on the existing segment boundaries. Skips audio upload, preprocessing, VAD, and resegmentation.
53
+
54
+ **Inputs:** audio_id, model_name, device
55
+
56
+ **Returns:** JSON with `segments` array and the same `audio_id`.
57
+
58
+ ### `POST /realign_from_timestamps`
59
+
60
+ Accepts an arbitrary list of `(start, end)` timestamp pairs and runs ASR + phoneme alignment on each slice. Skips VAD entirely — the client defines the segment boundaries directly. This is the core endpoint for timeline-based editing where the user drags segment boundaries manually.
61
+
62
+ **Inputs:** audio_id, timestamps (list of `{start, end}` objects in seconds), model_name, device
63
+
64
+ **Returns:** JSON with `segments` array and the same `audio_id`. Session boundaries are updated to match the provided timestamps.
65
+
66
+ Subsumes `/resegment_session` for most client use cases — the client can split, merge, and drag boundaries however they want, then send the final timestamp list in one call.
67
+
68
+ ---
69
+
70
+ ## Planned: Segment Editing Endpoints
71
+
72
+ Fine-grained operations for modifying individual segments without reprocessing the full recitation.
73
+
74
+ ### `POST /split_segment`
75
+
76
+ Split one segment at a given timestamp into two. Re-runs alignment on each half independently.
77
+
78
+ **Inputs:** audio_id, segment_index, split_time (seconds)
79
+
80
+ **Returns:** Updated `segments` array with the split segment replaced by two new segments.
81
+
82
+ ### `POST /merge_segments`
83
+
84
+ Merge two adjacent segments into one. Re-runs alignment on the combined audio slice.
85
+
86
+ **Inputs:** audio_id, segment_index_a, segment_index_b (must be adjacent)
87
+
88
+ **Returns:** Updated `segments` array with the two segments replaced by one.
89
+
90
+ ### `POST /adjust_boundary`
91
+
92
+ Shift a segment's start or end time. Re-runs alignment on the affected segment(s) and its neighbour if boundaries overlap.
93
+
94
+ **Inputs:** audio_id, segment_index, new_start (seconds, optional), new_end (seconds, optional)
95
+
96
+ **Returns:** Updated `segments` array.
97
+
98
+ ### `POST /override_segment_text`
99
+
100
+ Manually assign a Quran reference range to a segment, skipping alignment entirely. For when the aligner gets it wrong and the user knows the correct ayah.
101
+
102
+ **Inputs:** audio_id, segment_index, ref_from (e.g. `"2:255:1"`), ref_to (e.g. `"2:255:7"`)
103
+
104
+ **Returns:** Updated segment with the overridden reference and corresponding Quran text.
105
+
106
+ ### `POST /bulk_update_segments`
107
+
108
+ Batch update: client sends a full modified segment list (adjusted times, overridden labels). Server validates, persists to session, and optionally re-aligns changed segments.
109
+
110
+ **Inputs:** audio_id, segments (list of `{start, end, ref_from?, ref_to?}`), realign (boolean, default true — re-run ASR on segments whose boundaries changed)
111
+
112
+ **Returns:** Full updated `segments` array.
113
+
114
+ ---
115
+
116
+ ## Planned: Word-Level Timing
117
+
118
+ ### `POST /compute_word_timestamps`
119
+
120
+ Compute word-level start/end times for every word in every segment. This is the backbone of karaoke-style highlighting and word-by-word caption animation.
121
+
122
+ **Inputs:** audio_id, model_name, device
123
+
124
+ **Returns:** JSON with per-segment word timestamps:
125
+ ```json
126
+ {
127
+ "audio_id": "...",
128
+ "segments": [
129
+ {
130
+ "segment": 1,
131
+ "words": [
132
+ {"word": "بِسْمِ", "start": 0.81, "end": 1.12},
133
+ {"word": "اللَّهِ", "start": 1.12, "end": 1.45}
134
+ ]
135
+ }
136
+ ]
137
+ }
138
+ ```
139
+
140
+ ---
141
+
142
+ ## Planned: Export Endpoints
143
+
144
+ Generate subtitle files from session data. All accept `audio_id` and optionally use word-level timestamps if previously computed.
145
+
146
+ ### `POST /export_srt`
147
+
148
+ Standard SRT subtitle format. One entry per segment (or per word if `word_level=true`).
149
+
150
+ **Inputs:** audio_id, word_level (boolean, default false)
151
+
152
+ **Returns:** SRT file content.
153
+
154
+ ### `POST /export_vtt`
155
+
156
+ WebVTT format. Supports styling cues and is the standard for web video players.
157
+
158
+ **Inputs:** audio_id, word_level (boolean, default false)
159
+
160
+ **Returns:** VTT file content.
161
+
162
+ ### `POST /export_ass`
163
+
164
+ ASS/SSA format with Arabic font and styling presets. Most useful for video editors producing styled Quran captions.
165
+
166
+ **Inputs:** audio_id, word_level (boolean, default false), font_name (optional), font_size (optional)
167
+
168
+ **Returns:** ASS file content.
169
+
170
+ ---
171
+
172
+ ## Planned: Quran Lookup Endpoints
173
+
174
+ Utility endpoints for client-side UI (dropdowns, search, manual labelling).
175
+
176
+ ### `GET /quran_text`
177
+
178
+ Return Quran text with diacritics for a given reference range.
179
+
180
+ **Inputs:** ref_from (e.g. `"2:255:1"`), ref_to (e.g. `"2:255:7"`)
181
+
182
+ **Returns:** `{"text": "...", "ref_from": "...", "ref_to": "..."}`. All 114 chapters are pre-cached in memory.
183
+
184
+ ### `GET /surah_info`
185
+
186
+ List of all surahs with metadata.
187
+
188
+ **Returns:** Array of `{number, name_arabic, name_english, ayah_count, revelation_type}`.
189
+
190
+ ---
191
+
192
+ ## Planned: Recitation Analytics
193
+
194
+ ### `POST /recitation_stats`
195
+
196
+ Derive pace and timing analytics from an existing session's alignment results.
197
+
198
+ **Inputs:** audio_id
199
+
200
+ **Returns:**
201
+ ```json
202
+ {
203
+ "audio_id": "...",
204
+ "total_duration_sec": 312.5,
205
+ "total_segments": 7,
206
+ "total_words": 86,
207
+ "words_per_minute": 16.5,
208
+ "avg_segment_duration_sec": 8.2,
209
+ "avg_pause_duration_sec": 1.4,
210
+ "per_segment": [
211
+ {
212
+ "segment": 1,
213
+ "ref_from": "112:1:1",
214
+ "ref_to": "112:1:4",
215
+ "duration_sec": 2.18,
216
+ "word_count": 4,
217
+ "words_per_minute": 110.1,
218
+ "pause_after_sec": 1.82
219
+ }
220
+ ]
221
+ }
222
+ ```
223
+
224
+ Useful for learning apps tracking student fluency, reciter comparisons, or detecting rushed/slow sections.
225
+
226
+ ---
227
+
228
+ ## Planned: Streaming
229
+
230
+ ### `POST /process_chunk`
231
+
232
+ Streaming-friendly endpoint for incremental audio processing. The client sends audio chunks as they become available, and the server returns partial alignment results progressively. Designed for live "now playing" displays (e.g. Quran radio showing the current ayah in real time).
233
+
234
+ **Inputs:** audio_id (optional — omit on first chunk to start a new session), audio_chunk (raw audio bytes), is_final (boolean)
235
+
236
+ **Returns:**
237
+ ```json
238
+ {
239
+ "audio_id": "...",
240
+ "status": "partial",
241
+ "latest_segments": [
242
+ {
243
+ "segment": 5,
244
+ "ref_from": "36:1:1",
245
+ "ref_to": "36:1:2",
246
+ "matched_text": "يسٓ",
247
+ "time_from": 24.3,
248
+ "time_to": 25.8,
249
+ "confidence": 0.95
250
+ }
251
+ ]
252
+ }
253
+ ```
254
+
255
+ When `is_final=true`, the server finalises the session and returns the complete aligned output (same structure as `/process_audio_session`).
256
+
257
+ **Chunking notes:** The server buffers audio internally and runs VAD + ASR when enough speech has accumulated to form a segment. Earlier segments are locked in and won't change; only the trailing edge is provisional.
258
+
259
+ ---
260
+
261
+ ## Planned: Health / Status
262
+
263
+ ### `GET /health`
264
+
265
+ Server status for monitoring dashboards and client-side availability checks.
266
+
267
+ **Returns:**
268
+ ```json
269
+ {
270
+ "status": "ok",
271
+ "gpu_available": true,
272
+ "gpu_quota_exhausted": false,
273
+ "quota_reset_time": null,
274
+ "active_sessions": 12,
275
+ "models_loaded": ["Base", "Large"],
276
+ "uptime_sec": 84200
277
+ }
278
+ ```
279
+
280
+ ---
281
+
282
+ ## Error Handling
283
+
284
+ If `audio_id` is missing, expired, or invalid, session endpoints return:
285
+
286
+ ```json
287
+ {"error": "Session not found or expired", "segments": []}
288
+ ```
289
+
290
+ The client should call `/process_audio_session` again to get a fresh session.
291
+
292
+ ---
293
+
294
+ ## Design Notes
295
+
296
+ - **Thread safety:** Gradio handles concurrent requests via threading. The session store uses a lock around its internal dict.
297
+ - **Storage:** Audio on disk (can be large), metadata in memory (always small). Audio loaded via memory-mapped reads on demand.
298
+ - **No auth needed:** Session IDs are 128-bit random UUIDs — effectively unguessable.
299
+ - **HF Spaces compatibility:** `/tmp` is ephemeral and cleared on restart, which is fine since sessions are transient. The existing `allowed_paths=["/tmp"]` covers the new directory.
300
+ - **Backward compatible:** `/process_audio_json` remains unchanged.
docs/usage-logging.md ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Usage Logging
2
+
3
+ ## Part 1 — Reference: `recitation_app` Logging
4
+
5
+ Documents the recitation logging system used in `recitation_app` to collect anonymised analysis data on HuggingFace Hub. Included here as a reference for the quran_aligner schema below.
6
+
7
+ ### Dataset
8
+
9
+ | Property | Value |
10
+ |----------|-------|
11
+ | Repo | `hetchyy/recitation-logs` (private) |
12
+ | Type | HuggingFace Dataset |
13
+ | Format | Parquet files in `data/` |
14
+ | Push interval | 1 minute |
15
+
16
+ Configured in `config.py`:
17
+
18
+ ```python
19
+ USAGE_LOG_DATASET_REPO = "hetchyy/recitation-logs"
20
+ USAGE_LOG_PUSH_INTERVAL_MINUTES = 1
21
+ USAGE_LOG_AUDIO = False # toggleable at runtime
22
+ ```
23
+
24
+ ### Schema
25
+
26
+ Defined in `utils/usage_logger.py` as `_RECITATION_SCHEMA`:
27
+
28
+ | Field | HF Type | Description |
29
+ |-------|---------|-------------|
30
+ | `audio` | `Audio` | Optional FLAC-encoded audio bytes embedded in parquet |
31
+ | `timestamp` | `Value(string)` | ISO 8601 datetime of the analysis |
32
+ | `user_id` | `Value(string)` | SHA-256 hash (12-char) of username or IP+UA |
33
+ | `verse_ref` | `Value(string)` | Quranic reference, e.g. `"1:1"` |
34
+ | `canonical_text` | `Value(string)` | Arabic text of the verse |
35
+ | `segments` | `Value(string)` | JSON array of segment results (see below) |
36
+ | `multi_model` | `Value(bool)` | Whether multiple ASR models were used |
37
+ | `settings` | `Value(string)` | JSON dict of Tajweed settings |
38
+ | `vad_timestamps` | `Value(string)` | JSON list of VAD segment boundaries |
39
+
40
+ #### Segment object (inside `segments` JSON)
41
+
42
+ ```json
43
+ {
44
+ "segment_ref": "1:1",
45
+ "canonical_phonemes": "b i s m i ...",
46
+ "detected_phonemes": "b i s m i ..."
47
+ }
48
+ ```
49
+
50
+ #### Settings object (inside `settings` JSON)
51
+
52
+ ```json
53
+ {
54
+ "tolerance": 0.15,
55
+ "iqlab_sound": "m",
56
+ "ghunnah_length": 2,
57
+ "jaiz_length": 4,
58
+ "wajib_length": 4,
59
+ "arid_length": 2,
60
+ "leen_length": 2
61
+ }
62
+ ```
63
+
64
+ ### ParquetScheduler
65
+
66
+ Custom subclass of `huggingface_hub.CommitScheduler` (`utils/usage_logger.py`).
67
+
68
+ #### How it works
69
+
70
+ 1. **Buffer** — Rows accumulate in an in-memory list via `.append(row)`. Access is protected by a threading lock.
71
+ 2. **Flush** — On each scheduler tick (every `USAGE_LOG_PUSH_INTERVAL_MINUTES`):
72
+ - Lock the buffer, swap it out, release the lock.
73
+ - For any `audio` field containing a file path, read the file and convert to `{"path": filename, "bytes": binary_data}`.
74
+ - Build a PyArrow table from the rows.
75
+ - Embed the HF feature schema in parquet metadata:
76
+ ```python
77
+ table.replace_schema_metadata(
78
+ {"huggingface": json.dumps({"info": {"features": schema}})}
79
+ )
80
+ ```
81
+ - Write to a temp parquet file, then upload via `api.upload_file()` to `data/{uuid4()}.parquet`.
82
+ - Clean up temp audio files.
83
+
84
+ #### Audio encoding
85
+
86
+ When `USAGE_LOG_AUDIO` is enabled:
87
+
88
+ ```python
89
+ sf.write(filepath, audio_array, sample_rate, format="FLAC")
90
+ row["audio"] = str(filepath) # ParquetScheduler reads and embeds the bytes
91
+ ```
92
+
93
+ The audio is 16kHz mono, encoded as FLAC, and stored as embedded bytes inside the parquet file.
94
+
95
+ ### Lazy Initialisation
96
+
97
+ Schedulers are **not** created at import time. They are initialised on first call to `_ensure_schedulers()` using double-checked locking:
98
+
99
+ ```python
100
+ _recitation_scheduler = None
101
+ _schedulers_initialized = False
102
+ _init_lock = threading.Lock()
103
+
104
+ def _ensure_schedulers():
105
+ global _recitation_scheduler, _schedulers_initialized
106
+ if _schedulers_initialized:
107
+ return
108
+ with _init_lock:
109
+ if _schedulers_initialized:
110
+ return
111
+ _schedulers_initialized = True
112
+ _recitation_scheduler = ParquetScheduler(
113
+ repo_id=USAGE_LOG_DATASET_REPO,
114
+ schema=_RECITATION_SCHEMA,
115
+ every=USAGE_LOG_PUSH_INTERVAL_MINUTES,
116
+ path_in_repo="data",
117
+ repo_type="dataset",
118
+ private=True,
119
+ )
120
+ ```
121
+
122
+ This avoids interfering with ZeroGPU, which is sensitive to early network calls.
123
+
124
+ ### Error Logging
125
+
126
+ Errors use a separate `CommitScheduler` (not `ParquetScheduler`) that watches a local directory:
127
+
128
+ - Local path: `/usage_logs/errors/error_log-{uuid4()}.jsonl`
129
+ - Remote path: `data/errors/`
130
+ - Format: JSONL with fields `timestamp`, `user_id`, `verse_ref`, `error_message`
131
+
132
+ Errors are appended to the JSONL file under a file lock. The `CommitScheduler` syncs the directory to Hub periodically.
133
+
134
+ ### User Anonymisation
135
+
136
+ ```python
137
+ def get_user_id(request) -> str:
138
+ username = getattr(request, "username", None)
139
+ if username:
140
+ return hashlib.sha256(username.encode()).hexdigest()[:12]
141
+ ip = headers.get("x-forwarded-for", "").split(",")[0].strip()
142
+ ua = headers.get("user-agent", "")
143
+ return hashlib.sha256(f"{ip}|{ua}".encode()).hexdigest()[:12]
144
+ ```
145
+
146
+ - Logged-in HF users: hash of username
147
+ - Anonymous users: hash of IP + User-Agent
148
+ - Always truncated to 12 hex characters
149
+
150
+ ### Fallback
151
+
152
+ If the scheduler fails to initialise (no HF token, network issues), rows are written to a local JSONL file at `usage_logs/recitations_fallback.jsonl` (without audio).
153
+
154
+ ### Integration Point
155
+
156
+ Logging is called from the audio processing handler (`ui/handlers/audio_processing.py`) after each analysis completes:
157
+
158
+ ```python
159
+ log_analysis(
160
+ user_id, ref, text, segments,
161
+ multi_model=bool(use_multi),
162
+ settings=_settings,
163
+ audio=audio_for_log, # tuple of (sample_rate, np.ndarray) or None
164
+ vad_timestamps=vad_ts, # list of [start, end] pairs
165
+ )
166
+ ```
167
+
168
+ Errors are logged separately:
169
+
170
+ ```python
171
+ log_error(user_id, ref, "Audio loading failed")
172
+ ```
173
+
174
+ ### Dependencies
175
+
176
+ - `huggingface_hub` — `CommitScheduler` base class and Hub API
177
+ - `pyarrow` — Parquet table creation and schema metadata
178
+ - `soundfile` — FLAC audio encoding
179
+
180
+ ---
181
+
182
+ ## Part 2 — `quran_aligner` Logging Schema
183
+
184
+ Schema for logging alignment runs from this project. One row per audio upload. The row is mutated in-place while it sits in the `ParquetScheduler` buffer (before the next push-to-Hub tick). Run-level fields (profiling, reciter stats, quality stats, settings) are **overwritten** to reflect the latest run. Segment results are **appended** so every setting combination is preserved.
185
+
186
+ ### Run-level fields
187
+
188
+ #### Identity
189
+
190
+ | Field | HF Type | Description |
191
+ |-------|---------|-------------|
192
+ | `audio` | `Audio` | FLAC-encoded audio (16kHz mono) |
193
+ | `audio_id` | `Value(string)` | `{sha256(audio_bytes)[:16]}:{timestamp}`, e.g. `a3f7b2c91e04d8f2:20260203T141532` |
194
+ | `timestamp` | `Value(string)` | ISO 8601 datetime truncated to seconds, e.g. `2026-02-03T01:50:45` |
195
+ | `user_id` | `Value(string)` | SHA-256 hash (12-char) of IP+UA |
196
+
197
+ The `audio_id` hash prefix enables grouping/deduplication of the same recording across runs; the timestamp suffix makes each run unique. Cost is ~90ms for a 5-minute recording.
198
+
199
+ #### Input metadata
200
+
201
+ | Field | HF Type | Description |
202
+ |-------|---------|-------------|
203
+ | `audio_duration_s` | `Value(float64)` | Total audio duration in seconds |
204
+ | `num_segments` | `Value(int32)` | Number of VAD segments |
205
+ | `surah` | `Value(int32)` | Detected surah (1-114) |
206
+
207
+ #### Segmentation settings
208
+
209
+ | Field | HF Type | Description |
210
+ |-------|---------|-------------|
211
+ | `min_silence_ms` | `Value(int32)` | Minimum silence duration to split |
212
+ | `min_speech_ms` | `Value(int32)` | Minimum speech duration for a valid segment |
213
+ | `pad_ms` | `Value(int32)` | Padding around speech segments |
214
+ | `asr_model` | `Value(string)` | `"Base"` (`hetchyy/r15_95m`) or `"Large"` (`hetchyy/r7`) |
215
+ | `device` | `Value(string)` | `"GPU"` or `"CPU"` |
216
+
217
+ #### Profiling (seconds)
218
+
219
+ | Field | HF Type | Description |
220
+ |-------|---------|-------------|
221
+ | `total_time` | `Value(float64)` | End-to-end pipeline wall time |
222
+ | `vad_queue_time` | `Value(float64)` | VAD queue wait time |
223
+ | `vad_gpu_time` | `Value(float64)` | VAD actual GPU execution |
224
+ | `asr_gpu_time` | `Value(float64)` | ASR actual GPU execution |
225
+ | `dp_total_time` | `Value(float64)` | Total DP alignment across all segments |
226
+
227
+ #### Quality & retry stats
228
+
229
+ | Field | HF Type | Description |
230
+ |-------|---------|-------------|
231
+ | `segments_passed` | `Value(int32)` | Segments with confidence > 0 |
232
+ | `segments_failed` | `Value(int32)` | Segments with confidence <= 0 |
233
+ | `mean_confidence` | `Value(float64)` | Average confidence across all segments |
234
+ | `tier1_retries` | `Value(int32)` | Expanded-window retry attempts |
235
+ | `tier1_passed` | `Value(int32)` | Successful tier 1 retries |
236
+ | `tier2_retries` | `Value(int32)` | Relaxed-threshold retry attempts |
237
+ | `tier2_passed` | `Value(int32)` | Successful tier 2 retries |
238
+ | `reanchors` | `Value(int32)` | Re-anchor events (after consecutive failures) |
239
+ | `special_merges` | `Value(int32)` | Basmala-fused segments |
240
+
241
+ #### Reciter stats
242
+
243
+ Computed from matched segments (those with `word_count > 0`). Already calculated in `app.py:877-922` for console output.
244
+
245
+ | Field | HF Type | Description |
246
+ |-------|---------|-------------|
247
+ | `words_per_minute` | `Value(float64)` | `total_words / (total_speech_s / 60)` |
248
+ | `phonemes_per_second` | `Value(float64)` | `total_phonemes / total_speech_s` |
249
+ | `avg_segment_duration` | `Value(float64)` | Mean duration of matched segments |
250
+ | `std_segment_duration` | `Value(float64)` | Std dev of matched segment durations |
251
+ | `avg_pause_duration` | `Value(float64)` | Mean inter-segment silence gap |
252
+ | `std_pause_duration` | `Value(float64)` | Std dev of pause durations |
253
+
254
+ #### Session flags
255
+
256
+ | Field | HF Type | Description |
257
+ |-------|---------|-------------|
258
+ | `resegmented` | `Value(bool)` | User resegmented with different VAD settings |
259
+ | `retranscribed` | `Value(bool)` | User retranscribed with a different ASR model |
260
+
261
+ #### Segments, timestamps & error
262
+
263
+ | Field | HF Type | Description |
264
+ |-------|---------|-------------|
265
+ | `segments` | `Value(string)` | JSON array of run objects (see below) — **appended** on resegment/retranscribe |
266
+ | `word_timestamps` | `Value(string)` | JSON array of per-segment MFA word timings (see below), null until computed |
267
+ | `error` | `Value(string)` | Top-level error message if the pipeline failed |
268
+
269
+ ### Segment runs (inside `segments` JSON)
270
+
271
+ Each run with different settings appends a new run object. The array preserves the full history so every setting combination is available.
272
+
273
+ ```json
274
+ [
275
+ {
276
+ "min_silence_ms": 200,
277
+ "min_speech_ms": 1000,
278
+ "pad_ms": 100,
279
+ "asr_model": "Base",
280
+ "segments": [
281
+ {
282
+ "idx": 1,
283
+ "start": 0.512,
284
+ "end": 3.841,
285
+ "duration": 3.329,
286
+ "ref": "2:255:1-2:255:5",
287
+ "confidence": 0.87,
288
+ "word_count": 5,
289
+ "ayah_span": 1,
290
+ "phoneme_count": 42,
291
+ "undersegmented": false,
292
+ "missing_words": false,
293
+ "special_type": null,
294
+ "error": null
295
+ }
296
+ ]
297
+ },
298
+ {
299
+ "min_silence_ms": 600,
300
+ "min_speech_ms": 1500,
301
+ "pad_ms": 300,
302
+ "asr_model": "Base",
303
+ "segments": [...]
304
+ }
305
+ ]
306
+ ```
307
+
308
+ #### Run object
309
+
310
+ | Field | Type | Description |
311
+ |-------|------|-------------|
312
+ | `min_silence_ms` | int | Silence setting used for this run |
313
+ | `min_speech_ms` | int | Speech setting used for this run |
314
+ | `pad_ms` | int | Pad setting used for this run |
315
+ | `asr_model` | string | `"Base"` or `"Large"` |
316
+ | `segments` | array | Per-segment objects for this run |
317
+
318
+ #### Per-segment object
319
+
320
+ | Field | Type | Description |
321
+ |-------|------|-------------|
322
+ | `idx` | int | 1-indexed segment number |
323
+ | `start` | float | Segment start time in seconds |
324
+ | `end` | float | Segment end time in seconds |
325
+ | `duration` | float | `end - start` |
326
+ | `ref` | string | Matched reference `"S:A:W1-S:A:W2"`, empty if failed |
327
+ | `confidence` | float | Alignment confidence [0.0, 1.0] |
328
+ | `word_count` | int | Number of words matched |
329
+ | `ayah_span` | int | Number of ayahs spanned |
330
+ | `phoneme_count` | int | Length of ASR phoneme sequence |
331
+ | `undersegmented` | bool | Flagged if word_count >= 20 or ayah_span >= 2 and duration >= 15s |
332
+ | `missing_words` | bool | Gaps detected in word alignment |
333
+ | `special_type` | string\|null | `"Basmala"`, `"Isti'adha"`, `"Isti'adha+Basmala"`, or null |
334
+ | `error` | string\|null | Per-segment error message |
335
+
336
+ ### Word timestamps (inside `word_timestamps` JSON)
337
+
338
+ Populated when the user computes MFA timestamps. Array of per-segment word timing arrays:
339
+
340
+ ```json
341
+ [
342
+ {
343
+ "segment_idx": 1,
344
+ "ref": "2:255:1-2:255:5",
345
+ "words": [
346
+ {"word": "ٱللَّهُ", "start": 0.512, "end": 0.841},
347
+ {"word": "لَآ", "start": 0.870, "end": 1.023}
348
+ ]
349
+ }
350
+ ]
351
+ ```
352
+
353
+ ### In-place mutation
354
+
355
+ The row dict is appended to `ParquetScheduler` on the initial run, and a reference is stored in `gr.State`. Subsequent actions (resegment, retranscribe, compute timestamps) mutate the dict in-place before the next push-to-Hub tick (every 1 minute).
356
+
357
+ - **Overwritten on each run:** profiling, quality/retry stats, reciter stats, run-level settings (`min_silence_ms`, `asr_model`, etc.), `num_segments`, `surah`.
358
+ - **Appended on each run:** `segments` JSON array gains a new run object with its settings and per-segment results.
359
+ - **Set once:** `word_timestamps` is populated when the user computes MFA timestamps (null until then).
360
+ - **If the push already fired** before a subsequent action, the mutation is a no-op on the already-uploaded row. The new results are lost for that row — acceptable since the initial run is always captured.
361
+
362
+ ### Design rationale
363
+
364
+ - **Settings are denormalised** into each row so config changes can be correlated with quality without joins.
365
+ - **Profiling fields are flat columns**, not nested JSON, so they are directly queryable in the HF dataset viewer and pandas.
366
+ - **Segments are an array of run objects** — each run includes its settings alongside the per-segment results, so different setting combinations are preserved even though run-level fields reflect the latest state.
367
+ - **`mean_confidence` is pre-computed** at the run level for easy filtering and sorting without parsing the segments array.
368
+ - **Audio is always uploaded** as the first column so every run is reproducible and the dataset is playable in the HF viewer.
369
+ - **`audio_id`** combines a content hash with a timestamp — the hash prefix groups re-runs of the same recording, the suffix makes each row unique.
370
+ - **All sources are from existing objects** — `ProfilingData` (segment_processor.py), `SegmentInfo` (segment_processor.py), and `config.py` values. No new computation is required beyond assembling the row.
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=6.5.1
2
+ spaces>=0.44.0
3
+ torch==2.8.0
4
+ transformers==5.0.0
5
+ accelerate==1.11.0
6
+ librosa==0.10.2
7
+ numpy>=1.24.0,<2.0.0
8
+ requests>=2.28.0
9
+ pyarrow>=14.0.0
10
+ soundfile>=0.12.0
11
+ cython>=3.0.0
12
+ recitations_segmenter==1.0.0
13
+ git+https://github.com/Hetchy/Quranic-Phonemizer.git@1b6a8cc
scripts/add_open_tanween.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Replace regular tanween in qpc_hafs.json with open tanween where digital_khatt uses them."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ DATA_DIR = Path(__file__).resolve().parent.parent / "data"
7
+
8
+ OPEN_TO_REGULAR = {
9
+ "\u08F0": "\u064B", # open fathatan → regular fathatan
10
+ "\u08F1": "\u064C", # open dammatan → regular dammatan
11
+ "\u08F2": "\u064D", # open kasratan → regular kasratan
12
+ }
13
+ REGULAR_TO_OPEN = {v: k for k, v in OPEN_TO_REGULAR.items()}
14
+
15
+ def main():
16
+ khatt = json.loads((DATA_DIR / "digital_khatt_v2_script.json").read_text("utf-8"))
17
+ qpc = json.loads((DATA_DIR / "qpc_hafs.json").read_text("utf-8"))
18
+
19
+ counts = {"\u08F0": 0, "\u08F1": 0, "\u08F2": 0}
20
+ mismatches = []
21
+
22
+ for key, khatt_entry in khatt.items():
23
+ if key not in qpc:
24
+ continue
25
+ khatt_text = khatt_entry["text"]
26
+ qpc_text = qpc[key]["text"]
27
+
28
+ for open_char, regular_char in OPEN_TO_REGULAR.items():
29
+ if open_char in khatt_text:
30
+ if regular_char in qpc_text:
31
+ qpc_text = qpc_text.replace(regular_char, open_char)
32
+ counts[open_char] += 1
33
+ else:
34
+ mismatches.append((key, open_char, khatt_text, qpc[key]["text"]))
35
+
36
+ qpc[key]["text"] = qpc_text
37
+
38
+ print("Replacements:")
39
+ for char, count in counts.items():
40
+ name = {"\u08F0": "fathatan", "\u08F1": "dammatan", "\u08F2": "kasratan"}[char]
41
+ print(f" open {name} (U+{ord(char):04X}): {count} words")
42
+ print(f" total: {sum(counts.values())} words")
43
+
44
+ if mismatches:
45
+ print(f"\nMismatches ({len(mismatches)}):")
46
+ for key, char, kt, qt in mismatches[:10]:
47
+ print(f" {key}: khatt has U+{ord(char):04X} but qpc missing regular equivalent")
48
+ print(f" khatt: {kt}")
49
+ print(f" qpc: {qt}")
50
+
51
+ out_path = DATA_DIR / "qpc_hafs.json"
52
+ out_path.write_text(json.dumps(qpc, ensure_ascii=False, indent=2) + "\n", "utf-8")
53
+ print(f"\nSaved to {out_path}")
54
+
55
+
56
+ if __name__ == "__main__":
57
+ main()
scripts/build_phoneme_cache.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Build phoneme cache for all 114 chapters.
3
+
4
+ Phonemizes the entire Quran in a single call and saves per-chapter
5
+ ChapterReference objects to a pickle file for fast loading at runtime.
6
+
7
+ Usage:
8
+ python scripts/build_phoneme_cache.py
9
+ """
10
+
11
+ import pickle
12
+ import sys
13
+ from collections import defaultdict
14
+ from pathlib import Path
15
+
16
+ _project_root = Path(__file__).parent.parent.resolve()
17
+ sys.path.insert(0, str(_project_root))
18
+
19
+ from config import PHONEME_CACHE_PATH
20
+ from src.alignment.phoneme_matcher import ChapterReference, RefWord
21
+ from src.phonemizer_utils import get_phonemizer
22
+
23
+
24
+ def build_all_chapters() -> dict[int, ChapterReference]:
25
+ """Phonemize entire Quran and build all ChapterReference objects."""
26
+ pm = get_phonemizer()
27
+
28
+ print("Phonemizing entire Quran (1-114)...")
29
+ result = pm.phonemize(ref="1-114", stops=["verse"])
30
+
31
+ words = result._words
32
+ nested = result._nested
33
+ print(f"Total words: {len(words)}")
34
+
35
+ # Group by surah
36
+ surah_words: dict[int, list[RefWord]] = defaultdict(list)
37
+ for word, phonemes in zip(words, nested):
38
+ loc = word.location
39
+ surah_words[loc.surah_num].append(RefWord(
40
+ text=word.text,
41
+ phonemes=phonemes,
42
+ surah=loc.surah_num,
43
+ ayah=loc.ayah_num,
44
+ word_num=loc.word_num,
45
+ ))
46
+
47
+ # Build ChapterReference for each surah
48
+ chapters: dict[int, ChapterReference] = {}
49
+ for surah_num in sorted(surah_words):
50
+ ref_words = surah_words[surah_num]
51
+
52
+ total_phones = sum(len(w.phonemes) for w in ref_words)
53
+ avg_phones_per_word = total_phones / len(ref_words) if ref_words else 4.0
54
+
55
+ flat_phonemes = []
56
+ flat_phone_to_word = []
57
+ word_phone_offsets = []
58
+
59
+ for word_idx, word in enumerate(ref_words):
60
+ word_phone_offsets.append(len(flat_phonemes))
61
+ for ph in word.phonemes:
62
+ flat_phonemes.append(ph)
63
+ flat_phone_to_word.append(word_idx)
64
+
65
+ # Sentinel offset
66
+ word_phone_offsets.append(len(flat_phonemes))
67
+
68
+ chapters[surah_num] = ChapterReference(
69
+ surah=surah_num,
70
+ words=ref_words,
71
+ avg_phones_per_word=avg_phones_per_word,
72
+ flat_phonemes=flat_phonemes,
73
+ flat_phone_to_word=flat_phone_to_word,
74
+ word_phone_offsets=word_phone_offsets,
75
+ )
76
+
77
+ print(f"Built {len(chapters)} chapter references")
78
+ return chapters
79
+
80
+
81
+ def main():
82
+ chapters = build_all_chapters()
83
+
84
+ output_path = Path(PHONEME_CACHE_PATH)
85
+ output_path.parent.mkdir(parents=True, exist_ok=True)
86
+
87
+ with open(output_path, "wb") as f:
88
+ pickle.dump(chapters, f, protocol=pickle.HIGHEST_PROTOCOL)
89
+
90
+ print(f"Saved to {output_path}")
91
+ print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")
92
+
93
+
94
+ if __name__ == "__main__":
95
+ main()
scripts/build_phoneme_ngram_index.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Build phoneme n-gram index for the entire Quran.
3
+
4
+ Usage:
5
+ python scripts/build_phoneme_ngram_index.py
6
+ python scripts/build_phoneme_ngram_index.py --ngram-size 4
7
+ """
8
+
9
+ import argparse
10
+ import pickle
11
+ import sys
12
+ from collections import defaultdict
13
+ from pathlib import Path
14
+ from typing import Dict, List, Tuple
15
+
16
+ from tqdm import tqdm
17
+
18
+ # Add project root to path
19
+ _project_root = Path(__file__).parent.parent.resolve()
20
+ sys.path.insert(0, str(_project_root))
21
+
22
+ from config import NGRAM_SIZE, NGRAM_INDEX_PATH
23
+ from src.alignment.ngram_index import PhonemeNgramIndex
24
+ from src.phonemizer_utils import get_phonemizer
25
+
26
+
27
+ def build_index(ngram_size: int) -> PhonemeNgramIndex:
28
+ """Build the phoneme n-gram index from the entire Quran."""
29
+ pm = get_phonemizer()
30
+
31
+ print("Phonemizing entire Quran (surahs 1-114)...")
32
+ result = pm.phonemize(ref="1-114", stops=["verse"])
33
+
34
+ # Use _words (locations) + _nested (phonemes) directly — avoids slow get_mapping()
35
+ words = result._words
36
+ nested = result._nested
37
+ print(f"Total words from phonemizer: {len(words)}")
38
+
39
+ # Group phonemes by (surah, ayah) from word locations
40
+ verse_phonemes: Dict[Tuple[int, int], List[str]] = defaultdict(list)
41
+ for word, phonemes in tqdm(zip(words, nested), total=len(words), desc="Grouping words by verse"):
42
+ if not phonemes:
43
+ continue
44
+ loc = word.location
45
+ verse_phonemes[(loc.surah_num, loc.ayah_num)].extend(phonemes)
46
+
47
+ print(f"Total verses: {len(verse_phonemes)}")
48
+
49
+ # Extract n-grams per verse
50
+ ngram_positions: Dict[Tuple[str, ...], List[Tuple[int, int]]] = defaultdict(list)
51
+ total_ngrams = 0
52
+
53
+ for (surah, ayah), phonemes in tqdm(verse_phonemes.items(), desc="Building n-grams"):
54
+ if len(phonemes) < ngram_size:
55
+ continue
56
+ for i in range(len(phonemes) - ngram_size + 1):
57
+ ng = tuple(phonemes[i : i + ngram_size])
58
+ ngram_positions[ng].append((surah, ayah))
59
+ total_ngrams += 1
60
+
61
+ # Build counts
62
+ ngram_counts: Dict[Tuple[str, ...], int] = {
63
+ ng: len(positions) for ng, positions in ngram_positions.items()
64
+ }
65
+
66
+ print(f"Total n-gram occurrences: {total_ngrams}")
67
+ print(f"Unique n-grams: {len(ngram_positions)}")
68
+ if ngram_counts:
69
+ min_count = min(ngram_counts.values())
70
+ max_count = max(ngram_counts.values())
71
+ print(f"Count range: {min_count} - {max_count}")
72
+
73
+ return PhonemeNgramIndex(
74
+ ngram_positions=dict(ngram_positions),
75
+ ngram_counts=ngram_counts,
76
+ ngram_size=ngram_size,
77
+ total_ngrams=total_ngrams,
78
+ )
79
+
80
+
81
+ def main():
82
+ parser = argparse.ArgumentParser(description="Build phoneme n-gram index for Quran")
83
+ parser.add_argument(
84
+ "--ngram-size",
85
+ type=int,
86
+ default=NGRAM_SIZE,
87
+ help=f"N-gram size (default: {NGRAM_SIZE})",
88
+ )
89
+ parser.add_argument(
90
+ "--output",
91
+ type=str,
92
+ default=str(NGRAM_INDEX_PATH),
93
+ help=f"Output path (default: {NGRAM_INDEX_PATH})",
94
+ )
95
+ args = parser.parse_args()
96
+
97
+ index = build_index(args.ngram_size)
98
+
99
+ output_path = Path(args.output)
100
+ output_path.parent.mkdir(parents=True, exist_ok=True)
101
+
102
+ with open(output_path, "wb") as f:
103
+ pickle.dump(index, f, protocol=pickle.HIGHEST_PROTOCOL)
104
+
105
+ print(f"Saved index to {output_path}")
106
+ print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")
107
+
108
+
109
+ if __name__ == "__main__":
110
+ main()
scripts/export_onnx.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Export phoneme ASR models to optimized ONNX format and upload to HF Hub.
2
+
3
+ Usage:
4
+ pip install optimum[onnxruntime] onnxruntime
5
+ HF_TOKEN=... python scripts/export_onnx.py
6
+ HF_TOKEN=... python scripts/export_onnx.py --quantize # + dynamic INT8
7
+ HF_TOKEN=... python scripts/export_onnx.py --models Base # single model
8
+
9
+ Exports fp32 ONNX with ORT graph optimizations baked in.
10
+ Optionally applies dynamic INT8 quantization for CPU inference.
11
+
12
+ Note: ORTOptimizer's transformer-specific fusions (attention, LayerNorm, GELU)
13
+ do NOT support wav2vec2. We use ORT's general graph optimizations instead
14
+ (constant folding, redundant node elimination, common subexpression elimination).
15
+ Runtime ORT_ENABLE_ALL adds further optimizations at session load time.
16
+ """
17
+
18
+ import argparse
19
+ import os
20
+ import shutil
21
+ import sys
22
+ from pathlib import Path
23
+
24
+ # Add project root to path
25
+ sys.path.insert(0, str(Path(__file__).parent.parent))
26
+
27
+ from config import PHONEME_ASR_MODELS
28
+
29
+
30
+ def get_hf_token():
31
+ """Get HF token from env or cached login."""
32
+ hf_token = os.environ.get("HF_TOKEN")
33
+ if not hf_token:
34
+ try:
35
+ from huggingface_hub import HfFolder
36
+ hf_token = HfFolder.get_token()
37
+ except Exception:
38
+ pass
39
+ if not hf_token:
40
+ print("WARNING: No HF token found. Set HF_TOKEN env var or run `huggingface-cli login`.")
41
+ return hf_token
42
+
43
+
44
+ def _optimize_graph(model_path: Path):
45
+ """Apply general ORT graph optimizations (no transformer-specific fusions).
46
+
47
+ Bakes constant folding, redundant node elimination, and common subexpression
48
+ elimination into the model file so they don't need to run at session load time.
49
+ """
50
+ import onnxruntime as ort
51
+
52
+ model_file = str(model_path / "model.onnx")
53
+ optimized_file = str(model_path / "model_optimized.onnx")
54
+
55
+ sess_options = ort.SessionOptions()
56
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
57
+ sess_options.optimized_model_filepath = optimized_file
58
+
59
+ # Create session just to trigger optimization and save
60
+ ort.InferenceSession(model_file, sess_options, providers=["CPUExecutionProvider"])
61
+
62
+ # Replace original with optimized
63
+ os.replace(optimized_file, model_file)
64
+
65
+
66
+ def export_model(model_name: str, model_path: str, output_dir: Path, hf_token: str,
67
+ quantize: bool = False):
68
+ """Export a single model to optimized fp32 ONNX and push to HF Hub."""
69
+ from optimum.onnxruntime import ORTModelForCTC
70
+
71
+ print(f"\n{'='*60}")
72
+ print(f"Exporting '{model_name}' ({model_path})")
73
+ print(f"{'='*60}")
74
+
75
+ # Clean output dir for fresh export
76
+ if output_dir.exists():
77
+ shutil.rmtree(output_dir)
78
+ output_dir.mkdir(parents=True)
79
+
80
+ # Step 1: Export to ONNX (fp32)
81
+ print(f" [1/5] Exporting fp32 ONNX...")
82
+ model = ORTModelForCTC.from_pretrained(model_path, export=True, token=hf_token)
83
+ model.save_pretrained(output_dir)
84
+ print(f" Saved to {output_dir}")
85
+
86
+ # Step 2: Apply general ORT graph optimizations
87
+ # (wav2vec2 is not supported by ORTOptimizer's transformer-specific fusions,
88
+ # so we use ORT's built-in graph optimizations directly)
89
+ print(f" [2/5] Applying ORT graph optimizations...")
90
+ _optimize_graph(output_dir)
91
+ print(f" Graph optimization complete")
92
+
93
+ # Step 3: Optional dynamic INT8 quantization
94
+ model_file = output_dir / "model.onnx"
95
+ if quantize:
96
+ print(f" [3/5] Applying dynamic INT8 quantization (avx2)...")
97
+ from onnxruntime.quantization import QuantType, quantize_dynamic
98
+
99
+ quantized_file = output_dir / "model_quantized.onnx"
100
+ quantize_dynamic(
101
+ model_input=str(model_file),
102
+ model_output=str(quantized_file),
103
+ weight_type=QuantType.QInt8,
104
+ )
105
+ # Replace original with quantized
106
+ os.replace(str(quantized_file), str(model_file))
107
+ print(f" INT8 quantization complete")
108
+ else:
109
+ print(f" [3/5] Skipping quantization (use --quantize to enable)")
110
+
111
+ # Step 4: Verify with dummy forward pass
112
+ print(f" [4/5] Verifying model...")
113
+ import numpy as np
114
+ import onnxruntime as ort
115
+
116
+ sess = ort.InferenceSession(str(model_file), providers=["CPUExecutionProvider"])
117
+ input_info = sess.get_inputs()[0]
118
+ print(f" Input: name={input_info.name}, type={input_info.type}, shape={input_info.shape}")
119
+ dummy = np.random.randn(1, 16000).astype(np.float32)
120
+ out = sess.run(None, {"input_values": dummy})
121
+ print(f" Output shape: {out[0].shape} (dtype={out[0].dtype})")
122
+ del sess, dummy, out
123
+
124
+ # Step 5: Push to HF Hub
125
+ print(f" [5/5] Uploading to HF Hub...")
126
+ from huggingface_hub import HfApi
127
+
128
+ repo_name = model_path.split("/")[-1]
129
+ hub_repo = f"hetchyy/{repo_name}-onnx"
130
+ api = HfApi(token=hf_token)
131
+ api.create_repo(repo_id=hub_repo, repo_type="model", private=True, exist_ok=True)
132
+ api.upload_folder(folder_path=str(output_dir), repo_id=hub_repo, repo_type="model")
133
+ print(f" Pushed to {hub_repo}")
134
+
135
+
136
+ def main():
137
+ parser = argparse.ArgumentParser(description="Export phoneme ASR models to optimized ONNX")
138
+ parser.add_argument("--quantize", action="store_true",
139
+ help="Apply dynamic INT8 quantization after graph optimization")
140
+ parser.add_argument("--models", nargs="+", choices=list(PHONEME_ASR_MODELS.keys()),
141
+ default=list(PHONEME_ASR_MODELS.keys()),
142
+ help="Which models to export (default: all)")
143
+ args = parser.parse_args()
144
+
145
+ hf_token = get_hf_token()
146
+
147
+ models_dir = Path(__file__).parent.parent / "models"
148
+ models_dir.mkdir(exist_ok=True)
149
+
150
+ for name in args.models:
151
+ path = PHONEME_ASR_MODELS[name]
152
+ output_dir = models_dir / f"onnx_{name}"
153
+ export_model(name, path, output_dir, hf_token, quantize=args.quantize)
154
+
155
+ suffix = " + INT8 quantized" if args.quantize else ""
156
+ print(f"\nDone. ONNX fp32 optimized{suffix} models exported and uploaded.")
157
+
158
+
159
+ if __name__ == "__main__":
160
+ main()
scripts/fix_stop_sign_spacing.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Fix stop sign spacing in qpc_hafs.json.
4
+
5
+ The DigitalKhatt font expects stop signs (U+06D6 to U+06DB) as combining marks
6
+ directly attached to words, but the current data has spaces before them.
7
+
8
+ This script removes spaces before stop signs:
9
+ 'رَيْبَ ۛ' → 'رَيْبَۛ'
10
+ """
11
+
12
+ import json
13
+ import re
14
+ from pathlib import Path
15
+
16
+ # Stop sign characters (U+06D6 to U+06DB)
17
+ STOP_SIGNS = '\u06D6\u06D7\u06D8\u06D9\u06DA\u06DB'
18
+
19
+ # Pattern to match space followed by stop sign
20
+ SPACE_BEFORE_STOP_PATTERN = re.compile(f' ([{STOP_SIGNS}])')
21
+
22
+
23
+ def fix_stop_sign_spacing(text: str) -> str:
24
+ """Remove spaces before stop signs."""
25
+ return SPACE_BEFORE_STOP_PATTERN.sub(r'\1', text)
26
+
27
+
28
+ def main():
29
+ data_path = Path(__file__).parent.parent / 'data' / 'qpc_hafs.json'
30
+
31
+ print(f"Loading {data_path}...")
32
+ with open(data_path, 'r', encoding='utf-8') as f:
33
+ data = json.load(f)
34
+
35
+ modified_count = 0
36
+
37
+ for key, entry in data.items():
38
+ if 'text' in entry:
39
+ original = entry['text']
40
+ fixed = fix_stop_sign_spacing(original)
41
+ if fixed != original:
42
+ entry['text'] = fixed
43
+ modified_count += 1
44
+ if modified_count <= 5: # Show first 5 examples
45
+ print(f" {key}: {repr(original)} → {repr(fixed)}")
46
+
47
+ if modified_count > 5:
48
+ print(f" ... and {modified_count - 5} more entries")
49
+
50
+ print(f"\nModified {modified_count} entries")
51
+
52
+ print(f"Saving to {data_path}...")
53
+ with open(data_path, 'w', encoding='utf-8') as f:
54
+ json.dump(data, f, ensure_ascii=False, indent=2)
55
+
56
+ print("Done!")
57
+
58
+
59
+ if __name__ == '__main__':
60
+ main()
setup.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build Cython extensions (DP alignment core)."""
2
+
3
+ from setuptools import setup, Extension
4
+ from Cython.Build import cythonize
5
+
6
+ extensions = [
7
+ Extension(
8
+ "src._dp_core",
9
+ ["src/_dp_core.pyx"],
10
+ ),
11
+ ]
12
+
13
+ setup(
14
+ ext_modules=cythonize(extensions, language_level="3"),
15
+ )
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Core processing module
src/_dp_core.pyx ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cython: boundscheck=False, wraparound=False, cdivision=True
2
+ """
3
+ Cython-accelerated word-boundary-constrained substring Levenshtein DP.
4
+
5
+ Drop-in replacement for the pure-Python align_with_word_boundaries() in
6
+ phoneme_matcher.py. Callers still pass plain Python lists of strings;
7
+ encoding to integer arrays happens inside this module.
8
+ """
9
+
10
+ from libc.stdlib cimport malloc, free
11
+ from libc.math cimport INFINITY, fabs
12
+
13
+ # ---------------------------------------------------------------------------
14
+ # Phoneme → integer encoding (built lazily on first call)
15
+ # ---------------------------------------------------------------------------
16
+
17
+ cdef dict _phoneme_to_id = {}
18
+ cdef int _num_phonemes = 0
19
+ cdef double *_sub_matrix = NULL # flat _num_phonemes × _num_phonemes
20
+ cdef double _default_sub = 1.0
21
+
22
+
23
+ cdef int _encode_phoneme(str p):
24
+ """Return integer id for *p*, assigning a new one if unseen."""
25
+ global _num_phonemes
26
+ cdef int pid
27
+ try:
28
+ pid = _phoneme_to_id[p]
29
+ except KeyError:
30
+ pid = _num_phonemes
31
+ _phoneme_to_id[p] = pid
32
+ _num_phonemes += 1
33
+ return pid
34
+
35
+
36
+ def init_substitution_matrix(dict sub_costs, double default_sub):
37
+ """Build the dense substitution-cost matrix from the Python dict.
38
+
39
+ Must be called once before the first DP call (phoneme_matcher.py does
40
+ this at import time).
41
+
42
+ Parameters
43
+ ----------
44
+ sub_costs : dict[(str, str), float]
45
+ Phoneme-pair substitution costs (both orderings already present).
46
+ default_sub : float
47
+ Cost used for pairs not in *sub_costs*.
48
+ """
49
+ global _sub_matrix, _default_sub, _num_phonemes
50
+
51
+ _default_sub = default_sub
52
+
53
+ # First pass: make sure every phoneme in sub_costs has an id
54
+ for (a, b) in sub_costs:
55
+ _encode_phoneme(a)
56
+ _encode_phoneme(b)
57
+
58
+ # Allocate matrix (will be re-allocated if new phonemes appear later)
59
+ _rebuild_matrix(sub_costs)
60
+
61
+
62
+ cdef void _rebuild_matrix(dict sub_costs):
63
+ """(Re)allocate and fill the dense cost matrix."""
64
+ global _sub_matrix, _num_phonemes, _default_sub
65
+
66
+ cdef int size = _num_phonemes
67
+ cdef int i, j
68
+
69
+ if _sub_matrix != NULL:
70
+ free(_sub_matrix)
71
+
72
+ _sub_matrix = <double *>malloc(size * size * sizeof(double))
73
+ if _sub_matrix == NULL:
74
+ raise MemoryError("Failed to allocate substitution matrix")
75
+
76
+ # Fill with default
77
+ for i in range(size * size):
78
+ _sub_matrix[i] = _default_sub
79
+
80
+ # Diagonal = 0 (match)
81
+ for i in range(size):
82
+ _sub_matrix[i * size + i] = 0.0
83
+
84
+ # Overrides from dict
85
+ cdef int aid, bid
86
+ cdef double cost
87
+ for (a, b), cost in sub_costs.items():
88
+ aid = _phoneme_to_id.get(a, -1)
89
+ bid = _phoneme_to_id.get(b, -1)
90
+ if aid >= 0 and bid >= 0:
91
+ _sub_matrix[aid * size + bid] = cost
92
+
93
+
94
+ cdef inline double _get_sub_cost(int pid, int rid, int size) nogil:
95
+ """Look up substitution cost from the dense matrix."""
96
+ if pid == rid:
97
+ return 0.0
98
+ if pid < size and rid < size:
99
+ return _sub_matrix[pid * size + rid]
100
+ return _default_sub
101
+
102
+
103
+ # ---------------------------------------------------------------------------
104
+ # Main DP function
105
+ # ---------------------------------------------------------------------------
106
+
107
+ def cy_align_with_word_boundaries(
108
+ list P_list,
109
+ list R_list,
110
+ list R_phone_to_word_list,
111
+ int expected_word,
112
+ double prior_weight,
113
+ double cost_sub,
114
+ double cost_del,
115
+ double cost_ins,
116
+ ):
117
+ """Word-boundary-constrained substring alignment (Cython).
118
+
119
+ Identical semantics to the pure-Python version. Returns the same
120
+ (best_j, best_j_start, best_cost, best_norm_dist) tuple, with
121
+ ``(None, None, INF, INF)`` on failure.
122
+ """
123
+ cdef int m = len(P_list)
124
+ cdef int n = len(R_list)
125
+ cdef double INF_VAL = INFINITY
126
+
127
+ if m == 0 or n == 0:
128
+ return (None, None, float('inf'), float('inf'))
129
+
130
+ # ------------------------------------------------------------------
131
+ # Encode string lists → C arrays
132
+ # ------------------------------------------------------------------
133
+ cdef int *P_ids = <int *>malloc(m * sizeof(int))
134
+ cdef int *R_ids = <int *>malloc(n * sizeof(int))
135
+ cdef int *R_w = <int *>malloc(n * sizeof(int))
136
+ if P_ids == NULL or R_ids == NULL or R_w == NULL:
137
+ if P_ids != NULL: free(P_ids)
138
+ if R_ids != NULL: free(R_ids)
139
+ if R_w != NULL: free(R_w)
140
+ raise MemoryError()
141
+
142
+ cdef int i, j
143
+ cdef bint need_rebuild = False
144
+
145
+ for i in range(m):
146
+ p = P_list[i]
147
+ if p not in _phoneme_to_id:
148
+ _encode_phoneme(p)
149
+ need_rebuild = True
150
+ P_ids[i] = _phoneme_to_id[p]
151
+
152
+ for j in range(n):
153
+ r = R_list[j]
154
+ if r not in _phoneme_to_id:
155
+ _encode_phoneme(r)
156
+ need_rebuild = True
157
+ R_ids[j] = _phoneme_to_id[r]
158
+ R_w[j] = <int>R_phone_to_word_list[j]
159
+
160
+ # If new phonemes appeared, rebuild the matrix so ids are covered
161
+ if need_rebuild and _sub_matrix != NULL:
162
+ # We need the original sub_costs dict, but we don't have it here.
163
+ # The safest approach: expand matrix with defaults for new phonemes.
164
+ _grow_matrix()
165
+
166
+ cdef int mat_size = _num_phonemes
167
+
168
+ # ------------------------------------------------------------------
169
+ # Precompute boundary flags
170
+ # ------------------------------------------------------------------
171
+ cdef char *start_boundary = <char *>malloc((n + 1) * sizeof(char))
172
+ cdef char *end_boundary = <char *>malloc((n + 1) * sizeof(char))
173
+ if start_boundary == NULL or end_boundary == NULL:
174
+ free(P_ids); free(R_ids); free(R_w)
175
+ if start_boundary != NULL: free(start_boundary)
176
+ if end_boundary != NULL: free(end_boundary)
177
+ raise MemoryError()
178
+
179
+ # start_boundary[j]: can alignment start at column j?
180
+ start_boundary[0] = 1 # column 0 always valid
181
+ for j in range(1, n):
182
+ start_boundary[j] = 1 if R_w[j] != R_w[j - 1] else 0
183
+ start_boundary[n] = 0 # can't start at or past end
184
+
185
+ # end_boundary[j]: can alignment end at column j?
186
+ end_boundary[0] = 0 # can't end before consuming anything
187
+ for j in range(1, n):
188
+ end_boundary[j] = 1 if R_w[j] != R_w[j - 1] else 0
189
+ end_boundary[n] = 1 # end of reference always valid
190
+
191
+ # ------------------------------------------------------------------
192
+ # DP arrays (two-row rolling)
193
+ # ------------------------------------------------------------------
194
+ cdef double *prev_cost = <double *>malloc((n + 1) * sizeof(double))
195
+ cdef double *curr_cost = <double *>malloc((n + 1) * sizeof(double))
196
+ cdef int *prev_start = <int *>malloc((n + 1) * sizeof(int))
197
+ cdef int *curr_start = <int *>malloc((n + 1) * sizeof(int))
198
+ if (prev_cost == NULL or curr_cost == NULL or
199
+ prev_start == NULL or curr_start == NULL):
200
+ free(P_ids); free(R_ids); free(R_w)
201
+ free(start_boundary); free(end_boundary)
202
+ if prev_cost != NULL: free(prev_cost)
203
+ if curr_cost != NULL: free(curr_cost)
204
+ if prev_start != NULL: free(prev_start)
205
+ if curr_start != NULL: free(curr_start)
206
+ raise MemoryError()
207
+
208
+ # Initialise row 0
209
+ for j in range(n + 1):
210
+ if start_boundary[j]:
211
+ prev_cost[j] = 0.0
212
+ prev_start[j] = j
213
+ else:
214
+ prev_cost[j] = INF_VAL
215
+ prev_start[j] = -1
216
+
217
+ # ------------------------------------------------------------------
218
+ # Core DP loop (no Python objects touched → runs at C speed)
219
+ # ------------------------------------------------------------------
220
+ cdef double del_option, ins_option, sub_option, sc
221
+ cdef double *tmp_d
222
+ cdef int *tmp_i
223
+ cdef bint col0_start = start_boundary[0]
224
+
225
+ for i in range(1, m + 1):
226
+ if col0_start:
227
+ curr_cost[0] = i * cost_del
228
+ curr_start[0] = 0
229
+ else:
230
+ curr_cost[0] = INF_VAL
231
+ curr_start[0] = -1
232
+
233
+ for j in range(1, n + 1):
234
+ del_option = prev_cost[j] + cost_del
235
+ ins_option = curr_cost[j - 1] + cost_ins
236
+ sc = _get_sub_cost(P_ids[i - 1], R_ids[j - 1], mat_size)
237
+ sub_option = prev_cost[j - 1] + sc
238
+
239
+ if sub_option <= del_option and sub_option <= ins_option:
240
+ curr_cost[j] = sub_option
241
+ curr_start[j] = prev_start[j - 1]
242
+ elif del_option <= ins_option:
243
+ curr_cost[j] = del_option
244
+ curr_start[j] = prev_start[j]
245
+ else:
246
+ curr_cost[j] = ins_option
247
+ curr_start[j] = curr_start[j - 1]
248
+
249
+ # Swap rows
250
+ tmp_d = prev_cost; prev_cost = curr_cost; curr_cost = tmp_d
251
+ tmp_i = prev_start; prev_start = curr_start; curr_start = tmp_i
252
+
253
+ # ------------------------------------------------------------------
254
+ # Best-match selection (end boundaries only)
255
+ # ------------------------------------------------------------------
256
+ cdef double best_score = INF_VAL
257
+ cdef int best_j = -1
258
+ cdef int best_j_start = -1
259
+ cdef double best_cost_val = INF_VAL
260
+ cdef double best_norm = INF_VAL
261
+
262
+ cdef double dist, norm_dist, prior, score
263
+ cdef int j_start_val, ref_len, denom, start_word
264
+
265
+ for j in range(1, n + 1):
266
+ if not end_boundary[j]:
267
+ continue
268
+ if prev_cost[j] >= INF_VAL:
269
+ continue
270
+
271
+ dist = prev_cost[j]
272
+ j_start_val = prev_start[j]
273
+
274
+ ref_len = j - j_start_val
275
+ denom = m if m > ref_len else ref_len
276
+ if denom < 1:
277
+ denom = 1
278
+ norm_dist = dist / denom
279
+
280
+ if j_start_val < n:
281
+ start_word = R_w[j_start_val]
282
+ else:
283
+ start_word = R_w[j - 1]
284
+
285
+ prior = prior_weight * fabs(<double>(start_word - expected_word))
286
+ score = norm_dist + prior
287
+
288
+ if score < best_score:
289
+ best_score = score
290
+ best_j = j
291
+ best_j_start = j_start_val
292
+ best_cost_val = dist
293
+ best_norm = norm_dist
294
+
295
+ # ------------------------------------------------------------------
296
+ # Cleanup
297
+ # ------------------------------------------------------------------
298
+ free(P_ids); free(R_ids); free(R_w)
299
+ free(start_boundary); free(end_boundary)
300
+ free(prev_cost); free(curr_cost)
301
+ free(prev_start); free(curr_start)
302
+
303
+ if best_j < 0:
304
+ return (None, None, float('inf'), float('inf'))
305
+
306
+ return (best_j, best_j_start, best_cost_val, best_norm)
307
+
308
+
309
+ # ---------------------------------------------------------------------------
310
+ # Helper: grow matrix when new phonemes are encountered at runtime
311
+ # ---------------------------------------------------------------------------
312
+
313
+ cdef void _grow_matrix():
314
+ """Expand the substitution matrix to cover newly added phonemes.
315
+
316
+ New rows/columns are filled with the default substitution cost,
317
+ diagonal with 0.0. Existing entries are preserved.
318
+ """
319
+ global _sub_matrix, _num_phonemes
320
+
321
+ cdef int old_size
322
+ cdef int new_size = _num_phonemes
323
+ cdef double *new_mat
324
+
325
+ if _sub_matrix == NULL:
326
+ # No matrix yet — allocate fresh with defaults
327
+ _sub_matrix = <double *>malloc(new_size * new_size * sizeof(double))
328
+ if _sub_matrix == NULL:
329
+ return
330
+ for i in range(new_size * new_size):
331
+ _sub_matrix[i] = _default_sub
332
+ for i in range(new_size):
333
+ _sub_matrix[i * new_size + i] = 0.0
334
+ return
335
+
336
+ # Figure out old size from current allocation
337
+ # We track it implicitly: old_size = new_size - (number of phonemes added since last build)
338
+ # Simpler: just rebuild from scratch with defaults + diagonal
339
+ new_mat = <double *>malloc(new_size * new_size * sizeof(double))
340
+ if new_mat == NULL:
341
+ return
342
+
343
+ cdef int i, j_idx
344
+ for i in range(new_size * new_size):
345
+ new_mat[i] = _default_sub
346
+ for i in range(new_size):
347
+ new_mat[i * new_size + i] = 0.0
348
+
349
+ # Copy old entries (old matrix was some smaller size).
350
+ # We don't know old_size exactly, so we just keep the new defaults.
351
+ # The original sub_costs were already written; since we don't have
352
+ # the dict here, the known-pair costs are lost for the new matrix.
353
+ # This only happens if a completely new phoneme appears at runtime,
354
+ # which is extremely rare. The init call covers all 69 known phonemes.
355
+
356
+ free(_sub_matrix)
357
+ _sub_matrix = new_mat
src/alignment/__init__.py ADDED
File without changes
src/alignment/alignment_pipeline.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Orchestration for phoneme-based alignment and retries."""
2
+
3
+ from typing import List, Tuple
4
+
5
+ from config import (
6
+ ANCHOR_SEGMENTS,
7
+ MAX_CONSECUTIVE_FAILURES,
8
+ RETRY_LOOKBACK_WORDS,
9
+ RETRY_LOOKAHEAD_WORDS,
10
+ MAX_EDIT_DISTANCE_RELAXED,
11
+ PHONEME_ALIGNMENT_PROFILING,
12
+ )
13
+
14
+
15
+ def run_phoneme_matching(
16
+ phoneme_texts: List[List[str]],
17
+ detected_surah: int,
18
+ first_quran_idx: int = 0,
19
+ special_results: List[tuple] = None,
20
+ start_pointer: int = 0,
21
+ ) -> Tuple[List[tuple], dict, set]:
22
+ """
23
+ Phoneme-based segment matching using substring DP.
24
+
25
+ Args:
26
+ phoneme_texts: List of phoneme lists (each is a list of phoneme strings)
27
+ detected_surah: Surah number from anchor search
28
+ first_quran_idx: Index where Quran segments start (after specials)
29
+ special_results: Results for special segments (Isti'adha/Basmala)
30
+ start_pointer: Initial word pointer from anchor voting
31
+
32
+ Returns:
33
+ (results, profiling_dict, gap_segments)
34
+ results: List[(matched_text, score, matched_ref), ...]
35
+ """
36
+ from .phoneme_matcher import align_segment, get_matched_text
37
+ from .phoneme_matcher_cache import get_chapter_reference
38
+ from .phoneme_anchor import reanchor_within_surah, verse_to_word_index, find_anchor_by_voting
39
+ from .ngram_index import get_ngram_index
40
+
41
+ # Only import time if profiling enabled
42
+ if PHONEME_ALIGNMENT_PROFILING:
43
+ import time
44
+ total_start = time.perf_counter()
45
+ ref_build_start = time.perf_counter()
46
+
47
+ # Build/get cached chapter reference (includes phonemizer call if not cached)
48
+ chapter_ref = get_chapter_reference(detected_surah)
49
+
50
+ if PHONEME_ALIGNMENT_PROFILING:
51
+ ref_build_time = time.perf_counter() - ref_build_start
52
+
53
+ # Initialize results with special segments
54
+ results = list(special_results) if special_results else []
55
+ # Parallel list: None for specials/failures, (start_word_idx, end_word_idx) for matches
56
+ word_indices = [None] * len(results)
57
+
58
+ # Timing accumulators (only used if profiling enabled)
59
+ if PHONEME_ALIGNMENT_PROFILING:
60
+ dp_times = []
61
+ window_setup_total = 0.0
62
+ result_build_total = 0.0
63
+
64
+ # Track whether the next segment might have Basmala fused with verse content
65
+ from .special_segments import SPECIAL_PHONEMES, SPECIAL_TEXT
66
+ basmala_already_detected = any(
67
+ r[2] in ("Basmala", "Isti'adha+Basmala") for r in (special_results or [])
68
+ )
69
+ is_first_after_transition = not basmala_already_detected
70
+
71
+ special_merges = 0
72
+
73
+ # Process Quran segments with phoneme alignment
74
+ pointer = start_pointer
75
+ num_segments = 0
76
+ consecutive_failures = 0
77
+ skip_count = 0
78
+ pending_specials = []
79
+ tier1_attempts = 0
80
+ tier1_passed = 0
81
+ tier1_segments = []
82
+ tier2_attempts = 0
83
+ tier2_passed = 0
84
+ tier2_segments = []
85
+ consec_reanchors = 0
86
+ segments_attempted = 0
87
+ segments_passed = 0
88
+
89
+ for i, asr_phonemes in enumerate(phoneme_texts[first_quran_idx:]):
90
+ # Handle segments consumed by inter-chapter special detection
91
+ if skip_count > 0:
92
+ results.append(pending_specials.pop(0))
93
+ word_indices.append(None)
94
+ skip_count -= 1
95
+ continue
96
+
97
+ segment_idx = first_quran_idx + i + 1 # 1-indexed for display
98
+ segments_attempted += 1
99
+
100
+ alignment, timing = align_segment(asr_phonemes, chapter_ref, pointer, segment_idx)
101
+ num_segments += 1
102
+
103
+ # Accumulate timing if profiling enabled
104
+ if PHONEME_ALIGNMENT_PROFILING:
105
+ dp_times.append(timing['dp_time'])
106
+ window_setup_total += timing['window_setup_time']
107
+ result_build_total += timing['result_build_time']
108
+
109
+ # Chapter transition: pointer past end of chapter
110
+ if alignment is None and pointer >= chapter_ref.num_words:
111
+ from .special_segments import detect_inter_chapter_specials
112
+ remaining_phonemes = phoneme_texts[first_quran_idx + i:]
113
+ inter_specials, num_consumed = detect_inter_chapter_specials(remaining_phonemes)
114
+
115
+ if chapter_ref.surah == 1:
116
+ # After Al-Fatiha, the next chapter could be anything — global reanchor
117
+ print(f" [CHAPTER-END] Surah 1 complete at segment {segment_idx}, "
118
+ f"running global reanchor...")
119
+
120
+ # Use segments after specials for anchor voting
121
+ anchor_offset = first_quran_idx + i + num_consumed
122
+ anchor_remaining = phoneme_texts[anchor_offset:]
123
+
124
+ reanchor_surah, reanchor_ayah = find_anchor_by_voting(
125
+ anchor_remaining, get_ngram_index(), ANCHOR_SEGMENTS,
126
+ )
127
+
128
+ if reanchor_surah > 0:
129
+ next_surah = reanchor_surah
130
+ chapter_ref = get_chapter_reference(next_surah)
131
+ pointer = verse_to_word_index(chapter_ref, reanchor_ayah)
132
+ print(f" [GLOBAL-REANCHOR] Anchored to Surah {next_surah}, "
133
+ f"Ayah {reanchor_ayah}, word {pointer}")
134
+ else:
135
+ # Fallback: assume chapter 2
136
+ next_surah = 2
137
+ chapter_ref = get_chapter_reference(next_surah)
138
+ pointer = 0
139
+ print(f" [GLOBAL-REANCHOR] No anchor found, falling back to Surah 2")
140
+ else:
141
+ next_surah = chapter_ref.surah + 1
142
+ if next_surah > 114:
143
+ pass # No more chapters — fall through to failure handling
144
+ else:
145
+ print(f" [CHAPTER-END] Surah {chapter_ref.surah} complete at segment {segment_idx}, "
146
+ f"transitioning to Surah {next_surah}")
147
+ chapter_ref = get_chapter_reference(next_surah)
148
+ pointer = 0
149
+
150
+ if next_surah <= 114:
151
+ detected_surah = next_surah
152
+ consecutive_failures = 0
153
+
154
+ if num_consumed > 0:
155
+ has_basmala = any(s[2] in ("Basmala", "Isti'adha+Basmala") for s in inter_specials)
156
+ is_first_after_transition = not has_basmala
157
+ # Current segment is a special — append its result
158
+ results.append(inter_specials[0])
159
+ word_indices.append(None)
160
+ # Queue remaining specials for subsequent segments
161
+ if num_consumed > 1:
162
+ pending_specials = list(inter_specials[1:])
163
+ skip_count = num_consumed - 1
164
+
165
+ continue
166
+ else:
167
+ is_first_after_transition = True
168
+ # No specials — re-try alignment on this segment against the new chapter
169
+ alignment, timing = align_segment(asr_phonemes, chapter_ref, pointer, segment_idx)
170
+ num_segments += 1
171
+ if PHONEME_ALIGNMENT_PROFILING:
172
+ dp_times.append(timing['dp_time'])
173
+ window_setup_total += timing['window_setup_time']
174
+ result_build_total += timing['result_build_time']
175
+ # Fall through to existing if/else below
176
+
177
+ # Basmala-fused retry: if this is the first segment after a transition
178
+ # and Basmala wasn't detected, the reciter may have merged Basmala with
179
+ # the first verse. Always try prepending Basmala phonemes to R and pick
180
+ # the better result (even if the plain alignment already succeeded).
181
+ if is_first_after_transition:
182
+ is_first_after_transition = False
183
+
184
+ basmala_alignment, basmala_timing = align_segment(
185
+ asr_phonemes, chapter_ref, pointer, segment_idx,
186
+ basmala_prefix=True)
187
+ num_segments += 1
188
+ if PHONEME_ALIGNMENT_PROFILING:
189
+ dp_times.append(basmala_timing['dp_time'])
190
+ window_setup_total += basmala_timing['window_setup_time']
191
+ result_build_total += basmala_timing['result_build_time']
192
+
193
+ if basmala_alignment and basmala_alignment.basmala_consumed:
194
+ existing_conf = alignment.confidence if alignment else 0.0
195
+ if basmala_alignment.confidence > existing_conf:
196
+ matched_text = SPECIAL_TEXT["Basmala"] + " " + get_matched_text(chapter_ref, basmala_alignment)
197
+ result = (matched_text, basmala_alignment.confidence, basmala_alignment.matched_ref)
198
+ pointer = basmala_alignment.end_word_idx + 1
199
+ consecutive_failures = 0
200
+ word_indices.append((basmala_alignment.start_word_idx, basmala_alignment.end_word_idx))
201
+ results.append(result)
202
+ special_merges += 1
203
+ segments_passed += 1
204
+ print(f" [BASMALA-FUSED] Segment {segment_idx}: Basmala merged with verse "
205
+ f"(fused conf={basmala_alignment.confidence:.2f} > plain conf={existing_conf:.2f})")
206
+ continue
207
+ # Basmala-fused didn't win — fall through with original alignment
208
+
209
+ if alignment:
210
+ is_first_after_transition = False
211
+ matched_text = get_matched_text(chapter_ref, alignment)
212
+ result = (matched_text, alignment.confidence, alignment.matched_ref)
213
+ pointer = alignment.end_word_idx + 1 # Advance pointer
214
+ consecutive_failures = 0
215
+ word_indices.append((alignment.start_word_idx, alignment.end_word_idx))
216
+ segments_passed += 1
217
+ else:
218
+ # === Graduated retry ===
219
+ # Tier 1: expanded window, same threshold
220
+ tier1_attempts += 1
221
+ tier1_segments.append(segment_idx)
222
+ alignment, timing = align_segment(
223
+ asr_phonemes, chapter_ref, pointer, segment_idx,
224
+ lookback_override=RETRY_LOOKBACK_WORDS,
225
+ lookahead_override=RETRY_LOOKAHEAD_WORDS,
226
+ )
227
+ num_segments += 1
228
+ if PHONEME_ALIGNMENT_PROFILING:
229
+ dp_times.append(timing['dp_time'])
230
+ window_setup_total += timing['window_setup_time']
231
+ result_build_total += timing['result_build_time']
232
+
233
+ # Tier 2: expanded window + relaxed threshold
234
+ tier2_entered = False
235
+ if alignment is None:
236
+ tier2_entered = True
237
+ tier2_attempts += 1
238
+ tier2_segments.append(segment_idx)
239
+ alignment, timing = align_segment(
240
+ asr_phonemes, chapter_ref, pointer, segment_idx,
241
+ lookback_override=RETRY_LOOKBACK_WORDS,
242
+ lookahead_override=RETRY_LOOKAHEAD_WORDS,
243
+ max_edit_distance_override=MAX_EDIT_DISTANCE_RELAXED,
244
+ )
245
+ num_segments += 1
246
+ if PHONEME_ALIGNMENT_PROFILING:
247
+ dp_times.append(timing['dp_time'])
248
+ window_setup_total += timing['window_setup_time']
249
+ result_build_total += timing['result_build_time']
250
+
251
+ if alignment:
252
+ # Retry succeeded
253
+ is_first_after_transition = False
254
+ matched_text = get_matched_text(chapter_ref, alignment)
255
+ result = (matched_text, alignment.confidence, alignment.matched_ref)
256
+ pointer = alignment.end_word_idx + 1
257
+ consecutive_failures = 0
258
+ word_indices.append((alignment.start_word_idx, alignment.end_word_idx))
259
+ segments_passed += 1
260
+ if tier2_entered:
261
+ tier2_passed += 1
262
+ else:
263
+ tier1_passed += 1
264
+ print(f" [RETRY-OK] Segment {segment_idx}: recovered via expanded window/relaxed threshold")
265
+ else:
266
+ # Real failure after all retries
267
+ result = ("", 0.0, "")
268
+ consecutive_failures += 1
269
+ word_indices.append(None)
270
+
271
+ if consecutive_failures >= MAX_CONSECUTIVE_FAILURES:
272
+ consec_reanchors += 1
273
+ # Global re-anchor (not constrained to current surah)
274
+ remaining_idx = first_quran_idx + i + 1
275
+ remaining_texts = phoneme_texts[remaining_idx:]
276
+ if remaining_texts:
277
+ reanchor_surah, reanchor_ayah = find_anchor_by_voting(
278
+ remaining_texts, get_ngram_index(), ANCHOR_SEGMENTS,
279
+ )
280
+ if reanchor_surah > 0:
281
+ if reanchor_surah != detected_surah:
282
+ detected_surah = reanchor_surah
283
+ chapter_ref = get_chapter_reference(detected_surah)
284
+ pointer = verse_to_word_index(chapter_ref, reanchor_ayah)
285
+ print(f" [GLOBAL-REANCHOR] Jumped to Surah {detected_surah}, "
286
+ f"Ayah {reanchor_ayah}, word {pointer}")
287
+ consecutive_failures = 0
288
+
289
+ results.append(result)
290
+
291
+ # Post-processing: detect consecutive segments with reference gaps
292
+ gap_segments = set()
293
+
294
+ prev_matched_idx = None
295
+ for idx in range(len(results)):
296
+ if word_indices[idx] is None:
297
+ continue
298
+
299
+ if prev_matched_idx is not None:
300
+ prev_end = word_indices[prev_matched_idx][1]
301
+ curr_start = word_indices[idx][0]
302
+ gap = curr_start - prev_end - 1
303
+
304
+ if gap > 0:
305
+ gap_segments.add(prev_matched_idx)
306
+ gap_segments.add(idx)
307
+
308
+ print(f" [GAP] {gap} word(s) missing between segments "
309
+ f"{prev_matched_idx + 1} and {idx + 1}")
310
+
311
+ prev_matched_idx = idx
312
+
313
+ # Edge case: missing words at start of expected range
314
+ first_matched = next((i for i, w in enumerate(word_indices) if w is not None), None)
315
+ if first_matched is not None:
316
+ first_start = word_indices[first_matched][0]
317
+ if first_start > start_pointer:
318
+ gap_segments.add(first_matched)
319
+ print(f" [GAP] {first_start - start_pointer} word(s) missing before first segment {first_matched + 1}")
320
+
321
+ # Edge case: missing words at end of current verse
322
+ # Only flag if the last matched segment is also the final segment overall.
323
+ # If there are trailing no-match segments after it, those account for the
324
+ # remaining audio — the words aren't missing, they just failed to align.
325
+ # Compare against the verse boundary (not chapter end), since a recitation
326
+ # doesn't necessarily cover the entire chapter.
327
+ last_matched = next((i for i in range(len(word_indices) - 1, -1, -1) if word_indices[i] is not None), None)
328
+ if last_matched is not None and last_matched == len(word_indices) - 1:
329
+ last_end = word_indices[last_matched][1]
330
+ last_ayah = chapter_ref.words[last_end].ayah
331
+ # Find the last word index that belongs to the same verse
332
+ verse_end = last_end
333
+ while verse_end + 1 < chapter_ref.num_words and chapter_ref.words[verse_end + 1].ayah == last_ayah:
334
+ verse_end += 1
335
+ if last_end < verse_end:
336
+ gap_segments.add(last_matched)
337
+ print(f" [GAP] {verse_end - last_end} word(s) missing after last segment {last_matched + 1}")
338
+
339
+ # Build profiling dict
340
+ if PHONEME_ALIGNMENT_PROFILING:
341
+ total_time = time.perf_counter() - total_start
342
+ profiling = {
343
+ "total_time": total_time,
344
+ "ref_build_time": ref_build_time,
345
+ "dp_total_time": sum(dp_times),
346
+ "dp_min_time": min(dp_times) if dp_times else 0.0,
347
+ "dp_max_time": max(dp_times) if dp_times else 0.0,
348
+ "window_setup_time": window_setup_total,
349
+ "result_build_time": result_build_total,
350
+ "num_segments": num_segments,
351
+ "tier1_attempts": tier1_attempts,
352
+ "tier1_passed": tier1_passed,
353
+ "tier1_segments": tier1_segments,
354
+ "tier2_attempts": tier2_attempts,
355
+ "tier2_passed": tier2_passed,
356
+ "tier2_segments": tier2_segments,
357
+ "consec_reanchors": consec_reanchors,
358
+ "segments_attempted": segments_attempted,
359
+ "segments_passed": segments_passed,
360
+ "special_merges": special_merges,
361
+ }
362
+ else:
363
+ profiling = {
364
+ "num_segments": num_segments,
365
+ "tier1_attempts": tier1_attempts,
366
+ "tier1_passed": tier1_passed,
367
+ "tier1_segments": tier1_segments,
368
+ "tier2_attempts": tier2_attempts,
369
+ "tier2_passed": tier2_passed,
370
+ "tier2_segments": tier2_segments,
371
+ "consec_reanchors": consec_reanchors,
372
+ "segments_attempted": segments_attempted,
373
+ "segments_passed": segments_passed,
374
+ "special_merges": special_merges,
375
+ }
376
+
377
+ return results, profiling, gap_segments
src/alignment/ngram_index.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phoneme n-gram index: dataclass and cached loader.
3
+ """
4
+
5
+ import pickle
6
+ from dataclasses import dataclass
7
+ from typing import Dict, List, Optional, Tuple
8
+
9
+ from config import NGRAM_INDEX_PATH
10
+
11
+
12
+ @dataclass
13
+ class PhonemeNgramIndex:
14
+ """Pre-computed n-gram index for the entire Quran."""
15
+
16
+ # n-gram -> list of (surah, ayah) positions where it occurs
17
+ ngram_positions: Dict[Tuple[str, ...], List[Tuple[int, int]]]
18
+
19
+ # n-gram -> total occurrence count (for rarity weighting)
20
+ ngram_counts: Dict[Tuple[str, ...], int]
21
+
22
+ # Metadata
23
+ ngram_size: int
24
+ total_ngrams: int
25
+
26
+
27
+ _INDEX: Optional[PhonemeNgramIndex] = None
28
+
29
+
30
+ def get_ngram_index() -> PhonemeNgramIndex:
31
+ """Get or load the phoneme n-gram index."""
32
+ global _INDEX
33
+ if _INDEX is None:
34
+ print(f"[NGRAM] Loading index from {NGRAM_INDEX_PATH}...")
35
+ with open(NGRAM_INDEX_PATH, "rb") as f:
36
+ _INDEX = pickle.load(f)
37
+ print(f"[NGRAM] Loaded: {len(_INDEX.ngram_positions)} unique {_INDEX.ngram_size}-grams, "
38
+ f"{_INDEX.total_ngrams} total occurrences")
39
+ return _INDEX
src/alignment/phoneme_anchor.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phoneme n-gram voting for global anchor detection.
3
+
4
+ Replaces Whisper-based text matching for chapter/verse identification.
5
+ Each ASR n-gram that matches the Quran index votes for (surah, ayah)
6
+ weighted by rarity (1/count). The highest vote total wins the surah.
7
+ Then we find the best contiguous run of voted ayahs in that surah and
8
+ anchor to the first ayah of that run.
9
+ """
10
+
11
+ from collections import defaultdict
12
+ from typing import Dict, List, Tuple
13
+
14
+ from config import ANCHOR_DEBUG, ANCHOR_RARITY_WEIGHTING, ANCHOR_RUN_TRIM_RATIO
15
+ from .ngram_index import PhonemeNgramIndex
16
+ from .phoneme_matcher import ChapterReference
17
+
18
+
19
+ def _find_best_contiguous_run(
20
+ ayah_weights: Dict[int, float],
21
+ ) -> Tuple[int, int, float]:
22
+ """
23
+ Find the contiguous run of consecutive ayahs with highest total weight.
24
+
25
+ Args:
26
+ ayah_weights: {ayah_number: vote_weight} for a single surah
27
+
28
+ Returns:
29
+ (start_ayah, end_ayah, total_weight) of the best run
30
+ """
31
+ if not ayah_weights:
32
+ return (0, 0, 0.0)
33
+
34
+ sorted_ayahs = sorted(ayah_weights.keys())
35
+
36
+ # Build runs of consecutive ayahs
37
+ runs: List[Tuple[int, int, float]] = [] # (start, end, total_weight)
38
+ run_start = sorted_ayahs[0]
39
+ run_end = sorted_ayahs[0]
40
+ run_weight = ayah_weights[sorted_ayahs[0]]
41
+
42
+ for i in range(1, len(sorted_ayahs)):
43
+ ayah = sorted_ayahs[i]
44
+ if ayah == run_end + 1:
45
+ # Extends current run
46
+ run_end = ayah
47
+ run_weight += ayah_weights[ayah]
48
+ else:
49
+ # Gap — save current run, start new one
50
+ runs.append((run_start, run_end, run_weight))
51
+ run_start = ayah
52
+ run_end = ayah
53
+ run_weight = ayah_weights[ayah]
54
+
55
+ # Don't forget the last run
56
+ runs.append((run_start, run_end, run_weight))
57
+
58
+ # Pick run with highest total weight
59
+ best_start, best_end, best_weight = max(runs, key=lambda r: r[2])
60
+
61
+ # Trim leading/trailing ayahs whose weight < ANCHOR_RUN_TRIM_RATIO * max
62
+ max_w = max(ayah_weights[a] for a in range(best_start, best_end + 1))
63
+ threshold = ANCHOR_RUN_TRIM_RATIO * max_w
64
+
65
+ while best_start < best_end and ayah_weights[best_start] < threshold:
66
+ best_weight -= ayah_weights[best_start]
67
+ best_start += 1
68
+
69
+ while best_end > best_start and ayah_weights[best_end] < threshold:
70
+ best_weight -= ayah_weights[best_end]
71
+ best_end -= 1
72
+
73
+ return (best_start, best_end, best_weight)
74
+
75
+
76
+ def find_anchor_by_voting(
77
+ phoneme_texts: List[List[str]],
78
+ ngram_index: PhonemeNgramIndex,
79
+ n_segments: int,
80
+ ) -> Tuple[int, int]:
81
+ """
82
+ Vote on (surah, ayah) using n-gram rarity weighting.
83
+
84
+ Two-phase selection:
85
+ 1. Raw voting determines the winning surah (highest total weight across all ayahs)
86
+ 2. Within that surah, find the best contiguous run of ayahs and return
87
+ the first ayah of that run as the anchor point.
88
+
89
+ Args:
90
+ phoneme_texts: Phoneme lists for segments (starting from first Quran segment)
91
+ ngram_index: Pre-built n-gram index
92
+ n_segments: Number of segments to use for voting
93
+
94
+ Returns:
95
+ (surah, ayah) of best match, or (0, 0) if nothing found
96
+ """
97
+ # Concatenate first N non-empty segments
98
+ combined: List[str] = []
99
+ segments_used = 0
100
+ for phonemes in phoneme_texts[:n_segments]:
101
+ if phonemes:
102
+ combined.extend(phonemes)
103
+ segments_used += 1
104
+
105
+ n = ngram_index.ngram_size
106
+
107
+ if ANCHOR_DEBUG:
108
+ print(f"\n{'=' * 60}")
109
+ print(f"ANCHOR VOTING DEBUG")
110
+ print(f"{'=' * 60}")
111
+ print(f" Segments used: {segments_used}/{n_segments}")
112
+ print(f" Combined phonemes: {len(combined)}")
113
+ print(f" N-gram size: {n}")
114
+ if combined:
115
+ print(f" ASR phonemes: {' '.join(combined[:30])}{'...' if len(combined) > 30 else ''}")
116
+
117
+ # Extract n-grams from ASR
118
+ asr_ngrams = [
119
+ tuple(combined[i : i + n])
120
+ for i in range(len(combined) - n + 1)
121
+ ]
122
+
123
+ if ANCHOR_DEBUG:
124
+ print(f" ASR n-grams extracted: {len(asr_ngrams)}")
125
+
126
+ # =========================================================================
127
+ # Phase 1: Raw voting — accumulate (surah, ayah) votes
128
+ # =========================================================================
129
+ votes: Dict[Tuple[int, int], float] = defaultdict(float)
130
+ matched_ngrams = 0
131
+ missed_ngrams = 0
132
+
133
+ for ng in asr_ngrams:
134
+ if ng not in ngram_index.ngram_positions:
135
+ missed_ngrams += 1
136
+ continue
137
+
138
+ matched_ngrams += 1
139
+ weight = (1.0 / ngram_index.ngram_counts[ng]) if ANCHOR_RARITY_WEIGHTING else 1.0
140
+
141
+ for surah, ayah in ngram_index.ngram_positions[ng]:
142
+ votes[(surah, ayah)] += weight
143
+
144
+ if ANCHOR_DEBUG:
145
+ print(f" N-grams matched: {matched_ngrams}/{len(asr_ngrams)} "
146
+ f"({missed_ngrams} missed)")
147
+ print(f" Distinct (surah, ayah) voted for: {len(votes)}")
148
+
149
+ if not votes:
150
+ if ANCHOR_DEBUG:
151
+ print(f" RESULT: No votes cast — returning (0, 0)")
152
+ print(f"{'=' * 60}\n")
153
+ return (0, 0)
154
+
155
+ # =========================================================================
156
+ # Phase 1b: Determine winning surah (by total weight across all ayahs)
157
+ # =========================================================================
158
+ surah_totals: Dict[int, float] = defaultdict(float)
159
+ for (s, a), w in votes.items():
160
+ surah_totals[s] += w
161
+
162
+ winning_surah = max(surah_totals, key=surah_totals.get)
163
+
164
+ if ANCHOR_DEBUG:
165
+ ranked_surahs = sorted(surah_totals.items(), key=lambda kv: kv[1], reverse=True)
166
+ print(f"\n Surah vote totals (top 5):")
167
+ print(f" {'Surah':>5} {'Total Weight':>12}")
168
+ print(f" {'-' * 20}")
169
+ for s, w in ranked_surahs[:5]:
170
+ marker = " <-- winner" if s == winning_surah else ""
171
+ print(f" {s:>5} {w:>12.3f}{marker}")
172
+
173
+ # =========================================================================
174
+ # Phase 2: Within winning surah, find best contiguous ayah run
175
+ # =========================================================================
176
+ ayah_weights: Dict[int, float] = {}
177
+ for (s, a), w in votes.items():
178
+ if s == winning_surah:
179
+ ayah_weights[a] = w
180
+
181
+ run_start, run_end, run_weight = _find_best_contiguous_run(ayah_weights)
182
+
183
+ if ANCHOR_DEBUG:
184
+ # Show per-ayah votes in this surah
185
+ print(f"\n Surah {winning_surah} ayah votes:")
186
+ print(f" {'Ayah':>5} {'Weight':>8} {'In Best Run':>11}")
187
+ print(f" {'-' * 28}")
188
+ for a in sorted(ayah_weights.keys()):
189
+ in_run = "***" if run_start <= a <= run_end else ""
190
+ print(f" {a:>5} {ayah_weights[a]:>8.3f} {in_run:>11}")
191
+
192
+ print(f"\n Best contiguous run (after trim): ayahs {run_start}-{run_end} "
193
+ f"(weight={run_weight:.3f}, trim_ratio={ANCHOR_RUN_TRIM_RATIO})")
194
+ print(f" RESULT: Surah {winning_surah}, Ayah {run_start} (start of run)")
195
+ print(f"{'=' * 60}\n")
196
+
197
+ return (winning_surah, run_start)
198
+
199
+
200
+ def reanchor_within_surah(
201
+ phoneme_texts: List[List[str]],
202
+ ngram_index: PhonemeNgramIndex,
203
+ surah: int,
204
+ n_segments: int,
205
+ ) -> int:
206
+ """
207
+ Re-anchor within a known surah after consecutive DP failures.
208
+
209
+ Same n-gram voting as find_anchor_by_voting but:
210
+ - Only counts votes for the given surah (skip all others)
211
+ - Returns ayah (start of best contiguous run), or 0 if no votes
212
+
213
+ Args:
214
+ phoneme_texts: Remaining unprocessed phoneme lists
215
+ ngram_index: Pre-built n-gram index
216
+ surah: Current surah (fixed)
217
+ n_segments: How many segments to use for voting
218
+
219
+ Returns:
220
+ ayah number to re-anchor to (0 = failed)
221
+ """
222
+ # Concatenate first N non-empty segments
223
+ combined: List[str] = []
224
+ segments_used = 0
225
+ for phonemes in phoneme_texts[:n_segments]:
226
+ if phonemes:
227
+ combined.extend(phonemes)
228
+ segments_used += 1
229
+
230
+ n = ngram_index.ngram_size
231
+
232
+ if ANCHOR_DEBUG:
233
+ print(f"\n{'=' * 60}")
234
+ print(f"RE-ANCHOR WITHIN SURAH {surah}")
235
+ print(f"{'=' * 60}")
236
+ print(f" Segments used: {segments_used}/{n_segments}")
237
+ print(f" Combined phonemes: {len(combined)}")
238
+
239
+ # Extract n-grams from ASR
240
+ asr_ngrams = [
241
+ tuple(combined[i : i + n])
242
+ for i in range(len(combined) - n + 1)
243
+ ]
244
+
245
+ # Vote — only accumulate weight for positions in the given surah
246
+ ayah_weights: Dict[int, float] = defaultdict(float)
247
+ matched_ngrams = 0
248
+
249
+ for ng in asr_ngrams:
250
+ if ng not in ngram_index.ngram_positions:
251
+ continue
252
+ matched_ngrams += 1
253
+ weight = (1.0 / ngram_index.ngram_counts[ng]) if ANCHOR_RARITY_WEIGHTING else 1.0
254
+ for s, a in ngram_index.ngram_positions[ng]:
255
+ if s == surah:
256
+ ayah_weights[a] += weight
257
+
258
+ if ANCHOR_DEBUG:
259
+ print(f" N-grams matched: {matched_ngrams}/{len(asr_ngrams)}")
260
+ print(f" Ayahs with votes: {len(ayah_weights)}")
261
+
262
+ if not ayah_weights:
263
+ if ANCHOR_DEBUG:
264
+ print(f" RESULT: No votes — returning 0")
265
+ print(f"{'=' * 60}\n")
266
+ return 0
267
+
268
+ run_start, run_end, run_weight = _find_best_contiguous_run(dict(ayah_weights))
269
+
270
+ if ANCHOR_DEBUG:
271
+ print(f" Best contiguous run (after trim): ayahs {run_start}-{run_end} "
272
+ f"(weight={run_weight:.3f}, trim_ratio={ANCHOR_RUN_TRIM_RATIO})")
273
+ print(f" RESULT: Ayah {run_start}")
274
+ print(f"{'=' * 60}\n")
275
+
276
+ return run_start
277
+
278
+
279
+ def verse_to_word_index(chapter_ref: ChapterReference, ayah: int) -> int:
280
+ """
281
+ Find word index of the first word in a given ayah.
282
+
283
+ Args:
284
+ chapter_ref: Pre-built chapter reference
285
+ ayah: Verse number to find
286
+
287
+ Returns:
288
+ Word index into chapter_ref.words, or 0 if not found
289
+ """
290
+ for idx, word in enumerate(chapter_ref.words):
291
+ if word.ayah == ayah:
292
+ return idx
293
+ return 0
src/alignment/phoneme_asr.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Phoneme ASR using wav2vec2 CTC model."""
2
+
3
+ import os
4
+ import time
5
+ import torch
6
+ import numpy as np
7
+ from typing import List, Dict, Any
8
+
9
+ from config import (
10
+ PHONEME_ASR_MODELS, PHONEME_ASR_MODEL_DEFAULT, DTYPE, IS_HF_SPACE, TORCH_COMPILE,
11
+ BATCHING_STRATEGY, INFERENCE_BATCH_SIZE,
12
+ MAX_BATCH_SECONDS, MAX_PAD_WASTE, MIN_BATCH_SIZE,
13
+ )
14
+ from ..zero_gpu import ZERO_GPU_AVAILABLE, is_quota_exhausted
15
+
16
+
17
+ _cache = {} # model_name -> {"model": Model, "processor": Processor, "device": str}
18
+
19
+ _TORCH_DTYPE = torch.float16 if DTYPE == "float16" else torch.float32
20
+
21
+
22
+ def _get_hf_token():
23
+ """Get HF token from env var or stored login."""
24
+ token = os.environ.get("HF_TOKEN")
25
+ if not token:
26
+ try:
27
+ from huggingface_hub import HfFolder
28
+ token = HfFolder.get_token()
29
+ except Exception:
30
+ pass
31
+ return token
32
+
33
+
34
+ def _get_device_and_dtype():
35
+ """Get the best available device and dtype.
36
+
37
+ On HF Spaces with ZeroGPU, returns CPU to defer CUDA init
38
+ until inside a @gpu_decorator function.
39
+ """
40
+ if IS_HF_SPACE or ZERO_GPU_AVAILABLE:
41
+ return torch.device("cpu"), _TORCH_DTYPE
42
+ if torch.cuda.is_available():
43
+ return torch.device("cuda"), _TORCH_DTYPE
44
+ return torch.device("cpu"), _TORCH_DTYPE
45
+
46
+
47
+ def load_phoneme_asr(model_name=PHONEME_ASR_MODEL_DEFAULT):
48
+ """Load phoneme ASR model on CPU. Returns (model, processor).
49
+
50
+ Models are loaded once and cached per model_name. Both base and large
51
+ can be cached simultaneously. Use move_phoneme_asr_to_gpu() inside
52
+ GPU-decorated functions to move to CUDA.
53
+ """
54
+ if model_name in _cache:
55
+ entry = _cache[model_name]
56
+ return entry["model"], entry["processor"]
57
+
58
+ import logging
59
+ from transformers import AutoModelForCTC, AutoProcessor
60
+
61
+ # Suppress verbose transformers logging during load
62
+ logging.getLogger("transformers").setLevel(logging.WARNING)
63
+
64
+ model_path = PHONEME_ASR_MODELS[model_name]
65
+ print(f"Loading phoneme ASR: {model_path} ({model_name})")
66
+
67
+ # Use HF_TOKEN for private model access
68
+ hf_token = _get_hf_token()
69
+
70
+ device, dtype = _get_device_and_dtype()
71
+
72
+ model = AutoModelForCTC.from_pretrained(
73
+ model_path, token=hf_token, attn_implementation="sdpa"
74
+ )
75
+ model.to(device, dtype=dtype)
76
+ model.eval()
77
+ if TORCH_COMPILE and not (IS_HF_SPACE or ZERO_GPU_AVAILABLE):
78
+ model = torch.compile(model, mode="reduce-overhead")
79
+
80
+ processor = AutoProcessor.from_pretrained(model_path, token=hf_token)
81
+
82
+ _cache[model_name] = {
83
+ "model": model,
84
+ "processor": processor,
85
+ "device": device.type,
86
+ }
87
+
88
+ print(f"Phoneme ASR ({model_name}) loaded on {device}")
89
+ return model, processor
90
+
91
+
92
+ def move_phoneme_asr_to_gpu(model_name=None):
93
+ """Move cached phoneme ASR model(s) to GPU.
94
+
95
+ Args:
96
+ model_name: Move only this model. If None, move all cached models.
97
+
98
+ Call this inside @gpu_decorator functions on HF Spaces.
99
+ Idempotent: checks current device before moving.
100
+ Skips if quota exhausted or CUDA unavailable.
101
+ """
102
+ if is_quota_exhausted() or not torch.cuda.is_available():
103
+ return
104
+
105
+ names = [model_name] if model_name else list(_cache.keys())
106
+ device = torch.device("cuda")
107
+
108
+ for name in names:
109
+ if name not in _cache:
110
+ continue
111
+ entry = _cache[name]
112
+ model = entry["model"]
113
+ if next(model.parameters()).device.type != "cuda":
114
+ entry["model"] = model.to(device, dtype=_TORCH_DTYPE)
115
+ entry["device"] = "cuda"
116
+ print(f"[PHONEME ASR] Moved '{name}' to CUDA")
117
+
118
+
119
+ def move_phoneme_asr_to_cpu(model_name=None):
120
+ """Move cached phoneme ASR model(s) back to CPU.
121
+
122
+ Args:
123
+ model_name: Move only this model. If None, move all cached models.
124
+
125
+ Called when GPU lease fails or quota is exhausted so that
126
+ CPU fallback inference can proceed.
127
+ Idempotent: checks current device before moving.
128
+ """
129
+ names = [model_name] if model_name else list(_cache.keys())
130
+ device = torch.device("cpu")
131
+
132
+ for name in names:
133
+ if name not in _cache:
134
+ continue
135
+ entry = _cache[name]
136
+ model = entry["model"]
137
+ if next(model.parameters()).device.type != "cpu":
138
+ entry["model"] = model.to(device, dtype=_TORCH_DTYPE)
139
+ entry["device"] = "cpu"
140
+ print(f"[PHONEME ASR] Moved '{name}' to CPU")
141
+
142
+
143
+ def ids_to_phoneme_list(ids: List[int], tokenizer, pad_id: int) -> List[str]:
144
+ """
145
+ Convert token IDs to phoneme list with CTC collapse.
146
+
147
+ CTC decoding:
148
+ 1. Remove pad/blank tokens
149
+ 2. Collapse consecutive duplicates
150
+ 3. Filter out word delimiter "|"
151
+ """
152
+ # Convert all IDs to tokens first (do not skip any yet)
153
+ toks = tokenizer.convert_ids_to_tokens(ids)
154
+
155
+ if not toks:
156
+ return []
157
+
158
+ # Get the actual token string for pad
159
+ pad_tok = tokenizer.convert_ids_to_tokens([pad_id])[0] if pad_id is not None else "[PAD]"
160
+
161
+ # CTC collapse: remove consecutive duplicates and special tokens
162
+ collapsed: List[str] = []
163
+ prev = None
164
+ for t in toks:
165
+ # Skip pad/blank token
166
+ if t == pad_tok:
167
+ prev = t
168
+ continue
169
+ # Skip word delimiter
170
+ if t == "|":
171
+ prev = t
172
+ continue
173
+ # Skip consecutive duplicates (CTC collapse)
174
+ if t == prev:
175
+ continue
176
+ collapsed.append(t)
177
+ prev = t
178
+
179
+ return collapsed
180
+
181
+
182
+ def build_batches_naive(sorted_indices: List[int], batch_size: int) -> List[List[int]]:
183
+ """Fixed-count batching (original behavior)."""
184
+ return [sorted_indices[i:i + batch_size]
185
+ for i in range(0, len(sorted_indices), batch_size)]
186
+
187
+
188
+ def build_batches(sorted_indices: List[int], durations: List[float]) -> List[List[int]]:
189
+ """Build dynamic batches from duration-sorted indices.
190
+
191
+ Constraints:
192
+ - sum(durations) per batch <= MAX_BATCH_SECONDS
193
+ - pad waste fraction <= MAX_PAD_WASTE (1 - sum/[n*max], measures wasted tensor compute)
194
+ - batch won't be cut below MIN_BATCH_SIZE (avoids underutilization)
195
+ """
196
+ batches: List[List[int]] = []
197
+ current: List[int] = []
198
+ current_seconds = 0.0
199
+
200
+ for i in sorted_indices:
201
+ dur = durations[i]
202
+
203
+ if not current:
204
+ current.append(i)
205
+ current_seconds = dur
206
+ continue
207
+
208
+ max_dur = dur # candidate is the new longest (sorted ascending)
209
+ new_seconds = current_seconds + dur
210
+ new_size = len(current) + 1
211
+ pad_waste = 1.0 - new_seconds / (new_size * max_dur) if max_dur > 0 else 0.0
212
+
213
+ seconds_exceeded = new_seconds > MAX_BATCH_SECONDS
214
+ waste_exceeded = pad_waste > MAX_PAD_WASTE
215
+
216
+ if (seconds_exceeded or waste_exceeded) and len(current) >= MIN_BATCH_SIZE:
217
+ batches.append(current)
218
+ current = [i]
219
+ current_seconds = dur
220
+ else:
221
+ current.append(i)
222
+ current_seconds = new_seconds
223
+
224
+ if current:
225
+ batches.append(current)
226
+
227
+ return batches
228
+
229
+
230
+ def _transcribe_batch_pytorch(
231
+ segment_audios: List[np.ndarray],
232
+ durations: List[float],
233
+ batches: List[List[int]],
234
+ model,
235
+ processor,
236
+ tokenizer,
237
+ pad_id: int,
238
+ device: torch.device,
239
+ dtype: torch.dtype,
240
+ ) -> tuple:
241
+ """PyTorch inference path (GPU or CPU fallback)."""
242
+ results: List[List[str]] = [[] for _ in segment_audios]
243
+ batch_profiling = []
244
+
245
+ for batch_num_idx, batch_idx in enumerate(batches):
246
+ batch_audios = [segment_audios[i] for i in batch_idx]
247
+ batch_durations = [durations[i] for i in batch_idx]
248
+
249
+ batch_num = batch_num_idx + 1
250
+ t0 = time.time()
251
+
252
+ # Feature extraction + GPU transfer
253
+ t_feat_start = time.time()
254
+ inputs = processor(
255
+ batch_audios,
256
+ sampling_rate=16000,
257
+ return_tensors="pt",
258
+ padding=True,
259
+ )
260
+ input_values = inputs.input_values.to(device=device, dtype=dtype)
261
+ attention_mask = inputs.get("attention_mask")
262
+ if attention_mask is not None:
263
+ attention_mask = attention_mask.to(device=device)
264
+ feat_time = time.time() - t_feat_start
265
+
266
+ # Model inference
267
+ t_infer_start = time.time()
268
+ with torch.no_grad():
269
+ outputs = model(input_values, attention_mask=attention_mask)
270
+ logits = outputs.logits
271
+ if device.type == "cuda":
272
+ torch.cuda.synchronize()
273
+ infer_time = time.time() - t_infer_start
274
+
275
+ # CTC greedy decode
276
+ t_decode_start = time.time()
277
+ predicted_ids = torch.argmax(logits, dim=-1)
278
+
279
+ for j in range(predicted_ids.shape[0]):
280
+ ids_list = predicted_ids[j].cpu().tolist()
281
+ phoneme_list = ids_to_phoneme_list(ids_list, tokenizer, pad_id)
282
+ results[batch_idx[j]] = phoneme_list
283
+ decode_time = time.time() - t_decode_start
284
+
285
+ del input_values, attention_mask, outputs, logits, predicted_ids
286
+
287
+ batch_time = time.time() - t0
288
+
289
+ batch_profiling.append({
290
+ "batch_num": batch_num,
291
+ "size": len(batch_audios),
292
+ "time": batch_time,
293
+ "feat_time": feat_time,
294
+ "infer_time": infer_time,
295
+ "decode_time": decode_time,
296
+ "min_dur": min(batch_durations),
297
+ "max_dur": max(batch_durations),
298
+ "avg_dur": sum(batch_durations) / len(batch_durations),
299
+ "total_seconds": sum(batch_durations),
300
+ "pad_waste": 1.0 - sum(batch_durations) / (len(batch_durations) * max(batch_durations)) if max(batch_durations) > 0 else 0.0,
301
+ })
302
+
303
+ return results, batch_profiling
304
+
305
+
306
+ def transcribe_batch(segment_audios: List[np.ndarray], sample_rate: int, model_name: str = PHONEME_ASR_MODEL_DEFAULT) -> tuple:
307
+ """Transcribe audio segments to phoneme lists, sorted by duration for efficiency.
308
+
309
+ Args:
310
+ segment_audios: List of audio arrays
311
+ sample_rate: Audio sample rate
312
+ model_name: Which ASR model to use ("base" or "large")
313
+
314
+ Returns:
315
+ (results, batch_profiling) where results is List[List[str]] and
316
+ batch_profiling is a list of dicts with per-batch timing and duration stats.
317
+ """
318
+ if not segment_audios:
319
+ return [], [], 0.0, 0.0
320
+
321
+ model, processor = load_phoneme_asr(model_name)
322
+ if model is None:
323
+ return [[] for _ in segment_audios], [], 0.0, 0.0
324
+
325
+ device = next(model.parameters()).device
326
+ dtype = next(model.parameters()).dtype
327
+ tokenizer = processor.tokenizer
328
+ pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
329
+
330
+ # Compute durations (audio assumed to be 16kHz — resampled at source)
331
+ durations = [len(audio) / 16000.0 for audio in segment_audios]
332
+
333
+ # Sort indices by duration, then build dynamic batches
334
+ t_sort = time.time()
335
+ sorted_indices = sorted(range(len(segment_audios)), key=lambda i: durations[i])
336
+ sorting_time = time.time() - t_sort
337
+
338
+ t_batch_build = time.time()
339
+ if BATCHING_STRATEGY == "dynamic":
340
+ batches = build_batches(sorted_indices, durations)
341
+ else:
342
+ batches = build_batches_naive(sorted_indices, INFERENCE_BATCH_SIZE)
343
+ batch_build_time = time.time() - t_batch_build
344
+
345
+ backend = "PyTorch" + (f" ({device.type})" if device.type != "cpu" else " (CPU)")
346
+ print(f"[PHONEME ASR] Using {backend}")
347
+ results, batch_profiling = _transcribe_batch_pytorch(
348
+ segment_audios, durations, batches,
349
+ model, processor, tokenizer, pad_id, device, dtype,
350
+ )
351
+
352
+ sizes = [p["size"] for p in batch_profiling]
353
+ print(f"[PHONEME ASR] {len(segment_audios)} segments in {len(batch_profiling)} batches "
354
+ f"(sizes: {min(sizes)}-{max(sizes)}, sort: {sorting_time:.3f}s, batch build: {batch_build_time:.3f}s)")
355
+ return results, batch_profiling, sorting_time, batch_build_time
src/alignment/phoneme_matcher.py ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phoneme-based alignment using substring Levenshtein DP.
3
+
4
+ This module implements the core alignment algorithm for matching ASR phoneme
5
+ sequences to reference Quranic text phonemes with word-boundary constraints.
6
+ """
7
+
8
+ import json
9
+ from dataclasses import dataclass
10
+ from typing import Dict, List, Optional, Tuple
11
+
12
+ from config import (
13
+ DATA_PATH,
14
+ LOOKBACK_WORDS,
15
+ LOOKAHEAD_WORDS,
16
+ MAX_EDIT_DISTANCE,
17
+ START_PRIOR_WEIGHT,
18
+ COST_SUBSTITUTION,
19
+ COST_DELETION,
20
+ COST_INSERTION,
21
+ PHONEME_ALIGNMENT_DEBUG,
22
+ PHONEME_ALIGNMENT_PROFILING,
23
+ )
24
+
25
+ from ..phonemizer_utils import get_phonemizer
26
+
27
+
28
+ # =============================================================================
29
+ # Phoneme Substitution Cost Lookup
30
+ # =============================================================================
31
+
32
+
33
+ def _load_substitution_costs() -> Dict[Tuple[str, str], float]:
34
+ """Load phoneme pair substitution costs from JSON data file.
35
+
36
+ Stores both orderings (a,b) and (b,a) so lookups need only a plain tuple.
37
+ """
38
+ path = DATA_PATH / "phoneme_sub_costs.json"
39
+ if not path.exists():
40
+ return {}
41
+ with open(path) as f:
42
+ raw = json.load(f)
43
+ costs = {}
44
+ for key, section in raw.items():
45
+ if key == "_meta":
46
+ continue
47
+ for pair_str, cost in section.items():
48
+ a, b = pair_str.split("|")
49
+ c = float(cost)
50
+ costs[(a, b)] = c
51
+ costs[(b, a)] = c
52
+ return costs
53
+
54
+
55
+ _SUBSTITUTION_COSTS: Dict[Tuple[str, str], float] = _load_substitution_costs()
56
+
57
+ # Try to load Cython-accelerated DP; fall back to pure Python silently.
58
+ try:
59
+ from .._dp_core import cy_align_with_word_boundaries, init_substitution_matrix
60
+ init_substitution_matrix(_SUBSTITUTION_COSTS, COST_SUBSTITUTION)
61
+ _USE_CYTHON_DP = True
62
+ except ImportError:
63
+ _USE_CYTHON_DP = False
64
+
65
+
66
+ def get_sub_cost(p: str, r: str, default: float) -> float:
67
+ """Look up substitution cost for a phoneme pair.
68
+
69
+ Returns 0.0 for exact match, pair-specific cost if defined, otherwise default.
70
+ """
71
+ if p == r:
72
+ return 0.0
73
+ return _SUBSTITUTION_COSTS.get((p, r), default)
74
+
75
+
76
+ # =============================================================================
77
+ # Data Structures
78
+ # =============================================================================
79
+
80
+
81
+ @dataclass
82
+ class RefWord:
83
+ """Reference word with phoneme metadata."""
84
+ text: str # Arabic text
85
+ phonemes: List[str] # Phoneme list for this word
86
+ surah: int # Surah number
87
+ ayah: int # Verse number within surah
88
+ word_num: int # Word number within verse (1-indexed)
89
+
90
+ @property
91
+ def location(self) -> str:
92
+ """Format as 'surah:ayah:word'."""
93
+ return f"{self.surah}:{self.ayah}:{self.word_num}"
94
+
95
+
96
+ @dataclass
97
+ class ChapterReference:
98
+ """Pre-built reference data for a chapter."""
99
+ surah: int
100
+ words: List[RefWord]
101
+ avg_phones_per_word: float
102
+
103
+ # Pre-flattened phoneme data (avoids rebuilding per segment)
104
+ flat_phonemes: List[str] # All phonemes concatenated
105
+ flat_phone_to_word: List[int] # Word index for each phoneme (GLOBAL indices)
106
+ word_phone_offsets: List[int] # Prefix sum: word i starts at offset[i]
107
+
108
+ @property
109
+ def num_words(self) -> int:
110
+ return len(self.words)
111
+
112
+
113
+ @dataclass
114
+ class AlignmentResult:
115
+ """Result of aligning a segment."""
116
+ start_word_idx: int # Index into ChapterReference.words
117
+ end_word_idx: int # Index into ChapterReference.words (inclusive)
118
+ edit_cost: float # Raw edit distance (may be non-integer with substitution costs)
119
+ confidence: float # 1.0 - (edit_cost / max(asr_len, ref_len))
120
+
121
+ # For debugging
122
+ j_start: int # Start phoneme index in R window
123
+ best_j: int # End phoneme index in R window (exclusive)
124
+
125
+ # Resolved word references
126
+ start_word: RefWord
127
+ end_word: RefWord
128
+
129
+ # Whether Basmala prefix was consumed by the alignment
130
+ basmala_consumed: bool = False
131
+
132
+ @property
133
+ def ref_from(self) -> str:
134
+ """Start reference as 'surah:ayah:word'."""
135
+ return self.start_word.location
136
+
137
+ @property
138
+ def ref_to(self) -> str:
139
+ """End reference as 'surah:ayah:word'."""
140
+ return self.end_word.location
141
+
142
+ @property
143
+ def matched_ref(self) -> str:
144
+ """Combined reference as 'start-end'."""
145
+ return f"{self.ref_from}-{self.ref_to}"
146
+
147
+
148
+ # =============================================================================
149
+ # Helper Functions
150
+ # =============================================================================
151
+
152
+
153
+ def parse_location(location: str) -> Tuple[int, int, int]:
154
+ """Parse 'surah:ayah:word' into (surah, ayah, word_num)."""
155
+ parts = location.split(":")
156
+ return int(parts[0]), int(parts[1]), int(parts[2])
157
+
158
+
159
+ def get_matched_text(chapter_ref: ChapterReference, result: AlignmentResult) -> str:
160
+ """Get Arabic text for aligned words."""
161
+ words = chapter_ref.words[result.start_word_idx : result.end_word_idx + 1]
162
+ return ' '.join(w.text for w in words)
163
+
164
+
165
+ def _word_loc(words: List, idx: int) -> str:
166
+ """Format word index as 'ayah:word_num'."""
167
+ if idx < 0 or idx >= len(words):
168
+ return f"?({idx})"
169
+ w = words[idx]
170
+ return f"{w.ayah}:{w.word_num}"
171
+
172
+
173
+ def print_debug_info(
174
+ P: List[str],
175
+ R: List[str],
176
+ result: Optional[AlignmentResult],
177
+ segment_idx: int,
178
+ pointer: int,
179
+ win_start: int,
180
+ win_end: int,
181
+ words: List = None,
182
+ ) -> None:
183
+ """Print detailed alignment debug info."""
184
+ if not PHONEME_ALIGNMENT_DEBUG:
185
+ return
186
+
187
+ print("\n" + "━" * 60)
188
+ print(f"[PHONEME ALIGN] Segment {segment_idx}")
189
+ print("─" * 60)
190
+ loc_range = ""
191
+ if words:
192
+ loc_range = f" = {_word_loc(words, win_start)}-{_word_loc(words, win_end)}"
193
+ print(f" Window: words [{win_start}-{win_end}]{loc_range} "
194
+ f"({win_end - win_start} words, {len(R)} phonemes)")
195
+ ptr_loc = ""
196
+ if words:
197
+ ptr_loc = f" = {_word_loc(words, pointer)}"
198
+ print(f" Expected start: word {pointer}{ptr_loc}")
199
+ print()
200
+ if len(R) <= 40:
201
+ print(f" R: {' '.join(R)}")
202
+ else:
203
+ print(f" R: {' '.join(R[:20])} ... {' '.join(R[-20:])}")
204
+ print(f" P: {' '.join(P)} ({len(P)} phonemes)")
205
+ print()
206
+
207
+ if result:
208
+ recovered = R[result.j_start:result.best_j]
209
+ print(f" ✓ MATCH: words [{result.start_word_idx}-{result.end_word_idx}] "
210
+ f"({result.end_word_idx - result.start_word_idx + 1} words)")
211
+ print(f" Recovered: {' '.join(recovered)} ({len(recovered)} phonemes)")
212
+ print(f" Edit cost: {result.edit_cost}")
213
+ print(f" Confidence: {result.confidence:.2f}")
214
+ else:
215
+ print(f" ✗ NO MATCH (no candidates passed threshold)")
216
+
217
+ print("━" * 60)
218
+
219
+
220
+ # =============================================================================
221
+ # Chapter Reference Building
222
+ # =============================================================================
223
+
224
+
225
+ def build_chapter_reference(surah_num: int) -> ChapterReference:
226
+ """Build phoneme reference for entire chapter."""
227
+ pm = get_phonemizer()
228
+
229
+ # Phonemize entire chapter with stopping rules at verse boundaries
230
+ result = pm.phonemize(
231
+ ref=str(surah_num),
232
+ stops=["verse"]
233
+ )
234
+
235
+ # Get mapping - provides word metadata and phonemes directly
236
+ mapping = result.get_mapping()
237
+
238
+ # Build RefWord list - WordMapping already has phonemes as List[str]
239
+ words = []
240
+ for word in mapping.words:
241
+ surah, ayah, word_num = parse_location(word.location)
242
+ words.append(RefWord(
243
+ text=word.text,
244
+ phonemes=word.phonemes, # Direct access, no string parsing needed
245
+ surah=surah,
246
+ ayah=ayah,
247
+ word_num=word_num,
248
+ ))
249
+
250
+ # Compute average phonemes per word
251
+ total_phones = sum(len(w.phonemes) for w in words)
252
+ avg_phones_per_word = total_phones / len(words) if words else 4.0
253
+
254
+ # Pre-flatten phonemes for efficient windowing (avoids per-segment rebuilds)
255
+ flat_phonemes = []
256
+ flat_phone_to_word = []
257
+ word_phone_offsets = []
258
+
259
+ for word_idx, word in enumerate(words):
260
+ word_phone_offsets.append(len(flat_phonemes)) # Start offset for this word
261
+ for ph in word.phonemes:
262
+ flat_phonemes.append(ph)
263
+ flat_phone_to_word.append(word_idx)
264
+
265
+ # Sentinel: offset past last phoneme (for slicing convenience)
266
+ word_phone_offsets.append(len(flat_phonemes))
267
+
268
+ return ChapterReference(
269
+ surah=surah_num,
270
+ words=words,
271
+ avg_phones_per_word=avg_phones_per_word,
272
+ flat_phonemes=flat_phonemes,
273
+ flat_phone_to_word=flat_phone_to_word,
274
+ word_phone_offsets=word_phone_offsets,
275
+ )
276
+
277
+
278
+ # =============================================================================
279
+ # Word-Boundary-Constrained Alignment (DP)
280
+ # =============================================================================
281
+
282
+
283
+ def align_with_word_boundaries(
284
+ P: List[str],
285
+ R: List[str],
286
+ R_phone_to_word: List[int],
287
+ expected_word: int = 0,
288
+ prior_weight: float = START_PRIOR_WEIGHT,
289
+ cost_sub: float = COST_SUBSTITUTION,
290
+ cost_del: float = COST_DELETION,
291
+ cost_ins: float = COST_INSERTION,
292
+ ) -> Tuple[Optional[int], Optional[int], float, float]:
293
+ """
294
+ Word-boundary-constrained substring alignment.
295
+
296
+ Combines DP computation with best-match selection:
297
+ - Start: only word-start positions allowed (INF cost otherwise)
298
+ - End: only word-end positions evaluated as candidates
299
+
300
+ Args:
301
+ P: ASR phoneme sequence
302
+ R: Reference phoneme window
303
+ R_phone_to_word: Maps phoneme index -> word index (GLOBAL indices)
304
+ expected_word: Expected starting word index (for position prior)
305
+ prior_weight: Penalty per word distance from expected
306
+ cost_sub: Substitution cost
307
+ cost_del: Deletion cost (delete from P)
308
+ cost_ins: Insertion cost (insert from R)
309
+
310
+ Returns:
311
+ (best_j, best_j_start, best_cost, best_norm_dist) or (None, None, INF, INF)
312
+ """
313
+ if _USE_CYTHON_DP:
314
+ return cy_align_with_word_boundaries(
315
+ P, R, R_phone_to_word,
316
+ expected_word, prior_weight,
317
+ cost_sub, cost_del, cost_ins,
318
+ )
319
+
320
+ # --- Pure Python fallback ---
321
+ m, n = len(P), len(R)
322
+ INF = float('inf')
323
+
324
+ if m == 0 or n == 0:
325
+ return None, None, INF, float('inf')
326
+
327
+ # DP column semantics:
328
+ # Column j represents "consumed j phonemes" / boundary after phoneme j-1
329
+ # Column 0 = before any phonemes, Column n = after all phonemes
330
+ # Phoneme indices are 0..n-1, DP columns are 0..n
331
+
332
+ def is_start_boundary(j: int) -> bool:
333
+ """Can alignment START at DP column j? (before phoneme j)"""
334
+ if j >= n:
335
+ return False # Can't start at or past end
336
+ if j == 0:
337
+ return True # Column 0 is always valid start (first word)
338
+ # Valid if phoneme j begins a new word
339
+ return R_phone_to_word[j] != R_phone_to_word[j - 1]
340
+
341
+ def is_end_boundary(j: int) -> bool:
342
+ """Can alignment END at DP column j? (after phoneme j-1)"""
343
+ if j == 0:
344
+ return False # Can't end before consuming anything
345
+ if j == n:
346
+ return True # Column n (end of reference) always valid
347
+ # Valid if phoneme j starts a new word (meaning j-1 ended a word)
348
+ return R_phone_to_word[j] != R_phone_to_word[j - 1]
349
+
350
+ # Initialize: free start ONLY at word boundaries
351
+ prev_cost = [0.0 if is_start_boundary(j) else INF for j in range(n + 1)]
352
+ prev_start = [j if is_start_boundary(j) else -1 for j in range(n + 1)]
353
+
354
+ curr_cost = [0.0] * (n + 1)
355
+ curr_start = [0] * (n + 1)
356
+
357
+ # DP computation
358
+ for i in range(1, m + 1):
359
+ curr_cost[0] = i * cost_del if is_start_boundary(0) else INF
360
+ curr_start[0] = 0 if is_start_boundary(0) else -1
361
+
362
+ for j in range(1, n + 1):
363
+ del_option = prev_cost[j] + cost_del
364
+ ins_option = curr_cost[j-1] + cost_ins
365
+ sub_option = prev_cost[j-1] + get_sub_cost(P[i-1], R[j-1], cost_sub)
366
+
367
+ if sub_option <= del_option and sub_option <= ins_option:
368
+ curr_cost[j] = sub_option
369
+ curr_start[j] = prev_start[j-1]
370
+ elif del_option <= ins_option:
371
+ curr_cost[j] = del_option
372
+ curr_start[j] = prev_start[j]
373
+ else:
374
+ curr_cost[j] = ins_option
375
+ curr_start[j] = curr_start[j-1]
376
+
377
+ prev_cost, curr_cost = curr_cost, prev_cost
378
+ prev_start, curr_start = curr_start, prev_start
379
+
380
+ # After DP: evaluate only valid end boundary positions
381
+ # prev_cost/prev_start now contain the final row (after m iterations)
382
+ best_score = float('inf') # Score includes float norm_dist, so keep as float
383
+ best_j = None
384
+ best_j_start = None
385
+ best_cost = INF
386
+ best_norm_dist = float('inf')
387
+
388
+ for j in range(1, n + 1):
389
+ # Skip non-end-boundary positions
390
+ if not is_end_boundary(j):
391
+ continue
392
+
393
+ # Skip infinite cost (no valid alignment ends here)
394
+ if prev_cost[j] >= INF:
395
+ continue
396
+
397
+ dist = prev_cost[j]
398
+ j_start = prev_start[j]
399
+
400
+ # Compute normalized edit distance
401
+ ref_len = j - j_start
402
+ denom = max(m, ref_len, 1)
403
+ norm_dist = dist / denom
404
+
405
+ # Position prior on start word
406
+ start_word = R_phone_to_word[j_start] if j_start < n else R_phone_to_word[j - 1]
407
+ prior = prior_weight * abs(start_word - expected_word)
408
+ score = norm_dist + prior
409
+
410
+ if score < best_score:
411
+ best_score = score
412
+ best_j = j
413
+ best_j_start = j_start
414
+ best_cost = dist
415
+ best_norm_dist = norm_dist
416
+
417
+ return best_j, best_j_start, best_cost, best_norm_dist
418
+
419
+
420
+ # =============================================================================
421
+ # Per-Segment Alignment
422
+ # =============================================================================
423
+
424
+
425
+ def align_segment(
426
+ asr_phonemes: List[str],
427
+ chapter_ref: ChapterReference,
428
+ pointer: int,
429
+ segment_idx: int = 0,
430
+ basmala_prefix: bool = False,
431
+ lookback_override: Optional[int] = None,
432
+ lookahead_override: Optional[int] = None,
433
+ max_edit_distance_override: Optional[float] = None,
434
+ ) -> Tuple[Optional[AlignmentResult], dict]:
435
+ """
436
+ Align ASR phonemes to reference using substring Levenshtein DP.
437
+
438
+ Args:
439
+ asr_phonemes: Phoneme sequence from ASR for this segment
440
+ chapter_ref: Pre-built chapter reference data
441
+ pointer: First unprocessed word index (0 at start of chapter)
442
+ segment_idx: Segment number for debug output
443
+ basmala_prefix: If True, prepend Basmala phonemes to the R window
444
+ so the DP can consume a fused Basmala+verse segment
445
+ lookback_override: Override LOOKBACK_WORDS for this call
446
+ lookahead_override: Override LOOKAHEAD_WORDS for this call
447
+ max_edit_distance_override: Override MAX_EDIT_DISTANCE for this call
448
+
449
+ Returns: (AlignmentResult or None, timing_dict)
450
+ """
451
+ timing = {'window_setup_time': 0.0, 'dp_time': 0.0, 'result_build_time': 0.0}
452
+
453
+ # Only import time if profiling is enabled
454
+ if PHONEME_ALIGNMENT_PROFILING:
455
+ import time
456
+
457
+ P = asr_phonemes
458
+ m = len(P)
459
+
460
+ if m == 0:
461
+ return None, timing
462
+
463
+ words = chapter_ref.words
464
+ avg_phones = chapter_ref.avg_phones_per_word
465
+ num_words = chapter_ref.num_words
466
+
467
+ # === WINDOW SETUP ===
468
+ if PHONEME_ALIGNMENT_PROFILING:
469
+ t0 = time.perf_counter()
470
+
471
+ # 1. Estimate word count from phoneme count
472
+ est_words = max(1, round(m / avg_phones))
473
+
474
+ # 2. Define search window (word indices)
475
+ lb = lookback_override if lookback_override is not None else LOOKBACK_WORDS
476
+ la = lookahead_override if lookahead_override is not None else LOOKAHEAD_WORDS
477
+ win_start = max(0, pointer - lb)
478
+ win_end = min(num_words, pointer + est_words + la)
479
+
480
+ # End of chapter check
481
+ if win_start >= num_words:
482
+ if PHONEME_ALIGNMENT_PROFILING:
483
+ timing['window_setup_time'] = time.perf_counter() - t0
484
+ if PHONEME_ALIGNMENT_DEBUG:
485
+ print(f"[PHONEME ALIGN] Segment {segment_idx}: Past end of chapter")
486
+ return None, timing
487
+
488
+ # 3. Slice pre-flattened phoneme window
489
+ phone_start = chapter_ref.word_phone_offsets[win_start]
490
+ phone_end = chapter_ref.word_phone_offsets[win_end]
491
+
492
+ R = chapter_ref.flat_phonemes[phone_start:phone_end]
493
+ R_phone_to_word = chapter_ref.flat_phone_to_word[phone_start:phone_end]
494
+
495
+ # Optionally prepend Basmala phonemes so the DP can consume fused Basmala+verse
496
+ BASMALA_SENTINEL = -1
497
+ prefix_phonemes = None
498
+ if basmala_prefix:
499
+ from .special_segments import SPECIAL_PHONEMES
500
+ prefix_phonemes = SPECIAL_PHONEMES["Basmala"]
501
+
502
+ if prefix_phonemes is not None:
503
+ prefix_len = len(prefix_phonemes)
504
+ R = list(prefix_phonemes) + list(R)
505
+ R_phone_to_word = [BASMALA_SENTINEL] * prefix_len + list(R_phone_to_word)
506
+
507
+ n = len(R)
508
+
509
+ if n == 0:
510
+ if PHONEME_ALIGNMENT_PROFILING:
511
+ timing['window_setup_time'] = time.perf_counter() - t0
512
+ if PHONEME_ALIGNMENT_DEBUG:
513
+ print(f"[PHONEME ALIGN] Segment {segment_idx}: Empty reference window")
514
+ return None, timing
515
+
516
+ if PHONEME_ALIGNMENT_PROFILING:
517
+ timing['window_setup_time'] = time.perf_counter() - t0
518
+
519
+ # === DP ===
520
+ if PHONEME_ALIGNMENT_PROFILING:
521
+ t0 = time.perf_counter()
522
+
523
+ # 4. Run word-boundary-constrained alignment (DP + selection in one pass)
524
+ best_j, j_start, best_cost, norm_dist = align_with_word_boundaries(
525
+ P, R, R_phone_to_word,
526
+ expected_word=pointer,
527
+ prior_weight=START_PRIOR_WEIGHT
528
+ )
529
+
530
+ if PHONEME_ALIGNMENT_PROFILING:
531
+ timing['dp_time'] = time.perf_counter() - t0
532
+
533
+ # === RESULT BUILD ===
534
+ if PHONEME_ALIGNMENT_PROFILING:
535
+ t0 = time.perf_counter()
536
+
537
+ if best_j is None:
538
+ if PHONEME_ALIGNMENT_PROFILING:
539
+ timing['result_build_time'] = time.perf_counter() - t0
540
+ print_debug_info(P, R, None, segment_idx, pointer, win_start, win_end, words)
541
+ return None, timing
542
+
543
+ # 5. Check acceptance threshold
544
+ threshold = max_edit_distance_override if max_edit_distance_override is not None else MAX_EDIT_DISTANCE
545
+ if norm_dist > threshold:
546
+ if PHONEME_ALIGNMENT_PROFILING:
547
+ timing['result_build_time'] = time.perf_counter() - t0
548
+ print_debug_info(P, R, None, segment_idx, pointer, win_start, win_end, words)
549
+ return None, timing
550
+
551
+ # 6. Confidence is 1 - normalized distance
552
+ confidence = 1.0 - norm_dist
553
+
554
+ # 7. Map phoneme indices to word indices
555
+ start_word_idx = R_phone_to_word[j_start]
556
+ end_word_idx = R_phone_to_word[best_j - 1]
557
+
558
+ # Handle prefix: if alignment starts in the prefix region, find the first real word
559
+ basmala_consumed = False
560
+ if prefix_phonemes is not None and start_word_idx == BASMALA_SENTINEL:
561
+ basmala_consumed = True
562
+ for k in range(j_start, best_j):
563
+ if R_phone_to_word[k] != BASMALA_SENTINEL:
564
+ start_word_idx = R_phone_to_word[k]
565
+ break
566
+ else:
567
+ # Entire match is just Basmala with no verse content — reject
568
+ if PHONEME_ALIGNMENT_PROFILING:
569
+ timing['result_build_time'] = time.perf_counter() - t0
570
+ return None, timing
571
+
572
+ result = AlignmentResult(
573
+ start_word_idx=start_word_idx,
574
+ end_word_idx=end_word_idx,
575
+ edit_cost=best_cost,
576
+ confidence=confidence,
577
+ j_start=j_start,
578
+ best_j=best_j,
579
+ start_word=words[start_word_idx],
580
+ end_word=words[end_word_idx],
581
+ basmala_consumed=basmala_consumed,
582
+ )
583
+
584
+ if PHONEME_ALIGNMENT_PROFILING:
585
+ timing['result_build_time'] = time.perf_counter() - t0
586
+
587
+ # Debug output
588
+ print_debug_info(P, R, result, segment_idx, pointer, win_start, win_end, words)
589
+
590
+ return result, timing
src/alignment/phoneme_matcher_cache.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cache for ChapterReference objects.
3
+
4
+ Loads pre-built chapter references from a pickle file (built by
5
+ scripts/build_phoneme_cache.py) to avoid runtime phonemization.
6
+ """
7
+
8
+ import pickle
9
+ from typing import TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from .phoneme_matcher import ChapterReference
13
+
14
+ # Global cache: surah number -> ChapterReference
15
+ _chapter_cache: dict[int, "ChapterReference"] = {}
16
+
17
+
18
+ def get_chapter_reference(surah: int) -> "ChapterReference":
19
+ """
20
+ Get chapter reference from cache.
21
+
22
+ Args:
23
+ surah: Surah number (1-114)
24
+
25
+ Returns:
26
+ ChapterReference with pre-built phoneme data
27
+ """
28
+ if surah not in _chapter_cache:
29
+ # Fallback: build at runtime if cache wasn't preloaded
30
+ from .phoneme_matcher import build_chapter_reference
31
+ print(f"[CACHE] WARNING: Building reference for Surah {surah} at runtime "
32
+ "(phoneme cache not loaded — run scripts/build_phoneme_cache.py)")
33
+ _chapter_cache[surah] = build_chapter_reference(surah)
34
+ return _chapter_cache[surah]
35
+
36
+
37
+ def preload_all_chapters() -> None:
38
+ """Load all 114 chapter references from the pre-built cache file."""
39
+ from config import PHONEME_CACHE_PATH
40
+
41
+ if PHONEME_CACHE_PATH.exists():
42
+ print(f"[CACHE] Loading phoneme cache from {PHONEME_CACHE_PATH}...")
43
+ with open(PHONEME_CACHE_PATH, "rb") as f:
44
+ loaded: dict[int, "ChapterReference"] = pickle.load(f)
45
+ _chapter_cache.update(loaded)
46
+ print(f"[CACHE] Loaded {len(loaded)} chapters from cache")
47
+ else:
48
+ print(f"[CACHE] WARNING: {PHONEME_CACHE_PATH} not found, "
49
+ "falling back to runtime phonemization")
50
+ print("[CACHE] Run: python scripts/build_phoneme_cache.py")
51
+ for surah in range(1, 115):
52
+ get_chapter_reference(surah)
53
+ print(f"[CACHE] All 114 chapters built at runtime")
54
+
55
+
56
+ def clear_chapter_cache() -> None:
57
+ """Clear cache (for memory management)."""
58
+ _chapter_cache.clear()
59
+ print("[CACHE] Cleared chapter cache")
src/alignment/special_segments.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phoneme-based special segment detection for Basmala and Isti'adha.
3
+
4
+ These are common recitation openers that need special handling:
5
+ - Isti'adha: "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم" (I seek refuge in Allah)
6
+ - Basmala: "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم" (In the name of Allah)
7
+
8
+ Detection uses phoneme edit distance for robustness against ASR errors.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import List, Tuple, Optional
14
+
15
+ # =============================================================================
16
+ # Constants
17
+ # =============================================================================
18
+
19
+ from config import MAX_SPECIAL_EDIT_DISTANCE
20
+
21
+ # Special phoneme sequences
22
+ SPECIAL_PHONEMES = {
23
+ "Isti'adha": [
24
+ "ʔ", "a", "ʕ", "u:", "ð", "u", "b", "i", "ll", "a:", "h", "i",
25
+ "m", "i", "n", "a", "ʃʃ", "a", "j", "tˤ", "aˤ:", "n", "i",
26
+ "rˤrˤ", "aˤ", "ʒ", "i:", "m"
27
+ ],
28
+ "Basmala": [
29
+ "b", "i", "s", "m", "i", "ll", "a:", "h", "i", "rˤrˤ", "aˤ",
30
+ "ħ", "m", "a:", "n", "i", "rˤrˤ", "aˤ", "ħ", "i:", "m"
31
+ ],
32
+ }
33
+
34
+ # Combined = Isti'adha + Basmala (for detecting both in one segment)
35
+ COMBINED_PHONEMES = SPECIAL_PHONEMES["Isti'adha"] + SPECIAL_PHONEMES["Basmala"]
36
+
37
+ # Arabic text for display
38
+ SPECIAL_TEXT = {
39
+ "Isti'adha": "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم",
40
+ "Basmala": "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم",
41
+ }
42
+
43
+
44
+ # =============================================================================
45
+ # Levenshtein Distance
46
+ # =============================================================================
47
+
48
+ def levenshtein_distance(seq1: List[str], seq2: List[str]) -> int:
49
+ """
50
+ Compute standard Levenshtein edit distance between two sequences.
51
+
52
+ Args:
53
+ seq1: First sequence (list of phonemes)
54
+ seq2: Second sequence (list of phonemes)
55
+
56
+ Returns:
57
+ Edit distance (number of insertions, deletions, substitutions)
58
+ """
59
+ m, n = len(seq1), len(seq2)
60
+
61
+ # Handle edge cases
62
+ if m == 0:
63
+ return n
64
+ if n == 0:
65
+ return m
66
+
67
+ # Use two-row optimization for memory efficiency
68
+ prev = list(range(n + 1))
69
+ curr = [0] * (n + 1)
70
+
71
+ for i in range(1, m + 1):
72
+ curr[0] = i
73
+ for j in range(1, n + 1):
74
+ if seq1[i - 1] == seq2[j - 1]:
75
+ curr[j] = prev[j - 1] # No operation needed
76
+ else:
77
+ curr[j] = 1 + min(
78
+ prev[j], # Deletion
79
+ curr[j - 1], # Insertion
80
+ prev[j - 1], # Substitution
81
+ )
82
+ prev, curr = curr, prev
83
+
84
+ return prev[n]
85
+
86
+
87
+ def phoneme_edit_distance(asr_phonemes: List[str], ref_phonemes: List[str]) -> float:
88
+ """
89
+ Compute normalized edit distance between two phoneme sequences.
90
+
91
+ Args:
92
+ asr_phonemes: ASR output phoneme sequence
93
+ ref_phonemes: Reference phoneme sequence
94
+
95
+ Returns:
96
+ Normalized edit distance (0.0 = identical, 1.0 = completely different)
97
+ """
98
+ if not asr_phonemes or not ref_phonemes:
99
+ return 1.0
100
+
101
+ edit_dist = levenshtein_distance(asr_phonemes, ref_phonemes)
102
+ max_len = max(len(asr_phonemes), len(ref_phonemes))
103
+
104
+ return edit_dist / max_len
105
+
106
+
107
+ # =============================================================================
108
+ # Special Segment Detection
109
+ # =============================================================================
110
+
111
+ def detect_special_segments(
112
+ phoneme_texts: List[List[str]],
113
+ vad_segments: List,
114
+ segment_audios: List,
115
+ ) -> Tuple[List, List, List[Tuple[str, float, str]], int]:
116
+ """
117
+ Detect special segments (Isti'adha/Basmala) using phoneme edit distance.
118
+
119
+ Detection order:
120
+ 1. Try COMBINED (Isti'adha + Basmala) on segment 0 → split if match
121
+ 2. Else try Isti'adha on segment 0 → if match, try Basmala on segment 1
122
+ 3. Else try Basmala on segment 0
123
+ 4. Else no specials
124
+
125
+ Args:
126
+ phoneme_texts: List of phoneme lists from ASR
127
+ vad_segments: List of VadSegment objects
128
+ segment_audios: List of audio arrays
129
+
130
+ Returns:
131
+ (updated_vad_segments, updated_audios, special_results, first_quran_idx)
132
+
133
+ special_results: List of tuples (matched_text, score, ref) for compatibility
134
+ first_quran_idx: Index where Quran segments start (after specials)
135
+ """
136
+ # Import here to avoid circular imports
137
+ from ..segment_types import VadSegment
138
+
139
+ if not phoneme_texts or not vad_segments or not segment_audios:
140
+ return vad_segments, segment_audios, [], 0
141
+
142
+ special_results: List[Tuple[str, float, str]] = []
143
+
144
+ # Segment 0 phonemes (already a list)
145
+ seg0_phonemes = phoneme_texts[0] if phoneme_texts[0] else []
146
+
147
+ # ==========================================================================
148
+ # 1. Try COMBINED (Isti'adha + Basmala in one segment)
149
+ # ==========================================================================
150
+ combined_dist = phoneme_edit_distance(seg0_phonemes, COMBINED_PHONEMES)
151
+
152
+ if combined_dist <= MAX_SPECIAL_EDIT_DISTANCE:
153
+ print(f"[SPECIAL] Combined Isti'adha+Basmala detected (dist={combined_dist:.2f})")
154
+
155
+ # Split segment 0 by midpoint
156
+ seg = vad_segments[0]
157
+ audio = segment_audios[0]
158
+ mid_time = (seg.start_time + seg.end_time) / 2.0
159
+ mid_sample = max(1, len(audio) // 2)
160
+
161
+ # Create two new segments
162
+ new_vads = [
163
+ VadSegment(start_time=seg.start_time, end_time=mid_time, segment_idx=0),
164
+ VadSegment(start_time=mid_time, end_time=seg.end_time, segment_idx=1),
165
+ ]
166
+ new_audios = [
167
+ audio[:mid_sample],
168
+ audio[mid_sample:],
169
+ ]
170
+
171
+ # Add remaining segments with reindexed segment_idx
172
+ for i, vs in enumerate(vad_segments[1:], start=2):
173
+ new_vads.append(VadSegment(
174
+ start_time=vs.start_time,
175
+ end_time=vs.end_time,
176
+ segment_idx=i
177
+ ))
178
+ new_audios.extend(segment_audios[1:])
179
+
180
+ # Special results for both (confidence = 1 - distance)
181
+ confidence = 1.0 - combined_dist
182
+ special_results = [
183
+ (SPECIAL_TEXT["Isti'adha"], confidence, "Isti'adha"),
184
+ (SPECIAL_TEXT["Basmala"], confidence, "Basmala"),
185
+ ]
186
+
187
+ return new_vads, new_audios, special_results, 2
188
+
189
+ # ==========================================================================
190
+ # 2. Try Isti'adha on segment 0
191
+ # ==========================================================================
192
+ istiadha_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Isti'adha"])
193
+
194
+ if istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE:
195
+ print(f"[SPECIAL] Isti'adha detected on segment 0 (dist={istiadha_dist:.2f})")
196
+ special_results.append(
197
+ (SPECIAL_TEXT["Isti'adha"], 1.0 - istiadha_dist, "Isti'adha")
198
+ )
199
+
200
+ # Try Basmala on segment 1
201
+ if len(phoneme_texts) >= 2 and phoneme_texts[1]:
202
+ seg1_phonemes = phoneme_texts[1]
203
+ basmala_dist = phoneme_edit_distance(seg1_phonemes, SPECIAL_PHONEMES["Basmala"])
204
+
205
+ if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
206
+ print(f"[SPECIAL] Basmala detected on segment 1 (dist={basmala_dist:.2f})")
207
+ special_results.append(
208
+ (SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")
209
+ )
210
+ return vad_segments, segment_audios, special_results, 2
211
+ else:
212
+ print(f"[SPECIAL] No Basmala on segment 1 (dist={basmala_dist:.2f})")
213
+
214
+ return vad_segments, segment_audios, special_results, 1
215
+
216
+ # ==========================================================================
217
+ # 3. Try Basmala on segment 0
218
+ # ==========================================================================
219
+ basmala_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Basmala"])
220
+
221
+ if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
222
+ print(f"[SPECIAL] Basmala detected on segment 0 (dist={basmala_dist:.2f})")
223
+ special_results.append(
224
+ (SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")
225
+ )
226
+ return vad_segments, segment_audios, special_results, 1
227
+
228
+ # ==========================================================================
229
+ # 4. No specials detected
230
+ # ==========================================================================
231
+ print(f"[SPECIAL] No special segments detected "
232
+ f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})")
233
+
234
+ return vad_segments, segment_audios, [], 0
235
+
236
+
237
+ def detect_inter_chapter_specials(
238
+ phoneme_texts: List[List[str]],
239
+ ) -> Tuple[List[Tuple[str, float, str]], int]:
240
+ """
241
+ Detect special segments between chapters (phoneme-only, no audio splitting).
242
+
243
+ Same detection order as detect_special_segments:
244
+ 1. Try COMBINED on segment 0
245
+ 2. Else try Isti'adha on seg 0 -> if match, try Basmala on seg 1
246
+ 3. Else try Basmala on seg 0
247
+ 4. Else no specials
248
+
249
+ Returns:
250
+ (special_results, num_consumed)
251
+ special_results: List of (matched_text, score, ref) tuples
252
+ num_consumed: Number of segments consumed as specials
253
+ """
254
+ if not phoneme_texts or not phoneme_texts[0]:
255
+ return [], 0
256
+
257
+ seg0_phonemes = phoneme_texts[0]
258
+
259
+ # 1. Try COMBINED (Isti'adha + Basmala in one segment)
260
+ combined_dist = phoneme_edit_distance(seg0_phonemes, COMBINED_PHONEMES)
261
+ if combined_dist <= MAX_SPECIAL_EDIT_DISTANCE:
262
+ print(f"[INTER-CHAPTER] Combined Isti'adha+Basmala detected (dist={combined_dist:.2f})")
263
+ combined_text = SPECIAL_TEXT["Isti'adha"] + " ۝ " + SPECIAL_TEXT["Basmala"]
264
+ return [(combined_text, 1.0 - combined_dist, "Isti'adha+Basmala")], 1
265
+
266
+ # 2. Try Isti'adha on segment 0
267
+ istiadha_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Isti'adha"])
268
+ if istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE:
269
+ print(f"[INTER-CHAPTER] Isti'adha detected (dist={istiadha_dist:.2f})")
270
+ results = [(SPECIAL_TEXT["Isti'adha"], 1.0 - istiadha_dist, "Isti'adha")]
271
+ consumed = 1
272
+
273
+ # Try Basmala on segment 1
274
+ if len(phoneme_texts) >= 2 and phoneme_texts[1]:
275
+ seg1_phonemes = phoneme_texts[1]
276
+ basmala_dist = phoneme_edit_distance(seg1_phonemes, SPECIAL_PHONEMES["Basmala"])
277
+ if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
278
+ print(f"[INTER-CHAPTER] Basmala detected on next segment (dist={basmala_dist:.2f})")
279
+ results.append((SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala"))
280
+ consumed = 2
281
+ else:
282
+ print(f"[INTER-CHAPTER] No Basmala on next segment (dist={basmala_dist:.2f})")
283
+
284
+ return results, consumed
285
+
286
+ # 3. Try Basmala on segment 0
287
+ basmala_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Basmala"])
288
+ if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
289
+ print(f"[INTER-CHAPTER] Basmala detected (dist={basmala_dist:.2f})")
290
+ return [(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")], 1
291
+
292
+ # 4. No specials
293
+ print(f"[INTER-CHAPTER] No special segments detected "
294
+ f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})")
295
+ return [], 0
src/phonemizer_utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Phonemizer integration for reference phonemes."""
2
+
3
+ _pm = None
4
+
5
+
6
+ def get_phonemizer():
7
+ """Get or create Phonemizer instance."""
8
+ global _pm
9
+ if _pm is None:
10
+ from core.phonemizer import Phonemizer
11
+ _pm = Phonemizer()
12
+ return _pm
src/quran_index.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ QuranIndex: Pre-indexed Quran words for reference lookup and display.
3
+
4
+ Uses dual-script loading:
5
+ - QPC Hafs (qpc_hafs.json) for computation (indices, word counts, lookups)
6
+ - Digital Khatt (digital_khatt_v2_script.json) for display (renders correctly with DK font)
7
+
8
+ Stop signs in Digital Khatt are combining marks attached to words, while QPC Hafs
9
+ has spaces before stop signs. The DigitalKhatt font renders DK text correctly.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ from typing import Optional
18
+
19
+ from config import QURAN_SCRIPT_PATH_COMPUTE, QURAN_SCRIPT_PATH_DISPLAY
20
+
21
+
22
+ # Verse marker prefix to filter out (end-of-ayah markers)
23
+ VERSE_MARKER_PREFIX = '۝'
24
+
25
+
26
+ @dataclass
27
+ class WordInfo:
28
+ """Information about a single Quran word."""
29
+ global_idx: int # Position in flat word array
30
+ surah: int
31
+ ayah: int
32
+ word: int
33
+ text: str # QPC Hafs text (computation)
34
+ display_text: str # Digital Khatt text (display)
35
+
36
+
37
+ @dataclass
38
+ class QuranIndex:
39
+ """
40
+ Pre-indexed Quran words for reference lookup and display.
41
+
42
+ Used to convert matched references (e.g. "2:255:1-2:255:5") back to
43
+ original Arabic text with verse markers for UI rendering.
44
+ """
45
+ words: list[WordInfo] # All words in order
46
+ word_lookup: dict[tuple[int, int, int], int] # (surah, ayah, word) -> global_idx
47
+
48
+ @classmethod
49
+ def load(cls, compute_path: Optional[Path] = None, display_path: Optional[Path] = None) -> "QuranIndex":
50
+ """
51
+ Load and index the Quran from dual script sources.
52
+
53
+ Uses QPC Hafs as primary (determines word structure) and Digital Khatt
54
+ for display text. Falls back to QPC text if DK entry is missing.
55
+
56
+ Filters out verse markers (۝) - they're not real words.
57
+ """
58
+ if compute_path is None:
59
+ compute_path = QURAN_SCRIPT_PATH_COMPUTE
60
+ if display_path is None:
61
+ display_path = QURAN_SCRIPT_PATH_DISPLAY
62
+
63
+ with open(compute_path, "r", encoding="utf-8") as f:
64
+ compute_data = json.load(f)
65
+ with open(display_path, "r", encoding="utf-8") as f:
66
+ display_data = json.load(f)
67
+
68
+ words: list[WordInfo] = []
69
+ word_lookup: dict[tuple[int, int, int], int] = {}
70
+
71
+ # Sort by location key to ensure order (1:1:1, 1:1:2, ..., 114:6:3)
72
+ sorted_keys = sorted(compute_data.keys(), key=_parse_location_key)
73
+
74
+ for key in sorted_keys:
75
+ entry = compute_data[key]
76
+ text = entry["text"]
77
+
78
+ # Skip verse markers (QPC shouldn't have any, but safety check)
79
+ if text.startswith(VERSE_MARKER_PREFIX):
80
+ continue
81
+
82
+ surah = int(entry["surah"])
83
+ ayah = int(entry["ayah"])
84
+ word = int(entry["word"])
85
+
86
+ # Get display text from Digital Khatt, fallback to QPC text
87
+ dk_entry = display_data.get(key)
88
+ display_text = dk_entry["text"] if dk_entry else text
89
+
90
+ word_info = WordInfo(
91
+ global_idx=len(words),
92
+ surah=surah,
93
+ ayah=ayah,
94
+ word=word,
95
+ text=text,
96
+ display_text=display_text,
97
+ )
98
+ words.append(word_info)
99
+ word_lookup[(surah, ayah, word)] = word_info.global_idx
100
+
101
+ print(f"[QuranIndex] Loaded {len(words)} words")
102
+
103
+ return cls(
104
+ words=words,
105
+ word_lookup=word_lookup,
106
+ )
107
+
108
+ def ref_to_indices(self, ref: str) -> Optional[tuple[int, int]]:
109
+ """
110
+ Convert a ref like '2:255:1-2:255:5' or '2:255:5' to global start/end indices.
111
+ """
112
+ if not ref or ":" not in ref:
113
+ return None
114
+ try:
115
+ if "-" in ref:
116
+ start_ref, end_ref = ref.split("-")
117
+ else:
118
+ start_ref = end_ref = ref
119
+
120
+ def _lookup(r: str) -> Optional[int]:
121
+ parts = r.split(":")
122
+ if len(parts) < 3:
123
+ return None
124
+ return self.word_lookup.get((int(parts[0]), int(parts[1]), int(parts[2])))
125
+
126
+ start_idx = _lookup(start_ref)
127
+ end_idx = _lookup(end_ref)
128
+ if start_idx is None or end_idx is None:
129
+ return None
130
+ return start_idx, end_idx
131
+ except Exception:
132
+ return None
133
+
134
+
135
+ def _parse_location_key(key: str) -> tuple[int, int, int]:
136
+ """Parse location key like '2:255:3' into (surah, ayah, word) for sorting."""
137
+ parts = key.split(":")
138
+ return (int(parts[0]), int(parts[1]), int(parts[2]))
139
+
140
+
141
+ # Global singleton - loaded on first access
142
+ _quran_index_cache: Optional[QuranIndex] = None
143
+
144
+
145
+ def get_quran_index() -> QuranIndex:
146
+ """Get or create the global QuranIndex singleton."""
147
+ global _quran_index_cache
148
+ if _quran_index_cache is None:
149
+ _quran_index_cache = QuranIndex.load()
150
+ return _quran_index_cache
src/segment_processor.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compatibility façade for segmentation pipeline modules."""
2
+
3
+ from .segment_types import VadSegment, SegmentInfo, ProfilingData
4
+ from .segmenter.segmenter_model import load_segmenter, ensure_models_on_gpu, ensure_models_on_cpu
5
+ from .segmenter.segmenter_aoti import test_vad_aoti_export, apply_aoti_compiled
6
+ from .segmenter.vad import detect_speech_segments
7
+ from .alignment.alignment_pipeline import run_phoneme_matching
8
+
9
+ __all__ = [
10
+ "VadSegment",
11
+ "SegmentInfo",
12
+ "ProfilingData",
13
+ "load_segmenter",
14
+ "ensure_models_on_gpu",
15
+ "ensure_models_on_cpu",
16
+ "detect_speech_segments",
17
+ "run_phoneme_matching",
18
+ "test_vad_aoti_export",
19
+ "apply_aoti_compiled",
20
+ ]
src/segment_types.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data types for the segmentation pipeline."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+
7
+ @dataclass
8
+ class VadSegment:
9
+ """Raw VAD segment with timing info."""
10
+ start_time: float
11
+ end_time: float
12
+ segment_idx: int
13
+
14
+
15
+ @dataclass
16
+ class SegmentInfo:
17
+ """Processed segment with transcription and matching results."""
18
+ start_time: float
19
+ end_time: float
20
+ transcribed_text: str
21
+ matched_text: str
22
+ matched_ref: str # e.g. "2:255:1-2:255:5"
23
+ match_score: float
24
+ error: Optional[str] = None
25
+ has_missing_words: bool = False
26
+ potentially_undersegmented: bool = False
27
+
28
+
29
+ @dataclass
30
+ class ProfilingData:
31
+ """Profiling metrics for the processing pipeline."""
32
+ # Preprocessing
33
+ resample_time: float = 0.0 # Audio resampling time
34
+ # VAD profiling
35
+ vad_model_load_time: float = 0.0
36
+ vad_model_move_time: float = 0.0
37
+ vad_inference_time: float = 0.0
38
+ vad_gpu_time: float = 0.0 # Actual GPU lease execution time
39
+ vad_wall_time: float = 0.0 # Wall-clock time (includes queue wait)
40
+ # Phoneme ASR profiling
41
+ asr_time: float = 0.0 # Wav2vec wall-clock time (includes queue wait)
42
+ asr_gpu_time: float = 0.0 # Actual GPU lease execution time
43
+ asr_model_move_time: float = 0.0 # ASR model GPU move time
44
+ asr_sorting_time: float = 0.0 # Duration-sorting time
45
+ asr_batch_build_time: float = 0.0 # Dynamic batch construction time
46
+ asr_batch_profiling: list = None # Per-batch timing details
47
+ # Global anchor profiling
48
+ anchor_time: float = 0.0 # N-gram voting anchor detection
49
+ # Phoneme alignment profiling
50
+ phoneme_total_time: float = 0.0 # Overall phoneme matching time
51
+ phoneme_ref_build_time: float = 0.0 # Time to build chapter reference
52
+ phoneme_dp_total_time: float = 0.0 # Total DP time across all segments
53
+ phoneme_dp_min_time: float = 0.0 # Min DP time per segment
54
+ phoneme_dp_max_time: float = 0.0 # Max DP time per segment
55
+ phoneme_window_setup_time: float = 0.0 # Total window slicing time
56
+ phoneme_result_build_time: float = 0.0 # Total result construction time
57
+ phoneme_num_segments: int = 0 # Number of segments aligned
58
+ match_wall_time: float = 0.0 # Total matching wall-clock time
59
+ # Retry / reanchor counters
60
+ tier1_attempts: int = 0
61
+ tier1_passed: int = 0
62
+ tier1_segments: list = None
63
+ tier2_attempts: int = 0
64
+ tier2_passed: int = 0
65
+ tier2_segments: list = None
66
+ consec_reanchors: int = 0
67
+ segments_attempted: int = 0
68
+ segments_passed: int = 0
69
+ special_merges: int = 0
70
+ # Result building profiling
71
+ result_build_time: float = 0.0 # Total result building time
72
+ result_audio_encode_time: float = 0.0 # Audio-to-data-URL encoding
73
+ # Total pipeline time
74
+ total_time: float = 0.0 # End-to-end pipeline time
75
+
76
+ @property
77
+ def phoneme_dp_avg_time(self) -> float:
78
+ """Average DP time per segment."""
79
+ if self.phoneme_num_segments == 0:
80
+ return 0.0
81
+ return self.phoneme_dp_total_time / self.phoneme_num_segments
82
+
83
+ @staticmethod
84
+ def _fmt(seconds):
85
+ """Format seconds as m:ss.fff when >= 60s, else as s.fffs."""
86
+ if seconds >= 60:
87
+ m, s = divmod(seconds, 60)
88
+ return f"{int(m)}:{s:06.3f}"
89
+ return f"{seconds:.3f}s"
90
+
91
+ def summary(self) -> str:
92
+ """Return a formatted profiling summary."""
93
+ _fmt = self._fmt
94
+ lines = [
95
+ "\n" + "=" * 60,
96
+ "PROFILING SUMMARY",
97
+ "=" * 60,
98
+ f" Preprocessing:",
99
+ f" Resample: {self.resample_time:.3f}s",
100
+ f" VAD: wall {_fmt(self.vad_wall_time)}",
101
+ f" GPU Time: {self.vad_gpu_time:.3f}s (queue {self.vad_wall_time - self.vad_gpu_time:.3f}s)",
102
+ f" Model Load: {self.vad_model_load_time:.3f}s",
103
+ f" Model Move: {self.vad_model_move_time:.3f}s",
104
+ f" Inference: {self.vad_inference_time:.3f}s",
105
+ f" Phoneme ASR: wall {_fmt(self.asr_time)}",
106
+ f" GPU Time: {self.asr_gpu_time:.3f}s (queue {self.asr_time - self.asr_gpu_time:.3f}s)",
107
+ f" Model Move: {self.asr_model_move_time:.3f}s",
108
+ f" Sorting: {self.asr_sorting_time:.3f}s",
109
+ f" Batch Build: {self.asr_batch_build_time:.3f}s",
110
+ f" Batches: {len(self.asr_batch_profiling) if self.asr_batch_profiling else 0}",
111
+ ]
112
+ if self.asr_batch_profiling:
113
+ for b in self.asr_batch_profiling:
114
+ lines.append(
115
+ f" Batch {b['batch_num']:>2}: {b['size']:>3} segs | "
116
+ f"{b['time']:.3f}s | "
117
+ f"{b['min_dur']:.2f}-{b['max_dur']:.2f}s "
118
+ f"(A {b['avg_dur']:.2f}s, T {b['total_seconds']:.1f}s, W {b['pad_waste']:.0%})"
119
+ )
120
+ lines += [
121
+ f" Global Anchor:",
122
+ f" N-gram Voting: {self.anchor_time:.3f}s",
123
+ f" Phoneme Alignment: wall {_fmt(self.match_wall_time)}",
124
+ f" Ref Build: {self.phoneme_ref_build_time:.3f}s",
125
+ f" Window Setup: {self.phoneme_window_setup_time:.3f}s",
126
+ f" DP Total: {self.phoneme_dp_total_time:.3f}s",
127
+ f" Segments: {self.phoneme_num_segments}",
128
+ f" DP Avg/segment: {1000*self.phoneme_dp_avg_time:.3f}ms",
129
+ f" DP Min: {1000*self.phoneme_dp_min_time:.3f}ms",
130
+ f" DP Max: {1000*self.phoneme_dp_max_time:.3f}ms",
131
+ ]
132
+ pct = 100 * self.segments_passed / self.segments_attempted if self.segments_attempted else 0
133
+ t1_segs = self.tier1_segments or []
134
+ t2_segs = self.tier2_segments or []
135
+ lines += [
136
+ f" Alignment Stats:",
137
+ f" Attempted: {self.segments_attempted}",
138
+ f" Passed: {self.segments_passed} ({pct:.1f}%)",
139
+ f" Tier 1 Retries: {self.tier1_passed}/{self.tier1_attempts} passed segments: {t1_segs}",
140
+ f" Tier 2 Retries: {self.tier2_passed}/{self.tier2_attempts} passed segments: {t2_segs}",
141
+ f" Reanchors (consec failures): {self.consec_reanchors}",
142
+ f" Special Merges: {self.special_merges}",
143
+ "-" * 60,
144
+ ]
145
+ profiled_sum = (self.resample_time + self.vad_wall_time + self.asr_time
146
+ + self.anchor_time + self.match_wall_time + self.result_build_time)
147
+ unaccounted = self.total_time - profiled_sum
148
+ lines += [
149
+ f" PROFILED SUM: {_fmt(profiled_sum)}",
150
+ f" TOTAL (wall): {_fmt(self.total_time)} (unaccounted: {_fmt(unaccounted)})",
151
+ "=" * 60,
152
+ ]
153
+ return "\n".join(lines)
src/segmenter/__init__.py ADDED
File without changes
src/segmenter/segmenter_aoti.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """AoTInductor compilation utilities for the VAD segmenter."""
2
+
3
+ import torch
4
+
5
+ from config import (
6
+ AOTI_ENABLED, AOTI_MIN_AUDIO_MINUTES, AOTI_MAX_AUDIO_MINUTES,
7
+ AOTI_HUB_REPO, AOTI_HUB_ENABLED,
8
+ )
9
+ from .segmenter_model import _segmenter_cache
10
+
11
+
12
+ # =============================================================================
13
+ # AoT Compilation Test
14
+ # =============================================================================
15
+
16
+ _aoti_cache = {
17
+ "exported": None,
18
+ "compiled": None,
19
+ "tested": False,
20
+ }
21
+
22
+
23
+ def is_aoti_applied() -> bool:
24
+ """Return True if a compiled AoTI model has been applied."""
25
+ return bool(_aoti_cache.get("applied"))
26
+
27
+
28
+ def _get_aoti_hub_filename():
29
+ """Generate Hub filename encoding min/max audio duration."""
30
+ return f"vad_aoti_{AOTI_MIN_AUDIO_MINUTES}min_{AOTI_MAX_AUDIO_MINUTES}min.pt2"
31
+
32
+
33
+ def _try_load_aoti_from_hub(model):
34
+ """
35
+ Try to load a pre-compiled AoTI model from Hub.
36
+ Returns True if successful, False otherwise.
37
+ """
38
+ import os
39
+ import time
40
+
41
+ if not AOTI_HUB_ENABLED:
42
+ print("[AoTI] Hub persistence disabled")
43
+ return False
44
+
45
+ token = os.environ.get("HF_TOKEN")
46
+ if not token:
47
+ print("[AoTI] HF_TOKEN not set, cannot access Hub")
48
+ return False
49
+
50
+ filename = _get_aoti_hub_filename()
51
+ print(f"[AoTI] Checking Hub for pre-compiled model: {AOTI_HUB_REPO}/{filename}")
52
+
53
+ try:
54
+ from huggingface_hub import hf_hub_download, HfApi
55
+
56
+ # Check if file exists in repo
57
+ api = HfApi(token=token)
58
+ try:
59
+ files = api.list_repo_files(AOTI_HUB_REPO, token=token)
60
+ if filename not in files:
61
+ print(f"[AoTI] Compiled model not found on Hub (available: {files})")
62
+ return False
63
+ except Exception as e:
64
+ print(f"[AoTI] Could not list Hub repo: {e}")
65
+ return False
66
+
67
+ # Download the compiled graph
68
+ t0 = time.time()
69
+ compiled_graph_file = hf_hub_download(
70
+ AOTI_HUB_REPO, filename, token=token
71
+ )
72
+ download_time = time.time() - t0
73
+ print(f"[AoTI] Downloaded from Hub in {download_time:.1f}s: {compiled_graph_file}")
74
+
75
+ # Load using ZeroGPU AOTI utilities
76
+ from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights, drain_module_parameters
77
+
78
+ state_dict = model.state_dict()
79
+ zerogpu_weights = ZeroGPUWeights({name: weight for name, weight in state_dict.items()})
80
+ compiled = ZeroGPUCompiledModel(compiled_graph_file, zerogpu_weights)
81
+
82
+ # Replace forward method
83
+ setattr(model, "forward", compiled)
84
+ drain_module_parameters(model)
85
+
86
+ _aoti_cache["compiled"] = compiled
87
+ _aoti_cache["applied"] = True
88
+ print(f"[AoTI] Loaded and applied compiled model from Hub")
89
+ return True
90
+
91
+ except Exception as e:
92
+ print(f"[AoTI] Failed to load from Hub: {type(e).__name__}: {e}")
93
+ import traceback
94
+ traceback.print_exc()
95
+ return False
96
+
97
+
98
+ def _push_aoti_to_hub(compiled):
99
+ """
100
+ Push compiled AoTI model to Hub for future reuse.
101
+ """
102
+ import os
103
+ import time
104
+ import tempfile
105
+
106
+ if not AOTI_HUB_ENABLED:
107
+ print("[AoTI] Hub persistence disabled, skipping upload")
108
+ return False
109
+
110
+ token = os.environ.get("HF_TOKEN")
111
+ if not token:
112
+ print("[AoTI] HF_TOKEN not set, cannot upload to Hub")
113
+ return False
114
+
115
+ filename = _get_aoti_hub_filename()
116
+ print(f"[AoTI] Uploading compiled model to Hub: {AOTI_HUB_REPO}/{filename}")
117
+
118
+ try:
119
+ from huggingface_hub import HfApi, create_repo
120
+
121
+ api = HfApi(token=token)
122
+
123
+ # Create repo if it doesn't exist
124
+ try:
125
+ create_repo(AOTI_HUB_REPO, exist_ok=True, token=token)
126
+ except Exception as e:
127
+ print(f"[AoTI] Repo creation note: {e}")
128
+
129
+ # Get the archive file from the compiled object
130
+ archive = compiled.archive_file
131
+ if archive is None:
132
+ print("[AoTI] Compiled object has no archive_file, cannot upload")
133
+ return False
134
+
135
+ t0 = time.time()
136
+
137
+ # Write archive to temp file and upload
138
+ with tempfile.TemporaryDirectory() as tmpdir:
139
+ output_path = os.path.join(tmpdir, filename)
140
+
141
+ # archive is a BytesIO object
142
+ with open(output_path, "wb") as f:
143
+ f.write(archive.getvalue())
144
+
145
+ info = api.upload_file(
146
+ repo_id=AOTI_HUB_REPO,
147
+ path_or_fileobj=output_path,
148
+ path_in_repo=filename,
149
+ commit_message=f"Add compiled VAD model ({AOTI_MIN_AUDIO_MINUTES}-{AOTI_MAX_AUDIO_MINUTES} min)",
150
+ token=token,
151
+ )
152
+
153
+ upload_time = time.time() - t0
154
+ print(f"[AoTI] Uploaded to Hub in {upload_time:.1f}s: {info}")
155
+ return True
156
+
157
+ except Exception as e:
158
+ print(f"[AoTI] Failed to upload to Hub: {type(e).__name__}: {e}")
159
+ import traceback
160
+ traceback.print_exc()
161
+ return False
162
+
163
+
164
+ def test_vad_aoti_export():
165
+ """
166
+ Test torch.export AoT compilation for VAD model using spaces.aoti_capture.
167
+ Must be called AFTER model is on GPU (inside GPU-decorated function).
168
+
169
+ Checks Hub for pre-compiled model first. If found, loads it directly.
170
+ Otherwise, compiles fresh and uploads to Hub for future reuse.
171
+
172
+ Uses aoti_capture to capture the EXACT call signature from a real inference
173
+ call to segment_recitations, ensuring the export matches what the model
174
+ actually receives during inference.
175
+
176
+ Returns dict with test results and timing.
177
+ """
178
+ import time
179
+
180
+ results = {
181
+ "export_success": False,
182
+ "export_time": 0.0,
183
+ "compile_success": False,
184
+ "compile_time": 0.0,
185
+ "hub_loaded": False,
186
+ "hub_uploaded": False,
187
+ "error": None,
188
+ }
189
+
190
+ if not AOTI_ENABLED:
191
+ results["error"] = "AoTI disabled in config"
192
+ print("[AoTI] Disabled via AOTI_ENABLED=False")
193
+ return results
194
+
195
+ if _aoti_cache["tested"]:
196
+ print("[AoTI] Already tested this session, skipping")
197
+ return {"skipped": True, **results}
198
+
199
+ _aoti_cache["tested"] = True
200
+
201
+ # Check model is loaded and on GPU
202
+ if not _segmenter_cache["loaded"] or _segmenter_cache["model"] is None:
203
+ results["error"] = "Model not loaded"
204
+ print(f"[AoTI] {results['error']}")
205
+ return results
206
+
207
+ model = _segmenter_cache["model"]
208
+ processor = _segmenter_cache["processor"]
209
+ device = next(model.parameters()).device
210
+ dtype = next(model.parameters()).dtype
211
+
212
+ if device.type != "cuda":
213
+ results["error"] = f"Model not on GPU (device={device})"
214
+ print(f"[AoTI] {results['error']}")
215
+ return results
216
+
217
+ print(f"[AoTI] Testing torch.export on VAD model (device={device}, dtype={dtype})")
218
+
219
+ # Import spaces for aoti_capture
220
+ try:
221
+ import spaces
222
+ except ImportError:
223
+ results["error"] = "spaces module not available"
224
+ print(f"[AoTI] {results['error']}")
225
+ return results
226
+
227
+ # Try to load pre-compiled model from Hub first
228
+ if _try_load_aoti_from_hub(model):
229
+ results["hub_loaded"] = True
230
+ results["compile_success"] = True
231
+ print("[AoTI] Using pre-compiled model from Hub")
232
+ return results
233
+
234
+ # No cached model found - compile fresh
235
+ print("[AoTI] No cached model on Hub, compiling fresh...")
236
+
237
+ # Convert config minutes to samples (16kHz audio)
238
+ SAMPLES_PER_MINUTE = 16000 * 60
239
+ min_samples = int(AOTI_MIN_AUDIO_MINUTES * SAMPLES_PER_MINUTE)
240
+ max_samples = int(AOTI_MAX_AUDIO_MINUTES * SAMPLES_PER_MINUTE)
241
+
242
+ # Create test audio for capture - use min duration to save memory
243
+ # MUST be on CPU - segment_recitations moves to GPU internally
244
+ test_audio = torch.randn(min_samples, device="cpu")
245
+ print(f"[AoTI] Test audio: {min_samples} samples ({AOTI_MIN_AUDIO_MINUTES} min)")
246
+
247
+ # Capture the exact args/kwargs used by segment_recitations
248
+ try:
249
+ from recitations_segmenter import segment_recitations
250
+
251
+ print("[AoTI] Capturing call signature via aoti_capture...")
252
+ with spaces.aoti_capture(model) as call:
253
+ segment_recitations(
254
+ [test_audio], model, processor,
255
+ device=device, dtype=dtype, batch_size=1,
256
+ )
257
+
258
+ print(f"[AoTI] Captured args: {len(call.args)} positional, {list(call.kwargs.keys())} kwargs")
259
+
260
+ except Exception as e:
261
+ results["error"] = f"aoti_capture failed: {type(e).__name__}: {e}"
262
+ print(f"[AoTI] {results['error']}")
263
+ import traceback
264
+ traceback.print_exc()
265
+ return results
266
+
267
+ # Build dynamic shapes from captured tensors
268
+ # The sequence dimension (T) varies with audio length
269
+ try:
270
+ from torch.export import export, Dim
271
+
272
+ # Derive frame rate from captured tensor (model's actual output rate)
273
+ # Find the first 2D+ tensor to get the captured frame count
274
+ captured_frames = None
275
+ for val in list(call.kwargs.values()) + list(call.args):
276
+ if isinstance(val, torch.Tensor) and val.dim() >= 2:
277
+ captured_frames = val.shape[1]
278
+ break
279
+
280
+ if captured_frames is None:
281
+ raise ValueError("No 2D+ tensor found in captured args/kwargs")
282
+
283
+ # Calculate frames per minute from captured data
284
+ frames_per_minute = captured_frames / AOTI_MIN_AUDIO_MINUTES
285
+ min_frames = captured_frames # Already at min duration
286
+ max_frames = int(AOTI_MAX_AUDIO_MINUTES * frames_per_minute)
287
+ dynamic_T = Dim("T", min=min_frames, max=max_frames)
288
+ print(f"[AoTI] Captured {captured_frames} frames for {AOTI_MIN_AUDIO_MINUTES} min = {frames_per_minute:.1f} frames/min")
289
+ print(f"[AoTI] Dynamic shape range: {min_frames}-{max_frames} frames")
290
+
291
+ # Build dynamic_shapes dict matching the captured signature
292
+ dynamic_shapes_args = []
293
+ for arg in call.args:
294
+ if isinstance(arg, torch.Tensor) and arg.dim() >= 2:
295
+ # Assume sequence dim is dim 1 for 2D+ tensors
296
+ dynamic_shapes_args.append({1: dynamic_T})
297
+ else:
298
+ dynamic_shapes_args.append(None)
299
+
300
+ dynamic_shapes_kwargs = {}
301
+ for key, val in call.kwargs.items():
302
+ if isinstance(val, torch.Tensor) and val.dim() >= 2:
303
+ dynamic_shapes_kwargs[key] = {1: dynamic_T}
304
+ else:
305
+ dynamic_shapes_kwargs[key] = None
306
+
307
+ print(f"[AoTI] Dynamic shapes - args: {dynamic_shapes_args}, kwargs: {list(dynamic_shapes_kwargs.keys())}")
308
+
309
+ t0 = time.time()
310
+ # Export using captured signature - guarantees match with inference
311
+ exported = export(
312
+ model,
313
+ args=call.args,
314
+ kwargs=call.kwargs,
315
+ dynamic_shapes=(dynamic_shapes_args, dynamic_shapes_kwargs) if dynamic_shapes_args else dynamic_shapes_kwargs,
316
+ strict=False,
317
+ )
318
+ results["export_time"] = time.time() - t0
319
+ results["export_success"] = True
320
+ _aoti_cache["exported"] = exported
321
+ print(f"[AoTI] torch.export SUCCESS in {results['export_time']:.1f}s")
322
+
323
+ except Exception as e:
324
+ results["error"] = f"torch.export failed: {type(e).__name__}: {e}"
325
+ print(f"[AoTI] {results['error']}")
326
+ import traceback
327
+ traceback.print_exc()
328
+ return results
329
+
330
+ # Attempt spaces.aoti_compile
331
+ try:
332
+ t0 = time.time()
333
+ compiled = spaces.aoti_compile(exported)
334
+ results["compile_time"] = time.time() - t0
335
+ results["compile_success"] = True
336
+ _aoti_cache["compiled"] = compiled
337
+ print(f"[AoTI] spaces.aoti_compile SUCCESS in {results['compile_time']:.1f}s")
338
+
339
+ # Return compiled object - apply happens OUTSIDE GPU lease (in main process)
340
+ results["compiled"] = compiled
341
+ print(f"[AoTI] Compiled object ready for apply")
342
+
343
+ # Upload to Hub for future reuse
344
+ if _push_aoti_to_hub(compiled):
345
+ results["hub_uploaded"] = True
346
+
347
+ except Exception as e:
348
+ results["error"] = f"aoti_compile failed: {type(e).__name__}: {e}"
349
+ print(f"[AoTI] {results['error']}")
350
+ import traceback
351
+ traceback.print_exc()
352
+
353
+ return results
354
+
355
+
356
+ def apply_aoti_compiled(compiled):
357
+ """
358
+ Apply AoTI compiled model to VAD segmenter.
359
+ Must be called OUTSIDE GPU lease, in main process.
360
+ """
361
+ if compiled is None:
362
+ print("[AoTI] No compiled object to apply")
363
+ return False
364
+
365
+ model = _segmenter_cache.get("model")
366
+ if model is None:
367
+ print("[AoTI] Model not loaded, cannot apply")
368
+ return False
369
+
370
+ try:
371
+ import spaces
372
+ spaces.aoti_apply(compiled, model)
373
+ _aoti_cache["compiled"] = compiled
374
+ _aoti_cache["applied"] = True
375
+ print(f"[AoTI] Compiled model applied to VAD (model_id={id(model)})")
376
+ return True
377
+ except Exception as e:
378
+ print(f"[AoTI] Apply failed: {e}")
379
+ return False
src/segmenter/segmenter_model.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Model lifecycle and device management for the VAD segmenter."""
2
+
3
+ import torch
4
+
5
+ from config import SEGMENTER_MODEL, DTYPE, IS_HF_SPACE, TORCH_COMPILE
6
+ from ..zero_gpu import ZERO_GPU_AVAILABLE, is_quota_exhausted, is_user_forced_cpu
7
+
8
+
9
+ # =============================================================================
10
+ # Model caches
11
+ # =============================================================================
12
+
13
+ _segmenter_cache = {"model": None, "processor": None, "loaded": False, "load_time": 0.0, "device": None}
14
+ _env_logged = False
15
+
16
+
17
+ def _log_env_once():
18
+ """Log library and GPU versions once for debugging HF Space mismatches."""
19
+ global _env_logged
20
+ if _env_logged:
21
+ return
22
+ _env_logged = True
23
+ try:
24
+ import importlib.metadata as _im
25
+
26
+ def _ver(pkg: str) -> str:
27
+ try:
28
+ return _im.version(pkg)
29
+ except Exception:
30
+ return "unknown"
31
+
32
+ print(f"[ENV] torch={torch.__version__} cuda={torch.version.cuda} cudnn={torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'none'}")
33
+ print(f"[ENV] transformers={_ver('transformers')} recitations_segmenter={_ver('recitations_segmenter')}")
34
+ if torch.cuda.is_available():
35
+ print(f"[ENV] GPU={torch.cuda.get_device_name(0)}")
36
+ except Exception as e:
37
+ print(f"[ENV] Failed to log env: {e}")
38
+
39
+
40
+ _TORCH_DTYPE = torch.float16 if DTYPE == "float16" else torch.float32
41
+
42
+
43
+ def _get_device_and_dtype():
44
+ """Get the best available device and dtype."""
45
+ if IS_HF_SPACE or ZERO_GPU_AVAILABLE:
46
+ return torch.device("cpu"), _TORCH_DTYPE
47
+ if torch.cuda.is_available():
48
+ return torch.device("cuda"), _TORCH_DTYPE
49
+ return torch.device("cpu"), _TORCH_DTYPE
50
+
51
+
52
+ def ensure_models_on_gpu(asr_model_name=None):
53
+ """
54
+ Move models to GPU. Call this INSIDE a GPU-decorated function
55
+ after ZeroGPU lease is acquired.
56
+
57
+ Args:
58
+ asr_model_name: If provided, move only this ASR model to GPU.
59
+ If None, skip ASR model movement (e.g. during VAD-only lease).
60
+
61
+ Skips if quota exhausted or CUDA unavailable.
62
+ Idempotent: checks current device before moving.
63
+
64
+ Returns:
65
+ float: Time in seconds spent moving models to GPU.
66
+ """
67
+ import time
68
+ from ..alignment.phoneme_asr import move_phoneme_asr_to_gpu
69
+
70
+ if is_user_forced_cpu() or is_quota_exhausted() or not torch.cuda.is_available():
71
+ return 0.0
72
+
73
+ device = torch.device("cuda")
74
+ dtype = _TORCH_DTYPE
75
+ move_start = time.time()
76
+
77
+ # Move segmenter to GPU
78
+ if _segmenter_cache["loaded"] and _segmenter_cache["model"] is not None:
79
+ model = _segmenter_cache["model"]
80
+ if next(model.parameters()).device.type != "cuda":
81
+ print("[GPU] Moving segmenter to CUDA...")
82
+ model.to(device, dtype=dtype)
83
+ _segmenter_cache["model"] = model
84
+ _segmenter_cache["device"] = "cuda"
85
+ print("[GPU] Segmenter on CUDA")
86
+
87
+ # Move phoneme ASR to GPU (only the requested model)
88
+ if asr_model_name is not None:
89
+ move_phoneme_asr_to_gpu(asr_model_name)
90
+
91
+ return time.time() - move_start
92
+
93
+
94
+ def ensure_models_on_cpu():
95
+ """
96
+ Move all models back to CPU. Called when GPU lease fails or quota
97
+ is exhausted so that CPU fallback inference can proceed.
98
+
99
+ Idempotent: checks current device before moving.
100
+ """
101
+ from ..alignment.phoneme_asr import move_phoneme_asr_to_cpu
102
+
103
+ device = torch.device("cpu")
104
+ dtype = _TORCH_DTYPE
105
+
106
+ # Move segmenter to CPU
107
+ if _segmenter_cache["loaded"] and _segmenter_cache["model"] is not None:
108
+ model = _segmenter_cache["model"]
109
+ if next(model.parameters()).device.type != "cpu":
110
+ print("[CPU] Moving segmenter to CPU...")
111
+ model.to(device, dtype=dtype)
112
+ _segmenter_cache["model"] = model
113
+ _segmenter_cache["device"] = "cpu"
114
+ print("[CPU] Segmenter on CPU")
115
+
116
+ # Move phoneme ASR to CPU
117
+ move_phoneme_asr_to_cpu()
118
+
119
+
120
+ def load_segmenter():
121
+ """Load the VAD segmenter model on CPU. Returns (model, processor, load_time).
122
+
123
+ Models are loaded once and cached. Use ensure_models_on_gpu()
124
+ inside GPU-decorated functions to move to CUDA.
125
+ """
126
+ if _segmenter_cache["loaded"]:
127
+ return _segmenter_cache["model"], _segmenter_cache["processor"], 0.0
128
+
129
+ import time
130
+ start_time = time.time()
131
+
132
+ try:
133
+ from transformers import AutoModelForAudioFrameClassification, AutoFeatureExtractor
134
+
135
+ print(f"Loading segmenter: {SEGMENTER_MODEL}")
136
+ device, dtype = _get_device_and_dtype()
137
+
138
+ model = AutoModelForAudioFrameClassification.from_pretrained(SEGMENTER_MODEL)
139
+ model.to(device, dtype=dtype)
140
+ model.eval()
141
+ if TORCH_COMPILE and not (IS_HF_SPACE or ZERO_GPU_AVAILABLE):
142
+ model = torch.compile(model, mode="reduce-overhead")
143
+
144
+ processor = AutoFeatureExtractor.from_pretrained(SEGMENTER_MODEL)
145
+
146
+ load_time = time.time() - start_time
147
+ _segmenter_cache["model"] = model
148
+ _segmenter_cache["processor"] = processor
149
+ _segmenter_cache["loaded"] = True
150
+ _segmenter_cache["load_time"] = load_time
151
+ _segmenter_cache["device"] = device.type
152
+
153
+ print(f"Segmenter loaded on {device} in {load_time:.2f}s")
154
+ return model, processor, load_time
155
+
156
+ except Exception as e:
157
+ print(f"Failed to load segmenter: {e}")
158
+ return None, None, 0.0
src/segmenter/vad.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """VAD inference utilities."""
2
+
3
+ from typing import List, Tuple
4
+
5
+ import numpy as np
6
+ import torch
7
+
8
+ from .segmenter_aoti import is_aoti_applied
9
+ from .segmenter_model import load_segmenter, _log_env_once
10
+
11
+
12
+ def detect_speech_segments(
13
+ audio: np.ndarray,
14
+ sample_rate: int,
15
+ min_silence_ms: int,
16
+ min_speech_ms: int,
17
+ pad_ms: int
18
+ ) -> tuple[List[Tuple[float, float]], dict]:
19
+ """
20
+ Detect speech segments in audio using VAD.
21
+
22
+ Args:
23
+ audio: Audio waveform (mono, float32)
24
+ sample_rate: Sample rate of audio
25
+ min_silence_ms: Minimum silence duration to split segments
26
+ min_speech_ms: Minimum speech duration for a valid segment
27
+ pad_ms: Padding around speech segments
28
+
29
+ Returns:
30
+ Tuple of (intervals, profiling_dict, raw_speech_intervals, raw_is_complete) where:
31
+ - intervals: List of (start_time, end_time) tuples in seconds
32
+ - profiling_dict: {"model_load_time": float, "inference_time": float}
33
+ - raw_speech_intervals: Raw VAD intervals before cleaning (for resegmentation)
34
+ - raw_is_complete: Raw VAD completeness flags (for resegmentation)
35
+ """
36
+ import time
37
+
38
+ model, processor, model_load_time = load_segmenter()
39
+ if model is None:
40
+ # Fallback: treat whole audio as one segment
41
+ return [(0, len(audio) / sample_rate)], {"model_load_time": 0.0, "inference_time": 0.0}, None, None
42
+
43
+ inference_start = time.time()
44
+ _log_env_once()
45
+
46
+ try:
47
+ from recitations_segmenter import segment_recitations, clean_speech_intervals
48
+
49
+ audio_tensor = torch.from_numpy(audio).float()
50
+
51
+ device = next(model.parameters()).device
52
+ dtype = next(model.parameters()).dtype
53
+
54
+ # Log AoTI status
55
+ if is_aoti_applied():
56
+ print("[VAD] Using AOTInductor-compiled model")
57
+
58
+ # Run segmentation
59
+ outputs = segment_recitations(
60
+ [audio_tensor], model, processor,
61
+ device=device, dtype=dtype, batch_size=1,
62
+ )
63
+
64
+ if not outputs:
65
+ inference_time = time.time() - inference_start
66
+ return [(0, len(audio) / sample_rate)], {"model_load_time": model_load_time, "inference_time": inference_time}, None, None
67
+
68
+ # Clean speech intervals with user parameters
69
+ clean_out = clean_speech_intervals(
70
+ outputs[0].speech_intervals,
71
+ outputs[0].is_complete,
72
+ min_silence_duration_ms=min_silence_ms,
73
+ min_speech_duration_ms=min_speech_ms,
74
+ pad_duration_ms=pad_ms,
75
+ return_seconds=True,
76
+ )
77
+
78
+ inference_time = time.time() - inference_start
79
+ intervals = clean_out.clean_speech_intervals.tolist()
80
+
81
+ raw_count = len(outputs[0].speech_intervals)
82
+ final_count = len(intervals)
83
+ removed = raw_count - final_count
84
+ print(f"[VAD] Raw model intervals: {raw_count}, after cleaning: {final_count} "
85
+ f"({removed} removed by silence merge + min_speech={min_speech_ms}ms filter)")
86
+
87
+ raw_speech_intervals = outputs[0].speech_intervals
88
+ raw_is_complete = outputs[0].is_complete
89
+
90
+ return [(start, end) for start, end in intervals], {"model_load_time": model_load_time, "inference_time": inference_time}, raw_speech_intervals, raw_is_complete
91
+
92
+ except Exception as e:
93
+ print(f"VAD error: {e}")
94
+ import traceback
95
+ traceback.print_exc()
96
+ # Let gpu_with_fallback handle retries on CPU
97
+ raise
src/zero_gpu.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for integrating Hugging Face Spaces ZeroGPU without breaking
3
+ local or non-ZeroGPU environments.
4
+ """
5
+
6
+ import re
7
+ from typing import Callable, TypeVar
8
+ from functools import wraps
9
+
10
+ T = TypeVar("T", bound=Callable)
11
+
12
+ # Default values in case the spaces package is unavailable (e.g., local runs).
13
+ ZERO_GPU_AVAILABLE = False
14
+
15
+ # Track whether we've fallen back to CPU due to quota exhaustion
16
+ _gpu_quota_exhausted = False
17
+ _quota_reset_time = None # e.g. "13:53:59"
18
+ _user_forced_cpu = False
19
+
20
+ try:
21
+ import spaces # type: ignore
22
+
23
+ gpu_decorator = spaces.GPU # pragma: no cover
24
+ ZERO_GPU_AVAILABLE = True
25
+ except Exception:
26
+ def gpu_decorator(*decorator_args, **decorator_kwargs):
27
+ """
28
+ No-op replacement for spaces.GPU so code can run without the package
29
+ or outside of a ZeroGPU Space.
30
+ """
31
+
32
+ def wrapper(func: T) -> T:
33
+ return func
34
+
35
+ # Support both bare @gpu_decorator and @gpu_decorator(...)
36
+ if decorator_args and callable(decorator_args[0]) and not decorator_kwargs:
37
+ return decorator_args[0]
38
+ return wrapper
39
+
40
+
41
+ def is_quota_exhausted() -> bool:
42
+ """Check if GPU quota has been exhausted this session."""
43
+ return _gpu_quota_exhausted
44
+
45
+
46
+ def is_user_forced_cpu() -> bool:
47
+ """Check if the user manually selected CPU mode."""
48
+ return _user_forced_cpu
49
+
50
+
51
+ def get_quota_reset_time() -> str | None:
52
+ """Return the quota reset time string (e.g. '13:53:59'), or None."""
53
+ return _quota_reset_time
54
+
55
+
56
+ def reset_quota_flag():
57
+ """Reset the quota exhausted flag (e.g., after quota resets)."""
58
+ global _gpu_quota_exhausted, _quota_reset_time, _user_forced_cpu
59
+ _gpu_quota_exhausted = False
60
+ _quota_reset_time = None
61
+ _user_forced_cpu = False
62
+
63
+
64
+ def force_cpu_mode():
65
+ """Force all GPU-decorated functions to skip GPU and run on CPU."""
66
+ global _user_forced_cpu
67
+ _user_forced_cpu = True
68
+ _move_models_to_cpu()
69
+
70
+
71
+ def _move_models_to_cpu():
72
+ """Move all models back to CPU for fallback inference."""
73
+ try:
74
+ from .segmenter.segmenter_model import ensure_models_on_cpu
75
+ ensure_models_on_cpu()
76
+ except Exception as e:
77
+ print(f"[CPU] Failed to move models to CPU: {e}")
78
+
79
+
80
+ def gpu_with_fallback(duration=60):
81
+ """
82
+ Decorator that wraps a GPU function with automatic CPU fallback.
83
+
84
+ If ZeroGPU quota is exceeded, the function runs on CPU instead.
85
+ The decorated function should call ensure_models_on_gpu() internally,
86
+ which checks is_quota_exhausted() to decide whether to move to CUDA.
87
+
88
+ Usage:
89
+ @gpu_with_fallback(duration=60)
90
+ def my_gpu_func(data):
91
+ ensure_models_on_gpu() # Moves to CUDA if quota not exhausted
92
+ # ... inference using model's current device ...
93
+ """
94
+ def decorator(func: T) -> T:
95
+ # Create the GPU-wrapped version
96
+ if ZERO_GPU_AVAILABLE:
97
+ gpu_func = gpu_decorator(duration=duration)(func)
98
+ else:
99
+ gpu_func = func
100
+
101
+ @wraps(func)
102
+ def wrapper(*args, **kwargs):
103
+ global _gpu_quota_exhausted, _quota_reset_time
104
+
105
+ # If user explicitly chose CPU mode, skip GPU entirely
106
+ if _user_forced_cpu:
107
+ print("[CPU] User selected CPU mode")
108
+ return func(*args, **kwargs)
109
+
110
+ # If quota already exhausted, go straight to CPU
111
+ if _gpu_quota_exhausted:
112
+ print("[GPU] Quota exhausted, using CPU fallback")
113
+ _move_models_to_cpu()
114
+ return func(*args, **kwargs)
115
+
116
+ # Try GPU first
117
+ try:
118
+ return gpu_func(*args, **kwargs)
119
+ except Exception as e:
120
+ # ZeroGPU raises gradio.Error with title="ZeroGPU quota exceeded"
121
+ is_quota_error = getattr(e, 'title', '') == "ZeroGPU quota exceeded"
122
+ if not is_quota_error:
123
+ is_quota_error = 'quota' in str(e).lower()
124
+
125
+ if is_quota_error:
126
+ print(f"[GPU] Quota exceeded, falling back to CPU: {e}")
127
+ _gpu_quota_exhausted = True
128
+ # Parse reset time from message like "Try again in 13:53:59"
129
+ match = re.search(r'Try again in (\d+:\d{2}:\d{2})', str(e))
130
+ if match:
131
+ _quota_reset_time = match.group(1)
132
+ _move_models_to_cpu()
133
+ return func(*args, **kwargs)
134
+ else:
135
+ err_lower = str(e).lower()
136
+ is_timeout = (
137
+ 'timeout' in err_lower
138
+ or 'duration' in err_lower
139
+ or 'time limit' in err_lower
140
+ )
141
+ if is_timeout:
142
+ print(f"[GPU] Timeout error in {func.__name__}: {e}")
143
+ raise
144
+
145
+ return wrapper
146
+ return decorator
utils/__init__.py ADDED
File without changes
utils/usage_logger.py ADDED
@@ -0,0 +1,593 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage logger that pushes alignment runs to a HF Dataset repo.
3
+
4
+ Uses a ParquetScheduler (subclass of CommitScheduler) to buffer rows in memory
5
+ and periodically write+upload parquet files with embedded audio to the Hub.
6
+ Error logs use a separate CommitScheduler with JSONL files.
7
+ Falls back to local-only logging if schedulers can't initialize.
8
+
9
+ Scheduler creation is deferred to first use so that background threads don't
10
+ interfere with ZeroGPU's startup function scan.
11
+ """
12
+
13
+ import hashlib
14
+ import io
15
+ import json
16
+ import threading
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional, Tuple, Union
20
+ from uuid import uuid4
21
+
22
+ import numpy as np
23
+
24
+ # =========================================================================
25
+ # Directory setup
26
+ # =========================================================================
27
+
28
+ LOG_DIR = Path("usage_logs")
29
+ LOG_DIR.mkdir(parents=True, exist_ok=True)
30
+
31
+ ERROR_DIR = LOG_DIR / "errors"
32
+ ERROR_DIR.mkdir(parents=True, exist_ok=True)
33
+
34
+ ERROR_LOG_PATH = ERROR_DIR / f"error_log-{uuid4()}.jsonl"
35
+
36
+ # =========================================================================
37
+ # ParquetScheduler class definition (no instances created at import time)
38
+ # =========================================================================
39
+
40
+ _HAS_DEPS = False
41
+ try:
42
+ import pyarrow as pa
43
+ import pyarrow.parquet as pq
44
+ from huggingface_hub import CommitScheduler
45
+ from config import USAGE_LOG_DATASET_REPO, USAGE_LOG_PUSH_INTERVAL_MINUTES
46
+
47
+ _HAS_DEPS = True
48
+ except Exception:
49
+ pass
50
+
51
+ # HF features schema (column order matters — audio first for HF viewer widget)
52
+ _ALIGNER_SCHEMA: Dict[str, Dict[str, str]] = {
53
+ # Identity
54
+ "audio": {"_type": "Audio"},
55
+ "audio_id": {"_type": "Value", "dtype": "string"},
56
+ "timestamp": {"_type": "Value", "dtype": "string"},
57
+ "user_id": {"_type": "Value", "dtype": "string"},
58
+ # Input metadata
59
+ "audio_duration_s": {"_type": "Value", "dtype": "float64"},
60
+ "num_segments": {"_type": "Value", "dtype": "int32"},
61
+ "surah": {"_type": "Value", "dtype": "int32"},
62
+ # Segmentation settings
63
+ "min_silence_ms": {"_type": "Value", "dtype": "int32"},
64
+ "min_speech_ms": {"_type": "Value", "dtype": "int32"},
65
+ "pad_ms": {"_type": "Value", "dtype": "int32"},
66
+ "asr_model": {"_type": "Value", "dtype": "string"},
67
+ "device": {"_type": "Value", "dtype": "string"},
68
+ # Profiling
69
+ "total_time": {"_type": "Value", "dtype": "float64"},
70
+ "vad_queue_time": {"_type": "Value", "dtype": "float64"},
71
+ "vad_gpu_time": {"_type": "Value", "dtype": "float64"},
72
+ "asr_gpu_time": {"_type": "Value", "dtype": "float64"},
73
+ "dp_total_time": {"_type": "Value", "dtype": "float64"},
74
+ # Quality & retry
75
+ "segments_passed": {"_type": "Value", "dtype": "int32"},
76
+ "segments_failed": {"_type": "Value", "dtype": "int32"},
77
+ "mean_confidence": {"_type": "Value", "dtype": "float64"},
78
+ "tier1_retries": {"_type": "Value", "dtype": "int32"},
79
+ "tier1_passed": {"_type": "Value", "dtype": "int32"},
80
+ "tier2_retries": {"_type": "Value", "dtype": "int32"},
81
+ "tier2_passed": {"_type": "Value", "dtype": "int32"},
82
+ "reanchors": {"_type": "Value", "dtype": "int32"},
83
+ "special_merges": {"_type": "Value", "dtype": "int32"},
84
+ # Reciter stats
85
+ "words_per_minute": {"_type": "Value", "dtype": "float64"},
86
+ "phonemes_per_second": {"_type": "Value", "dtype": "float64"},
87
+ "avg_segment_duration": {"_type": "Value", "dtype": "float64"},
88
+ "std_segment_duration": {"_type": "Value", "dtype": "float64"},
89
+ "avg_pause_duration": {"_type": "Value", "dtype": "float64"},
90
+ "std_pause_duration": {"_type": "Value", "dtype": "float64"},
91
+ # Session flags
92
+ "resegmented": {"_type": "Value", "dtype": "bool"},
93
+ "retranscribed": {"_type": "Value", "dtype": "bool"},
94
+ # Segments, timestamps & error
95
+ "segments": {"_type": "Value", "dtype": "string"},
96
+ "word_timestamps": {"_type": "Value", "dtype": "string"},
97
+ "char_timestamps": {"_type": "Value", "dtype": "string"},
98
+ "error": {"_type": "Value", "dtype": "string"},
99
+ }
100
+
101
+ if _HAS_DEPS:
102
+ class ParquetScheduler(CommitScheduler):
103
+ """Buffers rows in memory and uploads a parquet file each interval.
104
+
105
+ Audio values are stored as file paths in the row dict; on push they are
106
+ read as bytes and embedded in the parquet using the HF Audio struct.
107
+ """
108
+
109
+ def __init__(
110
+ self,
111
+ *,
112
+ repo_id: str,
113
+ schema: Optional[Dict[str, Dict[str, str]]] = None,
114
+ every: Union[int, float] = 5,
115
+ path_in_repo: Optional[str] = "data",
116
+ repo_type: Optional[str] = "dataset",
117
+ private: bool = False,
118
+ ) -> None:
119
+ super().__init__(
120
+ repo_id=repo_id,
121
+ folder_path="dummy", # not used — we upload directly
122
+ every=every,
123
+ path_in_repo=path_in_repo,
124
+ repo_type=repo_type,
125
+ private=private,
126
+ )
127
+ self._rows: List[Dict[str, Any]] = []
128
+ self._schema = schema
129
+
130
+ def append(self, row: Dict[str, Any]) -> None:
131
+ with self.lock:
132
+ self._rows.append(row)
133
+
134
+ def push_to_hub(self) -> None:
135
+ with self.lock:
136
+ rows = self._rows
137
+ self._rows = []
138
+ if not rows:
139
+ return
140
+
141
+ print(f"[USAGE_LOG] Pushing {len(rows)} alignment row(s) to Hub.")
142
+
143
+ schema: Dict[str, Dict] = dict(self._schema) if self._schema else {}
144
+ paths_to_cleanup: List[Path] = []
145
+
146
+ for row in rows:
147
+ for key, value in row.items():
148
+ if key not in schema:
149
+ schema[key] = _infer_schema(key, value)
150
+
151
+ if value is not None and schema[key].get("_type") in ("Image", "Audio"):
152
+ file_path = Path(value)
153
+ if file_path.is_file():
154
+ row[key] = {
155
+ "path": file_path.name,
156
+ "bytes": file_path.read_bytes(),
157
+ }
158
+ paths_to_cleanup.append(file_path)
159
+ else:
160
+ row[key] = None
161
+
162
+ for row in rows:
163
+ for feature in schema:
164
+ if feature not in row:
165
+ row[feature] = None
166
+
167
+ table = pa.Table.from_pylist(rows)
168
+ table = table.replace_schema_metadata(
169
+ {"huggingface": json.dumps({"info": {"features": schema}})}
170
+ )
171
+
172
+ archive = None
173
+ try:
174
+ import tempfile
175
+ archive = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
176
+ pq.write_table(table, archive.name)
177
+ self.api.upload_file(
178
+ repo_id=self.repo_id,
179
+ repo_type=self.repo_type,
180
+ revision=self.revision,
181
+ path_in_repo=f"{self.path_in_repo}/{uuid4()}.parquet",
182
+ path_or_fileobj=archive.name,
183
+ )
184
+ print("[USAGE_LOG] Parquet commit completed.")
185
+ except Exception as e:
186
+ print(f"[USAGE_LOG] Failed to upload parquet: {e}")
187
+ finally:
188
+ if archive:
189
+ archive.close()
190
+ Path(archive.name).unlink(missing_ok=True)
191
+
192
+ for path in paths_to_cleanup:
193
+ path.unlink(missing_ok=True)
194
+
195
+ def _infer_schema(key: str, value: Any) -> Dict[str, str]:
196
+ if "image" in key:
197
+ return {"_type": "Image"}
198
+ if "audio" in key:
199
+ return {"_type": "Audio"}
200
+ if isinstance(value, bool):
201
+ return {"_type": "Value", "dtype": "bool"}
202
+ if isinstance(value, int):
203
+ return {"_type": "Value", "dtype": "int64"}
204
+ if isinstance(value, float):
205
+ return {"_type": "Value", "dtype": "float64"}
206
+ if isinstance(value, bytes):
207
+ return {"_type": "Value", "dtype": "binary"}
208
+ return {"_type": "Value", "dtype": "string"}
209
+
210
+
211
+ # =========================================================================
212
+ # Lazy scheduler initialization (deferred to first use)
213
+ # =========================================================================
214
+
215
+ _aligner_scheduler = None
216
+ _error_scheduler = None
217
+ _schedulers_initialized = False
218
+ _init_lock = threading.Lock()
219
+ _fallback_lock = threading.Lock()
220
+
221
+
222
+ def _ensure_schedulers() -> None:
223
+ global _aligner_scheduler, _error_scheduler, _schedulers_initialized
224
+ if _schedulers_initialized:
225
+ return
226
+ with _init_lock:
227
+ if _schedulers_initialized:
228
+ return
229
+ _schedulers_initialized = True
230
+ if not _HAS_DEPS:
231
+ print("[USAGE_LOG] Dependencies missing (local-only mode).")
232
+ return
233
+ try:
234
+ _aligner_scheduler = ParquetScheduler(
235
+ repo_id=USAGE_LOG_DATASET_REPO,
236
+ schema=_ALIGNER_SCHEMA,
237
+ every=USAGE_LOG_PUSH_INTERVAL_MINUTES,
238
+ path_in_repo="data",
239
+ repo_type="dataset",
240
+ private=True,
241
+ )
242
+ _error_scheduler = CommitScheduler(
243
+ repo_id=USAGE_LOG_DATASET_REPO,
244
+ repo_type="dataset",
245
+ folder_path=ERROR_DIR,
246
+ path_in_repo="data/errors",
247
+ private=True,
248
+ every=USAGE_LOG_PUSH_INTERVAL_MINUTES,
249
+ )
250
+ except Exception as e:
251
+ print(f"[USAGE_LOG] Scheduler init failed (local-only mode): {e}")
252
+
253
+
254
+ # =========================================================================
255
+ # Helpers
256
+ # =========================================================================
257
+
258
+
259
+ def _get_error_lock():
260
+ _ensure_schedulers()
261
+ if _error_scheduler is not None:
262
+ return _error_scheduler.lock
263
+ return _fallback_lock
264
+
265
+
266
+ def get_user_id(request) -> str:
267
+ """SHA-256 hash (12-char) of IP+UA from a gr.Request, or 'unknown'."""
268
+ try:
269
+ headers = request.headers
270
+ ip = (
271
+ headers.get("x-forwarded-for", "").split(",")[0].strip()
272
+ or headers.get("x-real-ip", "")
273
+ or ""
274
+ )
275
+ ua = headers.get("user-agent", "")
276
+ return hashlib.sha256(f"{ip}|{ua}".encode()).hexdigest()[:12]
277
+ except Exception:
278
+ return "unknown"
279
+
280
+
281
+ def _compute_audio_id(audio: np.ndarray, ts: datetime) -> str:
282
+ """Content hash (16-char) + compact timestamp."""
283
+ audio_hash = hashlib.sha256(audio.tobytes()).hexdigest()[:16]
284
+ return f"{audio_hash}:{ts.strftime('%Y%m%dT%H%M%S')}"
285
+
286
+
287
+ def _encode_audio_flac(audio: np.ndarray, sample_rate: int, audio_id: str) -> str:
288
+ """Encode audio to a temp FLAC file; returns the file path."""
289
+ import soundfile as sf
290
+
291
+ tmp_dir = LOG_DIR / "tmp_audio"
292
+ tmp_dir.mkdir(parents=True, exist_ok=True)
293
+ safe_id = audio_id.replace(":", "-")
294
+ filepath = tmp_dir / f"{safe_id}.flac"
295
+ sf.write(str(filepath), audio, sample_rate, format="FLAC")
296
+ return str(filepath)
297
+
298
+
299
+ def _sync_row_to_scheduler(row: Dict[str, Any]) -> None:
300
+ """Ensure *row* is represented in the scheduler buffer.
301
+
302
+ gr.State may deserialize the dict (creating a copy), and push_to_hub
303
+ detaches rows from the buffer. This helper finds the original row by
304
+ audio_id and updates it, or re-appends if it was already pushed.
305
+ """
306
+ if _aligner_scheduler is None:
307
+ return
308
+ audio_id = row.get("audio_id")
309
+ if not audio_id:
310
+ return
311
+ with _aligner_scheduler.lock:
312
+ for buffered in _aligner_scheduler._rows:
313
+ if buffered.get("audio_id") == audio_id:
314
+ # Update the buffered row in-place (handles gr.State copies)
315
+ buffered.update(row)
316
+ return
317
+ # Row was already pushed — re-append (audio file may be gone, that's ok)
318
+ _aligner_scheduler._rows.append(row)
319
+
320
+
321
+ # =========================================================================
322
+ # Public logging API
323
+ # =========================================================================
324
+
325
+
326
+ def log_alignment(
327
+ *,
328
+ audio: np.ndarray,
329
+ sample_rate: int,
330
+ request=None,
331
+ # Input metadata
332
+ audio_duration_s: float,
333
+ num_segments: int,
334
+ surah: int,
335
+ # Settings
336
+ min_silence_ms: int,
337
+ min_speech_ms: int,
338
+ pad_ms: int,
339
+ asr_model: str,
340
+ device: str,
341
+ # Profiling
342
+ total_time: float,
343
+ vad_queue_time: float,
344
+ vad_gpu_time: float,
345
+ asr_gpu_time: float,
346
+ dp_total_time: float,
347
+ # Quality & retry
348
+ segments_passed: int,
349
+ segments_failed: int,
350
+ mean_confidence: float,
351
+ tier1_retries: int,
352
+ tier1_passed: int,
353
+ tier2_retries: int,
354
+ tier2_passed: int,
355
+ reanchors: int,
356
+ special_merges: int,
357
+ # Reciter stats
358
+ words_per_minute: float,
359
+ phonemes_per_second: float,
360
+ avg_segment_duration: float,
361
+ std_segment_duration: float,
362
+ avg_pause_duration: float,
363
+ std_pause_duration: float,
364
+ # Segments
365
+ log_segments: List[dict],
366
+ ) -> Optional[Dict[str, Any]]:
367
+ """Log an alignment run. Returns the row dict reference for in-place mutation.
368
+
369
+ The returned dict can be stored in gr.State and mutated on
370
+ resegment/retranscribe/timestamps before the scheduler pushes.
371
+ """
372
+ _ensure_schedulers()
373
+ try:
374
+ ts = datetime.now()
375
+ audio_id = _compute_audio_id(audio, ts)
376
+ user_id = get_user_id(request) if request else "unknown"
377
+
378
+ # Build the segments JSON: array of run objects
379
+ segments_runs = [{
380
+ "min_silence_ms": int(min_silence_ms),
381
+ "min_speech_ms": int(min_speech_ms),
382
+ "pad_ms": int(pad_ms),
383
+ "asr_model": asr_model,
384
+ "segments": log_segments,
385
+ }]
386
+
387
+ # Encode audio to FLAC temp file (scheduler embeds bytes on push)
388
+ audio_path = _encode_audio_flac(audio, sample_rate, audio_id)
389
+
390
+ row: Dict[str, Any] = {
391
+ "audio": audio_path,
392
+ "audio_id": audio_id,
393
+ "timestamp": ts.isoformat(timespec="seconds"),
394
+ "user_id": user_id,
395
+ # Input metadata
396
+ "audio_duration_s": audio_duration_s,
397
+ "num_segments": num_segments,
398
+ "surah": surah,
399
+ # Settings (latest)
400
+ "min_silence_ms": int(min_silence_ms),
401
+ "min_speech_ms": int(min_speech_ms),
402
+ "pad_ms": int(pad_ms),
403
+ "asr_model": asr_model,
404
+ "device": device,
405
+ # Profiling
406
+ "total_time": total_time,
407
+ "vad_queue_time": vad_queue_time,
408
+ "vad_gpu_time": vad_gpu_time,
409
+ "asr_gpu_time": asr_gpu_time,
410
+ "dp_total_time": dp_total_time,
411
+ # Quality & retry
412
+ "segments_passed": segments_passed,
413
+ "segments_failed": segments_failed,
414
+ "mean_confidence": mean_confidence,
415
+ "tier1_retries": tier1_retries,
416
+ "tier1_passed": tier1_passed,
417
+ "tier2_retries": tier2_retries,
418
+ "tier2_passed": tier2_passed,
419
+ "reanchors": reanchors,
420
+ "special_merges": special_merges,
421
+ # Reciter stats
422
+ "words_per_minute": words_per_minute,
423
+ "phonemes_per_second": phonemes_per_second,
424
+ "avg_segment_duration": avg_segment_duration,
425
+ "std_segment_duration": std_segment_duration,
426
+ "avg_pause_duration": avg_pause_duration,
427
+ "std_pause_duration": std_pause_duration,
428
+ # Session flags
429
+ "resegmented": False,
430
+ "retranscribed": False,
431
+ # Segments & error
432
+ "segments": json.dumps(segments_runs),
433
+ "word_timestamps": None,
434
+ "char_timestamps": None,
435
+ "error": None,
436
+ }
437
+
438
+ if _aligner_scheduler is not None:
439
+ _aligner_scheduler.append(row)
440
+ else:
441
+ _write_fallback(row)
442
+
443
+ return row
444
+
445
+ except Exception as e:
446
+ print(f"[USAGE_LOG] Failed to log alignment: {e}")
447
+ return None
448
+
449
+
450
+ def update_alignment_row(
451
+ row: Dict[str, Any],
452
+ *,
453
+ action: str,
454
+ # Input metadata (overwritten)
455
+ audio_duration_s: float,
456
+ num_segments: int,
457
+ surah: int,
458
+ # Settings for this run
459
+ min_silence_ms: int,
460
+ min_speech_ms: int,
461
+ pad_ms: int,
462
+ asr_model: str,
463
+ device: str,
464
+ # Profiling
465
+ total_time: float,
466
+ vad_queue_time: float,
467
+ vad_gpu_time: float,
468
+ asr_gpu_time: float,
469
+ dp_total_time: float,
470
+ # Quality & retry
471
+ segments_passed: int,
472
+ segments_failed: int,
473
+ mean_confidence: float,
474
+ tier1_retries: int,
475
+ tier1_passed: int,
476
+ tier2_retries: int,
477
+ tier2_passed: int,
478
+ reanchors: int,
479
+ special_merges: int,
480
+ # Reciter stats
481
+ words_per_minute: float,
482
+ phonemes_per_second: float,
483
+ avg_segment_duration: float,
484
+ std_segment_duration: float,
485
+ avg_pause_duration: float,
486
+ std_pause_duration: float,
487
+ # Segments
488
+ log_segments: List[dict],
489
+ ) -> None:
490
+ """Mutate an existing row dict in-place and ensure it's in the scheduler buffer.
491
+
492
+ After mutation, syncs the row into the scheduler's buffer so the update
493
+ is captured even if gr.State returned a deserialized copy or if the
494
+ original row was already pushed to Hub.
495
+
496
+ Args:
497
+ row: The dict returned by log_alignment(), stored in gr.State.
498
+ action: "resegment" or "retranscribe".
499
+ """
500
+ try:
501
+ # Overwrite run-level fields
502
+ row["audio_duration_s"] = audio_duration_s
503
+ row["num_segments"] = num_segments
504
+ row["surah"] = surah
505
+ row["min_silence_ms"] = int(min_silence_ms)
506
+ row["min_speech_ms"] = int(min_speech_ms)
507
+ row["pad_ms"] = int(pad_ms)
508
+ row["asr_model"] = asr_model
509
+ row["device"] = device
510
+ row["total_time"] = total_time
511
+ row["vad_queue_time"] = vad_queue_time
512
+ row["vad_gpu_time"] = vad_gpu_time
513
+ row["asr_gpu_time"] = asr_gpu_time
514
+ row["dp_total_time"] = dp_total_time
515
+ row["segments_passed"] = segments_passed
516
+ row["segments_failed"] = segments_failed
517
+ row["mean_confidence"] = mean_confidence
518
+ row["tier1_retries"] = tier1_retries
519
+ row["tier1_passed"] = tier1_passed
520
+ row["tier2_retries"] = tier2_retries
521
+ row["tier2_passed"] = tier2_passed
522
+ row["reanchors"] = reanchors
523
+ row["special_merges"] = special_merges
524
+ row["words_per_minute"] = words_per_minute
525
+ row["phonemes_per_second"] = phonemes_per_second
526
+ row["avg_segment_duration"] = avg_segment_duration
527
+ row["std_segment_duration"] = std_segment_duration
528
+ row["avg_pause_duration"] = avg_pause_duration
529
+ row["std_pause_duration"] = std_pause_duration
530
+
531
+ # Set session flag
532
+ if action == "resegment":
533
+ row["resegmented"] = True
534
+ elif action == "retranscribe":
535
+ row["retranscribed"] = True
536
+
537
+ # Append new run to segments array
538
+ segments_runs = json.loads(row.get("segments") or "[]")
539
+ segments_runs.append({
540
+ "min_silence_ms": int(min_silence_ms),
541
+ "min_speech_ms": int(min_speech_ms),
542
+ "pad_ms": int(pad_ms),
543
+ "asr_model": asr_model,
544
+ "segments": log_segments,
545
+ })
546
+ row["segments"] = json.dumps(segments_runs)
547
+
548
+ # Sync with scheduler buffer — the row from gr.State may be a
549
+ # deserialized copy, or the original may have already been pushed.
550
+ _sync_row_to_scheduler(row)
551
+
552
+ except Exception as e:
553
+ print(f"[USAGE_LOG] Failed to update alignment row: {e}")
554
+
555
+
556
+ def update_word_timestamps(
557
+ row: Dict[str, Any],
558
+ word_timestamps_json: str,
559
+ char_timestamps_json: Optional[str] = None,
560
+ ) -> None:
561
+ """Set word and char timestamps fields on an existing row and sync to scheduler."""
562
+ try:
563
+ row["word_timestamps"] = word_timestamps_json
564
+ if char_timestamps_json is not None:
565
+ row["char_timestamps"] = char_timestamps_json
566
+ _sync_row_to_scheduler(row)
567
+ except Exception as e:
568
+ print(f"[USAGE_LOG] Failed to update word timestamps: {e}")
569
+
570
+
571
+ def log_error(user_id: str, error_message: str) -> None:
572
+ """Log a pipeline error to JSONL."""
573
+ try:
574
+ with _get_error_lock():
575
+ with ERROR_LOG_PATH.open("a") as f:
576
+ json.dump({
577
+ "timestamp": datetime.now().isoformat(timespec="seconds"),
578
+ "user_id": user_id,
579
+ "error_message": error_message or "",
580
+ }, f)
581
+ f.write("\n")
582
+ except Exception:
583
+ pass
584
+
585
+
586
+ def _write_fallback(row: Dict[str, Any]) -> None:
587
+ """Local-only fallback: write JSONL (without audio)."""
588
+ fallback_path = LOG_DIR / "alignments_fallback.jsonl"
589
+ with _fallback_lock:
590
+ with fallback_path.open("a") as f:
591
+ fallback_row = {k: v for k, v in row.items() if k != "audio"}
592
+ json.dump(fallback_row, f)
593
+ f.write("\n")