Spaces:
Running on Zero
Running on Zero
Commit ·
20e9692
0
Parent(s):
Initial commit
Browse files- .claude/skills/phonemizer/SKILL.md +517 -0
- .gitattributes +8 -0
- .gitignore +54 -0
- README.md +16 -0
- app.py +0 -0
- config.py +322 -0
- data/112.mp3 +3 -0
- data/7.mp3 +3 -0
- data/84.mp3 +3 -0
- data/DigitalKhattV2.otf +3 -0
- data/Juz' 30.mp3 +3 -0
- data/digital_khatt_v2_script.json +3 -0
- data/font_data.py +0 -0
- data/ligatures.json +1 -0
- data/phoneme_cache.pkl +3 -0
- data/phoneme_ngram_index_5.pkl +3 -0
- data/phoneme_sub_costs.json +67 -0
- data/qpc_hafs.json +3 -0
- data/surah-name-v2.ttf +3 -0
- data/surah_info.json +0 -0
- docs/api.md +300 -0
- docs/usage-logging.md +370 -0
- requirements.txt +13 -0
- scripts/add_open_tanween.py +57 -0
- scripts/build_phoneme_cache.py +95 -0
- scripts/build_phoneme_ngram_index.py +110 -0
- scripts/export_onnx.py +160 -0
- scripts/fix_stop_sign_spacing.py +60 -0
- setup.py +15 -0
- src/__init__.py +1 -0
- src/_dp_core.pyx +357 -0
- src/alignment/__init__.py +0 -0
- src/alignment/alignment_pipeline.py +377 -0
- src/alignment/ngram_index.py +39 -0
- src/alignment/phoneme_anchor.py +293 -0
- src/alignment/phoneme_asr.py +355 -0
- src/alignment/phoneme_matcher.py +590 -0
- src/alignment/phoneme_matcher_cache.py +59 -0
- src/alignment/special_segments.py +295 -0
- src/phonemizer_utils.py +12 -0
- src/quran_index.py +150 -0
- src/segment_processor.py +20 -0
- src/segment_types.py +153 -0
- src/segmenter/__init__.py +0 -0
- src/segmenter/segmenter_aoti.py +379 -0
- src/segmenter/segmenter_model.py +158 -0
- src/segmenter/vad.py +97 -0
- src/zero_gpu.py +146 -0
- utils/__init__.py +0 -0
- utils/usage_logger.py +593 -0
.claude/skills/phonemizer/SKILL.md
ADDED
|
@@ -0,0 +1,517 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: phonemizer
|
| 3 |
+
description: Domain expert in Quranic phonetics, Tajweed rules, Arabic Unicode, and the Quranic Phonemizer system. Use for phoneme analysis, IPA mappings, Unicode character inspection, and investigating Tajweed rule patterns.
|
| 4 |
+
allowed-tools:
|
| 5 |
+
- Glob
|
| 6 |
+
- Grep
|
| 7 |
+
- Read
|
| 8 |
+
- Bash
|
| 9 |
+
- Python
|
| 10 |
+
- Python3
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Phonemizer Domain Expert
|
| 14 |
+
|
| 15 |
+
You are a domain expert in Quranic phonetics, Tajweed rules, Arabic Unicode, and the Quranic Phonemizer system. Your role is to help understand patterns and associations between:
|
| 16 |
+
- **Tajweed rules** (pronunciation rules for Quranic recitation)
|
| 17 |
+
- **Unicode codepoints** (Arabic script encoding)
|
| 18 |
+
- **IPA phonemes** (International Phonetic Alphabet representations)
|
| 19 |
+
- **Arabic graphemes** (written characters and diacritics)
|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
## Phonemizer API
|
| 24 |
+
|
| 25 |
+
The phonemizer is located at `/mnt/c/Users/ahmed/Documents/Uni/Thesis/Code/phonemizer/`.
|
| 26 |
+
|
| 27 |
+
### Getting Text and Phonemes
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
import sys
|
| 31 |
+
sys.path.insert(0, "/mnt/c/Users/ahmed/Documents/Uni/Thesis/Code/phonemizer")
|
| 32 |
+
from core.phonemizer import Phonemizer
|
| 33 |
+
|
| 34 |
+
pm = Phonemizer()
|
| 35 |
+
|
| 36 |
+
# Get a verse by reference (surah:ayah or surah:ayah:word)
|
| 37 |
+
result = pm.phonemize(ref="2:255") # Ayat al-Kursi
|
| 38 |
+
|
| 39 |
+
# Access text
|
| 40 |
+
text = result.text() # Arabic text with verse markers
|
| 41 |
+
|
| 42 |
+
# Access phonemes
|
| 43 |
+
phonemes_str = result.phonemes_str(phoneme_sep=" ", word_sep="")
|
| 44 |
+
phonemes_list = result.phonemes_list(split='word') # [[phonemes], ...]
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
### Reference Formats
|
| 48 |
+
|
| 49 |
+
| Format | Example | Meaning |
|
| 50 |
+
|--------|---------|---------|
|
| 51 |
+
| Surah | `"1"` | Entire Al-Fatiha |
|
| 52 |
+
| Verse | `"2:255"` | Ayat al-Kursi |
|
| 53 |
+
| Word | `"1:1:2"` | Word 2 of 1:1 |
|
| 54 |
+
| Range | `"1:1 - 1:5"` | Verses 1-5 of surah 1 |
|
| 55 |
+
| Cross-surah | `"112 - 114"` | Last 3 surahs |
|
| 56 |
+
|
| 57 |
+
### Getting Detailed Mappings
|
| 58 |
+
|
| 59 |
+
```python
|
| 60 |
+
mapping = result.get_mapping()
|
| 61 |
+
|
| 62 |
+
# Flat phoneme sequence
|
| 63 |
+
all_phonemes = mapping.phoneme_sequence # ["b", "i", "s", "m", ...]
|
| 64 |
+
|
| 65 |
+
# Word-level data
|
| 66 |
+
for word in mapping.words:
|
| 67 |
+
print(word.location) # "1:1:1"
|
| 68 |
+
print(word.text) # "بِسْمِ"
|
| 69 |
+
print(word.phonemes) # ["b", "i", "s", "m", "i"]
|
| 70 |
+
|
| 71 |
+
# Letter-level breakdown
|
| 72 |
+
for letter in word.letter_mappings:
|
| 73 |
+
print(f" {letter.char} → {letter.phonemes}")
|
| 74 |
+
print(f" diacritic: {letter.diacritic}")
|
| 75 |
+
print(f" rules: {letter.letter_rules}")
|
| 76 |
+
|
| 77 |
+
# Alignment: phoneme → source letter
|
| 78 |
+
for entry in mapping.alignment:
|
| 79 |
+
print(f"Phoneme '{entry.phoneme}' from '{entry.source_char}'")
|
| 80 |
+
print(f" Tajweed rules: {entry.rules}")
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
---
|
| 84 |
+
|
| 85 |
+
## Data Structures
|
| 86 |
+
|
| 87 |
+
The phonemizer produces structured mappings that the recitation app transforms into frozen dataclasses.
|
| 88 |
+
|
| 89 |
+
### Phonemizer Core Structures (`phonemizer/core/mapping.py`)
|
| 90 |
+
|
| 91 |
+
#### PhonemizationMapping (root)
|
| 92 |
+
```python
|
| 93 |
+
@dataclass
|
| 94 |
+
class PhonemizationMapping:
|
| 95 |
+
ref: str # Reference string
|
| 96 |
+
text: str # Arabic text
|
| 97 |
+
words: List[WordMapping] # Per-word breakdown
|
| 98 |
+
phoneme_sequence: List[str] # Flat phoneme list
|
| 99 |
+
alignment: List[AlignmentEntry] # Phoneme→letter mapping
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
#### WordMapping
|
| 103 |
+
```python
|
| 104 |
+
@dataclass
|
| 105 |
+
class WordMapping:
|
| 106 |
+
location: str # "surah:ayah:word"
|
| 107 |
+
text: str # Arabic word
|
| 108 |
+
phonemes: List[str] # Word's phonemes
|
| 109 |
+
letter_mappings: List[LetterMapping]
|
| 110 |
+
leading_symbols: List[OtherSymbolMapping] # Before word
|
| 111 |
+
trailing_symbols: List[OtherSymbolMapping] # After word (verse markers)
|
| 112 |
+
is_special_word: bool # Allah, etc.
|
| 113 |
+
is_starting: bool # First word of segment
|
| 114 |
+
is_stopping: bool # Last word (affects phonemization)
|
| 115 |
+
madd_mappings: List[MaddMapping] # Long vowel tracking
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
#### LetterMapping
|
| 119 |
+
```python
|
| 120 |
+
@dataclass
|
| 121 |
+
class LetterMapping:
|
| 122 |
+
index: int # Position in word (0-based)
|
| 123 |
+
char: str # Base Arabic character
|
| 124 |
+
phonemes: List[str] # Emitted phonemes (empty = silent)
|
| 125 |
+
diacritic: Optional[str] # "FATHA", "SUKUN", etc.
|
| 126 |
+
has_shaddah: bool # Gemination marker
|
| 127 |
+
extensions: List[ExtensionSymbolMapping] # Dagger alef, maddah, etc.
|
| 128 |
+
other_symbols: List[OtherSymbolMapping] # Tatweel, stop signs
|
| 129 |
+
mapping_type: MappingType # STANDARD, SILENT, ONE_TO_MANY, etc.
|
| 130 |
+
letter_rules: List[str] # Tajweed rules for this letter
|
| 131 |
+
phoneme_rules: List[List[str]] # Rules per emitted phoneme
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
#### ExtensionSymbolMapping
|
| 135 |
+
```python
|
| 136 |
+
@dataclass
|
| 137 |
+
class ExtensionSymbolMapping:
|
| 138 |
+
char: str # The extension character (e.g., "ٰ" dagger alef)
|
| 139 |
+
name: str # Identifier: "DAGGER_ALEF", "MADDAH", "MINI_WAW", etc.
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
#### MaddMapping (Long Vowel Tracking)
|
| 143 |
+
```python
|
| 144 |
+
@dataclass
|
| 145 |
+
class MaddMapping:
|
| 146 |
+
phoneme_index: int # Index in word's phoneme list
|
| 147 |
+
phoneme: str # Long vowel: "a:", "aˤ:", "u:", "i:"
|
| 148 |
+
letter_index: int # Letter that carries this madd
|
| 149 |
+
vowel_grapheme: str # ا, و, ي, ى, ٰ (source of the vowel)
|
| 150 |
+
has_maddah: bool # Whether ٓ is present
|
| 151 |
+
extension_index: Optional[int] # If vowel is an extension, its index
|
| 152 |
+
madd_type: Optional[str] # 'wajib_muttasil', 'jaiz_munfasil', 'lazim', etc.
|
| 153 |
+
is_lafdh_jalalah: bool # Implicit dagger alef in "Allah"
|
| 154 |
+
is_hamza_fathatan: bool # hamza + fathatan → hamza + fatha + alef
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
---
|
| 158 |
+
|
| 159 |
+
### Recitation App Structures (`recitation_analysis/result.py`)
|
| 160 |
+
|
| 161 |
+
The recitation app transforms phonemizer output into **frozen (immutable)** dataclasses via `ResultBuilder`.
|
| 162 |
+
|
| 163 |
+
#### RecitationResult (top-level container)
|
| 164 |
+
```python
|
| 165 |
+
@dataclass
|
| 166 |
+
class RecitationResult:
|
| 167 |
+
verse_ref: str
|
| 168 |
+
segment_ref: Optional[str]
|
| 169 |
+
canonical_phonemes: Tuple[str, ...]
|
| 170 |
+
detected_phonemes: Tuple[str, ...] # From ASR
|
| 171 |
+
canonical_words: Tuple[WordData, ...]
|
| 172 |
+
detected_words: Tuple[WordData, ...] # Modified by alignment
|
| 173 |
+
alignment: Tuple[AlignmentEntry, ...]
|
| 174 |
+
madd_mappings: Tuple[MaddMappingData, ...]
|
| 175 |
+
errors: List[RecitationError] # Mutable - added after analysis
|
| 176 |
+
ghunnah_instances: List[GhunnahInstance]
|
| 177 |
+
madd_instances: List[MaddInstance]
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
#### WordData (frozen)
|
| 181 |
+
```python
|
| 182 |
+
@dataclass(frozen=True)
|
| 183 |
+
class WordData:
|
| 184 |
+
index: int # Position in verse (0-based)
|
| 185 |
+
location: str # "surah:verse:word"
|
| 186 |
+
letters: Tuple[LetterData, ...] # Letters in the word
|
| 187 |
+
leading_symbols: Tuple[str, ...] # Rare symbols before word
|
| 188 |
+
trailing_symbols: Tuple[str, ...] # Verse markers, etc.
|
| 189 |
+
is_stopping: bool # True if stopping word
|
| 190 |
+
|
| 191 |
+
@property
|
| 192 |
+
def text(self) -> str: # Derived from letters
|
| 193 |
+
|
| 194 |
+
@property
|
| 195 |
+
def phonemes(self) -> Tuple[str, ...]: # All letter phonemes
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
#### LetterData (frozen)
|
| 199 |
+
```python
|
| 200 |
+
@dataclass(frozen=True)
|
| 201 |
+
class LetterData:
|
| 202 |
+
index: int # Position in word (0-based)
|
| 203 |
+
char: str # Base Arabic character
|
| 204 |
+
diacritic: Optional[str] # Diacritic name
|
| 205 |
+
diacritic_char: Optional[str] # Diacritic character
|
| 206 |
+
shaddah: bool # Whether shaddah present
|
| 207 |
+
extensions: Tuple[str, ...] # Extension CHARACTERS
|
| 208 |
+
other_symbols: Tuple[str, ...] # Stop signs, etc.
|
| 209 |
+
phonemes: Tuple[str, ...] # Phonemes this letter produces
|
| 210 |
+
letter_rules: Tuple[str, ...] # Tajweed rules
|
| 211 |
+
phoneme_rules: Tuple[Tuple[str, ...], ...] # Per-phoneme rules
|
| 212 |
+
is_silent: bool # True if no phonemes
|
| 213 |
+
name: str # "LAM", "NOON", etc.
|
| 214 |
+
extension_names: Tuple[str, ...] # ("DAGGER_ALEF",)
|
| 215 |
+
|
| 216 |
+
# Insertion tracking (for error highlighting)
|
| 217 |
+
inserted_base: bool
|
| 218 |
+
inserted_diacritic: bool
|
| 219 |
+
inserted_shaddah: bool
|
| 220 |
+
inserted_extensions: Tuple[int, ...]
|
| 221 |
+
|
| 222 |
+
def get_full_text(self) -> str:
|
| 223 |
+
"""Builds: base + shaddah + diacritic + extensions + other"""
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
+
---
|
| 227 |
+
|
| 228 |
+
## Extensions and Vowel Phoneme Production
|
| 229 |
+
|
| 230 |
+
**Key Insight:** Extensions are NOT separate letters with their own phoneme lists. They are symbols attached to a letter that **influence the parent letter's phoneme production**.
|
| 231 |
+
|
| 232 |
+
### How Extensions Affect Phonemes
|
| 233 |
+
|
| 234 |
+
```
|
| 235 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 236 |
+
│ Letter: ل (Lam) │
|
| 237 |
+
│ Diacritic: FATHA (produces "a") │
|
| 238 |
+
│ Extensions: [DAGGER_ALEF] │
|
| 239 |
+
│ │
|
| 240 |
+
│ Phoneme production: │
|
| 241 |
+
│ - Lam base → "l" │
|
| 242 |
+
│ - Fatha + extension → "a:" (long vowel) │
|
| 243 |
+
│ │
|
| 244 |
+
│ Final phonemes: ["l", "a:"] ← stored on LETTER, not ext │
|
| 245 |
+
└─────────────────────────────────────────────────────────────┘
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
### Extension Types and Their Effects
|
| 249 |
+
|
| 250 |
+
| Extension Name | Char | Effect on Phonemes |
|
| 251 |
+
|---------------|------|-------------------|
|
| 252 |
+
| `DAGGER_ALEF` | ٰ | Extends fatha: `a` → `a:` |
|
| 253 |
+
| `MADDAH` | ٓ | Marks madd, just visual, usually accompanied with another extension |
|
| 254 |
+
| `MINI_WAW` | ۥ | Extends damma: `u` → `u:` |
|
| 255 |
+
| `MINI_YA_END` | ۦ | Extends kasra: `i` → `i:` |
|
| 256 |
+
| `HAMZA_ABOVE` | ٔ | Adds glottal stop phoneme |
|
| 257 |
+
| `HAMZA_BELOW` | ٕ | Adds glottal stop phoneme |
|
| 258 |
+
|
| 259 |
+
### Vowel Grapheme Sources
|
| 260 |
+
|
| 261 |
+
Long vowels can come from:
|
| 262 |
+
1. **Vowel letters**: ا (alef), و (waw), ي (ya), ى (alef maksura)
|
| 263 |
+
2. **Extensions**: ٰ (dagger alef), ۥ (mini waw), ۦ (mini ya)
|
| 264 |
+
|
| 265 |
+
The `MaddMapping.extension_index` tracks when the vowel comes from an extension:
|
| 266 |
+
|
| 267 |
+
```python
|
| 268 |
+
# Example: ملٰـٓئِكَة (malaa'ika - angels)
|
| 269 |
+
# The lam has dagger alef extension producing a:
|
| 270 |
+
MaddMapping(
|
| 271 |
+
letter_index=0, # Lam
|
| 272 |
+
phoneme="a:",
|
| 273 |
+
vowel_grapheme="ٰ", # Dagger alef
|
| 274 |
+
extension_index=0, # First extension on the letter
|
| 275 |
+
)
|
| 276 |
+
```
|
| 277 |
+
|
| 278 |
+
### Rendering Order
|
| 279 |
+
|
| 280 |
+
When rendering a letter with extensions:
|
| 281 |
+
```
|
| 282 |
+
base_char + shaddah? + diacritic + extensions + other_symbols
|
| 283 |
+
ل ّ َ ٰ ۟
|
| 284 |
+
```
|
| 285 |
+
|
| 286 |
+
---
|
| 287 |
+
|
| 288 |
+
### Usage
|
| 289 |
+
|
| 290 |
+
```python
|
| 291 |
+
from recitation_analysis.result_builder import ResultBuilder, get_result_builder
|
| 292 |
+
|
| 293 |
+
# Get singleton builder
|
| 294 |
+
builder = get_result_builder()
|
| 295 |
+
|
| 296 |
+
# Build from phonemizer result
|
| 297 |
+
result = builder.build_from_mapping(
|
| 298 |
+
mapping=phonemizer_result.get_mapping(),
|
| 299 |
+
verse_ref="1:1",
|
| 300 |
+
is_starting_segment=True,
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
# Result contains frozen WordData/LetterData with transforms applied
|
| 304 |
+
for word in result.canonical_words:
|
| 305 |
+
for letter in word.letters:
|
| 306 |
+
print(f"{letter.char} + {letter.extensions} → {letter.phonemes}")
|
| 307 |
+
```
|
| 308 |
+
|
| 309 |
+
---
|
| 310 |
+
|
| 311 |
+
## Phoneme Inventory
|
| 312 |
+
|
| 313 |
+
### Consonants (Base → Geminated)
|
| 314 |
+
|
| 315 |
+
| Letter | Phoneme | Geminated | Name |
|
| 316 |
+
|--------|---------|-----------|------|
|
| 317 |
+
| ب | `b` | `bb` | Ba |
|
| 318 |
+
| ت | `t` | `tt` | Ta |
|
| 319 |
+
| ث | `θ` | `θθ` | Tha |
|
| 320 |
+
| ج | `ʒ` | `ʒʒ` | Jeem |
|
| 321 |
+
| ح | `ħ` | `ħħ` | Hha |
|
| 322 |
+
| خ | `x` | `xx` | Kha |
|
| 323 |
+
| د | `d` | `dd` | Dal |
|
| 324 |
+
| ذ | `ð` | `ðð` | Thal |
|
| 325 |
+
| ر | `r` | `rr` | Ra |
|
| 326 |
+
| ز | `z` | `zz` | Zain |
|
| 327 |
+
| س | `s` | `ss` | Seen |
|
| 328 |
+
| ش | `ʃ` | `ʃʃ` | Sheen |
|
| 329 |
+
| ص | `sˤ` | `sˤsˤ` | Sad (emphatic) |
|
| 330 |
+
| ض | `dˤ` | `dˤdˤ` | Dad (emphatic) |
|
| 331 |
+
| ط | `tˤ` | `tˤtˤ` | Tta (emphatic) |
|
| 332 |
+
| ظ | `ðˤ` | `ðˤðˤ` | Dtha (emphatic) |
|
| 333 |
+
| ع | `ʕ` | `ʕʕ` | Ain |
|
| 334 |
+
| غ | `ɣ` | - | Ghain |
|
| 335 |
+
| ف | `f` | `ff` | Fa |
|
| 336 |
+
| ق | `q` | `qq` | Qaf |
|
| 337 |
+
| ك | `k` | `kk` | Kaf |
|
| 338 |
+
| ل | `l` | `ll` | Lam |
|
| 339 |
+
| م | `m` | - | Meem |
|
| 340 |
+
| ن | `n` | - | Noon |
|
| 341 |
+
| ه | `h` | `hh` | Ha |
|
| 342 |
+
| و | `w` | `ww` | Waw |
|
| 343 |
+
| ي | `j` | `jj` | Ya |
|
| 344 |
+
| ء | `ʔ` | - | Hamza |
|
| 345 |
+
|
| 346 |
+
### Vowels
|
| 347 |
+
|
| 348 |
+
| Diacritic | Short | Long | Name |
|
| 349 |
+
|-----------|-------|------|------|
|
| 350 |
+
| فَ | `a` | `a:` | Fatha |
|
| 351 |
+
| فُ | `u` | `u:` | Damma |
|
| 352 |
+
| فِ | `i` | `i:` | Kasra |
|
| 353 |
+
| فً | `an` | - | Fathatan (tanween) |
|
| 354 |
+
| فٌ | `un` | - | Dammatan (tanween) |
|
| 355 |
+
| فٍ | `in` | - | Kasratan (tanween) |
|
| 356 |
+
|
| 357 |
+
### Emphatic Vowels (after ص ض ط ظ)
|
| 358 |
+
|
| 359 |
+
| Context | Short | Long |
|
| 360 |
+
|---------|-------|------|
|
| 361 |
+
| Emphatic + fatha/alef | `aˤ` | `aˤ:` |
|
| 362 |
+
|
| 363 |
+
### Tajweed-Specific Phonemes
|
| 364 |
+
|
| 365 |
+
| Phoneme | Rule | Description |
|
| 366 |
+
|---------|------|-------------|
|
| 367 |
+
| `ŋ` | Ikhfaa | Hidden noon/tanween |
|
| 368 |
+
| `m̃` | Idgham | Nasalized meem (ghunnah) |
|
| 369 |
+
| `ñ` | Idgham | Nasalized noon (ghunnah) |
|
| 370 |
+
| `j̃` | Idgham | Nasalized ya (ghunnah) |
|
| 371 |
+
| `w̃` | Idgham | Nasalized waw (ghunnah) |
|
| 372 |
+
| `Q` | Qalqala | Bouncing (mid-word) |
|
| 373 |
+
| `lˤlˤ` | Lam Heavy | Heavy lam in "Allah" |
|
| 374 |
+
| `rˤ` | Ra Heavy | Heavy ra (tafkheem) |
|
| 375 |
+
|
| 376 |
+
---
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
## Searching for Specific Rules
|
| 380 |
+
|
| 381 |
+
The precomputed mappings file is at `data/phonemizer_mappings.json` (118MB, covers surahs 10-114).
|
| 382 |
+
|
| 383 |
+
### Search for a rule pattern
|
| 384 |
+
|
| 385 |
+
```bash
|
| 386 |
+
# Find verses containing idgham
|
| 387 |
+
grep -o '"letter_rules":\s*\[[^\]]*idgham[^\]]*\]' data/phonemizer_mappings.json | head -5
|
| 388 |
+
|
| 389 |
+
# Find specific rule type
|
| 390 |
+
grep -o '"rules":\s*\[[^\]]*ikhfaa[^\]]*\]' data/phonemizer_mappings.json | head -5
|
| 391 |
+
```
|
| 392 |
+
|
| 393 |
+
### Python search script
|
| 394 |
+
|
| 395 |
+
```python
|
| 396 |
+
import json
|
| 397 |
+
|
| 398 |
+
with open('data/phonemizer_mappings.json', 'r') as f:
|
| 399 |
+
raw = f.read()
|
| 400 |
+
data = json.loads(json.loads(raw)) if raw.startswith('"') else json.loads(raw)
|
| 401 |
+
|
| 402 |
+
def find_rule(rule_pattern: str, limit: int = 5):
|
| 403 |
+
"""Find words containing a specific Tajweed rule."""
|
| 404 |
+
results = []
|
| 405 |
+
for word in data['words']:
|
| 406 |
+
for lm in word.get('letter_mappings', []):
|
| 407 |
+
rules = lm.get('letter_rules', []) + [r for pr in lm.get('phoneme_rules', []) for r in pr]
|
| 408 |
+
if any(rule_pattern.lower() in r.lower() for r in rules):
|
| 409 |
+
results.append({
|
| 410 |
+
'location': word['location'],
|
| 411 |
+
'text': word['text'],
|
| 412 |
+
'char': lm['char'],
|
| 413 |
+
'phonemes': lm['phonemes'],
|
| 414 |
+
'rules': rules
|
| 415 |
+
})
|
| 416 |
+
if len(results) >= limit:
|
| 417 |
+
return results
|
| 418 |
+
return results
|
| 419 |
+
|
| 420 |
+
# Example: find idgham rules
|
| 421 |
+
for r in find_rule('idgham', limit=5):
|
| 422 |
+
print(f"{r['location']}: {r['text']} - {r['char']} → {r['phonemes']} ({r['rules']})")
|
| 423 |
+
```
|
| 424 |
+
|
| 425 |
+
---
|
| 426 |
+
|
| 427 |
+
## Text to Unicode Inspection Script
|
| 428 |
+
|
| 429 |
+
To inspect any Arabic text character by character:
|
| 430 |
+
|
| 431 |
+
```python
|
| 432 |
+
def text_to_unicode_sequence(text: str) -> list:
|
| 433 |
+
"""Convert Arabic text to a sequence of Unicode codepoints with names."""
|
| 434 |
+
import unicodedata
|
| 435 |
+
result = []
|
| 436 |
+
for i, char in enumerate(text):
|
| 437 |
+
cp = ord(char)
|
| 438 |
+
try:
|
| 439 |
+
name = unicodedata.name(char)
|
| 440 |
+
except ValueError:
|
| 441 |
+
name = "UNKNOWN"
|
| 442 |
+
result.append({
|
| 443 |
+
'index': i,
|
| 444 |
+
'char': char,
|
| 445 |
+
'codepoint': f"U+{cp:04X}",
|
| 446 |
+
'decimal': cp,
|
| 447 |
+
'name': name
|
| 448 |
+
})
|
| 449 |
+
return result
|
| 450 |
+
|
| 451 |
+
def print_unicode_sequence(text: str):
|
| 452 |
+
"""Pretty print unicode sequence of text."""
|
| 453 |
+
seq = text_to_unicode_sequence(text)
|
| 454 |
+
print(f"Text: {text}")
|
| 455 |
+
print(f"Length: {len(seq)} codepoints")
|
| 456 |
+
print("-" * 60)
|
| 457 |
+
for item in seq:
|
| 458 |
+
print(f"[{item['index']:2d}] {item['char']!r:6} {item['codepoint']} {item['name']}")
|
| 459 |
+
```
|
| 460 |
+
|
| 461 |
+
### Combined: Phonemizer + Unicode Inspection
|
| 462 |
+
|
| 463 |
+
```python
|
| 464 |
+
import sys
|
| 465 |
+
sys.path.insert(0, "/mnt/c/Users/ahmed/Documents/Uni/Thesis/Code/phonemizer")
|
| 466 |
+
from core.phonemizer import Phonemizer
|
| 467 |
+
|
| 468 |
+
pm = Phonemizer()
|
| 469 |
+
result = pm.phonemize("1:1:1")
|
| 470 |
+
|
| 471 |
+
# Get text and inspect unicode
|
| 472 |
+
text = result.text()
|
| 473 |
+
print_unicode_sequence(text)
|
| 474 |
+
|
| 475 |
+
# Get mapping and correlate
|
| 476 |
+
mapping = result.get_mapping()
|
| 477 |
+
for word in mapping.words:
|
| 478 |
+
print(f"\n=== {word.location}: {word.text} ===")
|
| 479 |
+
print_unicode_sequence(word.text)
|
| 480 |
+
print("\nLetter Mappings:")
|
| 481 |
+
for lm in word.letter_mappings:
|
| 482 |
+
print(f" {lm.char} (U+{ord(lm.char):04X}) → {lm.phonemes}")
|
| 483 |
+
```
|
| 484 |
+
|
| 485 |
+
---
|
| 486 |
+
|
| 487 |
+
## Investigation Workflow
|
| 488 |
+
|
| 489 |
+
When investigating a specific Tajweed rule:
|
| 490 |
+
|
| 491 |
+
1. **Search mappings file** for the rule pattern:
|
| 492 |
+
```python
|
| 493 |
+
results = find_rule('qalqala', limit=10)
|
| 494 |
+
```
|
| 495 |
+
|
| 496 |
+
2. **Get the verse reference** from the results:
|
| 497 |
+
```python
|
| 498 |
+
location = results[0]['location'] # e.g., "10:2:7"
|
| 499 |
+
```
|
| 500 |
+
|
| 501 |
+
3. **Run phonemizer** on that verse for detailed analysis:
|
| 502 |
+
```python
|
| 503 |
+
result = pm.phonemize(location)
|
| 504 |
+
mapping = result.get_mapping()
|
| 505 |
+
```
|
| 506 |
+
|
| 507 |
+
4. **Inspect unicode** of specific words:
|
| 508 |
+
```python
|
| 509 |
+
print_unicode_sequence(mapping.words[0].text)
|
| 510 |
+
```
|
| 511 |
+
|
| 512 |
+
5. **Examine alignments** to understand phoneme production:
|
| 513 |
+
```python
|
| 514 |
+
for a in mapping.alignment:
|
| 515 |
+
if 'qalqala' in str(a.rules):
|
| 516 |
+
print(f"Phoneme '{a.phoneme}' from '{a.source_char}' with rules {a.rules}")
|
| 517 |
+
```
|
.gitattributes
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# No LFS tracking - push files directly
|
| 2 |
+
data/digital_khatt_v2_script.json filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.otf filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ttf filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
data/*.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
data/qpc_hafs.json filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual environments
|
| 24 |
+
.env
|
| 25 |
+
.venv
|
| 26 |
+
env/
|
| 27 |
+
venv/
|
| 28 |
+
ENV/
|
| 29 |
+
|
| 30 |
+
# IDE
|
| 31 |
+
.idea/
|
| 32 |
+
.vscode/
|
| 33 |
+
*.swp
|
| 34 |
+
*.swo
|
| 35 |
+
*~
|
| 36 |
+
|
| 37 |
+
# OS
|
| 38 |
+
.DS_Store
|
| 39 |
+
Thumbs.db
|
| 40 |
+
|
| 41 |
+
# Gradio
|
| 42 |
+
flagged/
|
| 43 |
+
|
| 44 |
+
# Exported models
|
| 45 |
+
models/
|
| 46 |
+
|
| 47 |
+
# Test API
|
| 48 |
+
test_api.py
|
| 49 |
+
data/api_result.json
|
| 50 |
+
|
| 51 |
+
CLAUDE.md
|
| 52 |
+
inference_optimization.md
|
| 53 |
+
|
| 54 |
+
docs/
|
README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Quran Multi-Aligner
|
| 3 |
+
emoji: 🎯
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 6.5.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
short_description: Segment recitations and extract text and word timestamps
|
| 11 |
+
license: mit
|
| 12 |
+
thumbnail: >-
|
| 13 |
+
https://cdn-uploads.huggingface.co/production/uploads/684abe5b6327ae8863d106d2/Rr-R8HNiyJNbaXCE5saU6.png
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
config.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration settings for the Segments App.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
# HF Spaces detection
|
| 8 |
+
IS_HF_SPACE = os.environ.get("SPACE_ID") is not None
|
| 9 |
+
|
| 10 |
+
# Get project root directory
|
| 11 |
+
PROJECT_ROOT = Path(__file__).parent.absolute()
|
| 12 |
+
|
| 13 |
+
# Port for local development
|
| 14 |
+
PORT = 6902
|
| 15 |
+
|
| 16 |
+
# =============================================================================
|
| 17 |
+
# Audio settings
|
| 18 |
+
# =============================================================================
|
| 19 |
+
|
| 20 |
+
RESAMPLE_TYPE = "soxr_lq"
|
| 21 |
+
SEGMENT_AUDIO_DIR = Path("/tmp/segments") # WAV files written here per request
|
| 22 |
+
AUDIO_PRELOAD_COUNT = 5 # First N segments use preload="auto"
|
| 23 |
+
DELETE_CACHE_FREQUENCY = 3600*5 # Gradio cache cleanup interval (seconds)
|
| 24 |
+
DELETE_CACHE_AGE = 3600*5 # Delete cached files older than this (seconds)
|
| 25 |
+
|
| 26 |
+
# =============================================================================
|
| 27 |
+
# Model and data paths
|
| 28 |
+
# =============================================================================
|
| 29 |
+
|
| 30 |
+
# VAD segmenter model
|
| 31 |
+
SEGMENTER_MODEL = "obadx/recitation-segmenter-v2"
|
| 32 |
+
|
| 33 |
+
# Phoneme ASR models (wav2vec2 CTC)
|
| 34 |
+
PHONEME_ASR_MODELS = {
|
| 35 |
+
"Base": "hetchyy/r15_95m",
|
| 36 |
+
"Large": "hetchyy/r7",
|
| 37 |
+
}
|
| 38 |
+
PHONEME_ASR_MODEL_DEFAULT = "Base"
|
| 39 |
+
PHONEME_ASR_MODEL = PHONEME_ASR_MODELS[PHONEME_ASR_MODEL_DEFAULT]
|
| 40 |
+
|
| 41 |
+
DATA_PATH = PROJECT_ROOT / "data"
|
| 42 |
+
SURAH_INFO_PATH = DATA_PATH / "surah_info.json"
|
| 43 |
+
|
| 44 |
+
# Quran script paths
|
| 45 |
+
QURAN_SCRIPT_PATH_COMPUTE = DATA_PATH / "qpc_hafs.json"
|
| 46 |
+
QURAN_SCRIPT_PATH_DISPLAY = DATA_PATH / "qpc_hafs.json"
|
| 47 |
+
|
| 48 |
+
# Pre-built phoneme cache (all 114 chapters)
|
| 49 |
+
PHONEME_CACHE_PATH = DATA_PATH / "phoneme_cache.pkl"
|
| 50 |
+
|
| 51 |
+
# Phoneme n-gram index for anchor detection
|
| 52 |
+
NGRAM_SIZE = 5
|
| 53 |
+
NGRAM_INDEX_PATH = DATA_PATH / f"phoneme_ngram_index_{NGRAM_SIZE}.pkl"
|
| 54 |
+
|
| 55 |
+
# =============================================================================
|
| 56 |
+
# Inference settings
|
| 57 |
+
# =============================================================================
|
| 58 |
+
|
| 59 |
+
def get_vad_duration(minutes):
|
| 60 |
+
"""GPU seconds needed for VAD based on audio minutes."""
|
| 61 |
+
if minutes > 180:
|
| 62 |
+
return 60
|
| 63 |
+
elif minutes > 120:
|
| 64 |
+
return 40
|
| 65 |
+
elif minutes > 60:
|
| 66 |
+
return 25
|
| 67 |
+
elif minutes > 30:
|
| 68 |
+
return 15
|
| 69 |
+
elif minutes > 15:
|
| 70 |
+
return 10
|
| 71 |
+
else:
|
| 72 |
+
return 5
|
| 73 |
+
|
| 74 |
+
def get_asr_duration(minutes, model_name="Base"):
|
| 75 |
+
"""GPU seconds needed for ASR based on audio minutes and model size."""
|
| 76 |
+
if minutes > 180:
|
| 77 |
+
return 20
|
| 78 |
+
elif minutes > 60:
|
| 79 |
+
base = 15
|
| 80 |
+
else:
|
| 81 |
+
base = 10
|
| 82 |
+
|
| 83 |
+
if model_name == "Large":
|
| 84 |
+
LARGE_MODEL_DURATION_FACTOR = 3
|
| 85 |
+
return int(base * LARGE_MODEL_DURATION_FACTOR)
|
| 86 |
+
return base
|
| 87 |
+
|
| 88 |
+
# Batching strategy
|
| 89 |
+
BATCHING_STRATEGY = "dynamic" # "naive" (fixed count) or "dynamic" (seconds + pad waste)
|
| 90 |
+
|
| 91 |
+
# Naive batching
|
| 92 |
+
INFERENCE_BATCH_SIZE = 32 # Fixed segments per batch (used when BATCHING_STRATEGY="naive")
|
| 93 |
+
|
| 94 |
+
# Dynamic batching constraints
|
| 95 |
+
MAX_BATCH_SECONDS = 300 # Max total audio seconds per batch (sum of durations)
|
| 96 |
+
MAX_PAD_WASTE = 0.15 # Max fraction of padded tensor that is wasted (0=no waste, 1=all waste)
|
| 97 |
+
MIN_BATCH_SIZE = 8 # Minimum segments per batch (prevents underutilization)
|
| 98 |
+
|
| 99 |
+
# Model precision
|
| 100 |
+
DTYPE = "float16"
|
| 101 |
+
TORCH_COMPILE = True # Apply torch.compile() to GPU models (reduce-overhead mode)
|
| 102 |
+
|
| 103 |
+
# AOTInductor compilation (ZeroGPU optimization)
|
| 104 |
+
AOTI_ENABLED = True # Enable AOT compilation for VAD model on HF Space
|
| 105 |
+
AOTI_MIN_AUDIO_MINUTES = 15 # Min audio duration for dynamic shapes
|
| 106 |
+
AOTI_MAX_AUDIO_MINUTES = 90 # Max audio duration for dynamic shapes
|
| 107 |
+
AOTI_HUB_ENABLED = True # Enable Hub persistence (upload/download compiled models)
|
| 108 |
+
AOTI_HUB_REPO = "hetchyy/quran-aligner-aoti" # Hub repo for compiled model cache
|
| 109 |
+
|
| 110 |
+
# =============================================================================
|
| 111 |
+
# Phoneme-based alignment settings
|
| 112 |
+
# =============================================================================
|
| 113 |
+
|
| 114 |
+
ANCHOR_SEGMENTS = 5 # N-gram voting uses first N Quran segments
|
| 115 |
+
ANCHOR_RARITY_WEIGHTING = True # Weight votes by 1/count (rarity); False = equal weight
|
| 116 |
+
ANCHOR_RUN_TRIM_RATIO = 0.15 # Trim leading/trailing ayahs whose weight < ratio * max weight in run
|
| 117 |
+
|
| 118 |
+
# Edit operation costs (Levenshtein hyperparameters)
|
| 119 |
+
COST_SUBSTITUTION = 1.0 # Default phoneme substitution cost
|
| 120 |
+
COST_INSERTION = 1.0 # Insert phoneme from reference (R)
|
| 121 |
+
COST_DELETION = 0.8 # Delete phoneme from ASR (P)
|
| 122 |
+
|
| 123 |
+
# Alignment thresholds (normalized edit distance: 0 = identical, 1 = completely different)
|
| 124 |
+
LOOKBACK_WORDS = 15 # Window words to look back from pointer for starting positions
|
| 125 |
+
LOOKAHEAD_WORDS = 10 # Window words to look ahead after expected end position
|
| 126 |
+
MAX_EDIT_DISTANCE = 0.25 # Max normalized edit distance for valid ayah match
|
| 127 |
+
MAX_SPECIAL_EDIT_DISTANCE = 0.35 # Max normalized edit distance for Basmala/Isti'adha detection
|
| 128 |
+
START_PRIOR_WEIGHT = 0.005 # Penalty per word away from expected position
|
| 129 |
+
|
| 130 |
+
# Failed Segments
|
| 131 |
+
RETRY_LOOKBACK_WORDS = 60 # Expanded lookback for retry tier 1+2
|
| 132 |
+
RETRY_LOOKAHEAD_WORDS = 40 # Expanded lookahead for retry tier 1+2
|
| 133 |
+
MAX_EDIT_DISTANCE_RELAXED = 0.45 # Relaxed threshold for retry tier 2
|
| 134 |
+
MAX_CONSECUTIVE_FAILURES = 2 # Re-anchor within surah after this many DP failures
|
| 135 |
+
|
| 136 |
+
# Debug output
|
| 137 |
+
ANCHOR_DEBUG = True # Show detailed n-gram voting info (votes, top candidates)
|
| 138 |
+
PHONEME_ALIGNMENT_DEBUG = True # Show detailed alignment info (R, P, edit costs)
|
| 139 |
+
PHONEME_ALIGNMENT_PROFILING = True # Track and log timing breakdown (DP, window setup, etc.)
|
| 140 |
+
|
| 141 |
+
# =============================================================================
|
| 142 |
+
# Segmentation slider settings
|
| 143 |
+
# =============================================================================
|
| 144 |
+
|
| 145 |
+
# Segmentation presets: (min_silence_ms, min_speech_ms, pad_ms)
|
| 146 |
+
PRESET_MUJAWWAD = (600, 1500, 300) # Slow / Mujawwad recitation
|
| 147 |
+
PRESET_MURATTAL = (200, 1000, 100) # Normal pace (default)
|
| 148 |
+
PRESET_FAST = (75, 750, 40) # Fast recitation
|
| 149 |
+
|
| 150 |
+
# Slider ranges (defaults come from PRESET_MURATTAL)
|
| 151 |
+
MIN_SILENCE_MIN = 25
|
| 152 |
+
MIN_SILENCE_MAX = 1000
|
| 153 |
+
MIN_SILENCE_STEP = 25
|
| 154 |
+
|
| 155 |
+
MIN_SPEECH_MIN = 500
|
| 156 |
+
MIN_SPEECH_MAX = 2000
|
| 157 |
+
MIN_SPEECH_STEP = 250
|
| 158 |
+
|
| 159 |
+
PAD_MIN = 0
|
| 160 |
+
PAD_MAX = 300
|
| 161 |
+
PAD_STEP = 25
|
| 162 |
+
|
| 163 |
+
# =============================================================================
|
| 164 |
+
# Confidence thresholds for color coding
|
| 165 |
+
# =============================================================================
|
| 166 |
+
|
| 167 |
+
CONFIDENCE_HIGH = 0.8 # >= this: Green
|
| 168 |
+
CONFIDENCE_MED = 0.6 # >= this: Yellow, below: Red
|
| 169 |
+
REVIEW_SUMMARY_MAX_SEGMENTS = 15 # Max segment numbers to list before truncating
|
| 170 |
+
|
| 171 |
+
# Undersegmentation detection thresholds
|
| 172 |
+
# Flagged when (word_count >= MIN_WORDS OR ayah_span >= MIN_AYAH_SPAN) AND duration >= MIN_DURATION
|
| 173 |
+
UNDERSEG_MIN_WORDS = 20 # Word count threshold
|
| 174 |
+
UNDERSEG_MIN_AYAH_SPAN = 2 # Ayah span threshold (segment crosses ayah boundary)
|
| 175 |
+
UNDERSEG_MIN_DURATION = 15 # Duration gate (seconds)
|
| 176 |
+
|
| 177 |
+
# =============================================================================
|
| 178 |
+
# MFA forced alignment (word-level timestamps via HF Space)
|
| 179 |
+
# =============================================================================
|
| 180 |
+
|
| 181 |
+
MFA_SPACE_URL = "https://hetchyy-quran-phoneme-mfa.hf.space"
|
| 182 |
+
MFA_TIMEOUT = 120
|
| 183 |
+
|
| 184 |
+
# =============================================================================
|
| 185 |
+
# Usage logging (pushed to HF Hub via ParquetScheduler)
|
| 186 |
+
# =============================================================================
|
| 187 |
+
|
| 188 |
+
USAGE_LOG_DATASET_REPO = "hetchyy/quran-aligner-logs"
|
| 189 |
+
USAGE_LOG_PUSH_INTERVAL_MINUTES = 60
|
| 190 |
+
|
| 191 |
+
# =============================================================================
|
| 192 |
+
# Progress bar settings
|
| 193 |
+
# =============================================================================
|
| 194 |
+
|
| 195 |
+
PROGRESS_PROCESS_AUDIO = {
|
| 196 |
+
"vad_asr": (0.00, "Segmenting and transcribing..."),
|
| 197 |
+
"asr": (0.15, "Running ASR..."),
|
| 198 |
+
"special_segments": (0.50, "Detecting special segments..."),
|
| 199 |
+
"anchor": (0.60, "Anchor detection..."),
|
| 200 |
+
"matching": (0.80, "Text matching..."),
|
| 201 |
+
"building": (0.90, "Building results..."),
|
| 202 |
+
"done": (1.00, "Done!"),
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
PROGRESS_RESEGMENT = {
|
| 206 |
+
"resegment": (0.00, "Resegmenting..."),
|
| 207 |
+
"asr": (0.15, "Running ASR..."),
|
| 208 |
+
"special_segments": (0.50, "Detecting special segments..."),
|
| 209 |
+
"anchor": (0.60, "Anchor detection..."),
|
| 210 |
+
"matching": (0.80, "Text matching..."),
|
| 211 |
+
"building": (0.90, "Building results..."),
|
| 212 |
+
"done": (1.00, "Done!"),
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
PROGRESS_RETRANSCRIBE = {
|
| 216 |
+
"retranscribe": (0.00, "Retranscribing with {model} model..."),
|
| 217 |
+
"asr": (0.15, "Running ASR..."),
|
| 218 |
+
"special_segments": (0.50, "Detecting special segments..."),
|
| 219 |
+
"anchor": (0.60, "Anchor detection..."),
|
| 220 |
+
"matching": (0.80, "Text matching..."),
|
| 221 |
+
"building": (0.90, "Building results..."),
|
| 222 |
+
"done": (1.00, "Done!"),
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
PROGRESS_COMPUTE_TIMESTAMPS = {
|
| 226 |
+
"upload": (0.00, "Uploading audio to MFA space..."),
|
| 227 |
+
"align": (0.20, "Running forced alignment..."),
|
| 228 |
+
"inject": (0.90, "Applying word timestamps..."),
|
| 229 |
+
"done": (1.00, "Done!"),
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
MFA_PROGRESS_SEGMENT_RATE = 0.05 # seconds per segment for progress bar animation
|
| 233 |
+
|
| 234 |
+
# =============================================================================
|
| 235 |
+
# UI settings
|
| 236 |
+
# =============================================================================
|
| 237 |
+
|
| 238 |
+
# Main layout column scales
|
| 239 |
+
LEFT_COLUMN_SCALE = 4
|
| 240 |
+
RIGHT_COLUMN_SCALE = 6
|
| 241 |
+
|
| 242 |
+
# Arabic font stack
|
| 243 |
+
ARABIC_FONT_STACK = "'DigitalKhatt', 'Traditional Arabic'"
|
| 244 |
+
|
| 245 |
+
QURAN_TEXT_SIZE_PX = 24 # Size for Quran text in segment cards
|
| 246 |
+
ARABIC_WORD_SPACING = "0.2em" # Word spacing for Arabic text
|
| 247 |
+
|
| 248 |
+
# =============================================================================
|
| 249 |
+
# Animation settings
|
| 250 |
+
# =============================================================================
|
| 251 |
+
|
| 252 |
+
# Animation granularity
|
| 253 |
+
ANIM_GRANULARITIES = ["Words", "Characters"]
|
| 254 |
+
ANIM_GRANULARITY_DEFAULT = "Words"
|
| 255 |
+
|
| 256 |
+
ANIM_WORD_COLOR = "#49c3b3" # Green highlight for active word
|
| 257 |
+
ANIM_STYLE_ROW_SCALES = (2, 6, 1, 1) # Granularity, Style, Verse Only, Color
|
| 258 |
+
|
| 259 |
+
ANIM_OPACITY_PREV_DEFAULT = 0.3 # Default "before" opacity
|
| 260 |
+
ANIM_OPACITY_AFTER_DEFAULT = 0.3 # Default "after" opacity
|
| 261 |
+
ANIM_OPACITY_STEP = 0.1 # Opacity slider step size
|
| 262 |
+
|
| 263 |
+
# Mega-card text styling sliders
|
| 264 |
+
MEGA_WORD_SPACING_MIN = 0.0
|
| 265 |
+
MEGA_WORD_SPACING_MAX = 1.0
|
| 266 |
+
MEGA_WORD_SPACING_STEP = 0.05
|
| 267 |
+
MEGA_WORD_SPACING_DEFAULT = 0.2 # matches ARABIC_WORD_SPACING
|
| 268 |
+
|
| 269 |
+
MEGA_TEXT_SIZE_MIN = 12
|
| 270 |
+
MEGA_TEXT_SIZE_MAX = 60
|
| 271 |
+
MEGA_TEXT_SIZE_STEP = 2
|
| 272 |
+
MEGA_TEXT_SIZE_DEFAULT = 30 # matches QURAN_TEXT_SIZE_PX
|
| 273 |
+
MEGA_SURAH_LIGATURE_SIZE = 2 # em — surah name ligature font size in megacard
|
| 274 |
+
|
| 275 |
+
MEGA_LINE_SPACING_MIN = 1.5
|
| 276 |
+
MEGA_LINE_SPACING_MAX = 3.0
|
| 277 |
+
MEGA_LINE_SPACING_STEP = 0.1
|
| 278 |
+
MEGA_LINE_SPACING_DEFAULT = 2 # matches mega-card line-height
|
| 279 |
+
|
| 280 |
+
# Window engine settings (all modes use the window engine internally)
|
| 281 |
+
ANIM_WINDOW_PREV_DEFAULT = 4 # Default number of visible previous words/chars
|
| 282 |
+
ANIM_WINDOW_AFTER_DEFAULT = 4 # Default number of visible after words/chars
|
| 283 |
+
ANIM_WINDOW_PREV_MIN = 0
|
| 284 |
+
ANIM_WINDOW_AFTER_MIN = 0
|
| 285 |
+
ANIM_WINDOW_PREV_MAX = 15
|
| 286 |
+
ANIM_WINDOW_AFTER_MAX = 15
|
| 287 |
+
|
| 288 |
+
# Presets map mode names to window engine parameter values
|
| 289 |
+
ANIM_DISPLAY_MODE_DEFAULT = "Reveal"
|
| 290 |
+
ANIM_DISPLAY_MODES = ["Reveal", "Fade", "Spotlight", "Isolate", "Consume", "Custom"]
|
| 291 |
+
ANIM_PRESETS = {
|
| 292 |
+
"Reveal": {
|
| 293 |
+
"prev_opacity": 1.0,
|
| 294 |
+
"prev_words": ANIM_WINDOW_PREV_MAX,
|
| 295 |
+
"after_opacity": 0.0,
|
| 296 |
+
"after_words": 0,
|
| 297 |
+
},
|
| 298 |
+
"Fade": {
|
| 299 |
+
"prev_opacity": 1.0,
|
| 300 |
+
"prev_words": ANIM_WINDOW_PREV_MAX,
|
| 301 |
+
"after_opacity": 0.3,
|
| 302 |
+
"after_words": ANIM_WINDOW_AFTER_MAX,
|
| 303 |
+
},
|
| 304 |
+
"Spotlight": {
|
| 305 |
+
"prev_opacity": 0.3,
|
| 306 |
+
"prev_words": ANIM_WINDOW_PREV_MAX,
|
| 307 |
+
"after_opacity": 0.3,
|
| 308 |
+
"after_words": ANIM_WINDOW_AFTER_MAX,
|
| 309 |
+
},
|
| 310 |
+
"Isolate": {
|
| 311 |
+
"prev_opacity": 0,
|
| 312 |
+
"prev_words": 0,
|
| 313 |
+
"after_opacity": 0,
|
| 314 |
+
"after_words": 0,
|
| 315 |
+
},
|
| 316 |
+
"Consume": {
|
| 317 |
+
"prev_opacity": 0,
|
| 318 |
+
"prev_words": 0,
|
| 319 |
+
"after_opacity": 0.3,
|
| 320 |
+
"after_words": ANIM_WINDOW_AFTER_MAX,
|
| 321 |
+
}
|
| 322 |
+
}
|
data/112.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efe1176e324ffe694ef16825c02f2102cd7f1f0d9bfe66148529aed65e9893fe
|
| 3 |
+
size 432301
|
data/7.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aef46f287881fbc2f110dbb553877c8e3a0e5ab01ba3289803e512de18bc970b
|
| 3 |
+
size 46313600
|
data/84.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e801be0312b45546ceea22735a0d9eb4286b3b9a506ba6e98bc8965341ea9ecf
|
| 3 |
+
size 2480855
|
data/DigitalKhattV2.otf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0935c48269a57c9808e52dfae47864c189396452901c689977156036a72dd217
|
| 3 |
+
size 521832
|
data/Juz' 30.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c9493fd2161f098aa0c846998f6e4d45f8dd08517657cb10653132a2ee228e0b
|
| 3 |
+
size 54459943
|
data/digital_khatt_v2_script.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fd396239908253a07dc530580fd978c515cbac1b7751b2680296580fb62b247c
|
| 3 |
+
size 14832957
|
data/font_data.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/ligatures.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"surah-1":"surah001","surah-2":"surah002","surah-3":"surah003","surah-4":"surah004","surah-5":"surah005","surah-6":"surah006","surah-7":"surah007","surah-8":"surah008","surah-9":"surah009","surah-10":"surah010","surah-11":"surah011","surah-12":"surah012","surah-13":"surah013","surah-14":"surah014","surah-15":"surah015","surah-16":"surah016","surah-17":"surah017","surah-18":"surah018","surah-19":"surah019","surah-20":"surah020","surah-21":"surah021","surah-22":"surah022","surah-23":"surah023","surah-24":"surah024","surah-25":"surah025","surah-26":"surah026","surah-27":"surah027","surah-28":"surah028","surah-29":"surah029","surah-30":"surah030","surah-31":"surah031","surah-32":"surah032","surah-33":"surah033","surah-34":"surah034","surah-35":"surah035","surah-36":"surah036","surah-37":"surah037","surah-38":"surah038","surah-39":"surah039","surah-40":"surah040","surah-41":"surah041","surah-42":"surah042","surah-43":"surah043","surah-44":"surah044","surah-45":"surah045","surah-46":"surah046","surah-47":"surah047","surah-48":"surah048","surah-49":"surah049","surah-50":"surah050","surah-51":"surah051","surah-52":"surah052","surah-53":"surah053","surah-54":"surah054","surah-55":"surah055","surah-56":"surah056","surah-57":"surah057","surah-58":"surah058","surah-59":"surah059","surah-60":"surah060","surah-61":"surah061","surah-62":"surah062","surah-63":"surah063","surah-64":"surah064","surah-65":"surah065","surah-66":"surah066","surah-67":"surah067","surah-68":"surah068","surah-69":"surah069","surah-70":"surah070","surah-71":"surah071","surah-72":"surah072","surah-73":"surah073","surah-74":"surah074","surah-75":"surah075","surah-76":"surah076","surah-77":"surah077","surah-78":"surah078","surah-79":"surah079","surah-80":"surah080","surah-81":"surah081","surah-82":"surah082","surah-83":"surah083","surah-84":"surah084","surah-85":"surah085","surah-86":"surah086","surah-87":"surah087","surah-88":"surah088","surah-89":"surah089","surah-90":"surah090","surah-91":"surah091","surah-92":"surah092","surah-93":"surah093","surah-94":"surah094","surah-95":"surah095","surah-96":"surah096","surah-97":"surah097","surah-98":"surah098","surah-99":"surah099","surah-100":"surah100","surah-101":"surah101","surah-102":"surah102","surah-103":"surah103","surah-104":"surah104","surah-105":"surah105","surah-106":"surah106","surah-107":"surah107","surah-108":"surah108","surah-109":"surah109","surah-110":"surah110","surah-111":"surah111","surah-112":"surah112","surah-113":"surah113","surah-114":"surah114"}
|
data/phoneme_cache.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:027283ab3be8a239b99ba4b3ffeb869efddc3da6fce12e02473d3e335dbf3a04
|
| 3 |
+
size 7964064
|
data/phoneme_ngram_index_5.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b522664de41f590fc18fad385f023cb1a85829623cbcf035fe18152be52bc739
|
| 3 |
+
size 6205946
|
data/phoneme_sub_costs.json
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_meta": {
|
| 3 |
+
"description": "Phoneme substitution costs for DP alignment. Keys are 'phA|phB' (sorted). Values are float costs (0-1). Default substitution cost is 1.0.",
|
| 4 |
+
"version": 1
|
| 5 |
+
},
|
| 6 |
+
|
| 7 |
+
"consonant_gemination": {
|
| 8 |
+
"b|bb": 0.25,
|
| 9 |
+
"t|tt": 0.25,
|
| 10 |
+
"d|dd": 0.25,
|
| 11 |
+
"f|ff": 0.25,
|
| 12 |
+
"h|hh": 0.25,
|
| 13 |
+
"j|jj": 0.25,
|
| 14 |
+
"k|kk": 0.25,
|
| 15 |
+
"l|ll": 0.25,
|
| 16 |
+
"q|qq": 0.25,
|
| 17 |
+
"r|rr": 0.25,
|
| 18 |
+
"s|ss": 0.25,
|
| 19 |
+
"w|ww": 0.25,
|
| 20 |
+
"x|xx": 0.25,
|
| 21 |
+
"z|zz": 0.25,
|
| 22 |
+
"ð|ðð": 0.25,
|
| 23 |
+
"θ|θθ": 0.25,
|
| 24 |
+
"ħ|ħħ": 0.25,
|
| 25 |
+
"ʃ|ʃʃ": 0.25,
|
| 26 |
+
"ʒ|ʒʒ": 0.25,
|
| 27 |
+
"ʕ|ʕʕ": 0.25,
|
| 28 |
+
"sˤ|sˤsˤ": 0.25,
|
| 29 |
+
"dˤ|dˤdˤ": 0.25,
|
| 30 |
+
"tˤ|tˤtˤ": 0.25,
|
| 31 |
+
"ðˤ|ðˤðˤ": 0.25
|
| 32 |
+
},
|
| 33 |
+
|
| 34 |
+
"emphatic_pairs": {
|
| 35 |
+
"a|aˤ": 0.25,
|
| 36 |
+
"a:|aˤ:": 0.25,
|
| 37 |
+
"r|rˤ": 0.25,
|
| 38 |
+
"rˤ|rˤrˤ": 0.25,
|
| 39 |
+
"r|rˤrˤ": 0.25,
|
| 40 |
+
"l|lˤlˤ": 0.25,
|
| 41 |
+
"ll|lˤlˤ": 0.25
|
| 42 |
+
},
|
| 43 |
+
|
| 44 |
+
"nasal_ghunnah": {
|
| 45 |
+
"m|m̃": 0.25,
|
| 46 |
+
"n|ñ": 0.25,
|
| 47 |
+
"n|ŋ": 0.25,
|
| 48 |
+
"ŋ|ñ": 0.25,
|
| 49 |
+
"ŋ|m̃": 0.25,
|
| 50 |
+
"j|j̃": 0.25,
|
| 51 |
+
"w|w̃": 0.25
|
| 52 |
+
},
|
| 53 |
+
|
| 54 |
+
"vowel_length": {
|
| 55 |
+
"a|a:": 0.25,
|
| 56 |
+
"aˤ|aˤ:": 0.25,
|
| 57 |
+
"a|aˤ:": 0.25,
|
| 58 |
+
"aˤ|a:": 0.25,
|
| 59 |
+
"i|i:": 0.25,
|
| 60 |
+
"u|u:": 0.25
|
| 61 |
+
},
|
| 62 |
+
|
| 63 |
+
"common_subs": {
|
| 64 |
+
"n|l": 0.25,
|
| 65 |
+
"n|m": 0.25
|
| 66 |
+
}
|
| 67 |
+
}
|
data/qpc_hafs.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b2f91a19769275d0da57464002beacd8cec396b02b520aa14d17e3b135012a7
|
| 3 |
+
size 11596756
|
data/surah-name-v2.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d4678ef53ef76c361c32c13a9ad26317b8f8219089ab5822aafa6ed5d17502a
|
| 3 |
+
size 580388
|
data/surah_info.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
docs/api.md
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API Documentation
|
| 2 |
+
|
| 3 |
+
## Current Endpoints
|
| 4 |
+
|
| 5 |
+
### `POST /process_audio_json`
|
| 6 |
+
|
| 7 |
+
Stateless endpoint. Accepts audio and segmentation parameters, returns aligned JSON output.
|
| 8 |
+
|
| 9 |
+
**Inputs:** audio file, min_silence_ms, min_speech_ms, pad_ms, model_name, device
|
| 10 |
+
|
| 11 |
+
**Returns:** JSON with `segments` array (segment index, timestamps, Quran references, matched text, confidence, errors).
|
| 12 |
+
|
| 13 |
+
**Limitation:** Every call requires re-uploading the audio. No way to resegment or retranscribe without re-sending the full file.
|
| 14 |
+
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
## Planned: Session-Based Endpoints
|
| 18 |
+
|
| 19 |
+
The Gradio UI already caches intermediate results (preprocessed audio, VAD output, segment boundaries, model name) in `gr.State` so that resegment/retranscribe operations skip expensive steps. But `gr.State` is WebSocket-only — API clients using `gradio_client` can't benefit from this.
|
| 20 |
+
|
| 21 |
+
### Approach: Server-Side Session Store
|
| 22 |
+
|
| 23 |
+
On the first request, the server stores all intermediate data keyed by a UUID (`audio_id`) and returns it in the response. Subsequent requests reference this `audio_id` instead of re-uploading audio.
|
| 24 |
+
|
| 25 |
+
**What gets stored per session:**
|
| 26 |
+
- Preprocessed audio (float32, 16kHz mono) — saved to disk as `.npy`
|
| 27 |
+
- Raw VAD speech intervals — in memory (small)
|
| 28 |
+
- VAD completeness flags — in memory
|
| 29 |
+
- Cleaned segment boundaries — in memory
|
| 30 |
+
- Model name used — in memory
|
| 31 |
+
|
| 32 |
+
**Lifecycle:** Sessions expire after the same TTL as the existing Gradio cache (5 hours). A background thread purges expired sessions periodically. Audio files live under `/tmp/sessions/{audio_id}/`.
|
| 33 |
+
|
| 34 |
+
### `POST /process_audio_session`
|
| 35 |
+
|
| 36 |
+
Full pipeline. Same as `/process_audio_json` but additionally creates a server-side session.
|
| 37 |
+
|
| 38 |
+
**Inputs:** audio file, min_silence_ms, min_speech_ms, pad_ms, model_name, device
|
| 39 |
+
|
| 40 |
+
**Returns:** Same JSON as `/process_audio_json` with an added `audio_id` field.
|
| 41 |
+
|
| 42 |
+
### `POST /resegment_session`
|
| 43 |
+
|
| 44 |
+
Re-cleans VAD boundaries with new segmentation parameters and re-runs ASR. Skips audio upload, preprocessing, and VAD inference.
|
| 45 |
+
|
| 46 |
+
**Inputs:** audio_id, min_silence_ms, min_speech_ms, pad_ms, model_name, device
|
| 47 |
+
|
| 48 |
+
**Returns:** JSON with `segments` array and the same `audio_id`.
|
| 49 |
+
|
| 50 |
+
### `POST /retranscribe_session`
|
| 51 |
+
|
| 52 |
+
Re-runs ASR with a different model on the existing segment boundaries. Skips audio upload, preprocessing, VAD, and resegmentation.
|
| 53 |
+
|
| 54 |
+
**Inputs:** audio_id, model_name, device
|
| 55 |
+
|
| 56 |
+
**Returns:** JSON with `segments` array and the same `audio_id`.
|
| 57 |
+
|
| 58 |
+
### `POST /realign_from_timestamps`
|
| 59 |
+
|
| 60 |
+
Accepts an arbitrary list of `(start, end)` timestamp pairs and runs ASR + phoneme alignment on each slice. Skips VAD entirely — the client defines the segment boundaries directly. This is the core endpoint for timeline-based editing where the user drags segment boundaries manually.
|
| 61 |
+
|
| 62 |
+
**Inputs:** audio_id, timestamps (list of `{start, end}` objects in seconds), model_name, device
|
| 63 |
+
|
| 64 |
+
**Returns:** JSON with `segments` array and the same `audio_id`. Session boundaries are updated to match the provided timestamps.
|
| 65 |
+
|
| 66 |
+
Subsumes `/resegment_session` for most client use cases — the client can split, merge, and drag boundaries however they want, then send the final timestamp list in one call.
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
## Planned: Segment Editing Endpoints
|
| 71 |
+
|
| 72 |
+
Fine-grained operations for modifying individual segments without reprocessing the full recitation.
|
| 73 |
+
|
| 74 |
+
### `POST /split_segment`
|
| 75 |
+
|
| 76 |
+
Split one segment at a given timestamp into two. Re-runs alignment on each half independently.
|
| 77 |
+
|
| 78 |
+
**Inputs:** audio_id, segment_index, split_time (seconds)
|
| 79 |
+
|
| 80 |
+
**Returns:** Updated `segments` array with the split segment replaced by two new segments.
|
| 81 |
+
|
| 82 |
+
### `POST /merge_segments`
|
| 83 |
+
|
| 84 |
+
Merge two adjacent segments into one. Re-runs alignment on the combined audio slice.
|
| 85 |
+
|
| 86 |
+
**Inputs:** audio_id, segment_index_a, segment_index_b (must be adjacent)
|
| 87 |
+
|
| 88 |
+
**Returns:** Updated `segments` array with the two segments replaced by one.
|
| 89 |
+
|
| 90 |
+
### `POST /adjust_boundary`
|
| 91 |
+
|
| 92 |
+
Shift a segment's start or end time. Re-runs alignment on the affected segment(s) and its neighbour if boundaries overlap.
|
| 93 |
+
|
| 94 |
+
**Inputs:** audio_id, segment_index, new_start (seconds, optional), new_end (seconds, optional)
|
| 95 |
+
|
| 96 |
+
**Returns:** Updated `segments` array.
|
| 97 |
+
|
| 98 |
+
### `POST /override_segment_text`
|
| 99 |
+
|
| 100 |
+
Manually assign a Quran reference range to a segment, skipping alignment entirely. For when the aligner gets it wrong and the user knows the correct ayah.
|
| 101 |
+
|
| 102 |
+
**Inputs:** audio_id, segment_index, ref_from (e.g. `"2:255:1"`), ref_to (e.g. `"2:255:7"`)
|
| 103 |
+
|
| 104 |
+
**Returns:** Updated segment with the overridden reference and corresponding Quran text.
|
| 105 |
+
|
| 106 |
+
### `POST /bulk_update_segments`
|
| 107 |
+
|
| 108 |
+
Batch update: client sends a full modified segment list (adjusted times, overridden labels). Server validates, persists to session, and optionally re-aligns changed segments.
|
| 109 |
+
|
| 110 |
+
**Inputs:** audio_id, segments (list of `{start, end, ref_from?, ref_to?}`), realign (boolean, default true — re-run ASR on segments whose boundaries changed)
|
| 111 |
+
|
| 112 |
+
**Returns:** Full updated `segments` array.
|
| 113 |
+
|
| 114 |
+
---
|
| 115 |
+
|
| 116 |
+
## Planned: Word-Level Timing
|
| 117 |
+
|
| 118 |
+
### `POST /compute_word_timestamps`
|
| 119 |
+
|
| 120 |
+
Compute word-level start/end times for every word in every segment. This is the backbone of karaoke-style highlighting and word-by-word caption animation.
|
| 121 |
+
|
| 122 |
+
**Inputs:** audio_id, model_name, device
|
| 123 |
+
|
| 124 |
+
**Returns:** JSON with per-segment word timestamps:
|
| 125 |
+
```json
|
| 126 |
+
{
|
| 127 |
+
"audio_id": "...",
|
| 128 |
+
"segments": [
|
| 129 |
+
{
|
| 130 |
+
"segment": 1,
|
| 131 |
+
"words": [
|
| 132 |
+
{"word": "بِسْمِ", "start": 0.81, "end": 1.12},
|
| 133 |
+
{"word": "اللَّهِ", "start": 1.12, "end": 1.45}
|
| 134 |
+
]
|
| 135 |
+
}
|
| 136 |
+
]
|
| 137 |
+
}
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
---
|
| 141 |
+
|
| 142 |
+
## Planned: Export Endpoints
|
| 143 |
+
|
| 144 |
+
Generate subtitle files from session data. All accept `audio_id` and optionally use word-level timestamps if previously computed.
|
| 145 |
+
|
| 146 |
+
### `POST /export_srt`
|
| 147 |
+
|
| 148 |
+
Standard SRT subtitle format. One entry per segment (or per word if `word_level=true`).
|
| 149 |
+
|
| 150 |
+
**Inputs:** audio_id, word_level (boolean, default false)
|
| 151 |
+
|
| 152 |
+
**Returns:** SRT file content.
|
| 153 |
+
|
| 154 |
+
### `POST /export_vtt`
|
| 155 |
+
|
| 156 |
+
WebVTT format. Supports styling cues and is the standard for web video players.
|
| 157 |
+
|
| 158 |
+
**Inputs:** audio_id, word_level (boolean, default false)
|
| 159 |
+
|
| 160 |
+
**Returns:** VTT file content.
|
| 161 |
+
|
| 162 |
+
### `POST /export_ass`
|
| 163 |
+
|
| 164 |
+
ASS/SSA format with Arabic font and styling presets. Most useful for video editors producing styled Quran captions.
|
| 165 |
+
|
| 166 |
+
**Inputs:** audio_id, word_level (boolean, default false), font_name (optional), font_size (optional)
|
| 167 |
+
|
| 168 |
+
**Returns:** ASS file content.
|
| 169 |
+
|
| 170 |
+
---
|
| 171 |
+
|
| 172 |
+
## Planned: Quran Lookup Endpoints
|
| 173 |
+
|
| 174 |
+
Utility endpoints for client-side UI (dropdowns, search, manual labelling).
|
| 175 |
+
|
| 176 |
+
### `GET /quran_text`
|
| 177 |
+
|
| 178 |
+
Return Quran text with diacritics for a given reference range.
|
| 179 |
+
|
| 180 |
+
**Inputs:** ref_from (e.g. `"2:255:1"`), ref_to (e.g. `"2:255:7"`)
|
| 181 |
+
|
| 182 |
+
**Returns:** `{"text": "...", "ref_from": "...", "ref_to": "..."}`. All 114 chapters are pre-cached in memory.
|
| 183 |
+
|
| 184 |
+
### `GET /surah_info`
|
| 185 |
+
|
| 186 |
+
List of all surahs with metadata.
|
| 187 |
+
|
| 188 |
+
**Returns:** Array of `{number, name_arabic, name_english, ayah_count, revelation_type}`.
|
| 189 |
+
|
| 190 |
+
---
|
| 191 |
+
|
| 192 |
+
## Planned: Recitation Analytics
|
| 193 |
+
|
| 194 |
+
### `POST /recitation_stats`
|
| 195 |
+
|
| 196 |
+
Derive pace and timing analytics from an existing session's alignment results.
|
| 197 |
+
|
| 198 |
+
**Inputs:** audio_id
|
| 199 |
+
|
| 200 |
+
**Returns:**
|
| 201 |
+
```json
|
| 202 |
+
{
|
| 203 |
+
"audio_id": "...",
|
| 204 |
+
"total_duration_sec": 312.5,
|
| 205 |
+
"total_segments": 7,
|
| 206 |
+
"total_words": 86,
|
| 207 |
+
"words_per_minute": 16.5,
|
| 208 |
+
"avg_segment_duration_sec": 8.2,
|
| 209 |
+
"avg_pause_duration_sec": 1.4,
|
| 210 |
+
"per_segment": [
|
| 211 |
+
{
|
| 212 |
+
"segment": 1,
|
| 213 |
+
"ref_from": "112:1:1",
|
| 214 |
+
"ref_to": "112:1:4",
|
| 215 |
+
"duration_sec": 2.18,
|
| 216 |
+
"word_count": 4,
|
| 217 |
+
"words_per_minute": 110.1,
|
| 218 |
+
"pause_after_sec": 1.82
|
| 219 |
+
}
|
| 220 |
+
]
|
| 221 |
+
}
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
Useful for learning apps tracking student fluency, reciter comparisons, or detecting rushed/slow sections.
|
| 225 |
+
|
| 226 |
+
---
|
| 227 |
+
|
| 228 |
+
## Planned: Streaming
|
| 229 |
+
|
| 230 |
+
### `POST /process_chunk`
|
| 231 |
+
|
| 232 |
+
Streaming-friendly endpoint for incremental audio processing. The client sends audio chunks as they become available, and the server returns partial alignment results progressively. Designed for live "now playing" displays (e.g. Quran radio showing the current ayah in real time).
|
| 233 |
+
|
| 234 |
+
**Inputs:** audio_id (optional — omit on first chunk to start a new session), audio_chunk (raw audio bytes), is_final (boolean)
|
| 235 |
+
|
| 236 |
+
**Returns:**
|
| 237 |
+
```json
|
| 238 |
+
{
|
| 239 |
+
"audio_id": "...",
|
| 240 |
+
"status": "partial",
|
| 241 |
+
"latest_segments": [
|
| 242 |
+
{
|
| 243 |
+
"segment": 5,
|
| 244 |
+
"ref_from": "36:1:1",
|
| 245 |
+
"ref_to": "36:1:2",
|
| 246 |
+
"matched_text": "يسٓ",
|
| 247 |
+
"time_from": 24.3,
|
| 248 |
+
"time_to": 25.8,
|
| 249 |
+
"confidence": 0.95
|
| 250 |
+
}
|
| 251 |
+
]
|
| 252 |
+
}
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
When `is_final=true`, the server finalises the session and returns the complete aligned output (same structure as `/process_audio_session`).
|
| 256 |
+
|
| 257 |
+
**Chunking notes:** The server buffers audio internally and runs VAD + ASR when enough speech has accumulated to form a segment. Earlier segments are locked in and won't change; only the trailing edge is provisional.
|
| 258 |
+
|
| 259 |
+
---
|
| 260 |
+
|
| 261 |
+
## Planned: Health / Status
|
| 262 |
+
|
| 263 |
+
### `GET /health`
|
| 264 |
+
|
| 265 |
+
Server status for monitoring dashboards and client-side availability checks.
|
| 266 |
+
|
| 267 |
+
**Returns:**
|
| 268 |
+
```json
|
| 269 |
+
{
|
| 270 |
+
"status": "ok",
|
| 271 |
+
"gpu_available": true,
|
| 272 |
+
"gpu_quota_exhausted": false,
|
| 273 |
+
"quota_reset_time": null,
|
| 274 |
+
"active_sessions": 12,
|
| 275 |
+
"models_loaded": ["Base", "Large"],
|
| 276 |
+
"uptime_sec": 84200
|
| 277 |
+
}
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
+
---
|
| 281 |
+
|
| 282 |
+
## Error Handling
|
| 283 |
+
|
| 284 |
+
If `audio_id` is missing, expired, or invalid, session endpoints return:
|
| 285 |
+
|
| 286 |
+
```json
|
| 287 |
+
{"error": "Session not found or expired", "segments": []}
|
| 288 |
+
```
|
| 289 |
+
|
| 290 |
+
The client should call `/process_audio_session` again to get a fresh session.
|
| 291 |
+
|
| 292 |
+
---
|
| 293 |
+
|
| 294 |
+
## Design Notes
|
| 295 |
+
|
| 296 |
+
- **Thread safety:** Gradio handles concurrent requests via threading. The session store uses a lock around its internal dict.
|
| 297 |
+
- **Storage:** Audio on disk (can be large), metadata in memory (always small). Audio loaded via memory-mapped reads on demand.
|
| 298 |
+
- **No auth needed:** Session IDs are 128-bit random UUIDs — effectively unguessable.
|
| 299 |
+
- **HF Spaces compatibility:** `/tmp` is ephemeral and cleared on restart, which is fine since sessions are transient. The existing `allowed_paths=["/tmp"]` covers the new directory.
|
| 300 |
+
- **Backward compatible:** `/process_audio_json` remains unchanged.
|
docs/usage-logging.md
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Usage Logging
|
| 2 |
+
|
| 3 |
+
## Part 1 — Reference: `recitation_app` Logging
|
| 4 |
+
|
| 5 |
+
Documents the recitation logging system used in `recitation_app` to collect anonymised analysis data on HuggingFace Hub. Included here as a reference for the quran_aligner schema below.
|
| 6 |
+
|
| 7 |
+
### Dataset
|
| 8 |
+
|
| 9 |
+
| Property | Value |
|
| 10 |
+
|----------|-------|
|
| 11 |
+
| Repo | `hetchyy/recitation-logs` (private) |
|
| 12 |
+
| Type | HuggingFace Dataset |
|
| 13 |
+
| Format | Parquet files in `data/` |
|
| 14 |
+
| Push interval | 1 minute |
|
| 15 |
+
|
| 16 |
+
Configured in `config.py`:
|
| 17 |
+
|
| 18 |
+
```python
|
| 19 |
+
USAGE_LOG_DATASET_REPO = "hetchyy/recitation-logs"
|
| 20 |
+
USAGE_LOG_PUSH_INTERVAL_MINUTES = 1
|
| 21 |
+
USAGE_LOG_AUDIO = False # toggleable at runtime
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
### Schema
|
| 25 |
+
|
| 26 |
+
Defined in `utils/usage_logger.py` as `_RECITATION_SCHEMA`:
|
| 27 |
+
|
| 28 |
+
| Field | HF Type | Description |
|
| 29 |
+
|-------|---------|-------------|
|
| 30 |
+
| `audio` | `Audio` | Optional FLAC-encoded audio bytes embedded in parquet |
|
| 31 |
+
| `timestamp` | `Value(string)` | ISO 8601 datetime of the analysis |
|
| 32 |
+
| `user_id` | `Value(string)` | SHA-256 hash (12-char) of username or IP+UA |
|
| 33 |
+
| `verse_ref` | `Value(string)` | Quranic reference, e.g. `"1:1"` |
|
| 34 |
+
| `canonical_text` | `Value(string)` | Arabic text of the verse |
|
| 35 |
+
| `segments` | `Value(string)` | JSON array of segment results (see below) |
|
| 36 |
+
| `multi_model` | `Value(bool)` | Whether multiple ASR models were used |
|
| 37 |
+
| `settings` | `Value(string)` | JSON dict of Tajweed settings |
|
| 38 |
+
| `vad_timestamps` | `Value(string)` | JSON list of VAD segment boundaries |
|
| 39 |
+
|
| 40 |
+
#### Segment object (inside `segments` JSON)
|
| 41 |
+
|
| 42 |
+
```json
|
| 43 |
+
{
|
| 44 |
+
"segment_ref": "1:1",
|
| 45 |
+
"canonical_phonemes": "b i s m i ...",
|
| 46 |
+
"detected_phonemes": "b i s m i ..."
|
| 47 |
+
}
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
#### Settings object (inside `settings` JSON)
|
| 51 |
+
|
| 52 |
+
```json
|
| 53 |
+
{
|
| 54 |
+
"tolerance": 0.15,
|
| 55 |
+
"iqlab_sound": "m",
|
| 56 |
+
"ghunnah_length": 2,
|
| 57 |
+
"jaiz_length": 4,
|
| 58 |
+
"wajib_length": 4,
|
| 59 |
+
"arid_length": 2,
|
| 60 |
+
"leen_length": 2
|
| 61 |
+
}
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
### ParquetScheduler
|
| 65 |
+
|
| 66 |
+
Custom subclass of `huggingface_hub.CommitScheduler` (`utils/usage_logger.py`).
|
| 67 |
+
|
| 68 |
+
#### How it works
|
| 69 |
+
|
| 70 |
+
1. **Buffer** — Rows accumulate in an in-memory list via `.append(row)`. Access is protected by a threading lock.
|
| 71 |
+
2. **Flush** — On each scheduler tick (every `USAGE_LOG_PUSH_INTERVAL_MINUTES`):
|
| 72 |
+
- Lock the buffer, swap it out, release the lock.
|
| 73 |
+
- For any `audio` field containing a file path, read the file and convert to `{"path": filename, "bytes": binary_data}`.
|
| 74 |
+
- Build a PyArrow table from the rows.
|
| 75 |
+
- Embed the HF feature schema in parquet metadata:
|
| 76 |
+
```python
|
| 77 |
+
table.replace_schema_metadata(
|
| 78 |
+
{"huggingface": json.dumps({"info": {"features": schema}})}
|
| 79 |
+
)
|
| 80 |
+
```
|
| 81 |
+
- Write to a temp parquet file, then upload via `api.upload_file()` to `data/{uuid4()}.parquet`.
|
| 82 |
+
- Clean up temp audio files.
|
| 83 |
+
|
| 84 |
+
#### Audio encoding
|
| 85 |
+
|
| 86 |
+
When `USAGE_LOG_AUDIO` is enabled:
|
| 87 |
+
|
| 88 |
+
```python
|
| 89 |
+
sf.write(filepath, audio_array, sample_rate, format="FLAC")
|
| 90 |
+
row["audio"] = str(filepath) # ParquetScheduler reads and embeds the bytes
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
The audio is 16kHz mono, encoded as FLAC, and stored as embedded bytes inside the parquet file.
|
| 94 |
+
|
| 95 |
+
### Lazy Initialisation
|
| 96 |
+
|
| 97 |
+
Schedulers are **not** created at import time. They are initialised on first call to `_ensure_schedulers()` using double-checked locking:
|
| 98 |
+
|
| 99 |
+
```python
|
| 100 |
+
_recitation_scheduler = None
|
| 101 |
+
_schedulers_initialized = False
|
| 102 |
+
_init_lock = threading.Lock()
|
| 103 |
+
|
| 104 |
+
def _ensure_schedulers():
|
| 105 |
+
global _recitation_scheduler, _schedulers_initialized
|
| 106 |
+
if _schedulers_initialized:
|
| 107 |
+
return
|
| 108 |
+
with _init_lock:
|
| 109 |
+
if _schedulers_initialized:
|
| 110 |
+
return
|
| 111 |
+
_schedulers_initialized = True
|
| 112 |
+
_recitation_scheduler = ParquetScheduler(
|
| 113 |
+
repo_id=USAGE_LOG_DATASET_REPO,
|
| 114 |
+
schema=_RECITATION_SCHEMA,
|
| 115 |
+
every=USAGE_LOG_PUSH_INTERVAL_MINUTES,
|
| 116 |
+
path_in_repo="data",
|
| 117 |
+
repo_type="dataset",
|
| 118 |
+
private=True,
|
| 119 |
+
)
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
This avoids interfering with ZeroGPU, which is sensitive to early network calls.
|
| 123 |
+
|
| 124 |
+
### Error Logging
|
| 125 |
+
|
| 126 |
+
Errors use a separate `CommitScheduler` (not `ParquetScheduler`) that watches a local directory:
|
| 127 |
+
|
| 128 |
+
- Local path: `/usage_logs/errors/error_log-{uuid4()}.jsonl`
|
| 129 |
+
- Remote path: `data/errors/`
|
| 130 |
+
- Format: JSONL with fields `timestamp`, `user_id`, `verse_ref`, `error_message`
|
| 131 |
+
|
| 132 |
+
Errors are appended to the JSONL file under a file lock. The `CommitScheduler` syncs the directory to Hub periodically.
|
| 133 |
+
|
| 134 |
+
### User Anonymisation
|
| 135 |
+
|
| 136 |
+
```python
|
| 137 |
+
def get_user_id(request) -> str:
|
| 138 |
+
username = getattr(request, "username", None)
|
| 139 |
+
if username:
|
| 140 |
+
return hashlib.sha256(username.encode()).hexdigest()[:12]
|
| 141 |
+
ip = headers.get("x-forwarded-for", "").split(",")[0].strip()
|
| 142 |
+
ua = headers.get("user-agent", "")
|
| 143 |
+
return hashlib.sha256(f"{ip}|{ua}".encode()).hexdigest()[:12]
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
- Logged-in HF users: hash of username
|
| 147 |
+
- Anonymous users: hash of IP + User-Agent
|
| 148 |
+
- Always truncated to 12 hex characters
|
| 149 |
+
|
| 150 |
+
### Fallback
|
| 151 |
+
|
| 152 |
+
If the scheduler fails to initialise (no HF token, network issues), rows are written to a local JSONL file at `usage_logs/recitations_fallback.jsonl` (without audio).
|
| 153 |
+
|
| 154 |
+
### Integration Point
|
| 155 |
+
|
| 156 |
+
Logging is called from the audio processing handler (`ui/handlers/audio_processing.py`) after each analysis completes:
|
| 157 |
+
|
| 158 |
+
```python
|
| 159 |
+
log_analysis(
|
| 160 |
+
user_id, ref, text, segments,
|
| 161 |
+
multi_model=bool(use_multi),
|
| 162 |
+
settings=_settings,
|
| 163 |
+
audio=audio_for_log, # tuple of (sample_rate, np.ndarray) or None
|
| 164 |
+
vad_timestamps=vad_ts, # list of [start, end] pairs
|
| 165 |
+
)
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
Errors are logged separately:
|
| 169 |
+
|
| 170 |
+
```python
|
| 171 |
+
log_error(user_id, ref, "Audio loading failed")
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
### Dependencies
|
| 175 |
+
|
| 176 |
+
- `huggingface_hub` — `CommitScheduler` base class and Hub API
|
| 177 |
+
- `pyarrow` — Parquet table creation and schema metadata
|
| 178 |
+
- `soundfile` — FLAC audio encoding
|
| 179 |
+
|
| 180 |
+
---
|
| 181 |
+
|
| 182 |
+
## Part 2 — `quran_aligner` Logging Schema
|
| 183 |
+
|
| 184 |
+
Schema for logging alignment runs from this project. One row per audio upload. The row is mutated in-place while it sits in the `ParquetScheduler` buffer (before the next push-to-Hub tick). Run-level fields (profiling, reciter stats, quality stats, settings) are **overwritten** to reflect the latest run. Segment results are **appended** so every setting combination is preserved.
|
| 185 |
+
|
| 186 |
+
### Run-level fields
|
| 187 |
+
|
| 188 |
+
#### Identity
|
| 189 |
+
|
| 190 |
+
| Field | HF Type | Description |
|
| 191 |
+
|-------|---------|-------------|
|
| 192 |
+
| `audio` | `Audio` | FLAC-encoded audio (16kHz mono) |
|
| 193 |
+
| `audio_id` | `Value(string)` | `{sha256(audio_bytes)[:16]}:{timestamp}`, e.g. `a3f7b2c91e04d8f2:20260203T141532` |
|
| 194 |
+
| `timestamp` | `Value(string)` | ISO 8601 datetime truncated to seconds, e.g. `2026-02-03T01:50:45` |
|
| 195 |
+
| `user_id` | `Value(string)` | SHA-256 hash (12-char) of IP+UA |
|
| 196 |
+
|
| 197 |
+
The `audio_id` hash prefix enables grouping/deduplication of the same recording across runs; the timestamp suffix makes each run unique. Cost is ~90ms for a 5-minute recording.
|
| 198 |
+
|
| 199 |
+
#### Input metadata
|
| 200 |
+
|
| 201 |
+
| Field | HF Type | Description |
|
| 202 |
+
|-------|---------|-------------|
|
| 203 |
+
| `audio_duration_s` | `Value(float64)` | Total audio duration in seconds |
|
| 204 |
+
| `num_segments` | `Value(int32)` | Number of VAD segments |
|
| 205 |
+
| `surah` | `Value(int32)` | Detected surah (1-114) |
|
| 206 |
+
|
| 207 |
+
#### Segmentation settings
|
| 208 |
+
|
| 209 |
+
| Field | HF Type | Description |
|
| 210 |
+
|-------|---------|-------------|
|
| 211 |
+
| `min_silence_ms` | `Value(int32)` | Minimum silence duration to split |
|
| 212 |
+
| `min_speech_ms` | `Value(int32)` | Minimum speech duration for a valid segment |
|
| 213 |
+
| `pad_ms` | `Value(int32)` | Padding around speech segments |
|
| 214 |
+
| `asr_model` | `Value(string)` | `"Base"` (`hetchyy/r15_95m`) or `"Large"` (`hetchyy/r7`) |
|
| 215 |
+
| `device` | `Value(string)` | `"GPU"` or `"CPU"` |
|
| 216 |
+
|
| 217 |
+
#### Profiling (seconds)
|
| 218 |
+
|
| 219 |
+
| Field | HF Type | Description |
|
| 220 |
+
|-------|---------|-------------|
|
| 221 |
+
| `total_time` | `Value(float64)` | End-to-end pipeline wall time |
|
| 222 |
+
| `vad_queue_time` | `Value(float64)` | VAD queue wait time |
|
| 223 |
+
| `vad_gpu_time` | `Value(float64)` | VAD actual GPU execution |
|
| 224 |
+
| `asr_gpu_time` | `Value(float64)` | ASR actual GPU execution |
|
| 225 |
+
| `dp_total_time` | `Value(float64)` | Total DP alignment across all segments |
|
| 226 |
+
|
| 227 |
+
#### Quality & retry stats
|
| 228 |
+
|
| 229 |
+
| Field | HF Type | Description |
|
| 230 |
+
|-------|---------|-------------|
|
| 231 |
+
| `segments_passed` | `Value(int32)` | Segments with confidence > 0 |
|
| 232 |
+
| `segments_failed` | `Value(int32)` | Segments with confidence <= 0 |
|
| 233 |
+
| `mean_confidence` | `Value(float64)` | Average confidence across all segments |
|
| 234 |
+
| `tier1_retries` | `Value(int32)` | Expanded-window retry attempts |
|
| 235 |
+
| `tier1_passed` | `Value(int32)` | Successful tier 1 retries |
|
| 236 |
+
| `tier2_retries` | `Value(int32)` | Relaxed-threshold retry attempts |
|
| 237 |
+
| `tier2_passed` | `Value(int32)` | Successful tier 2 retries |
|
| 238 |
+
| `reanchors` | `Value(int32)` | Re-anchor events (after consecutive failures) |
|
| 239 |
+
| `special_merges` | `Value(int32)` | Basmala-fused segments |
|
| 240 |
+
|
| 241 |
+
#### Reciter stats
|
| 242 |
+
|
| 243 |
+
Computed from matched segments (those with `word_count > 0`). Already calculated in `app.py:877-922` for console output.
|
| 244 |
+
|
| 245 |
+
| Field | HF Type | Description |
|
| 246 |
+
|-------|---------|-------------|
|
| 247 |
+
| `words_per_minute` | `Value(float64)` | `total_words / (total_speech_s / 60)` |
|
| 248 |
+
| `phonemes_per_second` | `Value(float64)` | `total_phonemes / total_speech_s` |
|
| 249 |
+
| `avg_segment_duration` | `Value(float64)` | Mean duration of matched segments |
|
| 250 |
+
| `std_segment_duration` | `Value(float64)` | Std dev of matched segment durations |
|
| 251 |
+
| `avg_pause_duration` | `Value(float64)` | Mean inter-segment silence gap |
|
| 252 |
+
| `std_pause_duration` | `Value(float64)` | Std dev of pause durations |
|
| 253 |
+
|
| 254 |
+
#### Session flags
|
| 255 |
+
|
| 256 |
+
| Field | HF Type | Description |
|
| 257 |
+
|-------|---------|-------------|
|
| 258 |
+
| `resegmented` | `Value(bool)` | User resegmented with different VAD settings |
|
| 259 |
+
| `retranscribed` | `Value(bool)` | User retranscribed with a different ASR model |
|
| 260 |
+
|
| 261 |
+
#### Segments, timestamps & error
|
| 262 |
+
|
| 263 |
+
| Field | HF Type | Description |
|
| 264 |
+
|-------|---------|-------------|
|
| 265 |
+
| `segments` | `Value(string)` | JSON array of run objects (see below) — **appended** on resegment/retranscribe |
|
| 266 |
+
| `word_timestamps` | `Value(string)` | JSON array of per-segment MFA word timings (see below), null until computed |
|
| 267 |
+
| `error` | `Value(string)` | Top-level error message if the pipeline failed |
|
| 268 |
+
|
| 269 |
+
### Segment runs (inside `segments` JSON)
|
| 270 |
+
|
| 271 |
+
Each run with different settings appends a new run object. The array preserves the full history so every setting combination is available.
|
| 272 |
+
|
| 273 |
+
```json
|
| 274 |
+
[
|
| 275 |
+
{
|
| 276 |
+
"min_silence_ms": 200,
|
| 277 |
+
"min_speech_ms": 1000,
|
| 278 |
+
"pad_ms": 100,
|
| 279 |
+
"asr_model": "Base",
|
| 280 |
+
"segments": [
|
| 281 |
+
{
|
| 282 |
+
"idx": 1,
|
| 283 |
+
"start": 0.512,
|
| 284 |
+
"end": 3.841,
|
| 285 |
+
"duration": 3.329,
|
| 286 |
+
"ref": "2:255:1-2:255:5",
|
| 287 |
+
"confidence": 0.87,
|
| 288 |
+
"word_count": 5,
|
| 289 |
+
"ayah_span": 1,
|
| 290 |
+
"phoneme_count": 42,
|
| 291 |
+
"undersegmented": false,
|
| 292 |
+
"missing_words": false,
|
| 293 |
+
"special_type": null,
|
| 294 |
+
"error": null
|
| 295 |
+
}
|
| 296 |
+
]
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"min_silence_ms": 600,
|
| 300 |
+
"min_speech_ms": 1500,
|
| 301 |
+
"pad_ms": 300,
|
| 302 |
+
"asr_model": "Base",
|
| 303 |
+
"segments": [...]
|
| 304 |
+
}
|
| 305 |
+
]
|
| 306 |
+
```
|
| 307 |
+
|
| 308 |
+
#### Run object
|
| 309 |
+
|
| 310 |
+
| Field | Type | Description |
|
| 311 |
+
|-------|------|-------------|
|
| 312 |
+
| `min_silence_ms` | int | Silence setting used for this run |
|
| 313 |
+
| `min_speech_ms` | int | Speech setting used for this run |
|
| 314 |
+
| `pad_ms` | int | Pad setting used for this run |
|
| 315 |
+
| `asr_model` | string | `"Base"` or `"Large"` |
|
| 316 |
+
| `segments` | array | Per-segment objects for this run |
|
| 317 |
+
|
| 318 |
+
#### Per-segment object
|
| 319 |
+
|
| 320 |
+
| Field | Type | Description |
|
| 321 |
+
|-------|------|-------------|
|
| 322 |
+
| `idx` | int | 1-indexed segment number |
|
| 323 |
+
| `start` | float | Segment start time in seconds |
|
| 324 |
+
| `end` | float | Segment end time in seconds |
|
| 325 |
+
| `duration` | float | `end - start` |
|
| 326 |
+
| `ref` | string | Matched reference `"S:A:W1-S:A:W2"`, empty if failed |
|
| 327 |
+
| `confidence` | float | Alignment confidence [0.0, 1.0] |
|
| 328 |
+
| `word_count` | int | Number of words matched |
|
| 329 |
+
| `ayah_span` | int | Number of ayahs spanned |
|
| 330 |
+
| `phoneme_count` | int | Length of ASR phoneme sequence |
|
| 331 |
+
| `undersegmented` | bool | Flagged if word_count >= 20 or ayah_span >= 2 and duration >= 15s |
|
| 332 |
+
| `missing_words` | bool | Gaps detected in word alignment |
|
| 333 |
+
| `special_type` | string\|null | `"Basmala"`, `"Isti'adha"`, `"Isti'adha+Basmala"`, or null |
|
| 334 |
+
| `error` | string\|null | Per-segment error message |
|
| 335 |
+
|
| 336 |
+
### Word timestamps (inside `word_timestamps` JSON)
|
| 337 |
+
|
| 338 |
+
Populated when the user computes MFA timestamps. Array of per-segment word timing arrays:
|
| 339 |
+
|
| 340 |
+
```json
|
| 341 |
+
[
|
| 342 |
+
{
|
| 343 |
+
"segment_idx": 1,
|
| 344 |
+
"ref": "2:255:1-2:255:5",
|
| 345 |
+
"words": [
|
| 346 |
+
{"word": "ٱللَّهُ", "start": 0.512, "end": 0.841},
|
| 347 |
+
{"word": "لَآ", "start": 0.870, "end": 1.023}
|
| 348 |
+
]
|
| 349 |
+
}
|
| 350 |
+
]
|
| 351 |
+
```
|
| 352 |
+
|
| 353 |
+
### In-place mutation
|
| 354 |
+
|
| 355 |
+
The row dict is appended to `ParquetScheduler` on the initial run, and a reference is stored in `gr.State`. Subsequent actions (resegment, retranscribe, compute timestamps) mutate the dict in-place before the next push-to-Hub tick (every 1 minute).
|
| 356 |
+
|
| 357 |
+
- **Overwritten on each run:** profiling, quality/retry stats, reciter stats, run-level settings (`min_silence_ms`, `asr_model`, etc.), `num_segments`, `surah`.
|
| 358 |
+
- **Appended on each run:** `segments` JSON array gains a new run object with its settings and per-segment results.
|
| 359 |
+
- **Set once:** `word_timestamps` is populated when the user computes MFA timestamps (null until then).
|
| 360 |
+
- **If the push already fired** before a subsequent action, the mutation is a no-op on the already-uploaded row. The new results are lost for that row — acceptable since the initial run is always captured.
|
| 361 |
+
|
| 362 |
+
### Design rationale
|
| 363 |
+
|
| 364 |
+
- **Settings are denormalised** into each row so config changes can be correlated with quality without joins.
|
| 365 |
+
- **Profiling fields are flat columns**, not nested JSON, so they are directly queryable in the HF dataset viewer and pandas.
|
| 366 |
+
- **Segments are an array of run objects** — each run includes its settings alongside the per-segment results, so different setting combinations are preserved even though run-level fields reflect the latest state.
|
| 367 |
+
- **`mean_confidence` is pre-computed** at the run level for easy filtering and sorting without parsing the segments array.
|
| 368 |
+
- **Audio is always uploaded** as the first column so every run is reproducible and the dataset is playable in the HF viewer.
|
| 369 |
+
- **`audio_id`** combines a content hash with a timestamp — the hash prefix groups re-runs of the same recording, the suffix makes each row unique.
|
| 370 |
+
- **All sources are from existing objects** — `ProfilingData` (segment_processor.py), `SegmentInfo` (segment_processor.py), and `config.py` values. No new computation is required beyond assembling the row.
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=6.5.1
|
| 2 |
+
spaces>=0.44.0
|
| 3 |
+
torch==2.8.0
|
| 4 |
+
transformers==5.0.0
|
| 5 |
+
accelerate==1.11.0
|
| 6 |
+
librosa==0.10.2
|
| 7 |
+
numpy>=1.24.0,<2.0.0
|
| 8 |
+
requests>=2.28.0
|
| 9 |
+
pyarrow>=14.0.0
|
| 10 |
+
soundfile>=0.12.0
|
| 11 |
+
cython>=3.0.0
|
| 12 |
+
recitations_segmenter==1.0.0
|
| 13 |
+
git+https://github.com/Hetchy/Quranic-Phonemizer.git@1b6a8cc
|
scripts/add_open_tanween.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Replace regular tanween in qpc_hafs.json with open tanween where digital_khatt uses them."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
|
| 7 |
+
|
| 8 |
+
OPEN_TO_REGULAR = {
|
| 9 |
+
"\u08F0": "\u064B", # open fathatan → regular fathatan
|
| 10 |
+
"\u08F1": "\u064C", # open dammatan → regular dammatan
|
| 11 |
+
"\u08F2": "\u064D", # open kasratan → regular kasratan
|
| 12 |
+
}
|
| 13 |
+
REGULAR_TO_OPEN = {v: k for k, v in OPEN_TO_REGULAR.items()}
|
| 14 |
+
|
| 15 |
+
def main():
|
| 16 |
+
khatt = json.loads((DATA_DIR / "digital_khatt_v2_script.json").read_text("utf-8"))
|
| 17 |
+
qpc = json.loads((DATA_DIR / "qpc_hafs.json").read_text("utf-8"))
|
| 18 |
+
|
| 19 |
+
counts = {"\u08F0": 0, "\u08F1": 0, "\u08F2": 0}
|
| 20 |
+
mismatches = []
|
| 21 |
+
|
| 22 |
+
for key, khatt_entry in khatt.items():
|
| 23 |
+
if key not in qpc:
|
| 24 |
+
continue
|
| 25 |
+
khatt_text = khatt_entry["text"]
|
| 26 |
+
qpc_text = qpc[key]["text"]
|
| 27 |
+
|
| 28 |
+
for open_char, regular_char in OPEN_TO_REGULAR.items():
|
| 29 |
+
if open_char in khatt_text:
|
| 30 |
+
if regular_char in qpc_text:
|
| 31 |
+
qpc_text = qpc_text.replace(regular_char, open_char)
|
| 32 |
+
counts[open_char] += 1
|
| 33 |
+
else:
|
| 34 |
+
mismatches.append((key, open_char, khatt_text, qpc[key]["text"]))
|
| 35 |
+
|
| 36 |
+
qpc[key]["text"] = qpc_text
|
| 37 |
+
|
| 38 |
+
print("Replacements:")
|
| 39 |
+
for char, count in counts.items():
|
| 40 |
+
name = {"\u08F0": "fathatan", "\u08F1": "dammatan", "\u08F2": "kasratan"}[char]
|
| 41 |
+
print(f" open {name} (U+{ord(char):04X}): {count} words")
|
| 42 |
+
print(f" total: {sum(counts.values())} words")
|
| 43 |
+
|
| 44 |
+
if mismatches:
|
| 45 |
+
print(f"\nMismatches ({len(mismatches)}):")
|
| 46 |
+
for key, char, kt, qt in mismatches[:10]:
|
| 47 |
+
print(f" {key}: khatt has U+{ord(char):04X} but qpc missing regular equivalent")
|
| 48 |
+
print(f" khatt: {kt}")
|
| 49 |
+
print(f" qpc: {qt}")
|
| 50 |
+
|
| 51 |
+
out_path = DATA_DIR / "qpc_hafs.json"
|
| 52 |
+
out_path.write_text(json.dumps(qpc, ensure_ascii=False, indent=2) + "\n", "utf-8")
|
| 53 |
+
print(f"\nSaved to {out_path}")
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
main()
|
scripts/build_phoneme_cache.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Build phoneme cache for all 114 chapters.
|
| 3 |
+
|
| 4 |
+
Phonemizes the entire Quran in a single call and saves per-chapter
|
| 5 |
+
ChapterReference objects to a pickle file for fast loading at runtime.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python scripts/build_phoneme_cache.py
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import pickle
|
| 12 |
+
import sys
|
| 13 |
+
from collections import defaultdict
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
_project_root = Path(__file__).parent.parent.resolve()
|
| 17 |
+
sys.path.insert(0, str(_project_root))
|
| 18 |
+
|
| 19 |
+
from config import PHONEME_CACHE_PATH
|
| 20 |
+
from src.alignment.phoneme_matcher import ChapterReference, RefWord
|
| 21 |
+
from src.phonemizer_utils import get_phonemizer
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def build_all_chapters() -> dict[int, ChapterReference]:
|
| 25 |
+
"""Phonemize entire Quran and build all ChapterReference objects."""
|
| 26 |
+
pm = get_phonemizer()
|
| 27 |
+
|
| 28 |
+
print("Phonemizing entire Quran (1-114)...")
|
| 29 |
+
result = pm.phonemize(ref="1-114", stops=["verse"])
|
| 30 |
+
|
| 31 |
+
words = result._words
|
| 32 |
+
nested = result._nested
|
| 33 |
+
print(f"Total words: {len(words)}")
|
| 34 |
+
|
| 35 |
+
# Group by surah
|
| 36 |
+
surah_words: dict[int, list[RefWord]] = defaultdict(list)
|
| 37 |
+
for word, phonemes in zip(words, nested):
|
| 38 |
+
loc = word.location
|
| 39 |
+
surah_words[loc.surah_num].append(RefWord(
|
| 40 |
+
text=word.text,
|
| 41 |
+
phonemes=phonemes,
|
| 42 |
+
surah=loc.surah_num,
|
| 43 |
+
ayah=loc.ayah_num,
|
| 44 |
+
word_num=loc.word_num,
|
| 45 |
+
))
|
| 46 |
+
|
| 47 |
+
# Build ChapterReference for each surah
|
| 48 |
+
chapters: dict[int, ChapterReference] = {}
|
| 49 |
+
for surah_num in sorted(surah_words):
|
| 50 |
+
ref_words = surah_words[surah_num]
|
| 51 |
+
|
| 52 |
+
total_phones = sum(len(w.phonemes) for w in ref_words)
|
| 53 |
+
avg_phones_per_word = total_phones / len(ref_words) if ref_words else 4.0
|
| 54 |
+
|
| 55 |
+
flat_phonemes = []
|
| 56 |
+
flat_phone_to_word = []
|
| 57 |
+
word_phone_offsets = []
|
| 58 |
+
|
| 59 |
+
for word_idx, word in enumerate(ref_words):
|
| 60 |
+
word_phone_offsets.append(len(flat_phonemes))
|
| 61 |
+
for ph in word.phonemes:
|
| 62 |
+
flat_phonemes.append(ph)
|
| 63 |
+
flat_phone_to_word.append(word_idx)
|
| 64 |
+
|
| 65 |
+
# Sentinel offset
|
| 66 |
+
word_phone_offsets.append(len(flat_phonemes))
|
| 67 |
+
|
| 68 |
+
chapters[surah_num] = ChapterReference(
|
| 69 |
+
surah=surah_num,
|
| 70 |
+
words=ref_words,
|
| 71 |
+
avg_phones_per_word=avg_phones_per_word,
|
| 72 |
+
flat_phonemes=flat_phonemes,
|
| 73 |
+
flat_phone_to_word=flat_phone_to_word,
|
| 74 |
+
word_phone_offsets=word_phone_offsets,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
print(f"Built {len(chapters)} chapter references")
|
| 78 |
+
return chapters
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def main():
|
| 82 |
+
chapters = build_all_chapters()
|
| 83 |
+
|
| 84 |
+
output_path = Path(PHONEME_CACHE_PATH)
|
| 85 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 86 |
+
|
| 87 |
+
with open(output_path, "wb") as f:
|
| 88 |
+
pickle.dump(chapters, f, protocol=pickle.HIGHEST_PROTOCOL)
|
| 89 |
+
|
| 90 |
+
print(f"Saved to {output_path}")
|
| 91 |
+
print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
if __name__ == "__main__":
|
| 95 |
+
main()
|
scripts/build_phoneme_ngram_index.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Build phoneme n-gram index for the entire Quran.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
python scripts/build_phoneme_ngram_index.py
|
| 6 |
+
python scripts/build_phoneme_ngram_index.py --ngram-size 4
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import argparse
|
| 10 |
+
import pickle
|
| 11 |
+
import sys
|
| 12 |
+
from collections import defaultdict
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Dict, List, Tuple
|
| 15 |
+
|
| 16 |
+
from tqdm import tqdm
|
| 17 |
+
|
| 18 |
+
# Add project root to path
|
| 19 |
+
_project_root = Path(__file__).parent.parent.resolve()
|
| 20 |
+
sys.path.insert(0, str(_project_root))
|
| 21 |
+
|
| 22 |
+
from config import NGRAM_SIZE, NGRAM_INDEX_PATH
|
| 23 |
+
from src.alignment.ngram_index import PhonemeNgramIndex
|
| 24 |
+
from src.phonemizer_utils import get_phonemizer
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def build_index(ngram_size: int) -> PhonemeNgramIndex:
|
| 28 |
+
"""Build the phoneme n-gram index from the entire Quran."""
|
| 29 |
+
pm = get_phonemizer()
|
| 30 |
+
|
| 31 |
+
print("Phonemizing entire Quran (surahs 1-114)...")
|
| 32 |
+
result = pm.phonemize(ref="1-114", stops=["verse"])
|
| 33 |
+
|
| 34 |
+
# Use _words (locations) + _nested (phonemes) directly — avoids slow get_mapping()
|
| 35 |
+
words = result._words
|
| 36 |
+
nested = result._nested
|
| 37 |
+
print(f"Total words from phonemizer: {len(words)}")
|
| 38 |
+
|
| 39 |
+
# Group phonemes by (surah, ayah) from word locations
|
| 40 |
+
verse_phonemes: Dict[Tuple[int, int], List[str]] = defaultdict(list)
|
| 41 |
+
for word, phonemes in tqdm(zip(words, nested), total=len(words), desc="Grouping words by verse"):
|
| 42 |
+
if not phonemes:
|
| 43 |
+
continue
|
| 44 |
+
loc = word.location
|
| 45 |
+
verse_phonemes[(loc.surah_num, loc.ayah_num)].extend(phonemes)
|
| 46 |
+
|
| 47 |
+
print(f"Total verses: {len(verse_phonemes)}")
|
| 48 |
+
|
| 49 |
+
# Extract n-grams per verse
|
| 50 |
+
ngram_positions: Dict[Tuple[str, ...], List[Tuple[int, int]]] = defaultdict(list)
|
| 51 |
+
total_ngrams = 0
|
| 52 |
+
|
| 53 |
+
for (surah, ayah), phonemes in tqdm(verse_phonemes.items(), desc="Building n-grams"):
|
| 54 |
+
if len(phonemes) < ngram_size:
|
| 55 |
+
continue
|
| 56 |
+
for i in range(len(phonemes) - ngram_size + 1):
|
| 57 |
+
ng = tuple(phonemes[i : i + ngram_size])
|
| 58 |
+
ngram_positions[ng].append((surah, ayah))
|
| 59 |
+
total_ngrams += 1
|
| 60 |
+
|
| 61 |
+
# Build counts
|
| 62 |
+
ngram_counts: Dict[Tuple[str, ...], int] = {
|
| 63 |
+
ng: len(positions) for ng, positions in ngram_positions.items()
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
print(f"Total n-gram occurrences: {total_ngrams}")
|
| 67 |
+
print(f"Unique n-grams: {len(ngram_positions)}")
|
| 68 |
+
if ngram_counts:
|
| 69 |
+
min_count = min(ngram_counts.values())
|
| 70 |
+
max_count = max(ngram_counts.values())
|
| 71 |
+
print(f"Count range: {min_count} - {max_count}")
|
| 72 |
+
|
| 73 |
+
return PhonemeNgramIndex(
|
| 74 |
+
ngram_positions=dict(ngram_positions),
|
| 75 |
+
ngram_counts=ngram_counts,
|
| 76 |
+
ngram_size=ngram_size,
|
| 77 |
+
total_ngrams=total_ngrams,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def main():
|
| 82 |
+
parser = argparse.ArgumentParser(description="Build phoneme n-gram index for Quran")
|
| 83 |
+
parser.add_argument(
|
| 84 |
+
"--ngram-size",
|
| 85 |
+
type=int,
|
| 86 |
+
default=NGRAM_SIZE,
|
| 87 |
+
help=f"N-gram size (default: {NGRAM_SIZE})",
|
| 88 |
+
)
|
| 89 |
+
parser.add_argument(
|
| 90 |
+
"--output",
|
| 91 |
+
type=str,
|
| 92 |
+
default=str(NGRAM_INDEX_PATH),
|
| 93 |
+
help=f"Output path (default: {NGRAM_INDEX_PATH})",
|
| 94 |
+
)
|
| 95 |
+
args = parser.parse_args()
|
| 96 |
+
|
| 97 |
+
index = build_index(args.ngram_size)
|
| 98 |
+
|
| 99 |
+
output_path = Path(args.output)
|
| 100 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 101 |
+
|
| 102 |
+
with open(output_path, "wb") as f:
|
| 103 |
+
pickle.dump(index, f, protocol=pickle.HIGHEST_PROTOCOL)
|
| 104 |
+
|
| 105 |
+
print(f"Saved index to {output_path}")
|
| 106 |
+
print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
main()
|
scripts/export_onnx.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Export phoneme ASR models to optimized ONNX format and upload to HF Hub.
|
| 2 |
+
|
| 3 |
+
Usage:
|
| 4 |
+
pip install optimum[onnxruntime] onnxruntime
|
| 5 |
+
HF_TOKEN=... python scripts/export_onnx.py
|
| 6 |
+
HF_TOKEN=... python scripts/export_onnx.py --quantize # + dynamic INT8
|
| 7 |
+
HF_TOKEN=... python scripts/export_onnx.py --models Base # single model
|
| 8 |
+
|
| 9 |
+
Exports fp32 ONNX with ORT graph optimizations baked in.
|
| 10 |
+
Optionally applies dynamic INT8 quantization for CPU inference.
|
| 11 |
+
|
| 12 |
+
Note: ORTOptimizer's transformer-specific fusions (attention, LayerNorm, GELU)
|
| 13 |
+
do NOT support wav2vec2. We use ORT's general graph optimizations instead
|
| 14 |
+
(constant folding, redundant node elimination, common subexpression elimination).
|
| 15 |
+
Runtime ORT_ENABLE_ALL adds further optimizations at session load time.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import argparse
|
| 19 |
+
import os
|
| 20 |
+
import shutil
|
| 21 |
+
import sys
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
|
| 24 |
+
# Add project root to path
|
| 25 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 26 |
+
|
| 27 |
+
from config import PHONEME_ASR_MODELS
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def get_hf_token():
|
| 31 |
+
"""Get HF token from env or cached login."""
|
| 32 |
+
hf_token = os.environ.get("HF_TOKEN")
|
| 33 |
+
if not hf_token:
|
| 34 |
+
try:
|
| 35 |
+
from huggingface_hub import HfFolder
|
| 36 |
+
hf_token = HfFolder.get_token()
|
| 37 |
+
except Exception:
|
| 38 |
+
pass
|
| 39 |
+
if not hf_token:
|
| 40 |
+
print("WARNING: No HF token found. Set HF_TOKEN env var or run `huggingface-cli login`.")
|
| 41 |
+
return hf_token
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _optimize_graph(model_path: Path):
|
| 45 |
+
"""Apply general ORT graph optimizations (no transformer-specific fusions).
|
| 46 |
+
|
| 47 |
+
Bakes constant folding, redundant node elimination, and common subexpression
|
| 48 |
+
elimination into the model file so they don't need to run at session load time.
|
| 49 |
+
"""
|
| 50 |
+
import onnxruntime as ort
|
| 51 |
+
|
| 52 |
+
model_file = str(model_path / "model.onnx")
|
| 53 |
+
optimized_file = str(model_path / "model_optimized.onnx")
|
| 54 |
+
|
| 55 |
+
sess_options = ort.SessionOptions()
|
| 56 |
+
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 57 |
+
sess_options.optimized_model_filepath = optimized_file
|
| 58 |
+
|
| 59 |
+
# Create session just to trigger optimization and save
|
| 60 |
+
ort.InferenceSession(model_file, sess_options, providers=["CPUExecutionProvider"])
|
| 61 |
+
|
| 62 |
+
# Replace original with optimized
|
| 63 |
+
os.replace(optimized_file, model_file)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def export_model(model_name: str, model_path: str, output_dir: Path, hf_token: str,
|
| 67 |
+
quantize: bool = False):
|
| 68 |
+
"""Export a single model to optimized fp32 ONNX and push to HF Hub."""
|
| 69 |
+
from optimum.onnxruntime import ORTModelForCTC
|
| 70 |
+
|
| 71 |
+
print(f"\n{'='*60}")
|
| 72 |
+
print(f"Exporting '{model_name}' ({model_path})")
|
| 73 |
+
print(f"{'='*60}")
|
| 74 |
+
|
| 75 |
+
# Clean output dir for fresh export
|
| 76 |
+
if output_dir.exists():
|
| 77 |
+
shutil.rmtree(output_dir)
|
| 78 |
+
output_dir.mkdir(parents=True)
|
| 79 |
+
|
| 80 |
+
# Step 1: Export to ONNX (fp32)
|
| 81 |
+
print(f" [1/5] Exporting fp32 ONNX...")
|
| 82 |
+
model = ORTModelForCTC.from_pretrained(model_path, export=True, token=hf_token)
|
| 83 |
+
model.save_pretrained(output_dir)
|
| 84 |
+
print(f" Saved to {output_dir}")
|
| 85 |
+
|
| 86 |
+
# Step 2: Apply general ORT graph optimizations
|
| 87 |
+
# (wav2vec2 is not supported by ORTOptimizer's transformer-specific fusions,
|
| 88 |
+
# so we use ORT's built-in graph optimizations directly)
|
| 89 |
+
print(f" [2/5] Applying ORT graph optimizations...")
|
| 90 |
+
_optimize_graph(output_dir)
|
| 91 |
+
print(f" Graph optimization complete")
|
| 92 |
+
|
| 93 |
+
# Step 3: Optional dynamic INT8 quantization
|
| 94 |
+
model_file = output_dir / "model.onnx"
|
| 95 |
+
if quantize:
|
| 96 |
+
print(f" [3/5] Applying dynamic INT8 quantization (avx2)...")
|
| 97 |
+
from onnxruntime.quantization import QuantType, quantize_dynamic
|
| 98 |
+
|
| 99 |
+
quantized_file = output_dir / "model_quantized.onnx"
|
| 100 |
+
quantize_dynamic(
|
| 101 |
+
model_input=str(model_file),
|
| 102 |
+
model_output=str(quantized_file),
|
| 103 |
+
weight_type=QuantType.QInt8,
|
| 104 |
+
)
|
| 105 |
+
# Replace original with quantized
|
| 106 |
+
os.replace(str(quantized_file), str(model_file))
|
| 107 |
+
print(f" INT8 quantization complete")
|
| 108 |
+
else:
|
| 109 |
+
print(f" [3/5] Skipping quantization (use --quantize to enable)")
|
| 110 |
+
|
| 111 |
+
# Step 4: Verify with dummy forward pass
|
| 112 |
+
print(f" [4/5] Verifying model...")
|
| 113 |
+
import numpy as np
|
| 114 |
+
import onnxruntime as ort
|
| 115 |
+
|
| 116 |
+
sess = ort.InferenceSession(str(model_file), providers=["CPUExecutionProvider"])
|
| 117 |
+
input_info = sess.get_inputs()[0]
|
| 118 |
+
print(f" Input: name={input_info.name}, type={input_info.type}, shape={input_info.shape}")
|
| 119 |
+
dummy = np.random.randn(1, 16000).astype(np.float32)
|
| 120 |
+
out = sess.run(None, {"input_values": dummy})
|
| 121 |
+
print(f" Output shape: {out[0].shape} (dtype={out[0].dtype})")
|
| 122 |
+
del sess, dummy, out
|
| 123 |
+
|
| 124 |
+
# Step 5: Push to HF Hub
|
| 125 |
+
print(f" [5/5] Uploading to HF Hub...")
|
| 126 |
+
from huggingface_hub import HfApi
|
| 127 |
+
|
| 128 |
+
repo_name = model_path.split("/")[-1]
|
| 129 |
+
hub_repo = f"hetchyy/{repo_name}-onnx"
|
| 130 |
+
api = HfApi(token=hf_token)
|
| 131 |
+
api.create_repo(repo_id=hub_repo, repo_type="model", private=True, exist_ok=True)
|
| 132 |
+
api.upload_folder(folder_path=str(output_dir), repo_id=hub_repo, repo_type="model")
|
| 133 |
+
print(f" Pushed to {hub_repo}")
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def main():
|
| 137 |
+
parser = argparse.ArgumentParser(description="Export phoneme ASR models to optimized ONNX")
|
| 138 |
+
parser.add_argument("--quantize", action="store_true",
|
| 139 |
+
help="Apply dynamic INT8 quantization after graph optimization")
|
| 140 |
+
parser.add_argument("--models", nargs="+", choices=list(PHONEME_ASR_MODELS.keys()),
|
| 141 |
+
default=list(PHONEME_ASR_MODELS.keys()),
|
| 142 |
+
help="Which models to export (default: all)")
|
| 143 |
+
args = parser.parse_args()
|
| 144 |
+
|
| 145 |
+
hf_token = get_hf_token()
|
| 146 |
+
|
| 147 |
+
models_dir = Path(__file__).parent.parent / "models"
|
| 148 |
+
models_dir.mkdir(exist_ok=True)
|
| 149 |
+
|
| 150 |
+
for name in args.models:
|
| 151 |
+
path = PHONEME_ASR_MODELS[name]
|
| 152 |
+
output_dir = models_dir / f"onnx_{name}"
|
| 153 |
+
export_model(name, path, output_dir, hf_token, quantize=args.quantize)
|
| 154 |
+
|
| 155 |
+
suffix = " + INT8 quantized" if args.quantize else ""
|
| 156 |
+
print(f"\nDone. ONNX fp32 optimized{suffix} models exported and uploaded.")
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
if __name__ == "__main__":
|
| 160 |
+
main()
|
scripts/fix_stop_sign_spacing.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Fix stop sign spacing in qpc_hafs.json.
|
| 4 |
+
|
| 5 |
+
The DigitalKhatt font expects stop signs (U+06D6 to U+06DB) as combining marks
|
| 6 |
+
directly attached to words, but the current data has spaces before them.
|
| 7 |
+
|
| 8 |
+
This script removes spaces before stop signs:
|
| 9 |
+
'رَيْبَ ۛ' → 'رَيْبَۛ'
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import re
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
# Stop sign characters (U+06D6 to U+06DB)
|
| 17 |
+
STOP_SIGNS = '\u06D6\u06D7\u06D8\u06D9\u06DA\u06DB'
|
| 18 |
+
|
| 19 |
+
# Pattern to match space followed by stop sign
|
| 20 |
+
SPACE_BEFORE_STOP_PATTERN = re.compile(f' ([{STOP_SIGNS}])')
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def fix_stop_sign_spacing(text: str) -> str:
|
| 24 |
+
"""Remove spaces before stop signs."""
|
| 25 |
+
return SPACE_BEFORE_STOP_PATTERN.sub(r'\1', text)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def main():
|
| 29 |
+
data_path = Path(__file__).parent.parent / 'data' / 'qpc_hafs.json'
|
| 30 |
+
|
| 31 |
+
print(f"Loading {data_path}...")
|
| 32 |
+
with open(data_path, 'r', encoding='utf-8') as f:
|
| 33 |
+
data = json.load(f)
|
| 34 |
+
|
| 35 |
+
modified_count = 0
|
| 36 |
+
|
| 37 |
+
for key, entry in data.items():
|
| 38 |
+
if 'text' in entry:
|
| 39 |
+
original = entry['text']
|
| 40 |
+
fixed = fix_stop_sign_spacing(original)
|
| 41 |
+
if fixed != original:
|
| 42 |
+
entry['text'] = fixed
|
| 43 |
+
modified_count += 1
|
| 44 |
+
if modified_count <= 5: # Show first 5 examples
|
| 45 |
+
print(f" {key}: {repr(original)} → {repr(fixed)}")
|
| 46 |
+
|
| 47 |
+
if modified_count > 5:
|
| 48 |
+
print(f" ... and {modified_count - 5} more entries")
|
| 49 |
+
|
| 50 |
+
print(f"\nModified {modified_count} entries")
|
| 51 |
+
|
| 52 |
+
print(f"Saving to {data_path}...")
|
| 53 |
+
with open(data_path, 'w', encoding='utf-8') as f:
|
| 54 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 55 |
+
|
| 56 |
+
print("Done!")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
if __name__ == '__main__':
|
| 60 |
+
main()
|
setup.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build Cython extensions (DP alignment core)."""
|
| 2 |
+
|
| 3 |
+
from setuptools import setup, Extension
|
| 4 |
+
from Cython.Build import cythonize
|
| 5 |
+
|
| 6 |
+
extensions = [
|
| 7 |
+
Extension(
|
| 8 |
+
"src._dp_core",
|
| 9 |
+
["src/_dp_core.pyx"],
|
| 10 |
+
),
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
setup(
|
| 14 |
+
ext_modules=cythonize(extensions, language_level="3"),
|
| 15 |
+
)
|
src/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Core processing module
|
src/_dp_core.pyx
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# cython: boundscheck=False, wraparound=False, cdivision=True
|
| 2 |
+
"""
|
| 3 |
+
Cython-accelerated word-boundary-constrained substring Levenshtein DP.
|
| 4 |
+
|
| 5 |
+
Drop-in replacement for the pure-Python align_with_word_boundaries() in
|
| 6 |
+
phoneme_matcher.py. Callers still pass plain Python lists of strings;
|
| 7 |
+
encoding to integer arrays happens inside this module.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from libc.stdlib cimport malloc, free
|
| 11 |
+
from libc.math cimport INFINITY, fabs
|
| 12 |
+
|
| 13 |
+
# ---------------------------------------------------------------------------
|
| 14 |
+
# Phoneme → integer encoding (built lazily on first call)
|
| 15 |
+
# ---------------------------------------------------------------------------
|
| 16 |
+
|
| 17 |
+
cdef dict _phoneme_to_id = {}
|
| 18 |
+
cdef int _num_phonemes = 0
|
| 19 |
+
cdef double *_sub_matrix = NULL # flat _num_phonemes × _num_phonemes
|
| 20 |
+
cdef double _default_sub = 1.0
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
cdef int _encode_phoneme(str p):
|
| 24 |
+
"""Return integer id for *p*, assigning a new one if unseen."""
|
| 25 |
+
global _num_phonemes
|
| 26 |
+
cdef int pid
|
| 27 |
+
try:
|
| 28 |
+
pid = _phoneme_to_id[p]
|
| 29 |
+
except KeyError:
|
| 30 |
+
pid = _num_phonemes
|
| 31 |
+
_phoneme_to_id[p] = pid
|
| 32 |
+
_num_phonemes += 1
|
| 33 |
+
return pid
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def init_substitution_matrix(dict sub_costs, double default_sub):
|
| 37 |
+
"""Build the dense substitution-cost matrix from the Python dict.
|
| 38 |
+
|
| 39 |
+
Must be called once before the first DP call (phoneme_matcher.py does
|
| 40 |
+
this at import time).
|
| 41 |
+
|
| 42 |
+
Parameters
|
| 43 |
+
----------
|
| 44 |
+
sub_costs : dict[(str, str), float]
|
| 45 |
+
Phoneme-pair substitution costs (both orderings already present).
|
| 46 |
+
default_sub : float
|
| 47 |
+
Cost used for pairs not in *sub_costs*.
|
| 48 |
+
"""
|
| 49 |
+
global _sub_matrix, _default_sub, _num_phonemes
|
| 50 |
+
|
| 51 |
+
_default_sub = default_sub
|
| 52 |
+
|
| 53 |
+
# First pass: make sure every phoneme in sub_costs has an id
|
| 54 |
+
for (a, b) in sub_costs:
|
| 55 |
+
_encode_phoneme(a)
|
| 56 |
+
_encode_phoneme(b)
|
| 57 |
+
|
| 58 |
+
# Allocate matrix (will be re-allocated if new phonemes appear later)
|
| 59 |
+
_rebuild_matrix(sub_costs)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
cdef void _rebuild_matrix(dict sub_costs):
|
| 63 |
+
"""(Re)allocate and fill the dense cost matrix."""
|
| 64 |
+
global _sub_matrix, _num_phonemes, _default_sub
|
| 65 |
+
|
| 66 |
+
cdef int size = _num_phonemes
|
| 67 |
+
cdef int i, j
|
| 68 |
+
|
| 69 |
+
if _sub_matrix != NULL:
|
| 70 |
+
free(_sub_matrix)
|
| 71 |
+
|
| 72 |
+
_sub_matrix = <double *>malloc(size * size * sizeof(double))
|
| 73 |
+
if _sub_matrix == NULL:
|
| 74 |
+
raise MemoryError("Failed to allocate substitution matrix")
|
| 75 |
+
|
| 76 |
+
# Fill with default
|
| 77 |
+
for i in range(size * size):
|
| 78 |
+
_sub_matrix[i] = _default_sub
|
| 79 |
+
|
| 80 |
+
# Diagonal = 0 (match)
|
| 81 |
+
for i in range(size):
|
| 82 |
+
_sub_matrix[i * size + i] = 0.0
|
| 83 |
+
|
| 84 |
+
# Overrides from dict
|
| 85 |
+
cdef int aid, bid
|
| 86 |
+
cdef double cost
|
| 87 |
+
for (a, b), cost in sub_costs.items():
|
| 88 |
+
aid = _phoneme_to_id.get(a, -1)
|
| 89 |
+
bid = _phoneme_to_id.get(b, -1)
|
| 90 |
+
if aid >= 0 and bid >= 0:
|
| 91 |
+
_sub_matrix[aid * size + bid] = cost
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
cdef inline double _get_sub_cost(int pid, int rid, int size) nogil:
|
| 95 |
+
"""Look up substitution cost from the dense matrix."""
|
| 96 |
+
if pid == rid:
|
| 97 |
+
return 0.0
|
| 98 |
+
if pid < size and rid < size:
|
| 99 |
+
return _sub_matrix[pid * size + rid]
|
| 100 |
+
return _default_sub
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# ---------------------------------------------------------------------------
|
| 104 |
+
# Main DP function
|
| 105 |
+
# ---------------------------------------------------------------------------
|
| 106 |
+
|
| 107 |
+
def cy_align_with_word_boundaries(
|
| 108 |
+
list P_list,
|
| 109 |
+
list R_list,
|
| 110 |
+
list R_phone_to_word_list,
|
| 111 |
+
int expected_word,
|
| 112 |
+
double prior_weight,
|
| 113 |
+
double cost_sub,
|
| 114 |
+
double cost_del,
|
| 115 |
+
double cost_ins,
|
| 116 |
+
):
|
| 117 |
+
"""Word-boundary-constrained substring alignment (Cython).
|
| 118 |
+
|
| 119 |
+
Identical semantics to the pure-Python version. Returns the same
|
| 120 |
+
(best_j, best_j_start, best_cost, best_norm_dist) tuple, with
|
| 121 |
+
``(None, None, INF, INF)`` on failure.
|
| 122 |
+
"""
|
| 123 |
+
cdef int m = len(P_list)
|
| 124 |
+
cdef int n = len(R_list)
|
| 125 |
+
cdef double INF_VAL = INFINITY
|
| 126 |
+
|
| 127 |
+
if m == 0 or n == 0:
|
| 128 |
+
return (None, None, float('inf'), float('inf'))
|
| 129 |
+
|
| 130 |
+
# ------------------------------------------------------------------
|
| 131 |
+
# Encode string lists → C arrays
|
| 132 |
+
# ------------------------------------------------------------------
|
| 133 |
+
cdef int *P_ids = <int *>malloc(m * sizeof(int))
|
| 134 |
+
cdef int *R_ids = <int *>malloc(n * sizeof(int))
|
| 135 |
+
cdef int *R_w = <int *>malloc(n * sizeof(int))
|
| 136 |
+
if P_ids == NULL or R_ids == NULL or R_w == NULL:
|
| 137 |
+
if P_ids != NULL: free(P_ids)
|
| 138 |
+
if R_ids != NULL: free(R_ids)
|
| 139 |
+
if R_w != NULL: free(R_w)
|
| 140 |
+
raise MemoryError()
|
| 141 |
+
|
| 142 |
+
cdef int i, j
|
| 143 |
+
cdef bint need_rebuild = False
|
| 144 |
+
|
| 145 |
+
for i in range(m):
|
| 146 |
+
p = P_list[i]
|
| 147 |
+
if p not in _phoneme_to_id:
|
| 148 |
+
_encode_phoneme(p)
|
| 149 |
+
need_rebuild = True
|
| 150 |
+
P_ids[i] = _phoneme_to_id[p]
|
| 151 |
+
|
| 152 |
+
for j in range(n):
|
| 153 |
+
r = R_list[j]
|
| 154 |
+
if r not in _phoneme_to_id:
|
| 155 |
+
_encode_phoneme(r)
|
| 156 |
+
need_rebuild = True
|
| 157 |
+
R_ids[j] = _phoneme_to_id[r]
|
| 158 |
+
R_w[j] = <int>R_phone_to_word_list[j]
|
| 159 |
+
|
| 160 |
+
# If new phonemes appeared, rebuild the matrix so ids are covered
|
| 161 |
+
if need_rebuild and _sub_matrix != NULL:
|
| 162 |
+
# We need the original sub_costs dict, but we don't have it here.
|
| 163 |
+
# The safest approach: expand matrix with defaults for new phonemes.
|
| 164 |
+
_grow_matrix()
|
| 165 |
+
|
| 166 |
+
cdef int mat_size = _num_phonemes
|
| 167 |
+
|
| 168 |
+
# ------------------------------------------------------------------
|
| 169 |
+
# Precompute boundary flags
|
| 170 |
+
# ------------------------------------------------------------------
|
| 171 |
+
cdef char *start_boundary = <char *>malloc((n + 1) * sizeof(char))
|
| 172 |
+
cdef char *end_boundary = <char *>malloc((n + 1) * sizeof(char))
|
| 173 |
+
if start_boundary == NULL or end_boundary == NULL:
|
| 174 |
+
free(P_ids); free(R_ids); free(R_w)
|
| 175 |
+
if start_boundary != NULL: free(start_boundary)
|
| 176 |
+
if end_boundary != NULL: free(end_boundary)
|
| 177 |
+
raise MemoryError()
|
| 178 |
+
|
| 179 |
+
# start_boundary[j]: can alignment start at column j?
|
| 180 |
+
start_boundary[0] = 1 # column 0 always valid
|
| 181 |
+
for j in range(1, n):
|
| 182 |
+
start_boundary[j] = 1 if R_w[j] != R_w[j - 1] else 0
|
| 183 |
+
start_boundary[n] = 0 # can't start at or past end
|
| 184 |
+
|
| 185 |
+
# end_boundary[j]: can alignment end at column j?
|
| 186 |
+
end_boundary[0] = 0 # can't end before consuming anything
|
| 187 |
+
for j in range(1, n):
|
| 188 |
+
end_boundary[j] = 1 if R_w[j] != R_w[j - 1] else 0
|
| 189 |
+
end_boundary[n] = 1 # end of reference always valid
|
| 190 |
+
|
| 191 |
+
# ------------------------------------------------------------------
|
| 192 |
+
# DP arrays (two-row rolling)
|
| 193 |
+
# ------------------------------------------------------------------
|
| 194 |
+
cdef double *prev_cost = <double *>malloc((n + 1) * sizeof(double))
|
| 195 |
+
cdef double *curr_cost = <double *>malloc((n + 1) * sizeof(double))
|
| 196 |
+
cdef int *prev_start = <int *>malloc((n + 1) * sizeof(int))
|
| 197 |
+
cdef int *curr_start = <int *>malloc((n + 1) * sizeof(int))
|
| 198 |
+
if (prev_cost == NULL or curr_cost == NULL or
|
| 199 |
+
prev_start == NULL or curr_start == NULL):
|
| 200 |
+
free(P_ids); free(R_ids); free(R_w)
|
| 201 |
+
free(start_boundary); free(end_boundary)
|
| 202 |
+
if prev_cost != NULL: free(prev_cost)
|
| 203 |
+
if curr_cost != NULL: free(curr_cost)
|
| 204 |
+
if prev_start != NULL: free(prev_start)
|
| 205 |
+
if curr_start != NULL: free(curr_start)
|
| 206 |
+
raise MemoryError()
|
| 207 |
+
|
| 208 |
+
# Initialise row 0
|
| 209 |
+
for j in range(n + 1):
|
| 210 |
+
if start_boundary[j]:
|
| 211 |
+
prev_cost[j] = 0.0
|
| 212 |
+
prev_start[j] = j
|
| 213 |
+
else:
|
| 214 |
+
prev_cost[j] = INF_VAL
|
| 215 |
+
prev_start[j] = -1
|
| 216 |
+
|
| 217 |
+
# ------------------------------------------------------------------
|
| 218 |
+
# Core DP loop (no Python objects touched → runs at C speed)
|
| 219 |
+
# ------------------------------------------------------------------
|
| 220 |
+
cdef double del_option, ins_option, sub_option, sc
|
| 221 |
+
cdef double *tmp_d
|
| 222 |
+
cdef int *tmp_i
|
| 223 |
+
cdef bint col0_start = start_boundary[0]
|
| 224 |
+
|
| 225 |
+
for i in range(1, m + 1):
|
| 226 |
+
if col0_start:
|
| 227 |
+
curr_cost[0] = i * cost_del
|
| 228 |
+
curr_start[0] = 0
|
| 229 |
+
else:
|
| 230 |
+
curr_cost[0] = INF_VAL
|
| 231 |
+
curr_start[0] = -1
|
| 232 |
+
|
| 233 |
+
for j in range(1, n + 1):
|
| 234 |
+
del_option = prev_cost[j] + cost_del
|
| 235 |
+
ins_option = curr_cost[j - 1] + cost_ins
|
| 236 |
+
sc = _get_sub_cost(P_ids[i - 1], R_ids[j - 1], mat_size)
|
| 237 |
+
sub_option = prev_cost[j - 1] + sc
|
| 238 |
+
|
| 239 |
+
if sub_option <= del_option and sub_option <= ins_option:
|
| 240 |
+
curr_cost[j] = sub_option
|
| 241 |
+
curr_start[j] = prev_start[j - 1]
|
| 242 |
+
elif del_option <= ins_option:
|
| 243 |
+
curr_cost[j] = del_option
|
| 244 |
+
curr_start[j] = prev_start[j]
|
| 245 |
+
else:
|
| 246 |
+
curr_cost[j] = ins_option
|
| 247 |
+
curr_start[j] = curr_start[j - 1]
|
| 248 |
+
|
| 249 |
+
# Swap rows
|
| 250 |
+
tmp_d = prev_cost; prev_cost = curr_cost; curr_cost = tmp_d
|
| 251 |
+
tmp_i = prev_start; prev_start = curr_start; curr_start = tmp_i
|
| 252 |
+
|
| 253 |
+
# ------------------------------------------------------------------
|
| 254 |
+
# Best-match selection (end boundaries only)
|
| 255 |
+
# ------------------------------------------------------------------
|
| 256 |
+
cdef double best_score = INF_VAL
|
| 257 |
+
cdef int best_j = -1
|
| 258 |
+
cdef int best_j_start = -1
|
| 259 |
+
cdef double best_cost_val = INF_VAL
|
| 260 |
+
cdef double best_norm = INF_VAL
|
| 261 |
+
|
| 262 |
+
cdef double dist, norm_dist, prior, score
|
| 263 |
+
cdef int j_start_val, ref_len, denom, start_word
|
| 264 |
+
|
| 265 |
+
for j in range(1, n + 1):
|
| 266 |
+
if not end_boundary[j]:
|
| 267 |
+
continue
|
| 268 |
+
if prev_cost[j] >= INF_VAL:
|
| 269 |
+
continue
|
| 270 |
+
|
| 271 |
+
dist = prev_cost[j]
|
| 272 |
+
j_start_val = prev_start[j]
|
| 273 |
+
|
| 274 |
+
ref_len = j - j_start_val
|
| 275 |
+
denom = m if m > ref_len else ref_len
|
| 276 |
+
if denom < 1:
|
| 277 |
+
denom = 1
|
| 278 |
+
norm_dist = dist / denom
|
| 279 |
+
|
| 280 |
+
if j_start_val < n:
|
| 281 |
+
start_word = R_w[j_start_val]
|
| 282 |
+
else:
|
| 283 |
+
start_word = R_w[j - 1]
|
| 284 |
+
|
| 285 |
+
prior = prior_weight * fabs(<double>(start_word - expected_word))
|
| 286 |
+
score = norm_dist + prior
|
| 287 |
+
|
| 288 |
+
if score < best_score:
|
| 289 |
+
best_score = score
|
| 290 |
+
best_j = j
|
| 291 |
+
best_j_start = j_start_val
|
| 292 |
+
best_cost_val = dist
|
| 293 |
+
best_norm = norm_dist
|
| 294 |
+
|
| 295 |
+
# ------------------------------------------------------------------
|
| 296 |
+
# Cleanup
|
| 297 |
+
# ------------------------------------------------------------------
|
| 298 |
+
free(P_ids); free(R_ids); free(R_w)
|
| 299 |
+
free(start_boundary); free(end_boundary)
|
| 300 |
+
free(prev_cost); free(curr_cost)
|
| 301 |
+
free(prev_start); free(curr_start)
|
| 302 |
+
|
| 303 |
+
if best_j < 0:
|
| 304 |
+
return (None, None, float('inf'), float('inf'))
|
| 305 |
+
|
| 306 |
+
return (best_j, best_j_start, best_cost_val, best_norm)
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
# ---------------------------------------------------------------------------
|
| 310 |
+
# Helper: grow matrix when new phonemes are encountered at runtime
|
| 311 |
+
# ---------------------------------------------------------------------------
|
| 312 |
+
|
| 313 |
+
cdef void _grow_matrix():
|
| 314 |
+
"""Expand the substitution matrix to cover newly added phonemes.
|
| 315 |
+
|
| 316 |
+
New rows/columns are filled with the default substitution cost,
|
| 317 |
+
diagonal with 0.0. Existing entries are preserved.
|
| 318 |
+
"""
|
| 319 |
+
global _sub_matrix, _num_phonemes
|
| 320 |
+
|
| 321 |
+
cdef int old_size
|
| 322 |
+
cdef int new_size = _num_phonemes
|
| 323 |
+
cdef double *new_mat
|
| 324 |
+
|
| 325 |
+
if _sub_matrix == NULL:
|
| 326 |
+
# No matrix yet — allocate fresh with defaults
|
| 327 |
+
_sub_matrix = <double *>malloc(new_size * new_size * sizeof(double))
|
| 328 |
+
if _sub_matrix == NULL:
|
| 329 |
+
return
|
| 330 |
+
for i in range(new_size * new_size):
|
| 331 |
+
_sub_matrix[i] = _default_sub
|
| 332 |
+
for i in range(new_size):
|
| 333 |
+
_sub_matrix[i * new_size + i] = 0.0
|
| 334 |
+
return
|
| 335 |
+
|
| 336 |
+
# Figure out old size from current allocation
|
| 337 |
+
# We track it implicitly: old_size = new_size - (number of phonemes added since last build)
|
| 338 |
+
# Simpler: just rebuild from scratch with defaults + diagonal
|
| 339 |
+
new_mat = <double *>malloc(new_size * new_size * sizeof(double))
|
| 340 |
+
if new_mat == NULL:
|
| 341 |
+
return
|
| 342 |
+
|
| 343 |
+
cdef int i, j_idx
|
| 344 |
+
for i in range(new_size * new_size):
|
| 345 |
+
new_mat[i] = _default_sub
|
| 346 |
+
for i in range(new_size):
|
| 347 |
+
new_mat[i * new_size + i] = 0.0
|
| 348 |
+
|
| 349 |
+
# Copy old entries (old matrix was some smaller size).
|
| 350 |
+
# We don't know old_size exactly, so we just keep the new defaults.
|
| 351 |
+
# The original sub_costs were already written; since we don't have
|
| 352 |
+
# the dict here, the known-pair costs are lost for the new matrix.
|
| 353 |
+
# This only happens if a completely new phoneme appears at runtime,
|
| 354 |
+
# which is extremely rare. The init call covers all 69 known phonemes.
|
| 355 |
+
|
| 356 |
+
free(_sub_matrix)
|
| 357 |
+
_sub_matrix = new_mat
|
src/alignment/__init__.py
ADDED
|
File without changes
|
src/alignment/alignment_pipeline.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Orchestration for phoneme-based alignment and retries."""
|
| 2 |
+
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
|
| 5 |
+
from config import (
|
| 6 |
+
ANCHOR_SEGMENTS,
|
| 7 |
+
MAX_CONSECUTIVE_FAILURES,
|
| 8 |
+
RETRY_LOOKBACK_WORDS,
|
| 9 |
+
RETRY_LOOKAHEAD_WORDS,
|
| 10 |
+
MAX_EDIT_DISTANCE_RELAXED,
|
| 11 |
+
PHONEME_ALIGNMENT_PROFILING,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def run_phoneme_matching(
|
| 16 |
+
phoneme_texts: List[List[str]],
|
| 17 |
+
detected_surah: int,
|
| 18 |
+
first_quran_idx: int = 0,
|
| 19 |
+
special_results: List[tuple] = None,
|
| 20 |
+
start_pointer: int = 0,
|
| 21 |
+
) -> Tuple[List[tuple], dict, set]:
|
| 22 |
+
"""
|
| 23 |
+
Phoneme-based segment matching using substring DP.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
phoneme_texts: List of phoneme lists (each is a list of phoneme strings)
|
| 27 |
+
detected_surah: Surah number from anchor search
|
| 28 |
+
first_quran_idx: Index where Quran segments start (after specials)
|
| 29 |
+
special_results: Results for special segments (Isti'adha/Basmala)
|
| 30 |
+
start_pointer: Initial word pointer from anchor voting
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
(results, profiling_dict, gap_segments)
|
| 34 |
+
results: List[(matched_text, score, matched_ref), ...]
|
| 35 |
+
"""
|
| 36 |
+
from .phoneme_matcher import align_segment, get_matched_text
|
| 37 |
+
from .phoneme_matcher_cache import get_chapter_reference
|
| 38 |
+
from .phoneme_anchor import reanchor_within_surah, verse_to_word_index, find_anchor_by_voting
|
| 39 |
+
from .ngram_index import get_ngram_index
|
| 40 |
+
|
| 41 |
+
# Only import time if profiling enabled
|
| 42 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 43 |
+
import time
|
| 44 |
+
total_start = time.perf_counter()
|
| 45 |
+
ref_build_start = time.perf_counter()
|
| 46 |
+
|
| 47 |
+
# Build/get cached chapter reference (includes phonemizer call if not cached)
|
| 48 |
+
chapter_ref = get_chapter_reference(detected_surah)
|
| 49 |
+
|
| 50 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 51 |
+
ref_build_time = time.perf_counter() - ref_build_start
|
| 52 |
+
|
| 53 |
+
# Initialize results with special segments
|
| 54 |
+
results = list(special_results) if special_results else []
|
| 55 |
+
# Parallel list: None for specials/failures, (start_word_idx, end_word_idx) for matches
|
| 56 |
+
word_indices = [None] * len(results)
|
| 57 |
+
|
| 58 |
+
# Timing accumulators (only used if profiling enabled)
|
| 59 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 60 |
+
dp_times = []
|
| 61 |
+
window_setup_total = 0.0
|
| 62 |
+
result_build_total = 0.0
|
| 63 |
+
|
| 64 |
+
# Track whether the next segment might have Basmala fused with verse content
|
| 65 |
+
from .special_segments import SPECIAL_PHONEMES, SPECIAL_TEXT
|
| 66 |
+
basmala_already_detected = any(
|
| 67 |
+
r[2] in ("Basmala", "Isti'adha+Basmala") for r in (special_results or [])
|
| 68 |
+
)
|
| 69 |
+
is_first_after_transition = not basmala_already_detected
|
| 70 |
+
|
| 71 |
+
special_merges = 0
|
| 72 |
+
|
| 73 |
+
# Process Quran segments with phoneme alignment
|
| 74 |
+
pointer = start_pointer
|
| 75 |
+
num_segments = 0
|
| 76 |
+
consecutive_failures = 0
|
| 77 |
+
skip_count = 0
|
| 78 |
+
pending_specials = []
|
| 79 |
+
tier1_attempts = 0
|
| 80 |
+
tier1_passed = 0
|
| 81 |
+
tier1_segments = []
|
| 82 |
+
tier2_attempts = 0
|
| 83 |
+
tier2_passed = 0
|
| 84 |
+
tier2_segments = []
|
| 85 |
+
consec_reanchors = 0
|
| 86 |
+
segments_attempted = 0
|
| 87 |
+
segments_passed = 0
|
| 88 |
+
|
| 89 |
+
for i, asr_phonemes in enumerate(phoneme_texts[first_quran_idx:]):
|
| 90 |
+
# Handle segments consumed by inter-chapter special detection
|
| 91 |
+
if skip_count > 0:
|
| 92 |
+
results.append(pending_specials.pop(0))
|
| 93 |
+
word_indices.append(None)
|
| 94 |
+
skip_count -= 1
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
segment_idx = first_quran_idx + i + 1 # 1-indexed for display
|
| 98 |
+
segments_attempted += 1
|
| 99 |
+
|
| 100 |
+
alignment, timing = align_segment(asr_phonemes, chapter_ref, pointer, segment_idx)
|
| 101 |
+
num_segments += 1
|
| 102 |
+
|
| 103 |
+
# Accumulate timing if profiling enabled
|
| 104 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 105 |
+
dp_times.append(timing['dp_time'])
|
| 106 |
+
window_setup_total += timing['window_setup_time']
|
| 107 |
+
result_build_total += timing['result_build_time']
|
| 108 |
+
|
| 109 |
+
# Chapter transition: pointer past end of chapter
|
| 110 |
+
if alignment is None and pointer >= chapter_ref.num_words:
|
| 111 |
+
from .special_segments import detect_inter_chapter_specials
|
| 112 |
+
remaining_phonemes = phoneme_texts[first_quran_idx + i:]
|
| 113 |
+
inter_specials, num_consumed = detect_inter_chapter_specials(remaining_phonemes)
|
| 114 |
+
|
| 115 |
+
if chapter_ref.surah == 1:
|
| 116 |
+
# After Al-Fatiha, the next chapter could be anything — global reanchor
|
| 117 |
+
print(f" [CHAPTER-END] Surah 1 complete at segment {segment_idx}, "
|
| 118 |
+
f"running global reanchor...")
|
| 119 |
+
|
| 120 |
+
# Use segments after specials for anchor voting
|
| 121 |
+
anchor_offset = first_quran_idx + i + num_consumed
|
| 122 |
+
anchor_remaining = phoneme_texts[anchor_offset:]
|
| 123 |
+
|
| 124 |
+
reanchor_surah, reanchor_ayah = find_anchor_by_voting(
|
| 125 |
+
anchor_remaining, get_ngram_index(), ANCHOR_SEGMENTS,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
if reanchor_surah > 0:
|
| 129 |
+
next_surah = reanchor_surah
|
| 130 |
+
chapter_ref = get_chapter_reference(next_surah)
|
| 131 |
+
pointer = verse_to_word_index(chapter_ref, reanchor_ayah)
|
| 132 |
+
print(f" [GLOBAL-REANCHOR] Anchored to Surah {next_surah}, "
|
| 133 |
+
f"Ayah {reanchor_ayah}, word {pointer}")
|
| 134 |
+
else:
|
| 135 |
+
# Fallback: assume chapter 2
|
| 136 |
+
next_surah = 2
|
| 137 |
+
chapter_ref = get_chapter_reference(next_surah)
|
| 138 |
+
pointer = 0
|
| 139 |
+
print(f" [GLOBAL-REANCHOR] No anchor found, falling back to Surah 2")
|
| 140 |
+
else:
|
| 141 |
+
next_surah = chapter_ref.surah + 1
|
| 142 |
+
if next_surah > 114:
|
| 143 |
+
pass # No more chapters — fall through to failure handling
|
| 144 |
+
else:
|
| 145 |
+
print(f" [CHAPTER-END] Surah {chapter_ref.surah} complete at segment {segment_idx}, "
|
| 146 |
+
f"transitioning to Surah {next_surah}")
|
| 147 |
+
chapter_ref = get_chapter_reference(next_surah)
|
| 148 |
+
pointer = 0
|
| 149 |
+
|
| 150 |
+
if next_surah <= 114:
|
| 151 |
+
detected_surah = next_surah
|
| 152 |
+
consecutive_failures = 0
|
| 153 |
+
|
| 154 |
+
if num_consumed > 0:
|
| 155 |
+
has_basmala = any(s[2] in ("Basmala", "Isti'adha+Basmala") for s in inter_specials)
|
| 156 |
+
is_first_after_transition = not has_basmala
|
| 157 |
+
# Current segment is a special — append its result
|
| 158 |
+
results.append(inter_specials[0])
|
| 159 |
+
word_indices.append(None)
|
| 160 |
+
# Queue remaining specials for subsequent segments
|
| 161 |
+
if num_consumed > 1:
|
| 162 |
+
pending_specials = list(inter_specials[1:])
|
| 163 |
+
skip_count = num_consumed - 1
|
| 164 |
+
|
| 165 |
+
continue
|
| 166 |
+
else:
|
| 167 |
+
is_first_after_transition = True
|
| 168 |
+
# No specials — re-try alignment on this segment against the new chapter
|
| 169 |
+
alignment, timing = align_segment(asr_phonemes, chapter_ref, pointer, segment_idx)
|
| 170 |
+
num_segments += 1
|
| 171 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 172 |
+
dp_times.append(timing['dp_time'])
|
| 173 |
+
window_setup_total += timing['window_setup_time']
|
| 174 |
+
result_build_total += timing['result_build_time']
|
| 175 |
+
# Fall through to existing if/else below
|
| 176 |
+
|
| 177 |
+
# Basmala-fused retry: if this is the first segment after a transition
|
| 178 |
+
# and Basmala wasn't detected, the reciter may have merged Basmala with
|
| 179 |
+
# the first verse. Always try prepending Basmala phonemes to R and pick
|
| 180 |
+
# the better result (even if the plain alignment already succeeded).
|
| 181 |
+
if is_first_after_transition:
|
| 182 |
+
is_first_after_transition = False
|
| 183 |
+
|
| 184 |
+
basmala_alignment, basmala_timing = align_segment(
|
| 185 |
+
asr_phonemes, chapter_ref, pointer, segment_idx,
|
| 186 |
+
basmala_prefix=True)
|
| 187 |
+
num_segments += 1
|
| 188 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 189 |
+
dp_times.append(basmala_timing['dp_time'])
|
| 190 |
+
window_setup_total += basmala_timing['window_setup_time']
|
| 191 |
+
result_build_total += basmala_timing['result_build_time']
|
| 192 |
+
|
| 193 |
+
if basmala_alignment and basmala_alignment.basmala_consumed:
|
| 194 |
+
existing_conf = alignment.confidence if alignment else 0.0
|
| 195 |
+
if basmala_alignment.confidence > existing_conf:
|
| 196 |
+
matched_text = SPECIAL_TEXT["Basmala"] + " " + get_matched_text(chapter_ref, basmala_alignment)
|
| 197 |
+
result = (matched_text, basmala_alignment.confidence, basmala_alignment.matched_ref)
|
| 198 |
+
pointer = basmala_alignment.end_word_idx + 1
|
| 199 |
+
consecutive_failures = 0
|
| 200 |
+
word_indices.append((basmala_alignment.start_word_idx, basmala_alignment.end_word_idx))
|
| 201 |
+
results.append(result)
|
| 202 |
+
special_merges += 1
|
| 203 |
+
segments_passed += 1
|
| 204 |
+
print(f" [BASMALA-FUSED] Segment {segment_idx}: Basmala merged with verse "
|
| 205 |
+
f"(fused conf={basmala_alignment.confidence:.2f} > plain conf={existing_conf:.2f})")
|
| 206 |
+
continue
|
| 207 |
+
# Basmala-fused didn't win — fall through with original alignment
|
| 208 |
+
|
| 209 |
+
if alignment:
|
| 210 |
+
is_first_after_transition = False
|
| 211 |
+
matched_text = get_matched_text(chapter_ref, alignment)
|
| 212 |
+
result = (matched_text, alignment.confidence, alignment.matched_ref)
|
| 213 |
+
pointer = alignment.end_word_idx + 1 # Advance pointer
|
| 214 |
+
consecutive_failures = 0
|
| 215 |
+
word_indices.append((alignment.start_word_idx, alignment.end_word_idx))
|
| 216 |
+
segments_passed += 1
|
| 217 |
+
else:
|
| 218 |
+
# === Graduated retry ===
|
| 219 |
+
# Tier 1: expanded window, same threshold
|
| 220 |
+
tier1_attempts += 1
|
| 221 |
+
tier1_segments.append(segment_idx)
|
| 222 |
+
alignment, timing = align_segment(
|
| 223 |
+
asr_phonemes, chapter_ref, pointer, segment_idx,
|
| 224 |
+
lookback_override=RETRY_LOOKBACK_WORDS,
|
| 225 |
+
lookahead_override=RETRY_LOOKAHEAD_WORDS,
|
| 226 |
+
)
|
| 227 |
+
num_segments += 1
|
| 228 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 229 |
+
dp_times.append(timing['dp_time'])
|
| 230 |
+
window_setup_total += timing['window_setup_time']
|
| 231 |
+
result_build_total += timing['result_build_time']
|
| 232 |
+
|
| 233 |
+
# Tier 2: expanded window + relaxed threshold
|
| 234 |
+
tier2_entered = False
|
| 235 |
+
if alignment is None:
|
| 236 |
+
tier2_entered = True
|
| 237 |
+
tier2_attempts += 1
|
| 238 |
+
tier2_segments.append(segment_idx)
|
| 239 |
+
alignment, timing = align_segment(
|
| 240 |
+
asr_phonemes, chapter_ref, pointer, segment_idx,
|
| 241 |
+
lookback_override=RETRY_LOOKBACK_WORDS,
|
| 242 |
+
lookahead_override=RETRY_LOOKAHEAD_WORDS,
|
| 243 |
+
max_edit_distance_override=MAX_EDIT_DISTANCE_RELAXED,
|
| 244 |
+
)
|
| 245 |
+
num_segments += 1
|
| 246 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 247 |
+
dp_times.append(timing['dp_time'])
|
| 248 |
+
window_setup_total += timing['window_setup_time']
|
| 249 |
+
result_build_total += timing['result_build_time']
|
| 250 |
+
|
| 251 |
+
if alignment:
|
| 252 |
+
# Retry succeeded
|
| 253 |
+
is_first_after_transition = False
|
| 254 |
+
matched_text = get_matched_text(chapter_ref, alignment)
|
| 255 |
+
result = (matched_text, alignment.confidence, alignment.matched_ref)
|
| 256 |
+
pointer = alignment.end_word_idx + 1
|
| 257 |
+
consecutive_failures = 0
|
| 258 |
+
word_indices.append((alignment.start_word_idx, alignment.end_word_idx))
|
| 259 |
+
segments_passed += 1
|
| 260 |
+
if tier2_entered:
|
| 261 |
+
tier2_passed += 1
|
| 262 |
+
else:
|
| 263 |
+
tier1_passed += 1
|
| 264 |
+
print(f" [RETRY-OK] Segment {segment_idx}: recovered via expanded window/relaxed threshold")
|
| 265 |
+
else:
|
| 266 |
+
# Real failure after all retries
|
| 267 |
+
result = ("", 0.0, "")
|
| 268 |
+
consecutive_failures += 1
|
| 269 |
+
word_indices.append(None)
|
| 270 |
+
|
| 271 |
+
if consecutive_failures >= MAX_CONSECUTIVE_FAILURES:
|
| 272 |
+
consec_reanchors += 1
|
| 273 |
+
# Global re-anchor (not constrained to current surah)
|
| 274 |
+
remaining_idx = first_quran_idx + i + 1
|
| 275 |
+
remaining_texts = phoneme_texts[remaining_idx:]
|
| 276 |
+
if remaining_texts:
|
| 277 |
+
reanchor_surah, reanchor_ayah = find_anchor_by_voting(
|
| 278 |
+
remaining_texts, get_ngram_index(), ANCHOR_SEGMENTS,
|
| 279 |
+
)
|
| 280 |
+
if reanchor_surah > 0:
|
| 281 |
+
if reanchor_surah != detected_surah:
|
| 282 |
+
detected_surah = reanchor_surah
|
| 283 |
+
chapter_ref = get_chapter_reference(detected_surah)
|
| 284 |
+
pointer = verse_to_word_index(chapter_ref, reanchor_ayah)
|
| 285 |
+
print(f" [GLOBAL-REANCHOR] Jumped to Surah {detected_surah}, "
|
| 286 |
+
f"Ayah {reanchor_ayah}, word {pointer}")
|
| 287 |
+
consecutive_failures = 0
|
| 288 |
+
|
| 289 |
+
results.append(result)
|
| 290 |
+
|
| 291 |
+
# Post-processing: detect consecutive segments with reference gaps
|
| 292 |
+
gap_segments = set()
|
| 293 |
+
|
| 294 |
+
prev_matched_idx = None
|
| 295 |
+
for idx in range(len(results)):
|
| 296 |
+
if word_indices[idx] is None:
|
| 297 |
+
continue
|
| 298 |
+
|
| 299 |
+
if prev_matched_idx is not None:
|
| 300 |
+
prev_end = word_indices[prev_matched_idx][1]
|
| 301 |
+
curr_start = word_indices[idx][0]
|
| 302 |
+
gap = curr_start - prev_end - 1
|
| 303 |
+
|
| 304 |
+
if gap > 0:
|
| 305 |
+
gap_segments.add(prev_matched_idx)
|
| 306 |
+
gap_segments.add(idx)
|
| 307 |
+
|
| 308 |
+
print(f" [GAP] {gap} word(s) missing between segments "
|
| 309 |
+
f"{prev_matched_idx + 1} and {idx + 1}")
|
| 310 |
+
|
| 311 |
+
prev_matched_idx = idx
|
| 312 |
+
|
| 313 |
+
# Edge case: missing words at start of expected range
|
| 314 |
+
first_matched = next((i for i, w in enumerate(word_indices) if w is not None), None)
|
| 315 |
+
if first_matched is not None:
|
| 316 |
+
first_start = word_indices[first_matched][0]
|
| 317 |
+
if first_start > start_pointer:
|
| 318 |
+
gap_segments.add(first_matched)
|
| 319 |
+
print(f" [GAP] {first_start - start_pointer} word(s) missing before first segment {first_matched + 1}")
|
| 320 |
+
|
| 321 |
+
# Edge case: missing words at end of current verse
|
| 322 |
+
# Only flag if the last matched segment is also the final segment overall.
|
| 323 |
+
# If there are trailing no-match segments after it, those account for the
|
| 324 |
+
# remaining audio — the words aren't missing, they just failed to align.
|
| 325 |
+
# Compare against the verse boundary (not chapter end), since a recitation
|
| 326 |
+
# doesn't necessarily cover the entire chapter.
|
| 327 |
+
last_matched = next((i for i in range(len(word_indices) - 1, -1, -1) if word_indices[i] is not None), None)
|
| 328 |
+
if last_matched is not None and last_matched == len(word_indices) - 1:
|
| 329 |
+
last_end = word_indices[last_matched][1]
|
| 330 |
+
last_ayah = chapter_ref.words[last_end].ayah
|
| 331 |
+
# Find the last word index that belongs to the same verse
|
| 332 |
+
verse_end = last_end
|
| 333 |
+
while verse_end + 1 < chapter_ref.num_words and chapter_ref.words[verse_end + 1].ayah == last_ayah:
|
| 334 |
+
verse_end += 1
|
| 335 |
+
if last_end < verse_end:
|
| 336 |
+
gap_segments.add(last_matched)
|
| 337 |
+
print(f" [GAP] {verse_end - last_end} word(s) missing after last segment {last_matched + 1}")
|
| 338 |
+
|
| 339 |
+
# Build profiling dict
|
| 340 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 341 |
+
total_time = time.perf_counter() - total_start
|
| 342 |
+
profiling = {
|
| 343 |
+
"total_time": total_time,
|
| 344 |
+
"ref_build_time": ref_build_time,
|
| 345 |
+
"dp_total_time": sum(dp_times),
|
| 346 |
+
"dp_min_time": min(dp_times) if dp_times else 0.0,
|
| 347 |
+
"dp_max_time": max(dp_times) if dp_times else 0.0,
|
| 348 |
+
"window_setup_time": window_setup_total,
|
| 349 |
+
"result_build_time": result_build_total,
|
| 350 |
+
"num_segments": num_segments,
|
| 351 |
+
"tier1_attempts": tier1_attempts,
|
| 352 |
+
"tier1_passed": tier1_passed,
|
| 353 |
+
"tier1_segments": tier1_segments,
|
| 354 |
+
"tier2_attempts": tier2_attempts,
|
| 355 |
+
"tier2_passed": tier2_passed,
|
| 356 |
+
"tier2_segments": tier2_segments,
|
| 357 |
+
"consec_reanchors": consec_reanchors,
|
| 358 |
+
"segments_attempted": segments_attempted,
|
| 359 |
+
"segments_passed": segments_passed,
|
| 360 |
+
"special_merges": special_merges,
|
| 361 |
+
}
|
| 362 |
+
else:
|
| 363 |
+
profiling = {
|
| 364 |
+
"num_segments": num_segments,
|
| 365 |
+
"tier1_attempts": tier1_attempts,
|
| 366 |
+
"tier1_passed": tier1_passed,
|
| 367 |
+
"tier1_segments": tier1_segments,
|
| 368 |
+
"tier2_attempts": tier2_attempts,
|
| 369 |
+
"tier2_passed": tier2_passed,
|
| 370 |
+
"tier2_segments": tier2_segments,
|
| 371 |
+
"consec_reanchors": consec_reanchors,
|
| 372 |
+
"segments_attempted": segments_attempted,
|
| 373 |
+
"segments_passed": segments_passed,
|
| 374 |
+
"special_merges": special_merges,
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
return results, profiling, gap_segments
|
src/alignment/ngram_index.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Phoneme n-gram index: dataclass and cached loader.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pickle
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from typing import Dict, List, Optional, Tuple
|
| 8 |
+
|
| 9 |
+
from config import NGRAM_INDEX_PATH
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class PhonemeNgramIndex:
|
| 14 |
+
"""Pre-computed n-gram index for the entire Quran."""
|
| 15 |
+
|
| 16 |
+
# n-gram -> list of (surah, ayah) positions where it occurs
|
| 17 |
+
ngram_positions: Dict[Tuple[str, ...], List[Tuple[int, int]]]
|
| 18 |
+
|
| 19 |
+
# n-gram -> total occurrence count (for rarity weighting)
|
| 20 |
+
ngram_counts: Dict[Tuple[str, ...], int]
|
| 21 |
+
|
| 22 |
+
# Metadata
|
| 23 |
+
ngram_size: int
|
| 24 |
+
total_ngrams: int
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
_INDEX: Optional[PhonemeNgramIndex] = None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def get_ngram_index() -> PhonemeNgramIndex:
|
| 31 |
+
"""Get or load the phoneme n-gram index."""
|
| 32 |
+
global _INDEX
|
| 33 |
+
if _INDEX is None:
|
| 34 |
+
print(f"[NGRAM] Loading index from {NGRAM_INDEX_PATH}...")
|
| 35 |
+
with open(NGRAM_INDEX_PATH, "rb") as f:
|
| 36 |
+
_INDEX = pickle.load(f)
|
| 37 |
+
print(f"[NGRAM] Loaded: {len(_INDEX.ngram_positions)} unique {_INDEX.ngram_size}-grams, "
|
| 38 |
+
f"{_INDEX.total_ngrams} total occurrences")
|
| 39 |
+
return _INDEX
|
src/alignment/phoneme_anchor.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Phoneme n-gram voting for global anchor detection.
|
| 3 |
+
|
| 4 |
+
Replaces Whisper-based text matching for chapter/verse identification.
|
| 5 |
+
Each ASR n-gram that matches the Quran index votes for (surah, ayah)
|
| 6 |
+
weighted by rarity (1/count). The highest vote total wins the surah.
|
| 7 |
+
Then we find the best contiguous run of voted ayahs in that surah and
|
| 8 |
+
anchor to the first ayah of that run.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from collections import defaultdict
|
| 12 |
+
from typing import Dict, List, Tuple
|
| 13 |
+
|
| 14 |
+
from config import ANCHOR_DEBUG, ANCHOR_RARITY_WEIGHTING, ANCHOR_RUN_TRIM_RATIO
|
| 15 |
+
from .ngram_index import PhonemeNgramIndex
|
| 16 |
+
from .phoneme_matcher import ChapterReference
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _find_best_contiguous_run(
|
| 20 |
+
ayah_weights: Dict[int, float],
|
| 21 |
+
) -> Tuple[int, int, float]:
|
| 22 |
+
"""
|
| 23 |
+
Find the contiguous run of consecutive ayahs with highest total weight.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
ayah_weights: {ayah_number: vote_weight} for a single surah
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
(start_ayah, end_ayah, total_weight) of the best run
|
| 30 |
+
"""
|
| 31 |
+
if not ayah_weights:
|
| 32 |
+
return (0, 0, 0.0)
|
| 33 |
+
|
| 34 |
+
sorted_ayahs = sorted(ayah_weights.keys())
|
| 35 |
+
|
| 36 |
+
# Build runs of consecutive ayahs
|
| 37 |
+
runs: List[Tuple[int, int, float]] = [] # (start, end, total_weight)
|
| 38 |
+
run_start = sorted_ayahs[0]
|
| 39 |
+
run_end = sorted_ayahs[0]
|
| 40 |
+
run_weight = ayah_weights[sorted_ayahs[0]]
|
| 41 |
+
|
| 42 |
+
for i in range(1, len(sorted_ayahs)):
|
| 43 |
+
ayah = sorted_ayahs[i]
|
| 44 |
+
if ayah == run_end + 1:
|
| 45 |
+
# Extends current run
|
| 46 |
+
run_end = ayah
|
| 47 |
+
run_weight += ayah_weights[ayah]
|
| 48 |
+
else:
|
| 49 |
+
# Gap — save current run, start new one
|
| 50 |
+
runs.append((run_start, run_end, run_weight))
|
| 51 |
+
run_start = ayah
|
| 52 |
+
run_end = ayah
|
| 53 |
+
run_weight = ayah_weights[ayah]
|
| 54 |
+
|
| 55 |
+
# Don't forget the last run
|
| 56 |
+
runs.append((run_start, run_end, run_weight))
|
| 57 |
+
|
| 58 |
+
# Pick run with highest total weight
|
| 59 |
+
best_start, best_end, best_weight = max(runs, key=lambda r: r[2])
|
| 60 |
+
|
| 61 |
+
# Trim leading/trailing ayahs whose weight < ANCHOR_RUN_TRIM_RATIO * max
|
| 62 |
+
max_w = max(ayah_weights[a] for a in range(best_start, best_end + 1))
|
| 63 |
+
threshold = ANCHOR_RUN_TRIM_RATIO * max_w
|
| 64 |
+
|
| 65 |
+
while best_start < best_end and ayah_weights[best_start] < threshold:
|
| 66 |
+
best_weight -= ayah_weights[best_start]
|
| 67 |
+
best_start += 1
|
| 68 |
+
|
| 69 |
+
while best_end > best_start and ayah_weights[best_end] < threshold:
|
| 70 |
+
best_weight -= ayah_weights[best_end]
|
| 71 |
+
best_end -= 1
|
| 72 |
+
|
| 73 |
+
return (best_start, best_end, best_weight)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def find_anchor_by_voting(
|
| 77 |
+
phoneme_texts: List[List[str]],
|
| 78 |
+
ngram_index: PhonemeNgramIndex,
|
| 79 |
+
n_segments: int,
|
| 80 |
+
) -> Tuple[int, int]:
|
| 81 |
+
"""
|
| 82 |
+
Vote on (surah, ayah) using n-gram rarity weighting.
|
| 83 |
+
|
| 84 |
+
Two-phase selection:
|
| 85 |
+
1. Raw voting determines the winning surah (highest total weight across all ayahs)
|
| 86 |
+
2. Within that surah, find the best contiguous run of ayahs and return
|
| 87 |
+
the first ayah of that run as the anchor point.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
phoneme_texts: Phoneme lists for segments (starting from first Quran segment)
|
| 91 |
+
ngram_index: Pre-built n-gram index
|
| 92 |
+
n_segments: Number of segments to use for voting
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
(surah, ayah) of best match, or (0, 0) if nothing found
|
| 96 |
+
"""
|
| 97 |
+
# Concatenate first N non-empty segments
|
| 98 |
+
combined: List[str] = []
|
| 99 |
+
segments_used = 0
|
| 100 |
+
for phonemes in phoneme_texts[:n_segments]:
|
| 101 |
+
if phonemes:
|
| 102 |
+
combined.extend(phonemes)
|
| 103 |
+
segments_used += 1
|
| 104 |
+
|
| 105 |
+
n = ngram_index.ngram_size
|
| 106 |
+
|
| 107 |
+
if ANCHOR_DEBUG:
|
| 108 |
+
print(f"\n{'=' * 60}")
|
| 109 |
+
print(f"ANCHOR VOTING DEBUG")
|
| 110 |
+
print(f"{'=' * 60}")
|
| 111 |
+
print(f" Segments used: {segments_used}/{n_segments}")
|
| 112 |
+
print(f" Combined phonemes: {len(combined)}")
|
| 113 |
+
print(f" N-gram size: {n}")
|
| 114 |
+
if combined:
|
| 115 |
+
print(f" ASR phonemes: {' '.join(combined[:30])}{'...' if len(combined) > 30 else ''}")
|
| 116 |
+
|
| 117 |
+
# Extract n-grams from ASR
|
| 118 |
+
asr_ngrams = [
|
| 119 |
+
tuple(combined[i : i + n])
|
| 120 |
+
for i in range(len(combined) - n + 1)
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
if ANCHOR_DEBUG:
|
| 124 |
+
print(f" ASR n-grams extracted: {len(asr_ngrams)}")
|
| 125 |
+
|
| 126 |
+
# =========================================================================
|
| 127 |
+
# Phase 1: Raw voting — accumulate (surah, ayah) votes
|
| 128 |
+
# =========================================================================
|
| 129 |
+
votes: Dict[Tuple[int, int], float] = defaultdict(float)
|
| 130 |
+
matched_ngrams = 0
|
| 131 |
+
missed_ngrams = 0
|
| 132 |
+
|
| 133 |
+
for ng in asr_ngrams:
|
| 134 |
+
if ng not in ngram_index.ngram_positions:
|
| 135 |
+
missed_ngrams += 1
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
matched_ngrams += 1
|
| 139 |
+
weight = (1.0 / ngram_index.ngram_counts[ng]) if ANCHOR_RARITY_WEIGHTING else 1.0
|
| 140 |
+
|
| 141 |
+
for surah, ayah in ngram_index.ngram_positions[ng]:
|
| 142 |
+
votes[(surah, ayah)] += weight
|
| 143 |
+
|
| 144 |
+
if ANCHOR_DEBUG:
|
| 145 |
+
print(f" N-grams matched: {matched_ngrams}/{len(asr_ngrams)} "
|
| 146 |
+
f"({missed_ngrams} missed)")
|
| 147 |
+
print(f" Distinct (surah, ayah) voted for: {len(votes)}")
|
| 148 |
+
|
| 149 |
+
if not votes:
|
| 150 |
+
if ANCHOR_DEBUG:
|
| 151 |
+
print(f" RESULT: No votes cast — returning (0, 0)")
|
| 152 |
+
print(f"{'=' * 60}\n")
|
| 153 |
+
return (0, 0)
|
| 154 |
+
|
| 155 |
+
# =========================================================================
|
| 156 |
+
# Phase 1b: Determine winning surah (by total weight across all ayahs)
|
| 157 |
+
# =========================================================================
|
| 158 |
+
surah_totals: Dict[int, float] = defaultdict(float)
|
| 159 |
+
for (s, a), w in votes.items():
|
| 160 |
+
surah_totals[s] += w
|
| 161 |
+
|
| 162 |
+
winning_surah = max(surah_totals, key=surah_totals.get)
|
| 163 |
+
|
| 164 |
+
if ANCHOR_DEBUG:
|
| 165 |
+
ranked_surahs = sorted(surah_totals.items(), key=lambda kv: kv[1], reverse=True)
|
| 166 |
+
print(f"\n Surah vote totals (top 5):")
|
| 167 |
+
print(f" {'Surah':>5} {'Total Weight':>12}")
|
| 168 |
+
print(f" {'-' * 20}")
|
| 169 |
+
for s, w in ranked_surahs[:5]:
|
| 170 |
+
marker = " <-- winner" if s == winning_surah else ""
|
| 171 |
+
print(f" {s:>5} {w:>12.3f}{marker}")
|
| 172 |
+
|
| 173 |
+
# =========================================================================
|
| 174 |
+
# Phase 2: Within winning surah, find best contiguous ayah run
|
| 175 |
+
# =========================================================================
|
| 176 |
+
ayah_weights: Dict[int, float] = {}
|
| 177 |
+
for (s, a), w in votes.items():
|
| 178 |
+
if s == winning_surah:
|
| 179 |
+
ayah_weights[a] = w
|
| 180 |
+
|
| 181 |
+
run_start, run_end, run_weight = _find_best_contiguous_run(ayah_weights)
|
| 182 |
+
|
| 183 |
+
if ANCHOR_DEBUG:
|
| 184 |
+
# Show per-ayah votes in this surah
|
| 185 |
+
print(f"\n Surah {winning_surah} ayah votes:")
|
| 186 |
+
print(f" {'Ayah':>5} {'Weight':>8} {'In Best Run':>11}")
|
| 187 |
+
print(f" {'-' * 28}")
|
| 188 |
+
for a in sorted(ayah_weights.keys()):
|
| 189 |
+
in_run = "***" if run_start <= a <= run_end else ""
|
| 190 |
+
print(f" {a:>5} {ayah_weights[a]:>8.3f} {in_run:>11}")
|
| 191 |
+
|
| 192 |
+
print(f"\n Best contiguous run (after trim): ayahs {run_start}-{run_end} "
|
| 193 |
+
f"(weight={run_weight:.3f}, trim_ratio={ANCHOR_RUN_TRIM_RATIO})")
|
| 194 |
+
print(f" RESULT: Surah {winning_surah}, Ayah {run_start} (start of run)")
|
| 195 |
+
print(f"{'=' * 60}\n")
|
| 196 |
+
|
| 197 |
+
return (winning_surah, run_start)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def reanchor_within_surah(
|
| 201 |
+
phoneme_texts: List[List[str]],
|
| 202 |
+
ngram_index: PhonemeNgramIndex,
|
| 203 |
+
surah: int,
|
| 204 |
+
n_segments: int,
|
| 205 |
+
) -> int:
|
| 206 |
+
"""
|
| 207 |
+
Re-anchor within a known surah after consecutive DP failures.
|
| 208 |
+
|
| 209 |
+
Same n-gram voting as find_anchor_by_voting but:
|
| 210 |
+
- Only counts votes for the given surah (skip all others)
|
| 211 |
+
- Returns ayah (start of best contiguous run), or 0 if no votes
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
phoneme_texts: Remaining unprocessed phoneme lists
|
| 215 |
+
ngram_index: Pre-built n-gram index
|
| 216 |
+
surah: Current surah (fixed)
|
| 217 |
+
n_segments: How many segments to use for voting
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
ayah number to re-anchor to (0 = failed)
|
| 221 |
+
"""
|
| 222 |
+
# Concatenate first N non-empty segments
|
| 223 |
+
combined: List[str] = []
|
| 224 |
+
segments_used = 0
|
| 225 |
+
for phonemes in phoneme_texts[:n_segments]:
|
| 226 |
+
if phonemes:
|
| 227 |
+
combined.extend(phonemes)
|
| 228 |
+
segments_used += 1
|
| 229 |
+
|
| 230 |
+
n = ngram_index.ngram_size
|
| 231 |
+
|
| 232 |
+
if ANCHOR_DEBUG:
|
| 233 |
+
print(f"\n{'=' * 60}")
|
| 234 |
+
print(f"RE-ANCHOR WITHIN SURAH {surah}")
|
| 235 |
+
print(f"{'=' * 60}")
|
| 236 |
+
print(f" Segments used: {segments_used}/{n_segments}")
|
| 237 |
+
print(f" Combined phonemes: {len(combined)}")
|
| 238 |
+
|
| 239 |
+
# Extract n-grams from ASR
|
| 240 |
+
asr_ngrams = [
|
| 241 |
+
tuple(combined[i : i + n])
|
| 242 |
+
for i in range(len(combined) - n + 1)
|
| 243 |
+
]
|
| 244 |
+
|
| 245 |
+
# Vote — only accumulate weight for positions in the given surah
|
| 246 |
+
ayah_weights: Dict[int, float] = defaultdict(float)
|
| 247 |
+
matched_ngrams = 0
|
| 248 |
+
|
| 249 |
+
for ng in asr_ngrams:
|
| 250 |
+
if ng not in ngram_index.ngram_positions:
|
| 251 |
+
continue
|
| 252 |
+
matched_ngrams += 1
|
| 253 |
+
weight = (1.0 / ngram_index.ngram_counts[ng]) if ANCHOR_RARITY_WEIGHTING else 1.0
|
| 254 |
+
for s, a in ngram_index.ngram_positions[ng]:
|
| 255 |
+
if s == surah:
|
| 256 |
+
ayah_weights[a] += weight
|
| 257 |
+
|
| 258 |
+
if ANCHOR_DEBUG:
|
| 259 |
+
print(f" N-grams matched: {matched_ngrams}/{len(asr_ngrams)}")
|
| 260 |
+
print(f" Ayahs with votes: {len(ayah_weights)}")
|
| 261 |
+
|
| 262 |
+
if not ayah_weights:
|
| 263 |
+
if ANCHOR_DEBUG:
|
| 264 |
+
print(f" RESULT: No votes — returning 0")
|
| 265 |
+
print(f"{'=' * 60}\n")
|
| 266 |
+
return 0
|
| 267 |
+
|
| 268 |
+
run_start, run_end, run_weight = _find_best_contiguous_run(dict(ayah_weights))
|
| 269 |
+
|
| 270 |
+
if ANCHOR_DEBUG:
|
| 271 |
+
print(f" Best contiguous run (after trim): ayahs {run_start}-{run_end} "
|
| 272 |
+
f"(weight={run_weight:.3f}, trim_ratio={ANCHOR_RUN_TRIM_RATIO})")
|
| 273 |
+
print(f" RESULT: Ayah {run_start}")
|
| 274 |
+
print(f"{'=' * 60}\n")
|
| 275 |
+
|
| 276 |
+
return run_start
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def verse_to_word_index(chapter_ref: ChapterReference, ayah: int) -> int:
|
| 280 |
+
"""
|
| 281 |
+
Find word index of the first word in a given ayah.
|
| 282 |
+
|
| 283 |
+
Args:
|
| 284 |
+
chapter_ref: Pre-built chapter reference
|
| 285 |
+
ayah: Verse number to find
|
| 286 |
+
|
| 287 |
+
Returns:
|
| 288 |
+
Word index into chapter_ref.words, or 0 if not found
|
| 289 |
+
"""
|
| 290 |
+
for idx, word in enumerate(chapter_ref.words):
|
| 291 |
+
if word.ayah == ayah:
|
| 292 |
+
return idx
|
| 293 |
+
return 0
|
src/alignment/phoneme_asr.py
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Phoneme ASR using wav2vec2 CTC model."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
import torch
|
| 6 |
+
import numpy as np
|
| 7 |
+
from typing import List, Dict, Any
|
| 8 |
+
|
| 9 |
+
from config import (
|
| 10 |
+
PHONEME_ASR_MODELS, PHONEME_ASR_MODEL_DEFAULT, DTYPE, IS_HF_SPACE, TORCH_COMPILE,
|
| 11 |
+
BATCHING_STRATEGY, INFERENCE_BATCH_SIZE,
|
| 12 |
+
MAX_BATCH_SECONDS, MAX_PAD_WASTE, MIN_BATCH_SIZE,
|
| 13 |
+
)
|
| 14 |
+
from ..zero_gpu import ZERO_GPU_AVAILABLE, is_quota_exhausted
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
_cache = {} # model_name -> {"model": Model, "processor": Processor, "device": str}
|
| 18 |
+
|
| 19 |
+
_TORCH_DTYPE = torch.float16 if DTYPE == "float16" else torch.float32
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _get_hf_token():
|
| 23 |
+
"""Get HF token from env var or stored login."""
|
| 24 |
+
token = os.environ.get("HF_TOKEN")
|
| 25 |
+
if not token:
|
| 26 |
+
try:
|
| 27 |
+
from huggingface_hub import HfFolder
|
| 28 |
+
token = HfFolder.get_token()
|
| 29 |
+
except Exception:
|
| 30 |
+
pass
|
| 31 |
+
return token
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _get_device_and_dtype():
|
| 35 |
+
"""Get the best available device and dtype.
|
| 36 |
+
|
| 37 |
+
On HF Spaces with ZeroGPU, returns CPU to defer CUDA init
|
| 38 |
+
until inside a @gpu_decorator function.
|
| 39 |
+
"""
|
| 40 |
+
if IS_HF_SPACE or ZERO_GPU_AVAILABLE:
|
| 41 |
+
return torch.device("cpu"), _TORCH_DTYPE
|
| 42 |
+
if torch.cuda.is_available():
|
| 43 |
+
return torch.device("cuda"), _TORCH_DTYPE
|
| 44 |
+
return torch.device("cpu"), _TORCH_DTYPE
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def load_phoneme_asr(model_name=PHONEME_ASR_MODEL_DEFAULT):
|
| 48 |
+
"""Load phoneme ASR model on CPU. Returns (model, processor).
|
| 49 |
+
|
| 50 |
+
Models are loaded once and cached per model_name. Both base and large
|
| 51 |
+
can be cached simultaneously. Use move_phoneme_asr_to_gpu() inside
|
| 52 |
+
GPU-decorated functions to move to CUDA.
|
| 53 |
+
"""
|
| 54 |
+
if model_name in _cache:
|
| 55 |
+
entry = _cache[model_name]
|
| 56 |
+
return entry["model"], entry["processor"]
|
| 57 |
+
|
| 58 |
+
import logging
|
| 59 |
+
from transformers import AutoModelForCTC, AutoProcessor
|
| 60 |
+
|
| 61 |
+
# Suppress verbose transformers logging during load
|
| 62 |
+
logging.getLogger("transformers").setLevel(logging.WARNING)
|
| 63 |
+
|
| 64 |
+
model_path = PHONEME_ASR_MODELS[model_name]
|
| 65 |
+
print(f"Loading phoneme ASR: {model_path} ({model_name})")
|
| 66 |
+
|
| 67 |
+
# Use HF_TOKEN for private model access
|
| 68 |
+
hf_token = _get_hf_token()
|
| 69 |
+
|
| 70 |
+
device, dtype = _get_device_and_dtype()
|
| 71 |
+
|
| 72 |
+
model = AutoModelForCTC.from_pretrained(
|
| 73 |
+
model_path, token=hf_token, attn_implementation="sdpa"
|
| 74 |
+
)
|
| 75 |
+
model.to(device, dtype=dtype)
|
| 76 |
+
model.eval()
|
| 77 |
+
if TORCH_COMPILE and not (IS_HF_SPACE or ZERO_GPU_AVAILABLE):
|
| 78 |
+
model = torch.compile(model, mode="reduce-overhead")
|
| 79 |
+
|
| 80 |
+
processor = AutoProcessor.from_pretrained(model_path, token=hf_token)
|
| 81 |
+
|
| 82 |
+
_cache[model_name] = {
|
| 83 |
+
"model": model,
|
| 84 |
+
"processor": processor,
|
| 85 |
+
"device": device.type,
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
print(f"Phoneme ASR ({model_name}) loaded on {device}")
|
| 89 |
+
return model, processor
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def move_phoneme_asr_to_gpu(model_name=None):
|
| 93 |
+
"""Move cached phoneme ASR model(s) to GPU.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
model_name: Move only this model. If None, move all cached models.
|
| 97 |
+
|
| 98 |
+
Call this inside @gpu_decorator functions on HF Spaces.
|
| 99 |
+
Idempotent: checks current device before moving.
|
| 100 |
+
Skips if quota exhausted or CUDA unavailable.
|
| 101 |
+
"""
|
| 102 |
+
if is_quota_exhausted() or not torch.cuda.is_available():
|
| 103 |
+
return
|
| 104 |
+
|
| 105 |
+
names = [model_name] if model_name else list(_cache.keys())
|
| 106 |
+
device = torch.device("cuda")
|
| 107 |
+
|
| 108 |
+
for name in names:
|
| 109 |
+
if name not in _cache:
|
| 110 |
+
continue
|
| 111 |
+
entry = _cache[name]
|
| 112 |
+
model = entry["model"]
|
| 113 |
+
if next(model.parameters()).device.type != "cuda":
|
| 114 |
+
entry["model"] = model.to(device, dtype=_TORCH_DTYPE)
|
| 115 |
+
entry["device"] = "cuda"
|
| 116 |
+
print(f"[PHONEME ASR] Moved '{name}' to CUDA")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def move_phoneme_asr_to_cpu(model_name=None):
|
| 120 |
+
"""Move cached phoneme ASR model(s) back to CPU.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
model_name: Move only this model. If None, move all cached models.
|
| 124 |
+
|
| 125 |
+
Called when GPU lease fails or quota is exhausted so that
|
| 126 |
+
CPU fallback inference can proceed.
|
| 127 |
+
Idempotent: checks current device before moving.
|
| 128 |
+
"""
|
| 129 |
+
names = [model_name] if model_name else list(_cache.keys())
|
| 130 |
+
device = torch.device("cpu")
|
| 131 |
+
|
| 132 |
+
for name in names:
|
| 133 |
+
if name not in _cache:
|
| 134 |
+
continue
|
| 135 |
+
entry = _cache[name]
|
| 136 |
+
model = entry["model"]
|
| 137 |
+
if next(model.parameters()).device.type != "cpu":
|
| 138 |
+
entry["model"] = model.to(device, dtype=_TORCH_DTYPE)
|
| 139 |
+
entry["device"] = "cpu"
|
| 140 |
+
print(f"[PHONEME ASR] Moved '{name}' to CPU")
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def ids_to_phoneme_list(ids: List[int], tokenizer, pad_id: int) -> List[str]:
|
| 144 |
+
"""
|
| 145 |
+
Convert token IDs to phoneme list with CTC collapse.
|
| 146 |
+
|
| 147 |
+
CTC decoding:
|
| 148 |
+
1. Remove pad/blank tokens
|
| 149 |
+
2. Collapse consecutive duplicates
|
| 150 |
+
3. Filter out word delimiter "|"
|
| 151 |
+
"""
|
| 152 |
+
# Convert all IDs to tokens first (do not skip any yet)
|
| 153 |
+
toks = tokenizer.convert_ids_to_tokens(ids)
|
| 154 |
+
|
| 155 |
+
if not toks:
|
| 156 |
+
return []
|
| 157 |
+
|
| 158 |
+
# Get the actual token string for pad
|
| 159 |
+
pad_tok = tokenizer.convert_ids_to_tokens([pad_id])[0] if pad_id is not None else "[PAD]"
|
| 160 |
+
|
| 161 |
+
# CTC collapse: remove consecutive duplicates and special tokens
|
| 162 |
+
collapsed: List[str] = []
|
| 163 |
+
prev = None
|
| 164 |
+
for t in toks:
|
| 165 |
+
# Skip pad/blank token
|
| 166 |
+
if t == pad_tok:
|
| 167 |
+
prev = t
|
| 168 |
+
continue
|
| 169 |
+
# Skip word delimiter
|
| 170 |
+
if t == "|":
|
| 171 |
+
prev = t
|
| 172 |
+
continue
|
| 173 |
+
# Skip consecutive duplicates (CTC collapse)
|
| 174 |
+
if t == prev:
|
| 175 |
+
continue
|
| 176 |
+
collapsed.append(t)
|
| 177 |
+
prev = t
|
| 178 |
+
|
| 179 |
+
return collapsed
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def build_batches_naive(sorted_indices: List[int], batch_size: int) -> List[List[int]]:
|
| 183 |
+
"""Fixed-count batching (original behavior)."""
|
| 184 |
+
return [sorted_indices[i:i + batch_size]
|
| 185 |
+
for i in range(0, len(sorted_indices), batch_size)]
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def build_batches(sorted_indices: List[int], durations: List[float]) -> List[List[int]]:
|
| 189 |
+
"""Build dynamic batches from duration-sorted indices.
|
| 190 |
+
|
| 191 |
+
Constraints:
|
| 192 |
+
- sum(durations) per batch <= MAX_BATCH_SECONDS
|
| 193 |
+
- pad waste fraction <= MAX_PAD_WASTE (1 - sum/[n*max], measures wasted tensor compute)
|
| 194 |
+
- batch won't be cut below MIN_BATCH_SIZE (avoids underutilization)
|
| 195 |
+
"""
|
| 196 |
+
batches: List[List[int]] = []
|
| 197 |
+
current: List[int] = []
|
| 198 |
+
current_seconds = 0.0
|
| 199 |
+
|
| 200 |
+
for i in sorted_indices:
|
| 201 |
+
dur = durations[i]
|
| 202 |
+
|
| 203 |
+
if not current:
|
| 204 |
+
current.append(i)
|
| 205 |
+
current_seconds = dur
|
| 206 |
+
continue
|
| 207 |
+
|
| 208 |
+
max_dur = dur # candidate is the new longest (sorted ascending)
|
| 209 |
+
new_seconds = current_seconds + dur
|
| 210 |
+
new_size = len(current) + 1
|
| 211 |
+
pad_waste = 1.0 - new_seconds / (new_size * max_dur) if max_dur > 0 else 0.0
|
| 212 |
+
|
| 213 |
+
seconds_exceeded = new_seconds > MAX_BATCH_SECONDS
|
| 214 |
+
waste_exceeded = pad_waste > MAX_PAD_WASTE
|
| 215 |
+
|
| 216 |
+
if (seconds_exceeded or waste_exceeded) and len(current) >= MIN_BATCH_SIZE:
|
| 217 |
+
batches.append(current)
|
| 218 |
+
current = [i]
|
| 219 |
+
current_seconds = dur
|
| 220 |
+
else:
|
| 221 |
+
current.append(i)
|
| 222 |
+
current_seconds = new_seconds
|
| 223 |
+
|
| 224 |
+
if current:
|
| 225 |
+
batches.append(current)
|
| 226 |
+
|
| 227 |
+
return batches
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def _transcribe_batch_pytorch(
|
| 231 |
+
segment_audios: List[np.ndarray],
|
| 232 |
+
durations: List[float],
|
| 233 |
+
batches: List[List[int]],
|
| 234 |
+
model,
|
| 235 |
+
processor,
|
| 236 |
+
tokenizer,
|
| 237 |
+
pad_id: int,
|
| 238 |
+
device: torch.device,
|
| 239 |
+
dtype: torch.dtype,
|
| 240 |
+
) -> tuple:
|
| 241 |
+
"""PyTorch inference path (GPU or CPU fallback)."""
|
| 242 |
+
results: List[List[str]] = [[] for _ in segment_audios]
|
| 243 |
+
batch_profiling = []
|
| 244 |
+
|
| 245 |
+
for batch_num_idx, batch_idx in enumerate(batches):
|
| 246 |
+
batch_audios = [segment_audios[i] for i in batch_idx]
|
| 247 |
+
batch_durations = [durations[i] for i in batch_idx]
|
| 248 |
+
|
| 249 |
+
batch_num = batch_num_idx + 1
|
| 250 |
+
t0 = time.time()
|
| 251 |
+
|
| 252 |
+
# Feature extraction + GPU transfer
|
| 253 |
+
t_feat_start = time.time()
|
| 254 |
+
inputs = processor(
|
| 255 |
+
batch_audios,
|
| 256 |
+
sampling_rate=16000,
|
| 257 |
+
return_tensors="pt",
|
| 258 |
+
padding=True,
|
| 259 |
+
)
|
| 260 |
+
input_values = inputs.input_values.to(device=device, dtype=dtype)
|
| 261 |
+
attention_mask = inputs.get("attention_mask")
|
| 262 |
+
if attention_mask is not None:
|
| 263 |
+
attention_mask = attention_mask.to(device=device)
|
| 264 |
+
feat_time = time.time() - t_feat_start
|
| 265 |
+
|
| 266 |
+
# Model inference
|
| 267 |
+
t_infer_start = time.time()
|
| 268 |
+
with torch.no_grad():
|
| 269 |
+
outputs = model(input_values, attention_mask=attention_mask)
|
| 270 |
+
logits = outputs.logits
|
| 271 |
+
if device.type == "cuda":
|
| 272 |
+
torch.cuda.synchronize()
|
| 273 |
+
infer_time = time.time() - t_infer_start
|
| 274 |
+
|
| 275 |
+
# CTC greedy decode
|
| 276 |
+
t_decode_start = time.time()
|
| 277 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 278 |
+
|
| 279 |
+
for j in range(predicted_ids.shape[0]):
|
| 280 |
+
ids_list = predicted_ids[j].cpu().tolist()
|
| 281 |
+
phoneme_list = ids_to_phoneme_list(ids_list, tokenizer, pad_id)
|
| 282 |
+
results[batch_idx[j]] = phoneme_list
|
| 283 |
+
decode_time = time.time() - t_decode_start
|
| 284 |
+
|
| 285 |
+
del input_values, attention_mask, outputs, logits, predicted_ids
|
| 286 |
+
|
| 287 |
+
batch_time = time.time() - t0
|
| 288 |
+
|
| 289 |
+
batch_profiling.append({
|
| 290 |
+
"batch_num": batch_num,
|
| 291 |
+
"size": len(batch_audios),
|
| 292 |
+
"time": batch_time,
|
| 293 |
+
"feat_time": feat_time,
|
| 294 |
+
"infer_time": infer_time,
|
| 295 |
+
"decode_time": decode_time,
|
| 296 |
+
"min_dur": min(batch_durations),
|
| 297 |
+
"max_dur": max(batch_durations),
|
| 298 |
+
"avg_dur": sum(batch_durations) / len(batch_durations),
|
| 299 |
+
"total_seconds": sum(batch_durations),
|
| 300 |
+
"pad_waste": 1.0 - sum(batch_durations) / (len(batch_durations) * max(batch_durations)) if max(batch_durations) > 0 else 0.0,
|
| 301 |
+
})
|
| 302 |
+
|
| 303 |
+
return results, batch_profiling
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def transcribe_batch(segment_audios: List[np.ndarray], sample_rate: int, model_name: str = PHONEME_ASR_MODEL_DEFAULT) -> tuple:
|
| 307 |
+
"""Transcribe audio segments to phoneme lists, sorted by duration for efficiency.
|
| 308 |
+
|
| 309 |
+
Args:
|
| 310 |
+
segment_audios: List of audio arrays
|
| 311 |
+
sample_rate: Audio sample rate
|
| 312 |
+
model_name: Which ASR model to use ("base" or "large")
|
| 313 |
+
|
| 314 |
+
Returns:
|
| 315 |
+
(results, batch_profiling) where results is List[List[str]] and
|
| 316 |
+
batch_profiling is a list of dicts with per-batch timing and duration stats.
|
| 317 |
+
"""
|
| 318 |
+
if not segment_audios:
|
| 319 |
+
return [], [], 0.0, 0.0
|
| 320 |
+
|
| 321 |
+
model, processor = load_phoneme_asr(model_name)
|
| 322 |
+
if model is None:
|
| 323 |
+
return [[] for _ in segment_audios], [], 0.0, 0.0
|
| 324 |
+
|
| 325 |
+
device = next(model.parameters()).device
|
| 326 |
+
dtype = next(model.parameters()).dtype
|
| 327 |
+
tokenizer = processor.tokenizer
|
| 328 |
+
pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
|
| 329 |
+
|
| 330 |
+
# Compute durations (audio assumed to be 16kHz — resampled at source)
|
| 331 |
+
durations = [len(audio) / 16000.0 for audio in segment_audios]
|
| 332 |
+
|
| 333 |
+
# Sort indices by duration, then build dynamic batches
|
| 334 |
+
t_sort = time.time()
|
| 335 |
+
sorted_indices = sorted(range(len(segment_audios)), key=lambda i: durations[i])
|
| 336 |
+
sorting_time = time.time() - t_sort
|
| 337 |
+
|
| 338 |
+
t_batch_build = time.time()
|
| 339 |
+
if BATCHING_STRATEGY == "dynamic":
|
| 340 |
+
batches = build_batches(sorted_indices, durations)
|
| 341 |
+
else:
|
| 342 |
+
batches = build_batches_naive(sorted_indices, INFERENCE_BATCH_SIZE)
|
| 343 |
+
batch_build_time = time.time() - t_batch_build
|
| 344 |
+
|
| 345 |
+
backend = "PyTorch" + (f" ({device.type})" if device.type != "cpu" else " (CPU)")
|
| 346 |
+
print(f"[PHONEME ASR] Using {backend}")
|
| 347 |
+
results, batch_profiling = _transcribe_batch_pytorch(
|
| 348 |
+
segment_audios, durations, batches,
|
| 349 |
+
model, processor, tokenizer, pad_id, device, dtype,
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
sizes = [p["size"] for p in batch_profiling]
|
| 353 |
+
print(f"[PHONEME ASR] {len(segment_audios)} segments in {len(batch_profiling)} batches "
|
| 354 |
+
f"(sizes: {min(sizes)}-{max(sizes)}, sort: {sorting_time:.3f}s, batch build: {batch_build_time:.3f}s)")
|
| 355 |
+
return results, batch_profiling, sorting_time, batch_build_time
|
src/alignment/phoneme_matcher.py
ADDED
|
@@ -0,0 +1,590 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Phoneme-based alignment using substring Levenshtein DP.
|
| 3 |
+
|
| 4 |
+
This module implements the core alignment algorithm for matching ASR phoneme
|
| 5 |
+
sequences to reference Quranic text phonemes with word-boundary constraints.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from typing import Dict, List, Optional, Tuple
|
| 11 |
+
|
| 12 |
+
from config import (
|
| 13 |
+
DATA_PATH,
|
| 14 |
+
LOOKBACK_WORDS,
|
| 15 |
+
LOOKAHEAD_WORDS,
|
| 16 |
+
MAX_EDIT_DISTANCE,
|
| 17 |
+
START_PRIOR_WEIGHT,
|
| 18 |
+
COST_SUBSTITUTION,
|
| 19 |
+
COST_DELETION,
|
| 20 |
+
COST_INSERTION,
|
| 21 |
+
PHONEME_ALIGNMENT_DEBUG,
|
| 22 |
+
PHONEME_ALIGNMENT_PROFILING,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
from ..phonemizer_utils import get_phonemizer
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# =============================================================================
|
| 29 |
+
# Phoneme Substitution Cost Lookup
|
| 30 |
+
# =============================================================================
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _load_substitution_costs() -> Dict[Tuple[str, str], float]:
|
| 34 |
+
"""Load phoneme pair substitution costs from JSON data file.
|
| 35 |
+
|
| 36 |
+
Stores both orderings (a,b) and (b,a) so lookups need only a plain tuple.
|
| 37 |
+
"""
|
| 38 |
+
path = DATA_PATH / "phoneme_sub_costs.json"
|
| 39 |
+
if not path.exists():
|
| 40 |
+
return {}
|
| 41 |
+
with open(path) as f:
|
| 42 |
+
raw = json.load(f)
|
| 43 |
+
costs = {}
|
| 44 |
+
for key, section in raw.items():
|
| 45 |
+
if key == "_meta":
|
| 46 |
+
continue
|
| 47 |
+
for pair_str, cost in section.items():
|
| 48 |
+
a, b = pair_str.split("|")
|
| 49 |
+
c = float(cost)
|
| 50 |
+
costs[(a, b)] = c
|
| 51 |
+
costs[(b, a)] = c
|
| 52 |
+
return costs
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
_SUBSTITUTION_COSTS: Dict[Tuple[str, str], float] = _load_substitution_costs()
|
| 56 |
+
|
| 57 |
+
# Try to load Cython-accelerated DP; fall back to pure Python silently.
|
| 58 |
+
try:
|
| 59 |
+
from .._dp_core import cy_align_with_word_boundaries, init_substitution_matrix
|
| 60 |
+
init_substitution_matrix(_SUBSTITUTION_COSTS, COST_SUBSTITUTION)
|
| 61 |
+
_USE_CYTHON_DP = True
|
| 62 |
+
except ImportError:
|
| 63 |
+
_USE_CYTHON_DP = False
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_sub_cost(p: str, r: str, default: float) -> float:
|
| 67 |
+
"""Look up substitution cost for a phoneme pair.
|
| 68 |
+
|
| 69 |
+
Returns 0.0 for exact match, pair-specific cost if defined, otherwise default.
|
| 70 |
+
"""
|
| 71 |
+
if p == r:
|
| 72 |
+
return 0.0
|
| 73 |
+
return _SUBSTITUTION_COSTS.get((p, r), default)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# =============================================================================
|
| 77 |
+
# Data Structures
|
| 78 |
+
# =============================================================================
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@dataclass
|
| 82 |
+
class RefWord:
|
| 83 |
+
"""Reference word with phoneme metadata."""
|
| 84 |
+
text: str # Arabic text
|
| 85 |
+
phonemes: List[str] # Phoneme list for this word
|
| 86 |
+
surah: int # Surah number
|
| 87 |
+
ayah: int # Verse number within surah
|
| 88 |
+
word_num: int # Word number within verse (1-indexed)
|
| 89 |
+
|
| 90 |
+
@property
|
| 91 |
+
def location(self) -> str:
|
| 92 |
+
"""Format as 'surah:ayah:word'."""
|
| 93 |
+
return f"{self.surah}:{self.ayah}:{self.word_num}"
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@dataclass
|
| 97 |
+
class ChapterReference:
|
| 98 |
+
"""Pre-built reference data for a chapter."""
|
| 99 |
+
surah: int
|
| 100 |
+
words: List[RefWord]
|
| 101 |
+
avg_phones_per_word: float
|
| 102 |
+
|
| 103 |
+
# Pre-flattened phoneme data (avoids rebuilding per segment)
|
| 104 |
+
flat_phonemes: List[str] # All phonemes concatenated
|
| 105 |
+
flat_phone_to_word: List[int] # Word index for each phoneme (GLOBAL indices)
|
| 106 |
+
word_phone_offsets: List[int] # Prefix sum: word i starts at offset[i]
|
| 107 |
+
|
| 108 |
+
@property
|
| 109 |
+
def num_words(self) -> int:
|
| 110 |
+
return len(self.words)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@dataclass
|
| 114 |
+
class AlignmentResult:
|
| 115 |
+
"""Result of aligning a segment."""
|
| 116 |
+
start_word_idx: int # Index into ChapterReference.words
|
| 117 |
+
end_word_idx: int # Index into ChapterReference.words (inclusive)
|
| 118 |
+
edit_cost: float # Raw edit distance (may be non-integer with substitution costs)
|
| 119 |
+
confidence: float # 1.0 - (edit_cost / max(asr_len, ref_len))
|
| 120 |
+
|
| 121 |
+
# For debugging
|
| 122 |
+
j_start: int # Start phoneme index in R window
|
| 123 |
+
best_j: int # End phoneme index in R window (exclusive)
|
| 124 |
+
|
| 125 |
+
# Resolved word references
|
| 126 |
+
start_word: RefWord
|
| 127 |
+
end_word: RefWord
|
| 128 |
+
|
| 129 |
+
# Whether Basmala prefix was consumed by the alignment
|
| 130 |
+
basmala_consumed: bool = False
|
| 131 |
+
|
| 132 |
+
@property
|
| 133 |
+
def ref_from(self) -> str:
|
| 134 |
+
"""Start reference as 'surah:ayah:word'."""
|
| 135 |
+
return self.start_word.location
|
| 136 |
+
|
| 137 |
+
@property
|
| 138 |
+
def ref_to(self) -> str:
|
| 139 |
+
"""End reference as 'surah:ayah:word'."""
|
| 140 |
+
return self.end_word.location
|
| 141 |
+
|
| 142 |
+
@property
|
| 143 |
+
def matched_ref(self) -> str:
|
| 144 |
+
"""Combined reference as 'start-end'."""
|
| 145 |
+
return f"{self.ref_from}-{self.ref_to}"
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# =============================================================================
|
| 149 |
+
# Helper Functions
|
| 150 |
+
# =============================================================================
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def parse_location(location: str) -> Tuple[int, int, int]:
|
| 154 |
+
"""Parse 'surah:ayah:word' into (surah, ayah, word_num)."""
|
| 155 |
+
parts = location.split(":")
|
| 156 |
+
return int(parts[0]), int(parts[1]), int(parts[2])
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def get_matched_text(chapter_ref: ChapterReference, result: AlignmentResult) -> str:
|
| 160 |
+
"""Get Arabic text for aligned words."""
|
| 161 |
+
words = chapter_ref.words[result.start_word_idx : result.end_word_idx + 1]
|
| 162 |
+
return ' '.join(w.text for w in words)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def _word_loc(words: List, idx: int) -> str:
|
| 166 |
+
"""Format word index as 'ayah:word_num'."""
|
| 167 |
+
if idx < 0 or idx >= len(words):
|
| 168 |
+
return f"?({idx})"
|
| 169 |
+
w = words[idx]
|
| 170 |
+
return f"{w.ayah}:{w.word_num}"
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def print_debug_info(
|
| 174 |
+
P: List[str],
|
| 175 |
+
R: List[str],
|
| 176 |
+
result: Optional[AlignmentResult],
|
| 177 |
+
segment_idx: int,
|
| 178 |
+
pointer: int,
|
| 179 |
+
win_start: int,
|
| 180 |
+
win_end: int,
|
| 181 |
+
words: List = None,
|
| 182 |
+
) -> None:
|
| 183 |
+
"""Print detailed alignment debug info."""
|
| 184 |
+
if not PHONEME_ALIGNMENT_DEBUG:
|
| 185 |
+
return
|
| 186 |
+
|
| 187 |
+
print("\n" + "━" * 60)
|
| 188 |
+
print(f"[PHONEME ALIGN] Segment {segment_idx}")
|
| 189 |
+
print("─" * 60)
|
| 190 |
+
loc_range = ""
|
| 191 |
+
if words:
|
| 192 |
+
loc_range = f" = {_word_loc(words, win_start)}-{_word_loc(words, win_end)}"
|
| 193 |
+
print(f" Window: words [{win_start}-{win_end}]{loc_range} "
|
| 194 |
+
f"({win_end - win_start} words, {len(R)} phonemes)")
|
| 195 |
+
ptr_loc = ""
|
| 196 |
+
if words:
|
| 197 |
+
ptr_loc = f" = {_word_loc(words, pointer)}"
|
| 198 |
+
print(f" Expected start: word {pointer}{ptr_loc}")
|
| 199 |
+
print()
|
| 200 |
+
if len(R) <= 40:
|
| 201 |
+
print(f" R: {' '.join(R)}")
|
| 202 |
+
else:
|
| 203 |
+
print(f" R: {' '.join(R[:20])} ... {' '.join(R[-20:])}")
|
| 204 |
+
print(f" P: {' '.join(P)} ({len(P)} phonemes)")
|
| 205 |
+
print()
|
| 206 |
+
|
| 207 |
+
if result:
|
| 208 |
+
recovered = R[result.j_start:result.best_j]
|
| 209 |
+
print(f" ✓ MATCH: words [{result.start_word_idx}-{result.end_word_idx}] "
|
| 210 |
+
f"({result.end_word_idx - result.start_word_idx + 1} words)")
|
| 211 |
+
print(f" Recovered: {' '.join(recovered)} ({len(recovered)} phonemes)")
|
| 212 |
+
print(f" Edit cost: {result.edit_cost}")
|
| 213 |
+
print(f" Confidence: {result.confidence:.2f}")
|
| 214 |
+
else:
|
| 215 |
+
print(f" ✗ NO MATCH (no candidates passed threshold)")
|
| 216 |
+
|
| 217 |
+
print("━" * 60)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
# =============================================================================
|
| 221 |
+
# Chapter Reference Building
|
| 222 |
+
# =============================================================================
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def build_chapter_reference(surah_num: int) -> ChapterReference:
|
| 226 |
+
"""Build phoneme reference for entire chapter."""
|
| 227 |
+
pm = get_phonemizer()
|
| 228 |
+
|
| 229 |
+
# Phonemize entire chapter with stopping rules at verse boundaries
|
| 230 |
+
result = pm.phonemize(
|
| 231 |
+
ref=str(surah_num),
|
| 232 |
+
stops=["verse"]
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
# Get mapping - provides word metadata and phonemes directly
|
| 236 |
+
mapping = result.get_mapping()
|
| 237 |
+
|
| 238 |
+
# Build RefWord list - WordMapping already has phonemes as List[str]
|
| 239 |
+
words = []
|
| 240 |
+
for word in mapping.words:
|
| 241 |
+
surah, ayah, word_num = parse_location(word.location)
|
| 242 |
+
words.append(RefWord(
|
| 243 |
+
text=word.text,
|
| 244 |
+
phonemes=word.phonemes, # Direct access, no string parsing needed
|
| 245 |
+
surah=surah,
|
| 246 |
+
ayah=ayah,
|
| 247 |
+
word_num=word_num,
|
| 248 |
+
))
|
| 249 |
+
|
| 250 |
+
# Compute average phonemes per word
|
| 251 |
+
total_phones = sum(len(w.phonemes) for w in words)
|
| 252 |
+
avg_phones_per_word = total_phones / len(words) if words else 4.0
|
| 253 |
+
|
| 254 |
+
# Pre-flatten phonemes for efficient windowing (avoids per-segment rebuilds)
|
| 255 |
+
flat_phonemes = []
|
| 256 |
+
flat_phone_to_word = []
|
| 257 |
+
word_phone_offsets = []
|
| 258 |
+
|
| 259 |
+
for word_idx, word in enumerate(words):
|
| 260 |
+
word_phone_offsets.append(len(flat_phonemes)) # Start offset for this word
|
| 261 |
+
for ph in word.phonemes:
|
| 262 |
+
flat_phonemes.append(ph)
|
| 263 |
+
flat_phone_to_word.append(word_idx)
|
| 264 |
+
|
| 265 |
+
# Sentinel: offset past last phoneme (for slicing convenience)
|
| 266 |
+
word_phone_offsets.append(len(flat_phonemes))
|
| 267 |
+
|
| 268 |
+
return ChapterReference(
|
| 269 |
+
surah=surah_num,
|
| 270 |
+
words=words,
|
| 271 |
+
avg_phones_per_word=avg_phones_per_word,
|
| 272 |
+
flat_phonemes=flat_phonemes,
|
| 273 |
+
flat_phone_to_word=flat_phone_to_word,
|
| 274 |
+
word_phone_offsets=word_phone_offsets,
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
# =============================================================================
|
| 279 |
+
# Word-Boundary-Constrained Alignment (DP)
|
| 280 |
+
# =============================================================================
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def align_with_word_boundaries(
|
| 284 |
+
P: List[str],
|
| 285 |
+
R: List[str],
|
| 286 |
+
R_phone_to_word: List[int],
|
| 287 |
+
expected_word: int = 0,
|
| 288 |
+
prior_weight: float = START_PRIOR_WEIGHT,
|
| 289 |
+
cost_sub: float = COST_SUBSTITUTION,
|
| 290 |
+
cost_del: float = COST_DELETION,
|
| 291 |
+
cost_ins: float = COST_INSERTION,
|
| 292 |
+
) -> Tuple[Optional[int], Optional[int], float, float]:
|
| 293 |
+
"""
|
| 294 |
+
Word-boundary-constrained substring alignment.
|
| 295 |
+
|
| 296 |
+
Combines DP computation with best-match selection:
|
| 297 |
+
- Start: only word-start positions allowed (INF cost otherwise)
|
| 298 |
+
- End: only word-end positions evaluated as candidates
|
| 299 |
+
|
| 300 |
+
Args:
|
| 301 |
+
P: ASR phoneme sequence
|
| 302 |
+
R: Reference phoneme window
|
| 303 |
+
R_phone_to_word: Maps phoneme index -> word index (GLOBAL indices)
|
| 304 |
+
expected_word: Expected starting word index (for position prior)
|
| 305 |
+
prior_weight: Penalty per word distance from expected
|
| 306 |
+
cost_sub: Substitution cost
|
| 307 |
+
cost_del: Deletion cost (delete from P)
|
| 308 |
+
cost_ins: Insertion cost (insert from R)
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
(best_j, best_j_start, best_cost, best_norm_dist) or (None, None, INF, INF)
|
| 312 |
+
"""
|
| 313 |
+
if _USE_CYTHON_DP:
|
| 314 |
+
return cy_align_with_word_boundaries(
|
| 315 |
+
P, R, R_phone_to_word,
|
| 316 |
+
expected_word, prior_weight,
|
| 317 |
+
cost_sub, cost_del, cost_ins,
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
# --- Pure Python fallback ---
|
| 321 |
+
m, n = len(P), len(R)
|
| 322 |
+
INF = float('inf')
|
| 323 |
+
|
| 324 |
+
if m == 0 or n == 0:
|
| 325 |
+
return None, None, INF, float('inf')
|
| 326 |
+
|
| 327 |
+
# DP column semantics:
|
| 328 |
+
# Column j represents "consumed j phonemes" / boundary after phoneme j-1
|
| 329 |
+
# Column 0 = before any phonemes, Column n = after all phonemes
|
| 330 |
+
# Phoneme indices are 0..n-1, DP columns are 0..n
|
| 331 |
+
|
| 332 |
+
def is_start_boundary(j: int) -> bool:
|
| 333 |
+
"""Can alignment START at DP column j? (before phoneme j)"""
|
| 334 |
+
if j >= n:
|
| 335 |
+
return False # Can't start at or past end
|
| 336 |
+
if j == 0:
|
| 337 |
+
return True # Column 0 is always valid start (first word)
|
| 338 |
+
# Valid if phoneme j begins a new word
|
| 339 |
+
return R_phone_to_word[j] != R_phone_to_word[j - 1]
|
| 340 |
+
|
| 341 |
+
def is_end_boundary(j: int) -> bool:
|
| 342 |
+
"""Can alignment END at DP column j? (after phoneme j-1)"""
|
| 343 |
+
if j == 0:
|
| 344 |
+
return False # Can't end before consuming anything
|
| 345 |
+
if j == n:
|
| 346 |
+
return True # Column n (end of reference) always valid
|
| 347 |
+
# Valid if phoneme j starts a new word (meaning j-1 ended a word)
|
| 348 |
+
return R_phone_to_word[j] != R_phone_to_word[j - 1]
|
| 349 |
+
|
| 350 |
+
# Initialize: free start ONLY at word boundaries
|
| 351 |
+
prev_cost = [0.0 if is_start_boundary(j) else INF for j in range(n + 1)]
|
| 352 |
+
prev_start = [j if is_start_boundary(j) else -1 for j in range(n + 1)]
|
| 353 |
+
|
| 354 |
+
curr_cost = [0.0] * (n + 1)
|
| 355 |
+
curr_start = [0] * (n + 1)
|
| 356 |
+
|
| 357 |
+
# DP computation
|
| 358 |
+
for i in range(1, m + 1):
|
| 359 |
+
curr_cost[0] = i * cost_del if is_start_boundary(0) else INF
|
| 360 |
+
curr_start[0] = 0 if is_start_boundary(0) else -1
|
| 361 |
+
|
| 362 |
+
for j in range(1, n + 1):
|
| 363 |
+
del_option = prev_cost[j] + cost_del
|
| 364 |
+
ins_option = curr_cost[j-1] + cost_ins
|
| 365 |
+
sub_option = prev_cost[j-1] + get_sub_cost(P[i-1], R[j-1], cost_sub)
|
| 366 |
+
|
| 367 |
+
if sub_option <= del_option and sub_option <= ins_option:
|
| 368 |
+
curr_cost[j] = sub_option
|
| 369 |
+
curr_start[j] = prev_start[j-1]
|
| 370 |
+
elif del_option <= ins_option:
|
| 371 |
+
curr_cost[j] = del_option
|
| 372 |
+
curr_start[j] = prev_start[j]
|
| 373 |
+
else:
|
| 374 |
+
curr_cost[j] = ins_option
|
| 375 |
+
curr_start[j] = curr_start[j-1]
|
| 376 |
+
|
| 377 |
+
prev_cost, curr_cost = curr_cost, prev_cost
|
| 378 |
+
prev_start, curr_start = curr_start, prev_start
|
| 379 |
+
|
| 380 |
+
# After DP: evaluate only valid end boundary positions
|
| 381 |
+
# prev_cost/prev_start now contain the final row (after m iterations)
|
| 382 |
+
best_score = float('inf') # Score includes float norm_dist, so keep as float
|
| 383 |
+
best_j = None
|
| 384 |
+
best_j_start = None
|
| 385 |
+
best_cost = INF
|
| 386 |
+
best_norm_dist = float('inf')
|
| 387 |
+
|
| 388 |
+
for j in range(1, n + 1):
|
| 389 |
+
# Skip non-end-boundary positions
|
| 390 |
+
if not is_end_boundary(j):
|
| 391 |
+
continue
|
| 392 |
+
|
| 393 |
+
# Skip infinite cost (no valid alignment ends here)
|
| 394 |
+
if prev_cost[j] >= INF:
|
| 395 |
+
continue
|
| 396 |
+
|
| 397 |
+
dist = prev_cost[j]
|
| 398 |
+
j_start = prev_start[j]
|
| 399 |
+
|
| 400 |
+
# Compute normalized edit distance
|
| 401 |
+
ref_len = j - j_start
|
| 402 |
+
denom = max(m, ref_len, 1)
|
| 403 |
+
norm_dist = dist / denom
|
| 404 |
+
|
| 405 |
+
# Position prior on start word
|
| 406 |
+
start_word = R_phone_to_word[j_start] if j_start < n else R_phone_to_word[j - 1]
|
| 407 |
+
prior = prior_weight * abs(start_word - expected_word)
|
| 408 |
+
score = norm_dist + prior
|
| 409 |
+
|
| 410 |
+
if score < best_score:
|
| 411 |
+
best_score = score
|
| 412 |
+
best_j = j
|
| 413 |
+
best_j_start = j_start
|
| 414 |
+
best_cost = dist
|
| 415 |
+
best_norm_dist = norm_dist
|
| 416 |
+
|
| 417 |
+
return best_j, best_j_start, best_cost, best_norm_dist
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
# =============================================================================
|
| 421 |
+
# Per-Segment Alignment
|
| 422 |
+
# =============================================================================
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
def align_segment(
|
| 426 |
+
asr_phonemes: List[str],
|
| 427 |
+
chapter_ref: ChapterReference,
|
| 428 |
+
pointer: int,
|
| 429 |
+
segment_idx: int = 0,
|
| 430 |
+
basmala_prefix: bool = False,
|
| 431 |
+
lookback_override: Optional[int] = None,
|
| 432 |
+
lookahead_override: Optional[int] = None,
|
| 433 |
+
max_edit_distance_override: Optional[float] = None,
|
| 434 |
+
) -> Tuple[Optional[AlignmentResult], dict]:
|
| 435 |
+
"""
|
| 436 |
+
Align ASR phonemes to reference using substring Levenshtein DP.
|
| 437 |
+
|
| 438 |
+
Args:
|
| 439 |
+
asr_phonemes: Phoneme sequence from ASR for this segment
|
| 440 |
+
chapter_ref: Pre-built chapter reference data
|
| 441 |
+
pointer: First unprocessed word index (0 at start of chapter)
|
| 442 |
+
segment_idx: Segment number for debug output
|
| 443 |
+
basmala_prefix: If True, prepend Basmala phonemes to the R window
|
| 444 |
+
so the DP can consume a fused Basmala+verse segment
|
| 445 |
+
lookback_override: Override LOOKBACK_WORDS for this call
|
| 446 |
+
lookahead_override: Override LOOKAHEAD_WORDS for this call
|
| 447 |
+
max_edit_distance_override: Override MAX_EDIT_DISTANCE for this call
|
| 448 |
+
|
| 449 |
+
Returns: (AlignmentResult or None, timing_dict)
|
| 450 |
+
"""
|
| 451 |
+
timing = {'window_setup_time': 0.0, 'dp_time': 0.0, 'result_build_time': 0.0}
|
| 452 |
+
|
| 453 |
+
# Only import time if profiling is enabled
|
| 454 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 455 |
+
import time
|
| 456 |
+
|
| 457 |
+
P = asr_phonemes
|
| 458 |
+
m = len(P)
|
| 459 |
+
|
| 460 |
+
if m == 0:
|
| 461 |
+
return None, timing
|
| 462 |
+
|
| 463 |
+
words = chapter_ref.words
|
| 464 |
+
avg_phones = chapter_ref.avg_phones_per_word
|
| 465 |
+
num_words = chapter_ref.num_words
|
| 466 |
+
|
| 467 |
+
# === WINDOW SETUP ===
|
| 468 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 469 |
+
t0 = time.perf_counter()
|
| 470 |
+
|
| 471 |
+
# 1. Estimate word count from phoneme count
|
| 472 |
+
est_words = max(1, round(m / avg_phones))
|
| 473 |
+
|
| 474 |
+
# 2. Define search window (word indices)
|
| 475 |
+
lb = lookback_override if lookback_override is not None else LOOKBACK_WORDS
|
| 476 |
+
la = lookahead_override if lookahead_override is not None else LOOKAHEAD_WORDS
|
| 477 |
+
win_start = max(0, pointer - lb)
|
| 478 |
+
win_end = min(num_words, pointer + est_words + la)
|
| 479 |
+
|
| 480 |
+
# End of chapter check
|
| 481 |
+
if win_start >= num_words:
|
| 482 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 483 |
+
timing['window_setup_time'] = time.perf_counter() - t0
|
| 484 |
+
if PHONEME_ALIGNMENT_DEBUG:
|
| 485 |
+
print(f"[PHONEME ALIGN] Segment {segment_idx}: Past end of chapter")
|
| 486 |
+
return None, timing
|
| 487 |
+
|
| 488 |
+
# 3. Slice pre-flattened phoneme window
|
| 489 |
+
phone_start = chapter_ref.word_phone_offsets[win_start]
|
| 490 |
+
phone_end = chapter_ref.word_phone_offsets[win_end]
|
| 491 |
+
|
| 492 |
+
R = chapter_ref.flat_phonemes[phone_start:phone_end]
|
| 493 |
+
R_phone_to_word = chapter_ref.flat_phone_to_word[phone_start:phone_end]
|
| 494 |
+
|
| 495 |
+
# Optionally prepend Basmala phonemes so the DP can consume fused Basmala+verse
|
| 496 |
+
BASMALA_SENTINEL = -1
|
| 497 |
+
prefix_phonemes = None
|
| 498 |
+
if basmala_prefix:
|
| 499 |
+
from .special_segments import SPECIAL_PHONEMES
|
| 500 |
+
prefix_phonemes = SPECIAL_PHONEMES["Basmala"]
|
| 501 |
+
|
| 502 |
+
if prefix_phonemes is not None:
|
| 503 |
+
prefix_len = len(prefix_phonemes)
|
| 504 |
+
R = list(prefix_phonemes) + list(R)
|
| 505 |
+
R_phone_to_word = [BASMALA_SENTINEL] * prefix_len + list(R_phone_to_word)
|
| 506 |
+
|
| 507 |
+
n = len(R)
|
| 508 |
+
|
| 509 |
+
if n == 0:
|
| 510 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 511 |
+
timing['window_setup_time'] = time.perf_counter() - t0
|
| 512 |
+
if PHONEME_ALIGNMENT_DEBUG:
|
| 513 |
+
print(f"[PHONEME ALIGN] Segment {segment_idx}: Empty reference window")
|
| 514 |
+
return None, timing
|
| 515 |
+
|
| 516 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 517 |
+
timing['window_setup_time'] = time.perf_counter() - t0
|
| 518 |
+
|
| 519 |
+
# === DP ===
|
| 520 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 521 |
+
t0 = time.perf_counter()
|
| 522 |
+
|
| 523 |
+
# 4. Run word-boundary-constrained alignment (DP + selection in one pass)
|
| 524 |
+
best_j, j_start, best_cost, norm_dist = align_with_word_boundaries(
|
| 525 |
+
P, R, R_phone_to_word,
|
| 526 |
+
expected_word=pointer,
|
| 527 |
+
prior_weight=START_PRIOR_WEIGHT
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 531 |
+
timing['dp_time'] = time.perf_counter() - t0
|
| 532 |
+
|
| 533 |
+
# === RESULT BUILD ===
|
| 534 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 535 |
+
t0 = time.perf_counter()
|
| 536 |
+
|
| 537 |
+
if best_j is None:
|
| 538 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 539 |
+
timing['result_build_time'] = time.perf_counter() - t0
|
| 540 |
+
print_debug_info(P, R, None, segment_idx, pointer, win_start, win_end, words)
|
| 541 |
+
return None, timing
|
| 542 |
+
|
| 543 |
+
# 5. Check acceptance threshold
|
| 544 |
+
threshold = max_edit_distance_override if max_edit_distance_override is not None else MAX_EDIT_DISTANCE
|
| 545 |
+
if norm_dist > threshold:
|
| 546 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 547 |
+
timing['result_build_time'] = time.perf_counter() - t0
|
| 548 |
+
print_debug_info(P, R, None, segment_idx, pointer, win_start, win_end, words)
|
| 549 |
+
return None, timing
|
| 550 |
+
|
| 551 |
+
# 6. Confidence is 1 - normalized distance
|
| 552 |
+
confidence = 1.0 - norm_dist
|
| 553 |
+
|
| 554 |
+
# 7. Map phoneme indices to word indices
|
| 555 |
+
start_word_idx = R_phone_to_word[j_start]
|
| 556 |
+
end_word_idx = R_phone_to_word[best_j - 1]
|
| 557 |
+
|
| 558 |
+
# Handle prefix: if alignment starts in the prefix region, find the first real word
|
| 559 |
+
basmala_consumed = False
|
| 560 |
+
if prefix_phonemes is not None and start_word_idx == BASMALA_SENTINEL:
|
| 561 |
+
basmala_consumed = True
|
| 562 |
+
for k in range(j_start, best_j):
|
| 563 |
+
if R_phone_to_word[k] != BASMALA_SENTINEL:
|
| 564 |
+
start_word_idx = R_phone_to_word[k]
|
| 565 |
+
break
|
| 566 |
+
else:
|
| 567 |
+
# Entire match is just Basmala with no verse content — reject
|
| 568 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 569 |
+
timing['result_build_time'] = time.perf_counter() - t0
|
| 570 |
+
return None, timing
|
| 571 |
+
|
| 572 |
+
result = AlignmentResult(
|
| 573 |
+
start_word_idx=start_word_idx,
|
| 574 |
+
end_word_idx=end_word_idx,
|
| 575 |
+
edit_cost=best_cost,
|
| 576 |
+
confidence=confidence,
|
| 577 |
+
j_start=j_start,
|
| 578 |
+
best_j=best_j,
|
| 579 |
+
start_word=words[start_word_idx],
|
| 580 |
+
end_word=words[end_word_idx],
|
| 581 |
+
basmala_consumed=basmala_consumed,
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
if PHONEME_ALIGNMENT_PROFILING:
|
| 585 |
+
timing['result_build_time'] = time.perf_counter() - t0
|
| 586 |
+
|
| 587 |
+
# Debug output
|
| 588 |
+
print_debug_info(P, R, result, segment_idx, pointer, win_start, win_end, words)
|
| 589 |
+
|
| 590 |
+
return result, timing
|
src/alignment/phoneme_matcher_cache.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cache for ChapterReference objects.
|
| 3 |
+
|
| 4 |
+
Loads pre-built chapter references from a pickle file (built by
|
| 5 |
+
scripts/build_phoneme_cache.py) to avoid runtime phonemization.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import pickle
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
+
|
| 11 |
+
if TYPE_CHECKING:
|
| 12 |
+
from .phoneme_matcher import ChapterReference
|
| 13 |
+
|
| 14 |
+
# Global cache: surah number -> ChapterReference
|
| 15 |
+
_chapter_cache: dict[int, "ChapterReference"] = {}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_chapter_reference(surah: int) -> "ChapterReference":
|
| 19 |
+
"""
|
| 20 |
+
Get chapter reference from cache.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
surah: Surah number (1-114)
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
ChapterReference with pre-built phoneme data
|
| 27 |
+
"""
|
| 28 |
+
if surah not in _chapter_cache:
|
| 29 |
+
# Fallback: build at runtime if cache wasn't preloaded
|
| 30 |
+
from .phoneme_matcher import build_chapter_reference
|
| 31 |
+
print(f"[CACHE] WARNING: Building reference for Surah {surah} at runtime "
|
| 32 |
+
"(phoneme cache not loaded — run scripts/build_phoneme_cache.py)")
|
| 33 |
+
_chapter_cache[surah] = build_chapter_reference(surah)
|
| 34 |
+
return _chapter_cache[surah]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def preload_all_chapters() -> None:
|
| 38 |
+
"""Load all 114 chapter references from the pre-built cache file."""
|
| 39 |
+
from config import PHONEME_CACHE_PATH
|
| 40 |
+
|
| 41 |
+
if PHONEME_CACHE_PATH.exists():
|
| 42 |
+
print(f"[CACHE] Loading phoneme cache from {PHONEME_CACHE_PATH}...")
|
| 43 |
+
with open(PHONEME_CACHE_PATH, "rb") as f:
|
| 44 |
+
loaded: dict[int, "ChapterReference"] = pickle.load(f)
|
| 45 |
+
_chapter_cache.update(loaded)
|
| 46 |
+
print(f"[CACHE] Loaded {len(loaded)} chapters from cache")
|
| 47 |
+
else:
|
| 48 |
+
print(f"[CACHE] WARNING: {PHONEME_CACHE_PATH} not found, "
|
| 49 |
+
"falling back to runtime phonemization")
|
| 50 |
+
print("[CACHE] Run: python scripts/build_phoneme_cache.py")
|
| 51 |
+
for surah in range(1, 115):
|
| 52 |
+
get_chapter_reference(surah)
|
| 53 |
+
print(f"[CACHE] All 114 chapters built at runtime")
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def clear_chapter_cache() -> None:
|
| 57 |
+
"""Clear cache (for memory management)."""
|
| 58 |
+
_chapter_cache.clear()
|
| 59 |
+
print("[CACHE] Cleared chapter cache")
|
src/alignment/special_segments.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Phoneme-based special segment detection for Basmala and Isti'adha.
|
| 3 |
+
|
| 4 |
+
These are common recitation openers that need special handling:
|
| 5 |
+
- Isti'adha: "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم" (I seek refuge in Allah)
|
| 6 |
+
- Basmala: "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم" (In the name of Allah)
|
| 7 |
+
|
| 8 |
+
Detection uses phoneme edit distance for robustness against ASR errors.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from typing import List, Tuple, Optional
|
| 14 |
+
|
| 15 |
+
# =============================================================================
|
| 16 |
+
# Constants
|
| 17 |
+
# =============================================================================
|
| 18 |
+
|
| 19 |
+
from config import MAX_SPECIAL_EDIT_DISTANCE
|
| 20 |
+
|
| 21 |
+
# Special phoneme sequences
|
| 22 |
+
SPECIAL_PHONEMES = {
|
| 23 |
+
"Isti'adha": [
|
| 24 |
+
"ʔ", "a", "ʕ", "u:", "ð", "u", "b", "i", "ll", "a:", "h", "i",
|
| 25 |
+
"m", "i", "n", "a", "ʃʃ", "a", "j", "tˤ", "aˤ:", "n", "i",
|
| 26 |
+
"rˤrˤ", "aˤ", "ʒ", "i:", "m"
|
| 27 |
+
],
|
| 28 |
+
"Basmala": [
|
| 29 |
+
"b", "i", "s", "m", "i", "ll", "a:", "h", "i", "rˤrˤ", "aˤ",
|
| 30 |
+
"ħ", "m", "a:", "n", "i", "rˤrˤ", "aˤ", "ħ", "i:", "m"
|
| 31 |
+
],
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
# Combined = Isti'adha + Basmala (for detecting both in one segment)
|
| 35 |
+
COMBINED_PHONEMES = SPECIAL_PHONEMES["Isti'adha"] + SPECIAL_PHONEMES["Basmala"]
|
| 36 |
+
|
| 37 |
+
# Arabic text for display
|
| 38 |
+
SPECIAL_TEXT = {
|
| 39 |
+
"Isti'adha": "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم",
|
| 40 |
+
"Basmala": "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# =============================================================================
|
| 45 |
+
# Levenshtein Distance
|
| 46 |
+
# =============================================================================
|
| 47 |
+
|
| 48 |
+
def levenshtein_distance(seq1: List[str], seq2: List[str]) -> int:
|
| 49 |
+
"""
|
| 50 |
+
Compute standard Levenshtein edit distance between two sequences.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
seq1: First sequence (list of phonemes)
|
| 54 |
+
seq2: Second sequence (list of phonemes)
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Edit distance (number of insertions, deletions, substitutions)
|
| 58 |
+
"""
|
| 59 |
+
m, n = len(seq1), len(seq2)
|
| 60 |
+
|
| 61 |
+
# Handle edge cases
|
| 62 |
+
if m == 0:
|
| 63 |
+
return n
|
| 64 |
+
if n == 0:
|
| 65 |
+
return m
|
| 66 |
+
|
| 67 |
+
# Use two-row optimization for memory efficiency
|
| 68 |
+
prev = list(range(n + 1))
|
| 69 |
+
curr = [0] * (n + 1)
|
| 70 |
+
|
| 71 |
+
for i in range(1, m + 1):
|
| 72 |
+
curr[0] = i
|
| 73 |
+
for j in range(1, n + 1):
|
| 74 |
+
if seq1[i - 1] == seq2[j - 1]:
|
| 75 |
+
curr[j] = prev[j - 1] # No operation needed
|
| 76 |
+
else:
|
| 77 |
+
curr[j] = 1 + min(
|
| 78 |
+
prev[j], # Deletion
|
| 79 |
+
curr[j - 1], # Insertion
|
| 80 |
+
prev[j - 1], # Substitution
|
| 81 |
+
)
|
| 82 |
+
prev, curr = curr, prev
|
| 83 |
+
|
| 84 |
+
return prev[n]
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def phoneme_edit_distance(asr_phonemes: List[str], ref_phonemes: List[str]) -> float:
|
| 88 |
+
"""
|
| 89 |
+
Compute normalized edit distance between two phoneme sequences.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
asr_phonemes: ASR output phoneme sequence
|
| 93 |
+
ref_phonemes: Reference phoneme sequence
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
Normalized edit distance (0.0 = identical, 1.0 = completely different)
|
| 97 |
+
"""
|
| 98 |
+
if not asr_phonemes or not ref_phonemes:
|
| 99 |
+
return 1.0
|
| 100 |
+
|
| 101 |
+
edit_dist = levenshtein_distance(asr_phonemes, ref_phonemes)
|
| 102 |
+
max_len = max(len(asr_phonemes), len(ref_phonemes))
|
| 103 |
+
|
| 104 |
+
return edit_dist / max_len
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# =============================================================================
|
| 108 |
+
# Special Segment Detection
|
| 109 |
+
# =============================================================================
|
| 110 |
+
|
| 111 |
+
def detect_special_segments(
|
| 112 |
+
phoneme_texts: List[List[str]],
|
| 113 |
+
vad_segments: List,
|
| 114 |
+
segment_audios: List,
|
| 115 |
+
) -> Tuple[List, List, List[Tuple[str, float, str]], int]:
|
| 116 |
+
"""
|
| 117 |
+
Detect special segments (Isti'adha/Basmala) using phoneme edit distance.
|
| 118 |
+
|
| 119 |
+
Detection order:
|
| 120 |
+
1. Try COMBINED (Isti'adha + Basmala) on segment 0 → split if match
|
| 121 |
+
2. Else try Isti'adha on segment 0 → if match, try Basmala on segment 1
|
| 122 |
+
3. Else try Basmala on segment 0
|
| 123 |
+
4. Else no specials
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
phoneme_texts: List of phoneme lists from ASR
|
| 127 |
+
vad_segments: List of VadSegment objects
|
| 128 |
+
segment_audios: List of audio arrays
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
(updated_vad_segments, updated_audios, special_results, first_quran_idx)
|
| 132 |
+
|
| 133 |
+
special_results: List of tuples (matched_text, score, ref) for compatibility
|
| 134 |
+
first_quran_idx: Index where Quran segments start (after specials)
|
| 135 |
+
"""
|
| 136 |
+
# Import here to avoid circular imports
|
| 137 |
+
from ..segment_types import VadSegment
|
| 138 |
+
|
| 139 |
+
if not phoneme_texts or not vad_segments or not segment_audios:
|
| 140 |
+
return vad_segments, segment_audios, [], 0
|
| 141 |
+
|
| 142 |
+
special_results: List[Tuple[str, float, str]] = []
|
| 143 |
+
|
| 144 |
+
# Segment 0 phonemes (already a list)
|
| 145 |
+
seg0_phonemes = phoneme_texts[0] if phoneme_texts[0] else []
|
| 146 |
+
|
| 147 |
+
# ==========================================================================
|
| 148 |
+
# 1. Try COMBINED (Isti'adha + Basmala in one segment)
|
| 149 |
+
# ==========================================================================
|
| 150 |
+
combined_dist = phoneme_edit_distance(seg0_phonemes, COMBINED_PHONEMES)
|
| 151 |
+
|
| 152 |
+
if combined_dist <= MAX_SPECIAL_EDIT_DISTANCE:
|
| 153 |
+
print(f"[SPECIAL] Combined Isti'adha+Basmala detected (dist={combined_dist:.2f})")
|
| 154 |
+
|
| 155 |
+
# Split segment 0 by midpoint
|
| 156 |
+
seg = vad_segments[0]
|
| 157 |
+
audio = segment_audios[0]
|
| 158 |
+
mid_time = (seg.start_time + seg.end_time) / 2.0
|
| 159 |
+
mid_sample = max(1, len(audio) // 2)
|
| 160 |
+
|
| 161 |
+
# Create two new segments
|
| 162 |
+
new_vads = [
|
| 163 |
+
VadSegment(start_time=seg.start_time, end_time=mid_time, segment_idx=0),
|
| 164 |
+
VadSegment(start_time=mid_time, end_time=seg.end_time, segment_idx=1),
|
| 165 |
+
]
|
| 166 |
+
new_audios = [
|
| 167 |
+
audio[:mid_sample],
|
| 168 |
+
audio[mid_sample:],
|
| 169 |
+
]
|
| 170 |
+
|
| 171 |
+
# Add remaining segments with reindexed segment_idx
|
| 172 |
+
for i, vs in enumerate(vad_segments[1:], start=2):
|
| 173 |
+
new_vads.append(VadSegment(
|
| 174 |
+
start_time=vs.start_time,
|
| 175 |
+
end_time=vs.end_time,
|
| 176 |
+
segment_idx=i
|
| 177 |
+
))
|
| 178 |
+
new_audios.extend(segment_audios[1:])
|
| 179 |
+
|
| 180 |
+
# Special results for both (confidence = 1 - distance)
|
| 181 |
+
confidence = 1.0 - combined_dist
|
| 182 |
+
special_results = [
|
| 183 |
+
(SPECIAL_TEXT["Isti'adha"], confidence, "Isti'adha"),
|
| 184 |
+
(SPECIAL_TEXT["Basmala"], confidence, "Basmala"),
|
| 185 |
+
]
|
| 186 |
+
|
| 187 |
+
return new_vads, new_audios, special_results, 2
|
| 188 |
+
|
| 189 |
+
# ==========================================================================
|
| 190 |
+
# 2. Try Isti'adha on segment 0
|
| 191 |
+
# ==========================================================================
|
| 192 |
+
istiadha_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Isti'adha"])
|
| 193 |
+
|
| 194 |
+
if istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE:
|
| 195 |
+
print(f"[SPECIAL] Isti'adha detected on segment 0 (dist={istiadha_dist:.2f})")
|
| 196 |
+
special_results.append(
|
| 197 |
+
(SPECIAL_TEXT["Isti'adha"], 1.0 - istiadha_dist, "Isti'adha")
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# Try Basmala on segment 1
|
| 201 |
+
if len(phoneme_texts) >= 2 and phoneme_texts[1]:
|
| 202 |
+
seg1_phonemes = phoneme_texts[1]
|
| 203 |
+
basmala_dist = phoneme_edit_distance(seg1_phonemes, SPECIAL_PHONEMES["Basmala"])
|
| 204 |
+
|
| 205 |
+
if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
|
| 206 |
+
print(f"[SPECIAL] Basmala detected on segment 1 (dist={basmala_dist:.2f})")
|
| 207 |
+
special_results.append(
|
| 208 |
+
(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")
|
| 209 |
+
)
|
| 210 |
+
return vad_segments, segment_audios, special_results, 2
|
| 211 |
+
else:
|
| 212 |
+
print(f"[SPECIAL] No Basmala on segment 1 (dist={basmala_dist:.2f})")
|
| 213 |
+
|
| 214 |
+
return vad_segments, segment_audios, special_results, 1
|
| 215 |
+
|
| 216 |
+
# ==========================================================================
|
| 217 |
+
# 3. Try Basmala on segment 0
|
| 218 |
+
# ==========================================================================
|
| 219 |
+
basmala_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Basmala"])
|
| 220 |
+
|
| 221 |
+
if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
|
| 222 |
+
print(f"[SPECIAL] Basmala detected on segment 0 (dist={basmala_dist:.2f})")
|
| 223 |
+
special_results.append(
|
| 224 |
+
(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")
|
| 225 |
+
)
|
| 226 |
+
return vad_segments, segment_audios, special_results, 1
|
| 227 |
+
|
| 228 |
+
# ==========================================================================
|
| 229 |
+
# 4. No specials detected
|
| 230 |
+
# ==========================================================================
|
| 231 |
+
print(f"[SPECIAL] No special segments detected "
|
| 232 |
+
f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})")
|
| 233 |
+
|
| 234 |
+
return vad_segments, segment_audios, [], 0
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def detect_inter_chapter_specials(
|
| 238 |
+
phoneme_texts: List[List[str]],
|
| 239 |
+
) -> Tuple[List[Tuple[str, float, str]], int]:
|
| 240 |
+
"""
|
| 241 |
+
Detect special segments between chapters (phoneme-only, no audio splitting).
|
| 242 |
+
|
| 243 |
+
Same detection order as detect_special_segments:
|
| 244 |
+
1. Try COMBINED on segment 0
|
| 245 |
+
2. Else try Isti'adha on seg 0 -> if match, try Basmala on seg 1
|
| 246 |
+
3. Else try Basmala on seg 0
|
| 247 |
+
4. Else no specials
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
(special_results, num_consumed)
|
| 251 |
+
special_results: List of (matched_text, score, ref) tuples
|
| 252 |
+
num_consumed: Number of segments consumed as specials
|
| 253 |
+
"""
|
| 254 |
+
if not phoneme_texts or not phoneme_texts[0]:
|
| 255 |
+
return [], 0
|
| 256 |
+
|
| 257 |
+
seg0_phonemes = phoneme_texts[0]
|
| 258 |
+
|
| 259 |
+
# 1. Try COMBINED (Isti'adha + Basmala in one segment)
|
| 260 |
+
combined_dist = phoneme_edit_distance(seg0_phonemes, COMBINED_PHONEMES)
|
| 261 |
+
if combined_dist <= MAX_SPECIAL_EDIT_DISTANCE:
|
| 262 |
+
print(f"[INTER-CHAPTER] Combined Isti'adha+Basmala detected (dist={combined_dist:.2f})")
|
| 263 |
+
combined_text = SPECIAL_TEXT["Isti'adha"] + " " + SPECIAL_TEXT["Basmala"]
|
| 264 |
+
return [(combined_text, 1.0 - combined_dist, "Isti'adha+Basmala")], 1
|
| 265 |
+
|
| 266 |
+
# 2. Try Isti'adha on segment 0
|
| 267 |
+
istiadha_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Isti'adha"])
|
| 268 |
+
if istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE:
|
| 269 |
+
print(f"[INTER-CHAPTER] Isti'adha detected (dist={istiadha_dist:.2f})")
|
| 270 |
+
results = [(SPECIAL_TEXT["Isti'adha"], 1.0 - istiadha_dist, "Isti'adha")]
|
| 271 |
+
consumed = 1
|
| 272 |
+
|
| 273 |
+
# Try Basmala on segment 1
|
| 274 |
+
if len(phoneme_texts) >= 2 and phoneme_texts[1]:
|
| 275 |
+
seg1_phonemes = phoneme_texts[1]
|
| 276 |
+
basmala_dist = phoneme_edit_distance(seg1_phonemes, SPECIAL_PHONEMES["Basmala"])
|
| 277 |
+
if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
|
| 278 |
+
print(f"[INTER-CHAPTER] Basmala detected on next segment (dist={basmala_dist:.2f})")
|
| 279 |
+
results.append((SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala"))
|
| 280 |
+
consumed = 2
|
| 281 |
+
else:
|
| 282 |
+
print(f"[INTER-CHAPTER] No Basmala on next segment (dist={basmala_dist:.2f})")
|
| 283 |
+
|
| 284 |
+
return results, consumed
|
| 285 |
+
|
| 286 |
+
# 3. Try Basmala on segment 0
|
| 287 |
+
basmala_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Basmala"])
|
| 288 |
+
if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
|
| 289 |
+
print(f"[INTER-CHAPTER] Basmala detected (dist={basmala_dist:.2f})")
|
| 290 |
+
return [(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")], 1
|
| 291 |
+
|
| 292 |
+
# 4. No specials
|
| 293 |
+
print(f"[INTER-CHAPTER] No special segments detected "
|
| 294 |
+
f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})")
|
| 295 |
+
return [], 0
|
src/phonemizer_utils.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Phonemizer integration for reference phonemes."""
|
| 2 |
+
|
| 3 |
+
_pm = None
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def get_phonemizer():
|
| 7 |
+
"""Get or create Phonemizer instance."""
|
| 8 |
+
global _pm
|
| 9 |
+
if _pm is None:
|
| 10 |
+
from core.phonemizer import Phonemizer
|
| 11 |
+
_pm = Phonemizer()
|
| 12 |
+
return _pm
|
src/quran_index.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
QuranIndex: Pre-indexed Quran words for reference lookup and display.
|
| 3 |
+
|
| 4 |
+
Uses dual-script loading:
|
| 5 |
+
- QPC Hafs (qpc_hafs.json) for computation (indices, word counts, lookups)
|
| 6 |
+
- Digital Khatt (digital_khatt_v2_script.json) for display (renders correctly with DK font)
|
| 7 |
+
|
| 8 |
+
Stop signs in Digital Khatt are combining marks attached to words, while QPC Hafs
|
| 9 |
+
has spaces before stop signs. The DigitalKhatt font renders DK text correctly.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Optional
|
| 18 |
+
|
| 19 |
+
from config import QURAN_SCRIPT_PATH_COMPUTE, QURAN_SCRIPT_PATH_DISPLAY
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Verse marker prefix to filter out (end-of-ayah markers)
|
| 23 |
+
VERSE_MARKER_PREFIX = ''
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class WordInfo:
|
| 28 |
+
"""Information about a single Quran word."""
|
| 29 |
+
global_idx: int # Position in flat word array
|
| 30 |
+
surah: int
|
| 31 |
+
ayah: int
|
| 32 |
+
word: int
|
| 33 |
+
text: str # QPC Hafs text (computation)
|
| 34 |
+
display_text: str # Digital Khatt text (display)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@dataclass
|
| 38 |
+
class QuranIndex:
|
| 39 |
+
"""
|
| 40 |
+
Pre-indexed Quran words for reference lookup and display.
|
| 41 |
+
|
| 42 |
+
Used to convert matched references (e.g. "2:255:1-2:255:5") back to
|
| 43 |
+
original Arabic text with verse markers for UI rendering.
|
| 44 |
+
"""
|
| 45 |
+
words: list[WordInfo] # All words in order
|
| 46 |
+
word_lookup: dict[tuple[int, int, int], int] # (surah, ayah, word) -> global_idx
|
| 47 |
+
|
| 48 |
+
@classmethod
|
| 49 |
+
def load(cls, compute_path: Optional[Path] = None, display_path: Optional[Path] = None) -> "QuranIndex":
|
| 50 |
+
"""
|
| 51 |
+
Load and index the Quran from dual script sources.
|
| 52 |
+
|
| 53 |
+
Uses QPC Hafs as primary (determines word structure) and Digital Khatt
|
| 54 |
+
for display text. Falls back to QPC text if DK entry is missing.
|
| 55 |
+
|
| 56 |
+
Filters out verse markers () - they're not real words.
|
| 57 |
+
"""
|
| 58 |
+
if compute_path is None:
|
| 59 |
+
compute_path = QURAN_SCRIPT_PATH_COMPUTE
|
| 60 |
+
if display_path is None:
|
| 61 |
+
display_path = QURAN_SCRIPT_PATH_DISPLAY
|
| 62 |
+
|
| 63 |
+
with open(compute_path, "r", encoding="utf-8") as f:
|
| 64 |
+
compute_data = json.load(f)
|
| 65 |
+
with open(display_path, "r", encoding="utf-8") as f:
|
| 66 |
+
display_data = json.load(f)
|
| 67 |
+
|
| 68 |
+
words: list[WordInfo] = []
|
| 69 |
+
word_lookup: dict[tuple[int, int, int], int] = {}
|
| 70 |
+
|
| 71 |
+
# Sort by location key to ensure order (1:1:1, 1:1:2, ..., 114:6:3)
|
| 72 |
+
sorted_keys = sorted(compute_data.keys(), key=_parse_location_key)
|
| 73 |
+
|
| 74 |
+
for key in sorted_keys:
|
| 75 |
+
entry = compute_data[key]
|
| 76 |
+
text = entry["text"]
|
| 77 |
+
|
| 78 |
+
# Skip verse markers (QPC shouldn't have any, but safety check)
|
| 79 |
+
if text.startswith(VERSE_MARKER_PREFIX):
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
surah = int(entry["surah"])
|
| 83 |
+
ayah = int(entry["ayah"])
|
| 84 |
+
word = int(entry["word"])
|
| 85 |
+
|
| 86 |
+
# Get display text from Digital Khatt, fallback to QPC text
|
| 87 |
+
dk_entry = display_data.get(key)
|
| 88 |
+
display_text = dk_entry["text"] if dk_entry else text
|
| 89 |
+
|
| 90 |
+
word_info = WordInfo(
|
| 91 |
+
global_idx=len(words),
|
| 92 |
+
surah=surah,
|
| 93 |
+
ayah=ayah,
|
| 94 |
+
word=word,
|
| 95 |
+
text=text,
|
| 96 |
+
display_text=display_text,
|
| 97 |
+
)
|
| 98 |
+
words.append(word_info)
|
| 99 |
+
word_lookup[(surah, ayah, word)] = word_info.global_idx
|
| 100 |
+
|
| 101 |
+
print(f"[QuranIndex] Loaded {len(words)} words")
|
| 102 |
+
|
| 103 |
+
return cls(
|
| 104 |
+
words=words,
|
| 105 |
+
word_lookup=word_lookup,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
def ref_to_indices(self, ref: str) -> Optional[tuple[int, int]]:
|
| 109 |
+
"""
|
| 110 |
+
Convert a ref like '2:255:1-2:255:5' or '2:255:5' to global start/end indices.
|
| 111 |
+
"""
|
| 112 |
+
if not ref or ":" not in ref:
|
| 113 |
+
return None
|
| 114 |
+
try:
|
| 115 |
+
if "-" in ref:
|
| 116 |
+
start_ref, end_ref = ref.split("-")
|
| 117 |
+
else:
|
| 118 |
+
start_ref = end_ref = ref
|
| 119 |
+
|
| 120 |
+
def _lookup(r: str) -> Optional[int]:
|
| 121 |
+
parts = r.split(":")
|
| 122 |
+
if len(parts) < 3:
|
| 123 |
+
return None
|
| 124 |
+
return self.word_lookup.get((int(parts[0]), int(parts[1]), int(parts[2])))
|
| 125 |
+
|
| 126 |
+
start_idx = _lookup(start_ref)
|
| 127 |
+
end_idx = _lookup(end_ref)
|
| 128 |
+
if start_idx is None or end_idx is None:
|
| 129 |
+
return None
|
| 130 |
+
return start_idx, end_idx
|
| 131 |
+
except Exception:
|
| 132 |
+
return None
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _parse_location_key(key: str) -> tuple[int, int, int]:
|
| 136 |
+
"""Parse location key like '2:255:3' into (surah, ayah, word) for sorting."""
|
| 137 |
+
parts = key.split(":")
|
| 138 |
+
return (int(parts[0]), int(parts[1]), int(parts[2]))
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# Global singleton - loaded on first access
|
| 142 |
+
_quran_index_cache: Optional[QuranIndex] = None
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def get_quran_index() -> QuranIndex:
|
| 146 |
+
"""Get or create the global QuranIndex singleton."""
|
| 147 |
+
global _quran_index_cache
|
| 148 |
+
if _quran_index_cache is None:
|
| 149 |
+
_quran_index_cache = QuranIndex.load()
|
| 150 |
+
return _quran_index_cache
|
src/segment_processor.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility façade for segmentation pipeline modules."""
|
| 2 |
+
|
| 3 |
+
from .segment_types import VadSegment, SegmentInfo, ProfilingData
|
| 4 |
+
from .segmenter.segmenter_model import load_segmenter, ensure_models_on_gpu, ensure_models_on_cpu
|
| 5 |
+
from .segmenter.segmenter_aoti import test_vad_aoti_export, apply_aoti_compiled
|
| 6 |
+
from .segmenter.vad import detect_speech_segments
|
| 7 |
+
from .alignment.alignment_pipeline import run_phoneme_matching
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"VadSegment",
|
| 11 |
+
"SegmentInfo",
|
| 12 |
+
"ProfilingData",
|
| 13 |
+
"load_segmenter",
|
| 14 |
+
"ensure_models_on_gpu",
|
| 15 |
+
"ensure_models_on_cpu",
|
| 16 |
+
"detect_speech_segments",
|
| 17 |
+
"run_phoneme_matching",
|
| 18 |
+
"test_vad_aoti_export",
|
| 19 |
+
"apply_aoti_compiled",
|
| 20 |
+
]
|
src/segment_types.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data types for the segmentation pipeline."""
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@dataclass
|
| 8 |
+
class VadSegment:
|
| 9 |
+
"""Raw VAD segment with timing info."""
|
| 10 |
+
start_time: float
|
| 11 |
+
end_time: float
|
| 12 |
+
segment_idx: int
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class SegmentInfo:
|
| 17 |
+
"""Processed segment with transcription and matching results."""
|
| 18 |
+
start_time: float
|
| 19 |
+
end_time: float
|
| 20 |
+
transcribed_text: str
|
| 21 |
+
matched_text: str
|
| 22 |
+
matched_ref: str # e.g. "2:255:1-2:255:5"
|
| 23 |
+
match_score: float
|
| 24 |
+
error: Optional[str] = None
|
| 25 |
+
has_missing_words: bool = False
|
| 26 |
+
potentially_undersegmented: bool = False
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class ProfilingData:
|
| 31 |
+
"""Profiling metrics for the processing pipeline."""
|
| 32 |
+
# Preprocessing
|
| 33 |
+
resample_time: float = 0.0 # Audio resampling time
|
| 34 |
+
# VAD profiling
|
| 35 |
+
vad_model_load_time: float = 0.0
|
| 36 |
+
vad_model_move_time: float = 0.0
|
| 37 |
+
vad_inference_time: float = 0.0
|
| 38 |
+
vad_gpu_time: float = 0.0 # Actual GPU lease execution time
|
| 39 |
+
vad_wall_time: float = 0.0 # Wall-clock time (includes queue wait)
|
| 40 |
+
# Phoneme ASR profiling
|
| 41 |
+
asr_time: float = 0.0 # Wav2vec wall-clock time (includes queue wait)
|
| 42 |
+
asr_gpu_time: float = 0.0 # Actual GPU lease execution time
|
| 43 |
+
asr_model_move_time: float = 0.0 # ASR model GPU move time
|
| 44 |
+
asr_sorting_time: float = 0.0 # Duration-sorting time
|
| 45 |
+
asr_batch_build_time: float = 0.0 # Dynamic batch construction time
|
| 46 |
+
asr_batch_profiling: list = None # Per-batch timing details
|
| 47 |
+
# Global anchor profiling
|
| 48 |
+
anchor_time: float = 0.0 # N-gram voting anchor detection
|
| 49 |
+
# Phoneme alignment profiling
|
| 50 |
+
phoneme_total_time: float = 0.0 # Overall phoneme matching time
|
| 51 |
+
phoneme_ref_build_time: float = 0.0 # Time to build chapter reference
|
| 52 |
+
phoneme_dp_total_time: float = 0.0 # Total DP time across all segments
|
| 53 |
+
phoneme_dp_min_time: float = 0.0 # Min DP time per segment
|
| 54 |
+
phoneme_dp_max_time: float = 0.0 # Max DP time per segment
|
| 55 |
+
phoneme_window_setup_time: float = 0.0 # Total window slicing time
|
| 56 |
+
phoneme_result_build_time: float = 0.0 # Total result construction time
|
| 57 |
+
phoneme_num_segments: int = 0 # Number of segments aligned
|
| 58 |
+
match_wall_time: float = 0.0 # Total matching wall-clock time
|
| 59 |
+
# Retry / reanchor counters
|
| 60 |
+
tier1_attempts: int = 0
|
| 61 |
+
tier1_passed: int = 0
|
| 62 |
+
tier1_segments: list = None
|
| 63 |
+
tier2_attempts: int = 0
|
| 64 |
+
tier2_passed: int = 0
|
| 65 |
+
tier2_segments: list = None
|
| 66 |
+
consec_reanchors: int = 0
|
| 67 |
+
segments_attempted: int = 0
|
| 68 |
+
segments_passed: int = 0
|
| 69 |
+
special_merges: int = 0
|
| 70 |
+
# Result building profiling
|
| 71 |
+
result_build_time: float = 0.0 # Total result building time
|
| 72 |
+
result_audio_encode_time: float = 0.0 # Audio-to-data-URL encoding
|
| 73 |
+
# Total pipeline time
|
| 74 |
+
total_time: float = 0.0 # End-to-end pipeline time
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def phoneme_dp_avg_time(self) -> float:
|
| 78 |
+
"""Average DP time per segment."""
|
| 79 |
+
if self.phoneme_num_segments == 0:
|
| 80 |
+
return 0.0
|
| 81 |
+
return self.phoneme_dp_total_time / self.phoneme_num_segments
|
| 82 |
+
|
| 83 |
+
@staticmethod
|
| 84 |
+
def _fmt(seconds):
|
| 85 |
+
"""Format seconds as m:ss.fff when >= 60s, else as s.fffs."""
|
| 86 |
+
if seconds >= 60:
|
| 87 |
+
m, s = divmod(seconds, 60)
|
| 88 |
+
return f"{int(m)}:{s:06.3f}"
|
| 89 |
+
return f"{seconds:.3f}s"
|
| 90 |
+
|
| 91 |
+
def summary(self) -> str:
|
| 92 |
+
"""Return a formatted profiling summary."""
|
| 93 |
+
_fmt = self._fmt
|
| 94 |
+
lines = [
|
| 95 |
+
"\n" + "=" * 60,
|
| 96 |
+
"PROFILING SUMMARY",
|
| 97 |
+
"=" * 60,
|
| 98 |
+
f" Preprocessing:",
|
| 99 |
+
f" Resample: {self.resample_time:.3f}s",
|
| 100 |
+
f" VAD: wall {_fmt(self.vad_wall_time)}",
|
| 101 |
+
f" GPU Time: {self.vad_gpu_time:.3f}s (queue {self.vad_wall_time - self.vad_gpu_time:.3f}s)",
|
| 102 |
+
f" Model Load: {self.vad_model_load_time:.3f}s",
|
| 103 |
+
f" Model Move: {self.vad_model_move_time:.3f}s",
|
| 104 |
+
f" Inference: {self.vad_inference_time:.3f}s",
|
| 105 |
+
f" Phoneme ASR: wall {_fmt(self.asr_time)}",
|
| 106 |
+
f" GPU Time: {self.asr_gpu_time:.3f}s (queue {self.asr_time - self.asr_gpu_time:.3f}s)",
|
| 107 |
+
f" Model Move: {self.asr_model_move_time:.3f}s",
|
| 108 |
+
f" Sorting: {self.asr_sorting_time:.3f}s",
|
| 109 |
+
f" Batch Build: {self.asr_batch_build_time:.3f}s",
|
| 110 |
+
f" Batches: {len(self.asr_batch_profiling) if self.asr_batch_profiling else 0}",
|
| 111 |
+
]
|
| 112 |
+
if self.asr_batch_profiling:
|
| 113 |
+
for b in self.asr_batch_profiling:
|
| 114 |
+
lines.append(
|
| 115 |
+
f" Batch {b['batch_num']:>2}: {b['size']:>3} segs | "
|
| 116 |
+
f"{b['time']:.3f}s | "
|
| 117 |
+
f"{b['min_dur']:.2f}-{b['max_dur']:.2f}s "
|
| 118 |
+
f"(A {b['avg_dur']:.2f}s, T {b['total_seconds']:.1f}s, W {b['pad_waste']:.0%})"
|
| 119 |
+
)
|
| 120 |
+
lines += [
|
| 121 |
+
f" Global Anchor:",
|
| 122 |
+
f" N-gram Voting: {self.anchor_time:.3f}s",
|
| 123 |
+
f" Phoneme Alignment: wall {_fmt(self.match_wall_time)}",
|
| 124 |
+
f" Ref Build: {self.phoneme_ref_build_time:.3f}s",
|
| 125 |
+
f" Window Setup: {self.phoneme_window_setup_time:.3f}s",
|
| 126 |
+
f" DP Total: {self.phoneme_dp_total_time:.3f}s",
|
| 127 |
+
f" Segments: {self.phoneme_num_segments}",
|
| 128 |
+
f" DP Avg/segment: {1000*self.phoneme_dp_avg_time:.3f}ms",
|
| 129 |
+
f" DP Min: {1000*self.phoneme_dp_min_time:.3f}ms",
|
| 130 |
+
f" DP Max: {1000*self.phoneme_dp_max_time:.3f}ms",
|
| 131 |
+
]
|
| 132 |
+
pct = 100 * self.segments_passed / self.segments_attempted if self.segments_attempted else 0
|
| 133 |
+
t1_segs = self.tier1_segments or []
|
| 134 |
+
t2_segs = self.tier2_segments or []
|
| 135 |
+
lines += [
|
| 136 |
+
f" Alignment Stats:",
|
| 137 |
+
f" Attempted: {self.segments_attempted}",
|
| 138 |
+
f" Passed: {self.segments_passed} ({pct:.1f}%)",
|
| 139 |
+
f" Tier 1 Retries: {self.tier1_passed}/{self.tier1_attempts} passed segments: {t1_segs}",
|
| 140 |
+
f" Tier 2 Retries: {self.tier2_passed}/{self.tier2_attempts} passed segments: {t2_segs}",
|
| 141 |
+
f" Reanchors (consec failures): {self.consec_reanchors}",
|
| 142 |
+
f" Special Merges: {self.special_merges}",
|
| 143 |
+
"-" * 60,
|
| 144 |
+
]
|
| 145 |
+
profiled_sum = (self.resample_time + self.vad_wall_time + self.asr_time
|
| 146 |
+
+ self.anchor_time + self.match_wall_time + self.result_build_time)
|
| 147 |
+
unaccounted = self.total_time - profiled_sum
|
| 148 |
+
lines += [
|
| 149 |
+
f" PROFILED SUM: {_fmt(profiled_sum)}",
|
| 150 |
+
f" TOTAL (wall): {_fmt(self.total_time)} (unaccounted: {_fmt(unaccounted)})",
|
| 151 |
+
"=" * 60,
|
| 152 |
+
]
|
| 153 |
+
return "\n".join(lines)
|
src/segmenter/__init__.py
ADDED
|
File without changes
|
src/segmenter/segmenter_aoti.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""AoTInductor compilation utilities for the VAD segmenter."""
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
from config import (
|
| 6 |
+
AOTI_ENABLED, AOTI_MIN_AUDIO_MINUTES, AOTI_MAX_AUDIO_MINUTES,
|
| 7 |
+
AOTI_HUB_REPO, AOTI_HUB_ENABLED,
|
| 8 |
+
)
|
| 9 |
+
from .segmenter_model import _segmenter_cache
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# =============================================================================
|
| 13 |
+
# AoT Compilation Test
|
| 14 |
+
# =============================================================================
|
| 15 |
+
|
| 16 |
+
_aoti_cache = {
|
| 17 |
+
"exported": None,
|
| 18 |
+
"compiled": None,
|
| 19 |
+
"tested": False,
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def is_aoti_applied() -> bool:
|
| 24 |
+
"""Return True if a compiled AoTI model has been applied."""
|
| 25 |
+
return bool(_aoti_cache.get("applied"))
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _get_aoti_hub_filename():
|
| 29 |
+
"""Generate Hub filename encoding min/max audio duration."""
|
| 30 |
+
return f"vad_aoti_{AOTI_MIN_AUDIO_MINUTES}min_{AOTI_MAX_AUDIO_MINUTES}min.pt2"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _try_load_aoti_from_hub(model):
|
| 34 |
+
"""
|
| 35 |
+
Try to load a pre-compiled AoTI model from Hub.
|
| 36 |
+
Returns True if successful, False otherwise.
|
| 37 |
+
"""
|
| 38 |
+
import os
|
| 39 |
+
import time
|
| 40 |
+
|
| 41 |
+
if not AOTI_HUB_ENABLED:
|
| 42 |
+
print("[AoTI] Hub persistence disabled")
|
| 43 |
+
return False
|
| 44 |
+
|
| 45 |
+
token = os.environ.get("HF_TOKEN")
|
| 46 |
+
if not token:
|
| 47 |
+
print("[AoTI] HF_TOKEN not set, cannot access Hub")
|
| 48 |
+
return False
|
| 49 |
+
|
| 50 |
+
filename = _get_aoti_hub_filename()
|
| 51 |
+
print(f"[AoTI] Checking Hub for pre-compiled model: {AOTI_HUB_REPO}/{filename}")
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
from huggingface_hub import hf_hub_download, HfApi
|
| 55 |
+
|
| 56 |
+
# Check if file exists in repo
|
| 57 |
+
api = HfApi(token=token)
|
| 58 |
+
try:
|
| 59 |
+
files = api.list_repo_files(AOTI_HUB_REPO, token=token)
|
| 60 |
+
if filename not in files:
|
| 61 |
+
print(f"[AoTI] Compiled model not found on Hub (available: {files})")
|
| 62 |
+
return False
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"[AoTI] Could not list Hub repo: {e}")
|
| 65 |
+
return False
|
| 66 |
+
|
| 67 |
+
# Download the compiled graph
|
| 68 |
+
t0 = time.time()
|
| 69 |
+
compiled_graph_file = hf_hub_download(
|
| 70 |
+
AOTI_HUB_REPO, filename, token=token
|
| 71 |
+
)
|
| 72 |
+
download_time = time.time() - t0
|
| 73 |
+
print(f"[AoTI] Downloaded from Hub in {download_time:.1f}s: {compiled_graph_file}")
|
| 74 |
+
|
| 75 |
+
# Load using ZeroGPU AOTI utilities
|
| 76 |
+
from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights, drain_module_parameters
|
| 77 |
+
|
| 78 |
+
state_dict = model.state_dict()
|
| 79 |
+
zerogpu_weights = ZeroGPUWeights({name: weight for name, weight in state_dict.items()})
|
| 80 |
+
compiled = ZeroGPUCompiledModel(compiled_graph_file, zerogpu_weights)
|
| 81 |
+
|
| 82 |
+
# Replace forward method
|
| 83 |
+
setattr(model, "forward", compiled)
|
| 84 |
+
drain_module_parameters(model)
|
| 85 |
+
|
| 86 |
+
_aoti_cache["compiled"] = compiled
|
| 87 |
+
_aoti_cache["applied"] = True
|
| 88 |
+
print(f"[AoTI] Loaded and applied compiled model from Hub")
|
| 89 |
+
return True
|
| 90 |
+
|
| 91 |
+
except Exception as e:
|
| 92 |
+
print(f"[AoTI] Failed to load from Hub: {type(e).__name__}: {e}")
|
| 93 |
+
import traceback
|
| 94 |
+
traceback.print_exc()
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _push_aoti_to_hub(compiled):
|
| 99 |
+
"""
|
| 100 |
+
Push compiled AoTI model to Hub for future reuse.
|
| 101 |
+
"""
|
| 102 |
+
import os
|
| 103 |
+
import time
|
| 104 |
+
import tempfile
|
| 105 |
+
|
| 106 |
+
if not AOTI_HUB_ENABLED:
|
| 107 |
+
print("[AoTI] Hub persistence disabled, skipping upload")
|
| 108 |
+
return False
|
| 109 |
+
|
| 110 |
+
token = os.environ.get("HF_TOKEN")
|
| 111 |
+
if not token:
|
| 112 |
+
print("[AoTI] HF_TOKEN not set, cannot upload to Hub")
|
| 113 |
+
return False
|
| 114 |
+
|
| 115 |
+
filename = _get_aoti_hub_filename()
|
| 116 |
+
print(f"[AoTI] Uploading compiled model to Hub: {AOTI_HUB_REPO}/{filename}")
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
from huggingface_hub import HfApi, create_repo
|
| 120 |
+
|
| 121 |
+
api = HfApi(token=token)
|
| 122 |
+
|
| 123 |
+
# Create repo if it doesn't exist
|
| 124 |
+
try:
|
| 125 |
+
create_repo(AOTI_HUB_REPO, exist_ok=True, token=token)
|
| 126 |
+
except Exception as e:
|
| 127 |
+
print(f"[AoTI] Repo creation note: {e}")
|
| 128 |
+
|
| 129 |
+
# Get the archive file from the compiled object
|
| 130 |
+
archive = compiled.archive_file
|
| 131 |
+
if archive is None:
|
| 132 |
+
print("[AoTI] Compiled object has no archive_file, cannot upload")
|
| 133 |
+
return False
|
| 134 |
+
|
| 135 |
+
t0 = time.time()
|
| 136 |
+
|
| 137 |
+
# Write archive to temp file and upload
|
| 138 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 139 |
+
output_path = os.path.join(tmpdir, filename)
|
| 140 |
+
|
| 141 |
+
# archive is a BytesIO object
|
| 142 |
+
with open(output_path, "wb") as f:
|
| 143 |
+
f.write(archive.getvalue())
|
| 144 |
+
|
| 145 |
+
info = api.upload_file(
|
| 146 |
+
repo_id=AOTI_HUB_REPO,
|
| 147 |
+
path_or_fileobj=output_path,
|
| 148 |
+
path_in_repo=filename,
|
| 149 |
+
commit_message=f"Add compiled VAD model ({AOTI_MIN_AUDIO_MINUTES}-{AOTI_MAX_AUDIO_MINUTES} min)",
|
| 150 |
+
token=token,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
upload_time = time.time() - t0
|
| 154 |
+
print(f"[AoTI] Uploaded to Hub in {upload_time:.1f}s: {info}")
|
| 155 |
+
return True
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f"[AoTI] Failed to upload to Hub: {type(e).__name__}: {e}")
|
| 159 |
+
import traceback
|
| 160 |
+
traceback.print_exc()
|
| 161 |
+
return False
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def test_vad_aoti_export():
|
| 165 |
+
"""
|
| 166 |
+
Test torch.export AoT compilation for VAD model using spaces.aoti_capture.
|
| 167 |
+
Must be called AFTER model is on GPU (inside GPU-decorated function).
|
| 168 |
+
|
| 169 |
+
Checks Hub for pre-compiled model first. If found, loads it directly.
|
| 170 |
+
Otherwise, compiles fresh and uploads to Hub for future reuse.
|
| 171 |
+
|
| 172 |
+
Uses aoti_capture to capture the EXACT call signature from a real inference
|
| 173 |
+
call to segment_recitations, ensuring the export matches what the model
|
| 174 |
+
actually receives during inference.
|
| 175 |
+
|
| 176 |
+
Returns dict with test results and timing.
|
| 177 |
+
"""
|
| 178 |
+
import time
|
| 179 |
+
|
| 180 |
+
results = {
|
| 181 |
+
"export_success": False,
|
| 182 |
+
"export_time": 0.0,
|
| 183 |
+
"compile_success": False,
|
| 184 |
+
"compile_time": 0.0,
|
| 185 |
+
"hub_loaded": False,
|
| 186 |
+
"hub_uploaded": False,
|
| 187 |
+
"error": None,
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
if not AOTI_ENABLED:
|
| 191 |
+
results["error"] = "AoTI disabled in config"
|
| 192 |
+
print("[AoTI] Disabled via AOTI_ENABLED=False")
|
| 193 |
+
return results
|
| 194 |
+
|
| 195 |
+
if _aoti_cache["tested"]:
|
| 196 |
+
print("[AoTI] Already tested this session, skipping")
|
| 197 |
+
return {"skipped": True, **results}
|
| 198 |
+
|
| 199 |
+
_aoti_cache["tested"] = True
|
| 200 |
+
|
| 201 |
+
# Check model is loaded and on GPU
|
| 202 |
+
if not _segmenter_cache["loaded"] or _segmenter_cache["model"] is None:
|
| 203 |
+
results["error"] = "Model not loaded"
|
| 204 |
+
print(f"[AoTI] {results['error']}")
|
| 205 |
+
return results
|
| 206 |
+
|
| 207 |
+
model = _segmenter_cache["model"]
|
| 208 |
+
processor = _segmenter_cache["processor"]
|
| 209 |
+
device = next(model.parameters()).device
|
| 210 |
+
dtype = next(model.parameters()).dtype
|
| 211 |
+
|
| 212 |
+
if device.type != "cuda":
|
| 213 |
+
results["error"] = f"Model not on GPU (device={device})"
|
| 214 |
+
print(f"[AoTI] {results['error']}")
|
| 215 |
+
return results
|
| 216 |
+
|
| 217 |
+
print(f"[AoTI] Testing torch.export on VAD model (device={device}, dtype={dtype})")
|
| 218 |
+
|
| 219 |
+
# Import spaces for aoti_capture
|
| 220 |
+
try:
|
| 221 |
+
import spaces
|
| 222 |
+
except ImportError:
|
| 223 |
+
results["error"] = "spaces module not available"
|
| 224 |
+
print(f"[AoTI] {results['error']}")
|
| 225 |
+
return results
|
| 226 |
+
|
| 227 |
+
# Try to load pre-compiled model from Hub first
|
| 228 |
+
if _try_load_aoti_from_hub(model):
|
| 229 |
+
results["hub_loaded"] = True
|
| 230 |
+
results["compile_success"] = True
|
| 231 |
+
print("[AoTI] Using pre-compiled model from Hub")
|
| 232 |
+
return results
|
| 233 |
+
|
| 234 |
+
# No cached model found - compile fresh
|
| 235 |
+
print("[AoTI] No cached model on Hub, compiling fresh...")
|
| 236 |
+
|
| 237 |
+
# Convert config minutes to samples (16kHz audio)
|
| 238 |
+
SAMPLES_PER_MINUTE = 16000 * 60
|
| 239 |
+
min_samples = int(AOTI_MIN_AUDIO_MINUTES * SAMPLES_PER_MINUTE)
|
| 240 |
+
max_samples = int(AOTI_MAX_AUDIO_MINUTES * SAMPLES_PER_MINUTE)
|
| 241 |
+
|
| 242 |
+
# Create test audio for capture - use min duration to save memory
|
| 243 |
+
# MUST be on CPU - segment_recitations moves to GPU internally
|
| 244 |
+
test_audio = torch.randn(min_samples, device="cpu")
|
| 245 |
+
print(f"[AoTI] Test audio: {min_samples} samples ({AOTI_MIN_AUDIO_MINUTES} min)")
|
| 246 |
+
|
| 247 |
+
# Capture the exact args/kwargs used by segment_recitations
|
| 248 |
+
try:
|
| 249 |
+
from recitations_segmenter import segment_recitations
|
| 250 |
+
|
| 251 |
+
print("[AoTI] Capturing call signature via aoti_capture...")
|
| 252 |
+
with spaces.aoti_capture(model) as call:
|
| 253 |
+
segment_recitations(
|
| 254 |
+
[test_audio], model, processor,
|
| 255 |
+
device=device, dtype=dtype, batch_size=1,
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
print(f"[AoTI] Captured args: {len(call.args)} positional, {list(call.kwargs.keys())} kwargs")
|
| 259 |
+
|
| 260 |
+
except Exception as e:
|
| 261 |
+
results["error"] = f"aoti_capture failed: {type(e).__name__}: {e}"
|
| 262 |
+
print(f"[AoTI] {results['error']}")
|
| 263 |
+
import traceback
|
| 264 |
+
traceback.print_exc()
|
| 265 |
+
return results
|
| 266 |
+
|
| 267 |
+
# Build dynamic shapes from captured tensors
|
| 268 |
+
# The sequence dimension (T) varies with audio length
|
| 269 |
+
try:
|
| 270 |
+
from torch.export import export, Dim
|
| 271 |
+
|
| 272 |
+
# Derive frame rate from captured tensor (model's actual output rate)
|
| 273 |
+
# Find the first 2D+ tensor to get the captured frame count
|
| 274 |
+
captured_frames = None
|
| 275 |
+
for val in list(call.kwargs.values()) + list(call.args):
|
| 276 |
+
if isinstance(val, torch.Tensor) and val.dim() >= 2:
|
| 277 |
+
captured_frames = val.shape[1]
|
| 278 |
+
break
|
| 279 |
+
|
| 280 |
+
if captured_frames is None:
|
| 281 |
+
raise ValueError("No 2D+ tensor found in captured args/kwargs")
|
| 282 |
+
|
| 283 |
+
# Calculate frames per minute from captured data
|
| 284 |
+
frames_per_minute = captured_frames / AOTI_MIN_AUDIO_MINUTES
|
| 285 |
+
min_frames = captured_frames # Already at min duration
|
| 286 |
+
max_frames = int(AOTI_MAX_AUDIO_MINUTES * frames_per_minute)
|
| 287 |
+
dynamic_T = Dim("T", min=min_frames, max=max_frames)
|
| 288 |
+
print(f"[AoTI] Captured {captured_frames} frames for {AOTI_MIN_AUDIO_MINUTES} min = {frames_per_minute:.1f} frames/min")
|
| 289 |
+
print(f"[AoTI] Dynamic shape range: {min_frames}-{max_frames} frames")
|
| 290 |
+
|
| 291 |
+
# Build dynamic_shapes dict matching the captured signature
|
| 292 |
+
dynamic_shapes_args = []
|
| 293 |
+
for arg in call.args:
|
| 294 |
+
if isinstance(arg, torch.Tensor) and arg.dim() >= 2:
|
| 295 |
+
# Assume sequence dim is dim 1 for 2D+ tensors
|
| 296 |
+
dynamic_shapes_args.append({1: dynamic_T})
|
| 297 |
+
else:
|
| 298 |
+
dynamic_shapes_args.append(None)
|
| 299 |
+
|
| 300 |
+
dynamic_shapes_kwargs = {}
|
| 301 |
+
for key, val in call.kwargs.items():
|
| 302 |
+
if isinstance(val, torch.Tensor) and val.dim() >= 2:
|
| 303 |
+
dynamic_shapes_kwargs[key] = {1: dynamic_T}
|
| 304 |
+
else:
|
| 305 |
+
dynamic_shapes_kwargs[key] = None
|
| 306 |
+
|
| 307 |
+
print(f"[AoTI] Dynamic shapes - args: {dynamic_shapes_args}, kwargs: {list(dynamic_shapes_kwargs.keys())}")
|
| 308 |
+
|
| 309 |
+
t0 = time.time()
|
| 310 |
+
# Export using captured signature - guarantees match with inference
|
| 311 |
+
exported = export(
|
| 312 |
+
model,
|
| 313 |
+
args=call.args,
|
| 314 |
+
kwargs=call.kwargs,
|
| 315 |
+
dynamic_shapes=(dynamic_shapes_args, dynamic_shapes_kwargs) if dynamic_shapes_args else dynamic_shapes_kwargs,
|
| 316 |
+
strict=False,
|
| 317 |
+
)
|
| 318 |
+
results["export_time"] = time.time() - t0
|
| 319 |
+
results["export_success"] = True
|
| 320 |
+
_aoti_cache["exported"] = exported
|
| 321 |
+
print(f"[AoTI] torch.export SUCCESS in {results['export_time']:.1f}s")
|
| 322 |
+
|
| 323 |
+
except Exception as e:
|
| 324 |
+
results["error"] = f"torch.export failed: {type(e).__name__}: {e}"
|
| 325 |
+
print(f"[AoTI] {results['error']}")
|
| 326 |
+
import traceback
|
| 327 |
+
traceback.print_exc()
|
| 328 |
+
return results
|
| 329 |
+
|
| 330 |
+
# Attempt spaces.aoti_compile
|
| 331 |
+
try:
|
| 332 |
+
t0 = time.time()
|
| 333 |
+
compiled = spaces.aoti_compile(exported)
|
| 334 |
+
results["compile_time"] = time.time() - t0
|
| 335 |
+
results["compile_success"] = True
|
| 336 |
+
_aoti_cache["compiled"] = compiled
|
| 337 |
+
print(f"[AoTI] spaces.aoti_compile SUCCESS in {results['compile_time']:.1f}s")
|
| 338 |
+
|
| 339 |
+
# Return compiled object - apply happens OUTSIDE GPU lease (in main process)
|
| 340 |
+
results["compiled"] = compiled
|
| 341 |
+
print(f"[AoTI] Compiled object ready for apply")
|
| 342 |
+
|
| 343 |
+
# Upload to Hub for future reuse
|
| 344 |
+
if _push_aoti_to_hub(compiled):
|
| 345 |
+
results["hub_uploaded"] = True
|
| 346 |
+
|
| 347 |
+
except Exception as e:
|
| 348 |
+
results["error"] = f"aoti_compile failed: {type(e).__name__}: {e}"
|
| 349 |
+
print(f"[AoTI] {results['error']}")
|
| 350 |
+
import traceback
|
| 351 |
+
traceback.print_exc()
|
| 352 |
+
|
| 353 |
+
return results
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
def apply_aoti_compiled(compiled):
|
| 357 |
+
"""
|
| 358 |
+
Apply AoTI compiled model to VAD segmenter.
|
| 359 |
+
Must be called OUTSIDE GPU lease, in main process.
|
| 360 |
+
"""
|
| 361 |
+
if compiled is None:
|
| 362 |
+
print("[AoTI] No compiled object to apply")
|
| 363 |
+
return False
|
| 364 |
+
|
| 365 |
+
model = _segmenter_cache.get("model")
|
| 366 |
+
if model is None:
|
| 367 |
+
print("[AoTI] Model not loaded, cannot apply")
|
| 368 |
+
return False
|
| 369 |
+
|
| 370 |
+
try:
|
| 371 |
+
import spaces
|
| 372 |
+
spaces.aoti_apply(compiled, model)
|
| 373 |
+
_aoti_cache["compiled"] = compiled
|
| 374 |
+
_aoti_cache["applied"] = True
|
| 375 |
+
print(f"[AoTI] Compiled model applied to VAD (model_id={id(model)})")
|
| 376 |
+
return True
|
| 377 |
+
except Exception as e:
|
| 378 |
+
print(f"[AoTI] Apply failed: {e}")
|
| 379 |
+
return False
|
src/segmenter/segmenter_model.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Model lifecycle and device management for the VAD segmenter."""
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
from config import SEGMENTER_MODEL, DTYPE, IS_HF_SPACE, TORCH_COMPILE
|
| 6 |
+
from ..zero_gpu import ZERO_GPU_AVAILABLE, is_quota_exhausted, is_user_forced_cpu
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# =============================================================================
|
| 10 |
+
# Model caches
|
| 11 |
+
# =============================================================================
|
| 12 |
+
|
| 13 |
+
_segmenter_cache = {"model": None, "processor": None, "loaded": False, "load_time": 0.0, "device": None}
|
| 14 |
+
_env_logged = False
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _log_env_once():
|
| 18 |
+
"""Log library and GPU versions once for debugging HF Space mismatches."""
|
| 19 |
+
global _env_logged
|
| 20 |
+
if _env_logged:
|
| 21 |
+
return
|
| 22 |
+
_env_logged = True
|
| 23 |
+
try:
|
| 24 |
+
import importlib.metadata as _im
|
| 25 |
+
|
| 26 |
+
def _ver(pkg: str) -> str:
|
| 27 |
+
try:
|
| 28 |
+
return _im.version(pkg)
|
| 29 |
+
except Exception:
|
| 30 |
+
return "unknown"
|
| 31 |
+
|
| 32 |
+
print(f"[ENV] torch={torch.__version__} cuda={torch.version.cuda} cudnn={torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'none'}")
|
| 33 |
+
print(f"[ENV] transformers={_ver('transformers')} recitations_segmenter={_ver('recitations_segmenter')}")
|
| 34 |
+
if torch.cuda.is_available():
|
| 35 |
+
print(f"[ENV] GPU={torch.cuda.get_device_name(0)}")
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"[ENV] Failed to log env: {e}")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
_TORCH_DTYPE = torch.float16 if DTYPE == "float16" else torch.float32
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _get_device_and_dtype():
|
| 44 |
+
"""Get the best available device and dtype."""
|
| 45 |
+
if IS_HF_SPACE or ZERO_GPU_AVAILABLE:
|
| 46 |
+
return torch.device("cpu"), _TORCH_DTYPE
|
| 47 |
+
if torch.cuda.is_available():
|
| 48 |
+
return torch.device("cuda"), _TORCH_DTYPE
|
| 49 |
+
return torch.device("cpu"), _TORCH_DTYPE
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def ensure_models_on_gpu(asr_model_name=None):
|
| 53 |
+
"""
|
| 54 |
+
Move models to GPU. Call this INSIDE a GPU-decorated function
|
| 55 |
+
after ZeroGPU lease is acquired.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
asr_model_name: If provided, move only this ASR model to GPU.
|
| 59 |
+
If None, skip ASR model movement (e.g. during VAD-only lease).
|
| 60 |
+
|
| 61 |
+
Skips if quota exhausted or CUDA unavailable.
|
| 62 |
+
Idempotent: checks current device before moving.
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
float: Time in seconds spent moving models to GPU.
|
| 66 |
+
"""
|
| 67 |
+
import time
|
| 68 |
+
from ..alignment.phoneme_asr import move_phoneme_asr_to_gpu
|
| 69 |
+
|
| 70 |
+
if is_user_forced_cpu() or is_quota_exhausted() or not torch.cuda.is_available():
|
| 71 |
+
return 0.0
|
| 72 |
+
|
| 73 |
+
device = torch.device("cuda")
|
| 74 |
+
dtype = _TORCH_DTYPE
|
| 75 |
+
move_start = time.time()
|
| 76 |
+
|
| 77 |
+
# Move segmenter to GPU
|
| 78 |
+
if _segmenter_cache["loaded"] and _segmenter_cache["model"] is not None:
|
| 79 |
+
model = _segmenter_cache["model"]
|
| 80 |
+
if next(model.parameters()).device.type != "cuda":
|
| 81 |
+
print("[GPU] Moving segmenter to CUDA...")
|
| 82 |
+
model.to(device, dtype=dtype)
|
| 83 |
+
_segmenter_cache["model"] = model
|
| 84 |
+
_segmenter_cache["device"] = "cuda"
|
| 85 |
+
print("[GPU] Segmenter on CUDA")
|
| 86 |
+
|
| 87 |
+
# Move phoneme ASR to GPU (only the requested model)
|
| 88 |
+
if asr_model_name is not None:
|
| 89 |
+
move_phoneme_asr_to_gpu(asr_model_name)
|
| 90 |
+
|
| 91 |
+
return time.time() - move_start
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def ensure_models_on_cpu():
|
| 95 |
+
"""
|
| 96 |
+
Move all models back to CPU. Called when GPU lease fails or quota
|
| 97 |
+
is exhausted so that CPU fallback inference can proceed.
|
| 98 |
+
|
| 99 |
+
Idempotent: checks current device before moving.
|
| 100 |
+
"""
|
| 101 |
+
from ..alignment.phoneme_asr import move_phoneme_asr_to_cpu
|
| 102 |
+
|
| 103 |
+
device = torch.device("cpu")
|
| 104 |
+
dtype = _TORCH_DTYPE
|
| 105 |
+
|
| 106 |
+
# Move segmenter to CPU
|
| 107 |
+
if _segmenter_cache["loaded"] and _segmenter_cache["model"] is not None:
|
| 108 |
+
model = _segmenter_cache["model"]
|
| 109 |
+
if next(model.parameters()).device.type != "cpu":
|
| 110 |
+
print("[CPU] Moving segmenter to CPU...")
|
| 111 |
+
model.to(device, dtype=dtype)
|
| 112 |
+
_segmenter_cache["model"] = model
|
| 113 |
+
_segmenter_cache["device"] = "cpu"
|
| 114 |
+
print("[CPU] Segmenter on CPU")
|
| 115 |
+
|
| 116 |
+
# Move phoneme ASR to CPU
|
| 117 |
+
move_phoneme_asr_to_cpu()
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def load_segmenter():
|
| 121 |
+
"""Load the VAD segmenter model on CPU. Returns (model, processor, load_time).
|
| 122 |
+
|
| 123 |
+
Models are loaded once and cached. Use ensure_models_on_gpu()
|
| 124 |
+
inside GPU-decorated functions to move to CUDA.
|
| 125 |
+
"""
|
| 126 |
+
if _segmenter_cache["loaded"]:
|
| 127 |
+
return _segmenter_cache["model"], _segmenter_cache["processor"], 0.0
|
| 128 |
+
|
| 129 |
+
import time
|
| 130 |
+
start_time = time.time()
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
from transformers import AutoModelForAudioFrameClassification, AutoFeatureExtractor
|
| 134 |
+
|
| 135 |
+
print(f"Loading segmenter: {SEGMENTER_MODEL}")
|
| 136 |
+
device, dtype = _get_device_and_dtype()
|
| 137 |
+
|
| 138 |
+
model = AutoModelForAudioFrameClassification.from_pretrained(SEGMENTER_MODEL)
|
| 139 |
+
model.to(device, dtype=dtype)
|
| 140 |
+
model.eval()
|
| 141 |
+
if TORCH_COMPILE and not (IS_HF_SPACE or ZERO_GPU_AVAILABLE):
|
| 142 |
+
model = torch.compile(model, mode="reduce-overhead")
|
| 143 |
+
|
| 144 |
+
processor = AutoFeatureExtractor.from_pretrained(SEGMENTER_MODEL)
|
| 145 |
+
|
| 146 |
+
load_time = time.time() - start_time
|
| 147 |
+
_segmenter_cache["model"] = model
|
| 148 |
+
_segmenter_cache["processor"] = processor
|
| 149 |
+
_segmenter_cache["loaded"] = True
|
| 150 |
+
_segmenter_cache["load_time"] = load_time
|
| 151 |
+
_segmenter_cache["device"] = device.type
|
| 152 |
+
|
| 153 |
+
print(f"Segmenter loaded on {device} in {load_time:.2f}s")
|
| 154 |
+
return model, processor, load_time
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
print(f"Failed to load segmenter: {e}")
|
| 158 |
+
return None, None, 0.0
|
src/segmenter/vad.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""VAD inference utilities."""
|
| 2 |
+
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
from .segmenter_aoti import is_aoti_applied
|
| 9 |
+
from .segmenter_model import load_segmenter, _log_env_once
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def detect_speech_segments(
|
| 13 |
+
audio: np.ndarray,
|
| 14 |
+
sample_rate: int,
|
| 15 |
+
min_silence_ms: int,
|
| 16 |
+
min_speech_ms: int,
|
| 17 |
+
pad_ms: int
|
| 18 |
+
) -> tuple[List[Tuple[float, float]], dict]:
|
| 19 |
+
"""
|
| 20 |
+
Detect speech segments in audio using VAD.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
audio: Audio waveform (mono, float32)
|
| 24 |
+
sample_rate: Sample rate of audio
|
| 25 |
+
min_silence_ms: Minimum silence duration to split segments
|
| 26 |
+
min_speech_ms: Minimum speech duration for a valid segment
|
| 27 |
+
pad_ms: Padding around speech segments
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
Tuple of (intervals, profiling_dict, raw_speech_intervals, raw_is_complete) where:
|
| 31 |
+
- intervals: List of (start_time, end_time) tuples in seconds
|
| 32 |
+
- profiling_dict: {"model_load_time": float, "inference_time": float}
|
| 33 |
+
- raw_speech_intervals: Raw VAD intervals before cleaning (for resegmentation)
|
| 34 |
+
- raw_is_complete: Raw VAD completeness flags (for resegmentation)
|
| 35 |
+
"""
|
| 36 |
+
import time
|
| 37 |
+
|
| 38 |
+
model, processor, model_load_time = load_segmenter()
|
| 39 |
+
if model is None:
|
| 40 |
+
# Fallback: treat whole audio as one segment
|
| 41 |
+
return [(0, len(audio) / sample_rate)], {"model_load_time": 0.0, "inference_time": 0.0}, None, None
|
| 42 |
+
|
| 43 |
+
inference_start = time.time()
|
| 44 |
+
_log_env_once()
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
from recitations_segmenter import segment_recitations, clean_speech_intervals
|
| 48 |
+
|
| 49 |
+
audio_tensor = torch.from_numpy(audio).float()
|
| 50 |
+
|
| 51 |
+
device = next(model.parameters()).device
|
| 52 |
+
dtype = next(model.parameters()).dtype
|
| 53 |
+
|
| 54 |
+
# Log AoTI status
|
| 55 |
+
if is_aoti_applied():
|
| 56 |
+
print("[VAD] Using AOTInductor-compiled model")
|
| 57 |
+
|
| 58 |
+
# Run segmentation
|
| 59 |
+
outputs = segment_recitations(
|
| 60 |
+
[audio_tensor], model, processor,
|
| 61 |
+
device=device, dtype=dtype, batch_size=1,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
if not outputs:
|
| 65 |
+
inference_time = time.time() - inference_start
|
| 66 |
+
return [(0, len(audio) / sample_rate)], {"model_load_time": model_load_time, "inference_time": inference_time}, None, None
|
| 67 |
+
|
| 68 |
+
# Clean speech intervals with user parameters
|
| 69 |
+
clean_out = clean_speech_intervals(
|
| 70 |
+
outputs[0].speech_intervals,
|
| 71 |
+
outputs[0].is_complete,
|
| 72 |
+
min_silence_duration_ms=min_silence_ms,
|
| 73 |
+
min_speech_duration_ms=min_speech_ms,
|
| 74 |
+
pad_duration_ms=pad_ms,
|
| 75 |
+
return_seconds=True,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
inference_time = time.time() - inference_start
|
| 79 |
+
intervals = clean_out.clean_speech_intervals.tolist()
|
| 80 |
+
|
| 81 |
+
raw_count = len(outputs[0].speech_intervals)
|
| 82 |
+
final_count = len(intervals)
|
| 83 |
+
removed = raw_count - final_count
|
| 84 |
+
print(f"[VAD] Raw model intervals: {raw_count}, after cleaning: {final_count} "
|
| 85 |
+
f"({removed} removed by silence merge + min_speech={min_speech_ms}ms filter)")
|
| 86 |
+
|
| 87 |
+
raw_speech_intervals = outputs[0].speech_intervals
|
| 88 |
+
raw_is_complete = outputs[0].is_complete
|
| 89 |
+
|
| 90 |
+
return [(start, end) for start, end in intervals], {"model_load_time": model_load_time, "inference_time": inference_time}, raw_speech_intervals, raw_is_complete
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f"VAD error: {e}")
|
| 94 |
+
import traceback
|
| 95 |
+
traceback.print_exc()
|
| 96 |
+
# Let gpu_with_fallback handle retries on CPU
|
| 97 |
+
raise
|
src/zero_gpu.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utilities for integrating Hugging Face Spaces ZeroGPU without breaking
|
| 3 |
+
local or non-ZeroGPU environments.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
from typing import Callable, TypeVar
|
| 8 |
+
from functools import wraps
|
| 9 |
+
|
| 10 |
+
T = TypeVar("T", bound=Callable)
|
| 11 |
+
|
| 12 |
+
# Default values in case the spaces package is unavailable (e.g., local runs).
|
| 13 |
+
ZERO_GPU_AVAILABLE = False
|
| 14 |
+
|
| 15 |
+
# Track whether we've fallen back to CPU due to quota exhaustion
|
| 16 |
+
_gpu_quota_exhausted = False
|
| 17 |
+
_quota_reset_time = None # e.g. "13:53:59"
|
| 18 |
+
_user_forced_cpu = False
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
import spaces # type: ignore
|
| 22 |
+
|
| 23 |
+
gpu_decorator = spaces.GPU # pragma: no cover
|
| 24 |
+
ZERO_GPU_AVAILABLE = True
|
| 25 |
+
except Exception:
|
| 26 |
+
def gpu_decorator(*decorator_args, **decorator_kwargs):
|
| 27 |
+
"""
|
| 28 |
+
No-op replacement for spaces.GPU so code can run without the package
|
| 29 |
+
or outside of a ZeroGPU Space.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def wrapper(func: T) -> T:
|
| 33 |
+
return func
|
| 34 |
+
|
| 35 |
+
# Support both bare @gpu_decorator and @gpu_decorator(...)
|
| 36 |
+
if decorator_args and callable(decorator_args[0]) and not decorator_kwargs:
|
| 37 |
+
return decorator_args[0]
|
| 38 |
+
return wrapper
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def is_quota_exhausted() -> bool:
|
| 42 |
+
"""Check if GPU quota has been exhausted this session."""
|
| 43 |
+
return _gpu_quota_exhausted
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def is_user_forced_cpu() -> bool:
|
| 47 |
+
"""Check if the user manually selected CPU mode."""
|
| 48 |
+
return _user_forced_cpu
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def get_quota_reset_time() -> str | None:
|
| 52 |
+
"""Return the quota reset time string (e.g. '13:53:59'), or None."""
|
| 53 |
+
return _quota_reset_time
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def reset_quota_flag():
|
| 57 |
+
"""Reset the quota exhausted flag (e.g., after quota resets)."""
|
| 58 |
+
global _gpu_quota_exhausted, _quota_reset_time, _user_forced_cpu
|
| 59 |
+
_gpu_quota_exhausted = False
|
| 60 |
+
_quota_reset_time = None
|
| 61 |
+
_user_forced_cpu = False
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def force_cpu_mode():
|
| 65 |
+
"""Force all GPU-decorated functions to skip GPU and run on CPU."""
|
| 66 |
+
global _user_forced_cpu
|
| 67 |
+
_user_forced_cpu = True
|
| 68 |
+
_move_models_to_cpu()
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def _move_models_to_cpu():
|
| 72 |
+
"""Move all models back to CPU for fallback inference."""
|
| 73 |
+
try:
|
| 74 |
+
from .segmenter.segmenter_model import ensure_models_on_cpu
|
| 75 |
+
ensure_models_on_cpu()
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f"[CPU] Failed to move models to CPU: {e}")
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def gpu_with_fallback(duration=60):
|
| 81 |
+
"""
|
| 82 |
+
Decorator that wraps a GPU function with automatic CPU fallback.
|
| 83 |
+
|
| 84 |
+
If ZeroGPU quota is exceeded, the function runs on CPU instead.
|
| 85 |
+
The decorated function should call ensure_models_on_gpu() internally,
|
| 86 |
+
which checks is_quota_exhausted() to decide whether to move to CUDA.
|
| 87 |
+
|
| 88 |
+
Usage:
|
| 89 |
+
@gpu_with_fallback(duration=60)
|
| 90 |
+
def my_gpu_func(data):
|
| 91 |
+
ensure_models_on_gpu() # Moves to CUDA if quota not exhausted
|
| 92 |
+
# ... inference using model's current device ...
|
| 93 |
+
"""
|
| 94 |
+
def decorator(func: T) -> T:
|
| 95 |
+
# Create the GPU-wrapped version
|
| 96 |
+
if ZERO_GPU_AVAILABLE:
|
| 97 |
+
gpu_func = gpu_decorator(duration=duration)(func)
|
| 98 |
+
else:
|
| 99 |
+
gpu_func = func
|
| 100 |
+
|
| 101 |
+
@wraps(func)
|
| 102 |
+
def wrapper(*args, **kwargs):
|
| 103 |
+
global _gpu_quota_exhausted, _quota_reset_time
|
| 104 |
+
|
| 105 |
+
# If user explicitly chose CPU mode, skip GPU entirely
|
| 106 |
+
if _user_forced_cpu:
|
| 107 |
+
print("[CPU] User selected CPU mode")
|
| 108 |
+
return func(*args, **kwargs)
|
| 109 |
+
|
| 110 |
+
# If quota already exhausted, go straight to CPU
|
| 111 |
+
if _gpu_quota_exhausted:
|
| 112 |
+
print("[GPU] Quota exhausted, using CPU fallback")
|
| 113 |
+
_move_models_to_cpu()
|
| 114 |
+
return func(*args, **kwargs)
|
| 115 |
+
|
| 116 |
+
# Try GPU first
|
| 117 |
+
try:
|
| 118 |
+
return gpu_func(*args, **kwargs)
|
| 119 |
+
except Exception as e:
|
| 120 |
+
# ZeroGPU raises gradio.Error with title="ZeroGPU quota exceeded"
|
| 121 |
+
is_quota_error = getattr(e, 'title', '') == "ZeroGPU quota exceeded"
|
| 122 |
+
if not is_quota_error:
|
| 123 |
+
is_quota_error = 'quota' in str(e).lower()
|
| 124 |
+
|
| 125 |
+
if is_quota_error:
|
| 126 |
+
print(f"[GPU] Quota exceeded, falling back to CPU: {e}")
|
| 127 |
+
_gpu_quota_exhausted = True
|
| 128 |
+
# Parse reset time from message like "Try again in 13:53:59"
|
| 129 |
+
match = re.search(r'Try again in (\d+:\d{2}:\d{2})', str(e))
|
| 130 |
+
if match:
|
| 131 |
+
_quota_reset_time = match.group(1)
|
| 132 |
+
_move_models_to_cpu()
|
| 133 |
+
return func(*args, **kwargs)
|
| 134 |
+
else:
|
| 135 |
+
err_lower = str(e).lower()
|
| 136 |
+
is_timeout = (
|
| 137 |
+
'timeout' in err_lower
|
| 138 |
+
or 'duration' in err_lower
|
| 139 |
+
or 'time limit' in err_lower
|
| 140 |
+
)
|
| 141 |
+
if is_timeout:
|
| 142 |
+
print(f"[GPU] Timeout error in {func.__name__}: {e}")
|
| 143 |
+
raise
|
| 144 |
+
|
| 145 |
+
return wrapper
|
| 146 |
+
return decorator
|
utils/__init__.py
ADDED
|
File without changes
|
utils/usage_logger.py
ADDED
|
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Usage logger that pushes alignment runs to a HF Dataset repo.
|
| 3 |
+
|
| 4 |
+
Uses a ParquetScheduler (subclass of CommitScheduler) to buffer rows in memory
|
| 5 |
+
and periodically write+upload parquet files with embedded audio to the Hub.
|
| 6 |
+
Error logs use a separate CommitScheduler with JSONL files.
|
| 7 |
+
Falls back to local-only logging if schedulers can't initialize.
|
| 8 |
+
|
| 9 |
+
Scheduler creation is deferred to first use so that background threads don't
|
| 10 |
+
interfere with ZeroGPU's startup function scan.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import hashlib
|
| 14 |
+
import io
|
| 15 |
+
import json
|
| 16 |
+
import threading
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 20 |
+
from uuid import uuid4
|
| 21 |
+
|
| 22 |
+
import numpy as np
|
| 23 |
+
|
| 24 |
+
# =========================================================================
|
| 25 |
+
# Directory setup
|
| 26 |
+
# =========================================================================
|
| 27 |
+
|
| 28 |
+
LOG_DIR = Path("usage_logs")
|
| 29 |
+
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
| 30 |
+
|
| 31 |
+
ERROR_DIR = LOG_DIR / "errors"
|
| 32 |
+
ERROR_DIR.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
|
| 34 |
+
ERROR_LOG_PATH = ERROR_DIR / f"error_log-{uuid4()}.jsonl"
|
| 35 |
+
|
| 36 |
+
# =========================================================================
|
| 37 |
+
# ParquetScheduler class definition (no instances created at import time)
|
| 38 |
+
# =========================================================================
|
| 39 |
+
|
| 40 |
+
_HAS_DEPS = False
|
| 41 |
+
try:
|
| 42 |
+
import pyarrow as pa
|
| 43 |
+
import pyarrow.parquet as pq
|
| 44 |
+
from huggingface_hub import CommitScheduler
|
| 45 |
+
from config import USAGE_LOG_DATASET_REPO, USAGE_LOG_PUSH_INTERVAL_MINUTES
|
| 46 |
+
|
| 47 |
+
_HAS_DEPS = True
|
| 48 |
+
except Exception:
|
| 49 |
+
pass
|
| 50 |
+
|
| 51 |
+
# HF features schema (column order matters — audio first for HF viewer widget)
|
| 52 |
+
_ALIGNER_SCHEMA: Dict[str, Dict[str, str]] = {
|
| 53 |
+
# Identity
|
| 54 |
+
"audio": {"_type": "Audio"},
|
| 55 |
+
"audio_id": {"_type": "Value", "dtype": "string"},
|
| 56 |
+
"timestamp": {"_type": "Value", "dtype": "string"},
|
| 57 |
+
"user_id": {"_type": "Value", "dtype": "string"},
|
| 58 |
+
# Input metadata
|
| 59 |
+
"audio_duration_s": {"_type": "Value", "dtype": "float64"},
|
| 60 |
+
"num_segments": {"_type": "Value", "dtype": "int32"},
|
| 61 |
+
"surah": {"_type": "Value", "dtype": "int32"},
|
| 62 |
+
# Segmentation settings
|
| 63 |
+
"min_silence_ms": {"_type": "Value", "dtype": "int32"},
|
| 64 |
+
"min_speech_ms": {"_type": "Value", "dtype": "int32"},
|
| 65 |
+
"pad_ms": {"_type": "Value", "dtype": "int32"},
|
| 66 |
+
"asr_model": {"_type": "Value", "dtype": "string"},
|
| 67 |
+
"device": {"_type": "Value", "dtype": "string"},
|
| 68 |
+
# Profiling
|
| 69 |
+
"total_time": {"_type": "Value", "dtype": "float64"},
|
| 70 |
+
"vad_queue_time": {"_type": "Value", "dtype": "float64"},
|
| 71 |
+
"vad_gpu_time": {"_type": "Value", "dtype": "float64"},
|
| 72 |
+
"asr_gpu_time": {"_type": "Value", "dtype": "float64"},
|
| 73 |
+
"dp_total_time": {"_type": "Value", "dtype": "float64"},
|
| 74 |
+
# Quality & retry
|
| 75 |
+
"segments_passed": {"_type": "Value", "dtype": "int32"},
|
| 76 |
+
"segments_failed": {"_type": "Value", "dtype": "int32"},
|
| 77 |
+
"mean_confidence": {"_type": "Value", "dtype": "float64"},
|
| 78 |
+
"tier1_retries": {"_type": "Value", "dtype": "int32"},
|
| 79 |
+
"tier1_passed": {"_type": "Value", "dtype": "int32"},
|
| 80 |
+
"tier2_retries": {"_type": "Value", "dtype": "int32"},
|
| 81 |
+
"tier2_passed": {"_type": "Value", "dtype": "int32"},
|
| 82 |
+
"reanchors": {"_type": "Value", "dtype": "int32"},
|
| 83 |
+
"special_merges": {"_type": "Value", "dtype": "int32"},
|
| 84 |
+
# Reciter stats
|
| 85 |
+
"words_per_minute": {"_type": "Value", "dtype": "float64"},
|
| 86 |
+
"phonemes_per_second": {"_type": "Value", "dtype": "float64"},
|
| 87 |
+
"avg_segment_duration": {"_type": "Value", "dtype": "float64"},
|
| 88 |
+
"std_segment_duration": {"_type": "Value", "dtype": "float64"},
|
| 89 |
+
"avg_pause_duration": {"_type": "Value", "dtype": "float64"},
|
| 90 |
+
"std_pause_duration": {"_type": "Value", "dtype": "float64"},
|
| 91 |
+
# Session flags
|
| 92 |
+
"resegmented": {"_type": "Value", "dtype": "bool"},
|
| 93 |
+
"retranscribed": {"_type": "Value", "dtype": "bool"},
|
| 94 |
+
# Segments, timestamps & error
|
| 95 |
+
"segments": {"_type": "Value", "dtype": "string"},
|
| 96 |
+
"word_timestamps": {"_type": "Value", "dtype": "string"},
|
| 97 |
+
"char_timestamps": {"_type": "Value", "dtype": "string"},
|
| 98 |
+
"error": {"_type": "Value", "dtype": "string"},
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
if _HAS_DEPS:
|
| 102 |
+
class ParquetScheduler(CommitScheduler):
|
| 103 |
+
"""Buffers rows in memory and uploads a parquet file each interval.
|
| 104 |
+
|
| 105 |
+
Audio values are stored as file paths in the row dict; on push they are
|
| 106 |
+
read as bytes and embedded in the parquet using the HF Audio struct.
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
def __init__(
|
| 110 |
+
self,
|
| 111 |
+
*,
|
| 112 |
+
repo_id: str,
|
| 113 |
+
schema: Optional[Dict[str, Dict[str, str]]] = None,
|
| 114 |
+
every: Union[int, float] = 5,
|
| 115 |
+
path_in_repo: Optional[str] = "data",
|
| 116 |
+
repo_type: Optional[str] = "dataset",
|
| 117 |
+
private: bool = False,
|
| 118 |
+
) -> None:
|
| 119 |
+
super().__init__(
|
| 120 |
+
repo_id=repo_id,
|
| 121 |
+
folder_path="dummy", # not used — we upload directly
|
| 122 |
+
every=every,
|
| 123 |
+
path_in_repo=path_in_repo,
|
| 124 |
+
repo_type=repo_type,
|
| 125 |
+
private=private,
|
| 126 |
+
)
|
| 127 |
+
self._rows: List[Dict[str, Any]] = []
|
| 128 |
+
self._schema = schema
|
| 129 |
+
|
| 130 |
+
def append(self, row: Dict[str, Any]) -> None:
|
| 131 |
+
with self.lock:
|
| 132 |
+
self._rows.append(row)
|
| 133 |
+
|
| 134 |
+
def push_to_hub(self) -> None:
|
| 135 |
+
with self.lock:
|
| 136 |
+
rows = self._rows
|
| 137 |
+
self._rows = []
|
| 138 |
+
if not rows:
|
| 139 |
+
return
|
| 140 |
+
|
| 141 |
+
print(f"[USAGE_LOG] Pushing {len(rows)} alignment row(s) to Hub.")
|
| 142 |
+
|
| 143 |
+
schema: Dict[str, Dict] = dict(self._schema) if self._schema else {}
|
| 144 |
+
paths_to_cleanup: List[Path] = []
|
| 145 |
+
|
| 146 |
+
for row in rows:
|
| 147 |
+
for key, value in row.items():
|
| 148 |
+
if key not in schema:
|
| 149 |
+
schema[key] = _infer_schema(key, value)
|
| 150 |
+
|
| 151 |
+
if value is not None and schema[key].get("_type") in ("Image", "Audio"):
|
| 152 |
+
file_path = Path(value)
|
| 153 |
+
if file_path.is_file():
|
| 154 |
+
row[key] = {
|
| 155 |
+
"path": file_path.name,
|
| 156 |
+
"bytes": file_path.read_bytes(),
|
| 157 |
+
}
|
| 158 |
+
paths_to_cleanup.append(file_path)
|
| 159 |
+
else:
|
| 160 |
+
row[key] = None
|
| 161 |
+
|
| 162 |
+
for row in rows:
|
| 163 |
+
for feature in schema:
|
| 164 |
+
if feature not in row:
|
| 165 |
+
row[feature] = None
|
| 166 |
+
|
| 167 |
+
table = pa.Table.from_pylist(rows)
|
| 168 |
+
table = table.replace_schema_metadata(
|
| 169 |
+
{"huggingface": json.dumps({"info": {"features": schema}})}
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
archive = None
|
| 173 |
+
try:
|
| 174 |
+
import tempfile
|
| 175 |
+
archive = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
|
| 176 |
+
pq.write_table(table, archive.name)
|
| 177 |
+
self.api.upload_file(
|
| 178 |
+
repo_id=self.repo_id,
|
| 179 |
+
repo_type=self.repo_type,
|
| 180 |
+
revision=self.revision,
|
| 181 |
+
path_in_repo=f"{self.path_in_repo}/{uuid4()}.parquet",
|
| 182 |
+
path_or_fileobj=archive.name,
|
| 183 |
+
)
|
| 184 |
+
print("[USAGE_LOG] Parquet commit completed.")
|
| 185 |
+
except Exception as e:
|
| 186 |
+
print(f"[USAGE_LOG] Failed to upload parquet: {e}")
|
| 187 |
+
finally:
|
| 188 |
+
if archive:
|
| 189 |
+
archive.close()
|
| 190 |
+
Path(archive.name).unlink(missing_ok=True)
|
| 191 |
+
|
| 192 |
+
for path in paths_to_cleanup:
|
| 193 |
+
path.unlink(missing_ok=True)
|
| 194 |
+
|
| 195 |
+
def _infer_schema(key: str, value: Any) -> Dict[str, str]:
|
| 196 |
+
if "image" in key:
|
| 197 |
+
return {"_type": "Image"}
|
| 198 |
+
if "audio" in key:
|
| 199 |
+
return {"_type": "Audio"}
|
| 200 |
+
if isinstance(value, bool):
|
| 201 |
+
return {"_type": "Value", "dtype": "bool"}
|
| 202 |
+
if isinstance(value, int):
|
| 203 |
+
return {"_type": "Value", "dtype": "int64"}
|
| 204 |
+
if isinstance(value, float):
|
| 205 |
+
return {"_type": "Value", "dtype": "float64"}
|
| 206 |
+
if isinstance(value, bytes):
|
| 207 |
+
return {"_type": "Value", "dtype": "binary"}
|
| 208 |
+
return {"_type": "Value", "dtype": "string"}
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
# =========================================================================
|
| 212 |
+
# Lazy scheduler initialization (deferred to first use)
|
| 213 |
+
# =========================================================================
|
| 214 |
+
|
| 215 |
+
_aligner_scheduler = None
|
| 216 |
+
_error_scheduler = None
|
| 217 |
+
_schedulers_initialized = False
|
| 218 |
+
_init_lock = threading.Lock()
|
| 219 |
+
_fallback_lock = threading.Lock()
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def _ensure_schedulers() -> None:
|
| 223 |
+
global _aligner_scheduler, _error_scheduler, _schedulers_initialized
|
| 224 |
+
if _schedulers_initialized:
|
| 225 |
+
return
|
| 226 |
+
with _init_lock:
|
| 227 |
+
if _schedulers_initialized:
|
| 228 |
+
return
|
| 229 |
+
_schedulers_initialized = True
|
| 230 |
+
if not _HAS_DEPS:
|
| 231 |
+
print("[USAGE_LOG] Dependencies missing (local-only mode).")
|
| 232 |
+
return
|
| 233 |
+
try:
|
| 234 |
+
_aligner_scheduler = ParquetScheduler(
|
| 235 |
+
repo_id=USAGE_LOG_DATASET_REPO,
|
| 236 |
+
schema=_ALIGNER_SCHEMA,
|
| 237 |
+
every=USAGE_LOG_PUSH_INTERVAL_MINUTES,
|
| 238 |
+
path_in_repo="data",
|
| 239 |
+
repo_type="dataset",
|
| 240 |
+
private=True,
|
| 241 |
+
)
|
| 242 |
+
_error_scheduler = CommitScheduler(
|
| 243 |
+
repo_id=USAGE_LOG_DATASET_REPO,
|
| 244 |
+
repo_type="dataset",
|
| 245 |
+
folder_path=ERROR_DIR,
|
| 246 |
+
path_in_repo="data/errors",
|
| 247 |
+
private=True,
|
| 248 |
+
every=USAGE_LOG_PUSH_INTERVAL_MINUTES,
|
| 249 |
+
)
|
| 250 |
+
except Exception as e:
|
| 251 |
+
print(f"[USAGE_LOG] Scheduler init failed (local-only mode): {e}")
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
# =========================================================================
|
| 255 |
+
# Helpers
|
| 256 |
+
# =========================================================================
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def _get_error_lock():
|
| 260 |
+
_ensure_schedulers()
|
| 261 |
+
if _error_scheduler is not None:
|
| 262 |
+
return _error_scheduler.lock
|
| 263 |
+
return _fallback_lock
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def get_user_id(request) -> str:
|
| 267 |
+
"""SHA-256 hash (12-char) of IP+UA from a gr.Request, or 'unknown'."""
|
| 268 |
+
try:
|
| 269 |
+
headers = request.headers
|
| 270 |
+
ip = (
|
| 271 |
+
headers.get("x-forwarded-for", "").split(",")[0].strip()
|
| 272 |
+
or headers.get("x-real-ip", "")
|
| 273 |
+
or ""
|
| 274 |
+
)
|
| 275 |
+
ua = headers.get("user-agent", "")
|
| 276 |
+
return hashlib.sha256(f"{ip}|{ua}".encode()).hexdigest()[:12]
|
| 277 |
+
except Exception:
|
| 278 |
+
return "unknown"
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def _compute_audio_id(audio: np.ndarray, ts: datetime) -> str:
|
| 282 |
+
"""Content hash (16-char) + compact timestamp."""
|
| 283 |
+
audio_hash = hashlib.sha256(audio.tobytes()).hexdigest()[:16]
|
| 284 |
+
return f"{audio_hash}:{ts.strftime('%Y%m%dT%H%M%S')}"
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def _encode_audio_flac(audio: np.ndarray, sample_rate: int, audio_id: str) -> str:
|
| 288 |
+
"""Encode audio to a temp FLAC file; returns the file path."""
|
| 289 |
+
import soundfile as sf
|
| 290 |
+
|
| 291 |
+
tmp_dir = LOG_DIR / "tmp_audio"
|
| 292 |
+
tmp_dir.mkdir(parents=True, exist_ok=True)
|
| 293 |
+
safe_id = audio_id.replace(":", "-")
|
| 294 |
+
filepath = tmp_dir / f"{safe_id}.flac"
|
| 295 |
+
sf.write(str(filepath), audio, sample_rate, format="FLAC")
|
| 296 |
+
return str(filepath)
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def _sync_row_to_scheduler(row: Dict[str, Any]) -> None:
|
| 300 |
+
"""Ensure *row* is represented in the scheduler buffer.
|
| 301 |
+
|
| 302 |
+
gr.State may deserialize the dict (creating a copy), and push_to_hub
|
| 303 |
+
detaches rows from the buffer. This helper finds the original row by
|
| 304 |
+
audio_id and updates it, or re-appends if it was already pushed.
|
| 305 |
+
"""
|
| 306 |
+
if _aligner_scheduler is None:
|
| 307 |
+
return
|
| 308 |
+
audio_id = row.get("audio_id")
|
| 309 |
+
if not audio_id:
|
| 310 |
+
return
|
| 311 |
+
with _aligner_scheduler.lock:
|
| 312 |
+
for buffered in _aligner_scheduler._rows:
|
| 313 |
+
if buffered.get("audio_id") == audio_id:
|
| 314 |
+
# Update the buffered row in-place (handles gr.State copies)
|
| 315 |
+
buffered.update(row)
|
| 316 |
+
return
|
| 317 |
+
# Row was already pushed — re-append (audio file may be gone, that's ok)
|
| 318 |
+
_aligner_scheduler._rows.append(row)
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
# =========================================================================
|
| 322 |
+
# Public logging API
|
| 323 |
+
# =========================================================================
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def log_alignment(
|
| 327 |
+
*,
|
| 328 |
+
audio: np.ndarray,
|
| 329 |
+
sample_rate: int,
|
| 330 |
+
request=None,
|
| 331 |
+
# Input metadata
|
| 332 |
+
audio_duration_s: float,
|
| 333 |
+
num_segments: int,
|
| 334 |
+
surah: int,
|
| 335 |
+
# Settings
|
| 336 |
+
min_silence_ms: int,
|
| 337 |
+
min_speech_ms: int,
|
| 338 |
+
pad_ms: int,
|
| 339 |
+
asr_model: str,
|
| 340 |
+
device: str,
|
| 341 |
+
# Profiling
|
| 342 |
+
total_time: float,
|
| 343 |
+
vad_queue_time: float,
|
| 344 |
+
vad_gpu_time: float,
|
| 345 |
+
asr_gpu_time: float,
|
| 346 |
+
dp_total_time: float,
|
| 347 |
+
# Quality & retry
|
| 348 |
+
segments_passed: int,
|
| 349 |
+
segments_failed: int,
|
| 350 |
+
mean_confidence: float,
|
| 351 |
+
tier1_retries: int,
|
| 352 |
+
tier1_passed: int,
|
| 353 |
+
tier2_retries: int,
|
| 354 |
+
tier2_passed: int,
|
| 355 |
+
reanchors: int,
|
| 356 |
+
special_merges: int,
|
| 357 |
+
# Reciter stats
|
| 358 |
+
words_per_minute: float,
|
| 359 |
+
phonemes_per_second: float,
|
| 360 |
+
avg_segment_duration: float,
|
| 361 |
+
std_segment_duration: float,
|
| 362 |
+
avg_pause_duration: float,
|
| 363 |
+
std_pause_duration: float,
|
| 364 |
+
# Segments
|
| 365 |
+
log_segments: List[dict],
|
| 366 |
+
) -> Optional[Dict[str, Any]]:
|
| 367 |
+
"""Log an alignment run. Returns the row dict reference for in-place mutation.
|
| 368 |
+
|
| 369 |
+
The returned dict can be stored in gr.State and mutated on
|
| 370 |
+
resegment/retranscribe/timestamps before the scheduler pushes.
|
| 371 |
+
"""
|
| 372 |
+
_ensure_schedulers()
|
| 373 |
+
try:
|
| 374 |
+
ts = datetime.now()
|
| 375 |
+
audio_id = _compute_audio_id(audio, ts)
|
| 376 |
+
user_id = get_user_id(request) if request else "unknown"
|
| 377 |
+
|
| 378 |
+
# Build the segments JSON: array of run objects
|
| 379 |
+
segments_runs = [{
|
| 380 |
+
"min_silence_ms": int(min_silence_ms),
|
| 381 |
+
"min_speech_ms": int(min_speech_ms),
|
| 382 |
+
"pad_ms": int(pad_ms),
|
| 383 |
+
"asr_model": asr_model,
|
| 384 |
+
"segments": log_segments,
|
| 385 |
+
}]
|
| 386 |
+
|
| 387 |
+
# Encode audio to FLAC temp file (scheduler embeds bytes on push)
|
| 388 |
+
audio_path = _encode_audio_flac(audio, sample_rate, audio_id)
|
| 389 |
+
|
| 390 |
+
row: Dict[str, Any] = {
|
| 391 |
+
"audio": audio_path,
|
| 392 |
+
"audio_id": audio_id,
|
| 393 |
+
"timestamp": ts.isoformat(timespec="seconds"),
|
| 394 |
+
"user_id": user_id,
|
| 395 |
+
# Input metadata
|
| 396 |
+
"audio_duration_s": audio_duration_s,
|
| 397 |
+
"num_segments": num_segments,
|
| 398 |
+
"surah": surah,
|
| 399 |
+
# Settings (latest)
|
| 400 |
+
"min_silence_ms": int(min_silence_ms),
|
| 401 |
+
"min_speech_ms": int(min_speech_ms),
|
| 402 |
+
"pad_ms": int(pad_ms),
|
| 403 |
+
"asr_model": asr_model,
|
| 404 |
+
"device": device,
|
| 405 |
+
# Profiling
|
| 406 |
+
"total_time": total_time,
|
| 407 |
+
"vad_queue_time": vad_queue_time,
|
| 408 |
+
"vad_gpu_time": vad_gpu_time,
|
| 409 |
+
"asr_gpu_time": asr_gpu_time,
|
| 410 |
+
"dp_total_time": dp_total_time,
|
| 411 |
+
# Quality & retry
|
| 412 |
+
"segments_passed": segments_passed,
|
| 413 |
+
"segments_failed": segments_failed,
|
| 414 |
+
"mean_confidence": mean_confidence,
|
| 415 |
+
"tier1_retries": tier1_retries,
|
| 416 |
+
"tier1_passed": tier1_passed,
|
| 417 |
+
"tier2_retries": tier2_retries,
|
| 418 |
+
"tier2_passed": tier2_passed,
|
| 419 |
+
"reanchors": reanchors,
|
| 420 |
+
"special_merges": special_merges,
|
| 421 |
+
# Reciter stats
|
| 422 |
+
"words_per_minute": words_per_minute,
|
| 423 |
+
"phonemes_per_second": phonemes_per_second,
|
| 424 |
+
"avg_segment_duration": avg_segment_duration,
|
| 425 |
+
"std_segment_duration": std_segment_duration,
|
| 426 |
+
"avg_pause_duration": avg_pause_duration,
|
| 427 |
+
"std_pause_duration": std_pause_duration,
|
| 428 |
+
# Session flags
|
| 429 |
+
"resegmented": False,
|
| 430 |
+
"retranscribed": False,
|
| 431 |
+
# Segments & error
|
| 432 |
+
"segments": json.dumps(segments_runs),
|
| 433 |
+
"word_timestamps": None,
|
| 434 |
+
"char_timestamps": None,
|
| 435 |
+
"error": None,
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
if _aligner_scheduler is not None:
|
| 439 |
+
_aligner_scheduler.append(row)
|
| 440 |
+
else:
|
| 441 |
+
_write_fallback(row)
|
| 442 |
+
|
| 443 |
+
return row
|
| 444 |
+
|
| 445 |
+
except Exception as e:
|
| 446 |
+
print(f"[USAGE_LOG] Failed to log alignment: {e}")
|
| 447 |
+
return None
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def update_alignment_row(
|
| 451 |
+
row: Dict[str, Any],
|
| 452 |
+
*,
|
| 453 |
+
action: str,
|
| 454 |
+
# Input metadata (overwritten)
|
| 455 |
+
audio_duration_s: float,
|
| 456 |
+
num_segments: int,
|
| 457 |
+
surah: int,
|
| 458 |
+
# Settings for this run
|
| 459 |
+
min_silence_ms: int,
|
| 460 |
+
min_speech_ms: int,
|
| 461 |
+
pad_ms: int,
|
| 462 |
+
asr_model: str,
|
| 463 |
+
device: str,
|
| 464 |
+
# Profiling
|
| 465 |
+
total_time: float,
|
| 466 |
+
vad_queue_time: float,
|
| 467 |
+
vad_gpu_time: float,
|
| 468 |
+
asr_gpu_time: float,
|
| 469 |
+
dp_total_time: float,
|
| 470 |
+
# Quality & retry
|
| 471 |
+
segments_passed: int,
|
| 472 |
+
segments_failed: int,
|
| 473 |
+
mean_confidence: float,
|
| 474 |
+
tier1_retries: int,
|
| 475 |
+
tier1_passed: int,
|
| 476 |
+
tier2_retries: int,
|
| 477 |
+
tier2_passed: int,
|
| 478 |
+
reanchors: int,
|
| 479 |
+
special_merges: int,
|
| 480 |
+
# Reciter stats
|
| 481 |
+
words_per_minute: float,
|
| 482 |
+
phonemes_per_second: float,
|
| 483 |
+
avg_segment_duration: float,
|
| 484 |
+
std_segment_duration: float,
|
| 485 |
+
avg_pause_duration: float,
|
| 486 |
+
std_pause_duration: float,
|
| 487 |
+
# Segments
|
| 488 |
+
log_segments: List[dict],
|
| 489 |
+
) -> None:
|
| 490 |
+
"""Mutate an existing row dict in-place and ensure it's in the scheduler buffer.
|
| 491 |
+
|
| 492 |
+
After mutation, syncs the row into the scheduler's buffer so the update
|
| 493 |
+
is captured even if gr.State returned a deserialized copy or if the
|
| 494 |
+
original row was already pushed to Hub.
|
| 495 |
+
|
| 496 |
+
Args:
|
| 497 |
+
row: The dict returned by log_alignment(), stored in gr.State.
|
| 498 |
+
action: "resegment" or "retranscribe".
|
| 499 |
+
"""
|
| 500 |
+
try:
|
| 501 |
+
# Overwrite run-level fields
|
| 502 |
+
row["audio_duration_s"] = audio_duration_s
|
| 503 |
+
row["num_segments"] = num_segments
|
| 504 |
+
row["surah"] = surah
|
| 505 |
+
row["min_silence_ms"] = int(min_silence_ms)
|
| 506 |
+
row["min_speech_ms"] = int(min_speech_ms)
|
| 507 |
+
row["pad_ms"] = int(pad_ms)
|
| 508 |
+
row["asr_model"] = asr_model
|
| 509 |
+
row["device"] = device
|
| 510 |
+
row["total_time"] = total_time
|
| 511 |
+
row["vad_queue_time"] = vad_queue_time
|
| 512 |
+
row["vad_gpu_time"] = vad_gpu_time
|
| 513 |
+
row["asr_gpu_time"] = asr_gpu_time
|
| 514 |
+
row["dp_total_time"] = dp_total_time
|
| 515 |
+
row["segments_passed"] = segments_passed
|
| 516 |
+
row["segments_failed"] = segments_failed
|
| 517 |
+
row["mean_confidence"] = mean_confidence
|
| 518 |
+
row["tier1_retries"] = tier1_retries
|
| 519 |
+
row["tier1_passed"] = tier1_passed
|
| 520 |
+
row["tier2_retries"] = tier2_retries
|
| 521 |
+
row["tier2_passed"] = tier2_passed
|
| 522 |
+
row["reanchors"] = reanchors
|
| 523 |
+
row["special_merges"] = special_merges
|
| 524 |
+
row["words_per_minute"] = words_per_minute
|
| 525 |
+
row["phonemes_per_second"] = phonemes_per_second
|
| 526 |
+
row["avg_segment_duration"] = avg_segment_duration
|
| 527 |
+
row["std_segment_duration"] = std_segment_duration
|
| 528 |
+
row["avg_pause_duration"] = avg_pause_duration
|
| 529 |
+
row["std_pause_duration"] = std_pause_duration
|
| 530 |
+
|
| 531 |
+
# Set session flag
|
| 532 |
+
if action == "resegment":
|
| 533 |
+
row["resegmented"] = True
|
| 534 |
+
elif action == "retranscribe":
|
| 535 |
+
row["retranscribed"] = True
|
| 536 |
+
|
| 537 |
+
# Append new run to segments array
|
| 538 |
+
segments_runs = json.loads(row.get("segments") or "[]")
|
| 539 |
+
segments_runs.append({
|
| 540 |
+
"min_silence_ms": int(min_silence_ms),
|
| 541 |
+
"min_speech_ms": int(min_speech_ms),
|
| 542 |
+
"pad_ms": int(pad_ms),
|
| 543 |
+
"asr_model": asr_model,
|
| 544 |
+
"segments": log_segments,
|
| 545 |
+
})
|
| 546 |
+
row["segments"] = json.dumps(segments_runs)
|
| 547 |
+
|
| 548 |
+
# Sync with scheduler buffer — the row from gr.State may be a
|
| 549 |
+
# deserialized copy, or the original may have already been pushed.
|
| 550 |
+
_sync_row_to_scheduler(row)
|
| 551 |
+
|
| 552 |
+
except Exception as e:
|
| 553 |
+
print(f"[USAGE_LOG] Failed to update alignment row: {e}")
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
def update_word_timestamps(
|
| 557 |
+
row: Dict[str, Any],
|
| 558 |
+
word_timestamps_json: str,
|
| 559 |
+
char_timestamps_json: Optional[str] = None,
|
| 560 |
+
) -> None:
|
| 561 |
+
"""Set word and char timestamps fields on an existing row and sync to scheduler."""
|
| 562 |
+
try:
|
| 563 |
+
row["word_timestamps"] = word_timestamps_json
|
| 564 |
+
if char_timestamps_json is not None:
|
| 565 |
+
row["char_timestamps"] = char_timestamps_json
|
| 566 |
+
_sync_row_to_scheduler(row)
|
| 567 |
+
except Exception as e:
|
| 568 |
+
print(f"[USAGE_LOG] Failed to update word timestamps: {e}")
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
def log_error(user_id: str, error_message: str) -> None:
|
| 572 |
+
"""Log a pipeline error to JSONL."""
|
| 573 |
+
try:
|
| 574 |
+
with _get_error_lock():
|
| 575 |
+
with ERROR_LOG_PATH.open("a") as f:
|
| 576 |
+
json.dump({
|
| 577 |
+
"timestamp": datetime.now().isoformat(timespec="seconds"),
|
| 578 |
+
"user_id": user_id,
|
| 579 |
+
"error_message": error_message or "",
|
| 580 |
+
}, f)
|
| 581 |
+
f.write("\n")
|
| 582 |
+
except Exception:
|
| 583 |
+
pass
|
| 584 |
+
|
| 585 |
+
|
| 586 |
+
def _write_fallback(row: Dict[str, Any]) -> None:
|
| 587 |
+
"""Local-only fallback: write JSONL (without audio)."""
|
| 588 |
+
fallback_path = LOG_DIR / "alignments_fallback.jsonl"
|
| 589 |
+
with _fallback_lock:
|
| 590 |
+
with fallback_path.open("a") as f:
|
| 591 |
+
fallback_row = {k: v for k, v in row.items() if k != "audio"}
|
| 592 |
+
json.dump(fallback_row, f)
|
| 593 |
+
f.write("\n")
|