Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -38,6 +38,12 @@ except Exception as e:
|
|
| 38 |
print(f"Error loading pyannote pipeline: {type(e).__name__}: {e}. Diarization will be skipped.")
|
| 39 |
diarization_pipeline = None
|
| 40 |
global_diarizer = diarization_pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
model_name = "medium"
|
| 42 |
class TimelineItem(BaseModel):
|
| 43 |
start: float
|
|
@@ -191,26 +197,44 @@ def analyze_audio(audio_file: str,
|
|
| 191 |
result = model.transcribe(audio_loaded, batch_size=4 )
|
| 192 |
language_code = result.get("language") or result.get("detected_language") or "en"
|
| 193 |
results.languageCode = language_code
|
|
|
|
| 194 |
print(f"Detected language: {language_code}. Aligning transcription...")
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
warn(results, "ALIGN_SKIP", "Alignment unavailable; using raw Whisper segments.")
|
| 201 |
-
diarize_output = None
|
| 202 |
-
if global_diarizer is not None:
|
| 203 |
-
print("Performing speaker diarization (Requires HF_TOKEN)...")
|
| 204 |
try:
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
except Exception as e:
|
| 209 |
-
warn(results, "
|
| 210 |
-
diarize_output = None
|
| 211 |
else:
|
| 212 |
-
warn(results, "
|
| 213 |
-
print("Assigning speakers to words...")
|
| 214 |
try:
|
| 215 |
diarize_segments_for_assignment = []
|
| 216 |
if diarize_output is not None and hasattr(diarize_output, "itertracks"):
|
|
|
|
| 38 |
print(f"Error loading pyannote pipeline: {type(e).__name__}: {e}. Diarization will be skipped.")
|
| 39 |
diarization_pipeline = None
|
| 40 |
global_diarizer = diarization_pipeline
|
| 41 |
+
ALIGN_MODEL_MAP = {
|
| 42 |
+
"ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu",
|
| 43 |
+
"pa": "kingabzpro/wav2vec2-large-xlsr-53-punjabi",
|
| 44 |
+
"sd": "Abdullah104/wav2vec2-large-xls-r-300m-sindhi-kaggle",
|
| 45 |
+
"ps": "ihanif/wav2vec2-xls-r-300m-pashto",
|
| 46 |
+
}
|
| 47 |
model_name = "medium"
|
| 48 |
class TimelineItem(BaseModel):
|
| 49 |
start: float
|
|
|
|
| 197 |
result = model.transcribe(audio_loaded, batch_size=4 )
|
| 198 |
language_code = result.get("language") or result.get("detected_language") or "en"
|
| 199 |
results.languageCode = language_code
|
| 200 |
+
global global_align_model_cache
|
| 201 |
print(f"Detected language: {language_code}. Aligning transcription...")
|
| 202 |
+
aligned = {"segments": result["segments"]}
|
| 203 |
+
align_model = None
|
| 204 |
+
metadata = None
|
| 205 |
+
if language_code not in global_align_model_cache:
|
| 206 |
+
align_model_name = ALIGN_MODEL_MAP.get(language_code)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
try:
|
| 208 |
+
if align_model_name:
|
| 209 |
+
print(f"Loading custom alignment model for {language_code}: {align_model_name}...")
|
| 210 |
+
align_model, metadata = whisperx.load_align_model(
|
| 211 |
+
language_code=language_code,
|
| 212 |
+
model_name=align_model_name,
|
| 213 |
+
device=device
|
| 214 |
+
)
|
| 215 |
+
global_align_model_cache[language_code] = (align_model, metadata)
|
| 216 |
+
print(f"Alignment model loaded/cached for language: {language_code}")
|
| 217 |
+
|
| 218 |
+
except Exception as e:
|
| 219 |
+
warn(results, "ALIGN_LOAD_FAIL", f"Failed to load alignment model for {language_code}: {e}. Alignment skipped.")
|
| 220 |
+
global_align_model_cache[language_code] = (None, None) # Cache the failure/skip
|
| 221 |
+
else:
|
| 222 |
+
align_model, metadata = global_align_model_cache[language_code]
|
| 223 |
+
if align_model:
|
| 224 |
+
print(f"Alignment model loaded from cache for language: {language_code}")
|
| 225 |
+
if align_model:
|
| 226 |
+
try:
|
| 227 |
+
aligned = whisperx.align(
|
| 228 |
+
result["segments"],
|
| 229 |
+
align_model,
|
| 230 |
+
metadata,
|
| 231 |
+
audio_loaded,
|
| 232 |
+
device
|
| 233 |
+
)
|
| 234 |
except Exception as e:
|
| 235 |
+
warn(results, "ALIGN_RUN_FAIL", f"Alignment execution failed: {type(e).__name__}: {e}. Using raw segments.")
|
|
|
|
| 236 |
else:
|
| 237 |
+
warn(results, "ALIGN_SKIP", "Alignment model unavailable; using raw Whisper segments.")
|
|
|
|
| 238 |
try:
|
| 239 |
diarize_segments_for_assignment = []
|
| 240 |
if diarize_output is not None and hasattr(diarize_output, "itertracks"):
|