Spaces:

hafsaabd82
/

Audio-Analyzer

Sleeping

App Files Files Community

hafsaabd82 commited on Dec 4, 2025

Commit

9fb2b44

verified ·

1 Parent(s): 4991c43

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -16

app.py CHANGED Viewed

@@ -38,6 +38,12 @@ except Exception as e:
     print(f"Error loading pyannote pipeline: {type(e).__name__}: {e}. Diarization will be skipped.")
     diarization_pipeline = None
 global_diarizer = diarization_pipeline
 model_name = "medium"
 class TimelineItem(BaseModel):
     start: float
@@ -191,26 +197,44 @@ def analyze_audio(audio_file: str,
         result = model.transcribe(audio_loaded, batch_size=4 )
         language_code = result.get("language") or result.get("detected_language") or "en"
         results.languageCode = language_code
         print(f"Detected language: {language_code}. Aligning transcription...")
-        try:
-            align_model, metadata = whisperx.load_align_model(language_code=language_code, device=device)
-            aligned = whisperx.align(result["segments"], align_model, metadata, audio_loaded, device)
-        except Exception:
-            aligned = {"segments": result["segments"]}
-            warn(results, "ALIGN_SKIP", "Alignment unavailable; using raw Whisper segments.")
-        diarize_output = None
-        if global_diarizer is not None:
-            print("Performing speaker diarization (Requires HF_TOKEN)...")
             try:
-                diarize_output = global_diarizer(audio_for_model)
-                for segment, _, label in diarize_output.itertracks(yield_label=True):
-                    print(f"start={segment.start:.1f}s stop={segment.end:.1f}s {label}")
             except Exception as e:
-                warn(results, "DIAR_SKIP", f"Error during diarization (likely token/model failure): {type(e).__name__}: {e}. Skipping diarization.")
-                diarize_output = None
         else:
-            warn(results, "DIAR_SKIP", "HF_TOKEN not set. Skipping speaker diarization.")
-        print("Assigning speakers to words...")
         try:
             diarize_segments_for_assignment = []
             if diarize_output is not None and hasattr(diarize_output, "itertracks"):

     print(f"Error loading pyannote pipeline: {type(e).__name__}: {e}. Diarization will be skipped.")
     diarization_pipeline = None
 global_diarizer = diarization_pipeline
+ALIGN_MODEL_MAP = {
+    "ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu",
+    "pa": "kingabzpro/wav2vec2-large-xlsr-53-punjabi",
+    "sd": "Abdullah104/wav2vec2-large-xls-r-300m-sindhi-kaggle",
+    "ps": "ihanif/wav2vec2-xls-r-300m-pashto",
+}
 model_name = "medium"
 class TimelineItem(BaseModel):
     start: float
         result = model.transcribe(audio_loaded, batch_size=4 )
         language_code = result.get("language") or result.get("detected_language") or "en"
         results.languageCode = language_code
+        global global_align_model_cache
         print(f"Detected language: {language_code}. Aligning transcription...")
+        aligned = {"segments": result["segments"]}
+        align_model = None
+        metadata = None
+        if language_code not in global_align_model_cache:
+            align_model_name = ALIGN_MODEL_MAP.get(language_code)
             try:
+                if align_model_name:
+                    print(f"Loading custom alignment model for {language_code}: {align_model_name}...")
+                align_model, metadata = whisperx.load_align_model(
+                    language_code=language_code,
+                    model_name=align_model_name,
+                    device=device
+                )
+                global_align_model_cache[language_code] = (align_model, metadata)
+                print(f"Alignment model loaded/cached for language: {language_code}")
+            except Exception as e:
+                warn(results, "ALIGN_LOAD_FAIL", f"Failed to load alignment model for {language_code}: {e}. Alignment skipped.")
+                global_align_model_cache[language_code] = (None, None) # Cache the failure/skip
+        else:
+            align_model, metadata = global_align_model_cache[language_code]
+            if align_model:
+                 print(f"Alignment model loaded from cache for language: {language_code}")
+        if align_model:
+            try:
+                aligned = whisperx.align(
+                    result["segments"],
+                    align_model,
+                    metadata,
+                    audio_loaded,
+                    device
+                )
             except Exception as e:
+                warn(results, "ALIGN_RUN_FAIL", f"Alignment execution failed: {type(e).__name__}: {e}. Using raw segments.")
         else:
+            warn(results, "ALIGN_SKIP", "Alignment model unavailable; using raw Whisper segments.")
         try:
             diarize_segments_for_assignment = []
             if diarize_output is not None and hasattr(diarize_output, "itertracks"):