Spaces:

hafsaabd82
/

Audio-Analyzer

Sleeping

App Files Files Community

hafsaabd82 commited on Dec 4, 2025

Commit

8d49f81

verified ·

1 Parent(s): 1e73f01

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -13

app.py CHANGED Viewed

@@ -158,20 +158,26 @@ def preprocess_audio(input_path,
         y = rms_normalize(y, target_rms=target_rms)
     sf.write(output_path, y, sr, subtype=output_subtype)
     return output_path
-def analyze_audio(audio_file: str,
-                  reference_rttm_file: Optional[str] = None,
-                  preprocess: bool = True,
                   preprocess_params: Optional[Dict[str, Any]] = None) -> AnalysisResults:
     results = AnalysisResults()
-    global global_align_model_cache, ALIGN_MODEL_MAP
-    ends: List[float] = []
     rows: List[Dict[str, Any]] = []
     if not os.path.exists(audio_file):
         results.message = f"Error: Input audio file '{audio_file}' not found."
         return results
     audio_for_model = audio_file
     temp_preproc = None
     if preprocess:
         params = {
             "target_sr": 16000, "normalize_rms": True, "target_rms": 0.08,
@@ -190,22 +196,28 @@ def analyze_audio(audio_file: str,
             warn(results, "PREP_FAIL", f"Preprocessing failed: {e}. Falling back to original audio.")
             audio_for_model = audio_file
             temp_preproc = None
     start_ml_time = time.time()
     try:
         print(f"Loading Whisper model '{model_name}' on {device}...")
         model = whisperx.load_model(model_name, device, compute_type="float32")
         audio_loaded = whisperx.load_audio(audio_for_model)
         print("Transcribing audio...")
         result = model.transcribe(audio_loaded, batch_size=4 )
         language_code = result.get("language") or result.get("detected_language") or "en"
         results.languageCode = language_code
-        global global_align_model_cache
         print(f"Detected language: {language_code}. Aligning transcription...")
-        aligned = {"segments": result["segments"]}
         align_model = None
         metadata = None
         if language_code not in global_align_model_cache:
             align_model_name = ALIGN_MODEL_MAP.get(language_code)
             try:
                 if align_model_name:
                     print(f"Loading custom alignment model for {language_code}: {align_model_name}...")
@@ -216,16 +228,18 @@ def analyze_audio(audio_file: str,
                 )
                 global_align_model_cache[language_code] = (align_model, metadata)
                 print(f"Alignment model loaded/cached for language: {language_code}")
             except Exception as e:
                 warn(results, "ALIGN_LOAD_FAIL", f"Failed to load alignment model for {language_code}: {e}. Alignment skipped.")
-                global_align_model_cache[language_code] = (None, None) # Cache the failure/skip
         else:
             align_model, metadata = global_align_model_cache[language_code]
             if align_model:
                  print(f"Alignment model loaded from cache for language: {language_code}")
         if align_model:
             try:
                 aligned = whisperx.align(
                     result["segments"],
                     align_model,
@@ -236,7 +250,26 @@ def analyze_audio(audio_file: str,
             except Exception as e:
                 warn(results, "ALIGN_RUN_FAIL", f"Alignment execution failed: {type(e).__name__}: {e}. Using raw segments.")
         else:
-            warn(results, "ALIGN_SKIP", "Alignment model unavailable; using raw Whisper segments.")
         try:
             diarize_segments_for_assignment = []
             if diarize_output is not None and hasattr(diarize_output, "itertracks"):
@@ -247,6 +280,7 @@ def analyze_audio(audio_file: str,
                         "speaker": normalize_speaker(label)
                     })
                 print(f"DEBUG: Converted {len(diarize_segments_for_assignment)} diarization segments.")
             if diarize_segments_for_assignment:
                 diarize_df = pd.DataFrame(diarize_segments_for_assignment)
                 final = whisperx.assign_word_speakers(diarize_df, aligned)
@@ -254,12 +288,15 @@ def analyze_audio(audio_file: str,
                 warn(results, "ASSIGN_FAIL", "Diarization segments were empty or unavailable. Defaulting all to Speaker_1.")
                 final = aligned
                 for seg in final.get("segments", []):
-                    seg["speaker"] = "Speaker_1"
         except Exception as e:
             warn(results, "ASSIGN_SPEAKERS_ERROR", f"Error assigning speakers: {type(e).__name__}: {e}. Falling back to unassigned segments.")
             final = aligned
             for seg in final.get("segments", []):
                 seg["speaker"] = "Speaker_1"
         def _get_time_field(d: Dict[str, Any], keys: List[str]) -> Optional[float]:
             """Try multiple possible keys and coerce to native float, returning None if not possible."""
             for k in keys:
@@ -275,6 +312,7 @@ def analyze_audio(audio_file: str,
                     except (TypeError, ValueError):
                         continue
             return None
         for seg in final.get("segments", []):
             seg_speaker = normalize_speaker(seg.get("speaker") or seg.get("speaker_label") or "Speaker_1")
             word_list = seg.get("words") or seg.get("tokens") or seg.get("items") or []
@@ -301,20 +339,21 @@ def analyze_audio(audio_file: str,
                     word_start = _get_time_field(seg, ["start", "s"])
                 if word_end is None:
                     word_end = _get_time_field(seg, ["end", "e"])
                 if word_start is None:
                     continue
                 if word_end is None:
                     word_end = word_start
                 word_speaker = normalize_speaker(w.get("speaker") or seg_speaker)
                 word_text = (w.get("text") or w.get("word") or w.get("label") or "").strip()
                 rows.append({
                     "start": float(word_start),
                     "end": float(word_end),
                     "text": str(word_text),
                     "speaker": str(word_speaker),
                 })
         rows = sorted(rows, key=lambda r: r.get("start", 0.0))
         results.timelineData = rows
         for w in rows:
@@ -322,9 +361,21 @@ def analyze_audio(audio_file: str,
             f_e = force_float(e)
             if f_e is not None:
                 ends.append(f_e)
     except Exception as e:
         results.message = f"Error during ML processing: {type(e).__name__}: {e}"
         return results
     finally:
         if temp_preproc and os.path.exists(temp_preproc):
             os.remove(temp_preproc)

         y = rms_normalize(y, target_rms=target_rms)
     sf.write(output_path, y, sr, subtype=output_subtype)
     return output_path
+def analyze_audio(audio_file: str,
+                  reference_rttm_file: Optional[str] = None,
+                  preprocess: bool = True,
                   preprocess_params: Optional[Dict[str, Any]] = None) -> AnalysisResults:
     results = AnalysisResults()
+    # Ensure access to global variables for reading/writing
+    global global_align_model_cache, ALIGN_MODEL_MAP
+    ends: List[float] = []
     rows: List[Dict[str, Any]] = []
     if not os.path.exists(audio_file):
         results.message = f"Error: Input audio file '{audio_file}' not found."
         return results
     audio_for_model = audio_file
     temp_preproc = None
+    # --- Preprocessing ---
     if preprocess:
         params = {
             "target_sr": 16000, "normalize_rms": True, "target_rms": 0.08,
             warn(results, "PREP_FAIL", f"Preprocessing failed: {e}. Falling back to original audio.")
             audio_for_model = audio_file
             temp_preproc = None
     start_ml_time = time.time()
     try:
+        # --- Transcription ---
         print(f"Loading Whisper model '{model_name}' on {device}...")
         model = whisperx.load_model(model_name, device, compute_type="float32")
         audio_loaded = whisperx.load_audio(audio_for_model)
         print("Transcribing audio...")
         result = model.transcribe(audio_loaded, batch_size=4 )
         language_code = result.get("language") or result.get("detected_language") or "en"
         results.languageCode = language_code
+        aligned = {"segments": result["segments"]} # Default fallback
+        # --- Alignment Loading and Execution (Language-Specific) ---
         print(f"Detected language: {language_code}. Aligning transcription...")
         align_model = None
         metadata = None
         if language_code not in global_align_model_cache:
             align_model_name = ALIGN_MODEL_MAP.get(language_code)
             try:
                 if align_model_name:
                     print(f"Loading custom alignment model for {language_code}: {align_model_name}...")
                 )
                 global_align_model_cache[language_code] = (align_model, metadata)
                 print(f"Alignment model loaded/cached for language: {language_code}")
             except Exception as e:
                 warn(results, "ALIGN_LOAD_FAIL", f"Failed to load alignment model for {language_code}: {e}. Alignment skipped.")
+                global_align_model_cache[language_code] = (None, None)
         else:
             align_model, metadata = global_align_model_cache[language_code]
             if align_model:
                  print(f"Alignment model loaded from cache for language: {language_code}")
         if align_model:
             try:
+                print("Performing word-level alignment...")
                 aligned = whisperx.align(
                     result["segments"],
                     align_model,
             except Exception as e:
                 warn(results, "ALIGN_RUN_FAIL", f"Alignment execution failed: {type(e).__name__}: {e}. Using raw segments.")
         else:
+            warn(results, "ALIGN_SKIP", "Alignment model unavailable; using raw Whisper segments.")
+        # --- DIARIZATION EXECUTION (The missing block, now re-inserted) ---
+        diarize_output = None
+        if global_diarizer is not None:
+            print("Performing speaker diarization (Requires HF_TOKEN)...")
+            try:
+                diarize_output = global_diarizer(audio_for_model)
+                for segment, _, label in diarize_output.itertracks(yield_label=True):
+                    print(f"start={segment.start:.1f}s stop={segment.end:.1f}s {label}")
+            except Exception as e:
+                warn(results, "DIAR_SKIP", f"Error during diarization (likely token/model failure): {type(e).__name__}: {e}. Skipping diarization.")
+                diarize_output = None
+        else:
+            warn(results, "DIAR_SKIP", "HF_TOKEN not set or Diarization Pipeline failed to load globally. Skipping speaker diarization.")
+        # --- Speaker Assignment ---
+        print("Assigning speakers to words...")
         try:
             diarize_segments_for_assignment = []
             if diarize_output is not None and hasattr(diarize_output, "itertracks"):
                         "speaker": normalize_speaker(label)
                     })
                 print(f"DEBUG: Converted {len(diarize_segments_for_assignment)} diarization segments.")
             if diarize_segments_for_assignment:
                 diarize_df = pd.DataFrame(diarize_segments_for_assignment)
                 final = whisperx.assign_word_speakers(diarize_df, aligned)
                 warn(results, "ASSIGN_FAIL", "Diarization segments were empty or unavailable. Defaulting all to Speaker_1.")
                 final = aligned
                 for seg in final.get("segments", []):
+                    seg["speaker"] = "Speaker_1"
         except Exception as e:
             warn(results, "ASSIGN_SPEAKERS_ERROR", f"Error assigning speakers: {type(e).__name__}: {e}. Falling back to unassigned segments.")
             final = aligned
             for seg in final.get("segments", []):
                 seg["speaker"] = "Speaker_1"
+        # ... (rest of the timeline generation logic) ...
         def _get_time_field(d: Dict[str, Any], keys: List[str]) -> Optional[float]:
             """Try multiple possible keys and coerce to native float, returning None if not possible."""
             for k in keys:
                     except (TypeError, ValueError):
                         continue
             return None
         for seg in final.get("segments", []):
             seg_speaker = normalize_speaker(seg.get("speaker") or seg.get("speaker_label") or "Speaker_1")
             word_list = seg.get("words") or seg.get("tokens") or seg.get("items") or []
                     word_start = _get_time_field(seg, ["start", "s"])
                 if word_end is None:
                     word_end = _get_time_field(seg, ["end", "e"])
                 if word_start is None:
                     continue
                 if word_end is None:
                     word_end = word_start
                 word_speaker = normalize_speaker(w.get("speaker") or seg_speaker)
                 word_text = (w.get("text") or w.get("word") or w.get("label") or "").strip()
                 rows.append({
                     "start": float(word_start),
                     "end": float(word_end),
                     "text": str(word_text),
                     "speaker": str(word_speaker),
                 })
         rows = sorted(rows, key=lambda r: r.get("start", 0.0))
         results.timelineData = rows
         for w in rows:
             f_e = force_float(e)
             if f_e is not None:
                 ends.append(f_e)
     except Exception as e:
         results.message = f"Error during ML processing: {type(e).__name__}: {e}"
         return results
+    finally:
+        if temp_preproc and os.path.exists(temp_preproc):
+            os.remove(temp_preproc)
+        results.duration = force_float(max(ends) if ends else 0.0) or 0.0
+        end_ml_time = time.time()
+        print(f"ML Processing finished in {end_ml_time - start_ml_time:.2f} seconds.")
+    results.success = True
+    return results
     finally:
         if temp_preproc and os.path.exists(temp_preproc):
             os.remove(temp_preproc)