Spaces:

hafsaabd82
/

Audio-Analyzer

Sleeping

App Files Files Community

hafsaabd82 commited on Dec 4, 2025

Commit

eb8b754

verified ·

1 Parent(s): a1ffca1

Update app.py

Browse files

Files changed (1) hide show

app.py +361 -96

app.py CHANGED Viewed

@@ -1,126 +1,391 @@
-import os
 import tempfile
 import whisperx
-from pyannote.audio import Pipeline
 import pandas as pd
 import librosa
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-import torch
-if not hasattr(torch.utils._pytree, "register_pytree_node"):
-    torch.utils._pytree.register_pytree_node = torch.utils._pytree._register_pytree_node
-import traceback
-app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["https://frontend-audio-analyzer.vercel.app/"],
     allow_methods=["*"],
-    allow_headers=["*"]
 )
-device = "cpu"
-compute_type = "float16" if device == "cuda" else "float32"
-hf_token = os.environ.get("HF_TOKEN")
-try:
-    whisper_model = whisperx.load_model("large-v2", device=device, compute_type=compute_type)
-except Exception as e:
-    print(f"Error loading WhisperX model: {e}")
-    whisper_model = None
-try:
-    diarize_pipeline = Pipeline.from_pretrained(
-        "pyannote/speaker-diarization-2.1",
-        use_auth_token=hf_token
-    )
-except Exception as e:
-    print(f"Error loading Pyannote pipeline. Check HF_TOKEN: {e}")
-    diarize_pipeline = None
-try:
-    align_model, metadata = whisperx.load_align_model(
-        language_code=None,
-        device=device
-    )
-except Exception as e:
-    print(f"Error loading WhisperX alignment model: {e}")
-    align_model, metadata = None, None
-@app.post("/process-audio")
-async def process_audio(file: UploadFile = File(...)):
-    if not whisper_model or not diarize_pipeline or not align_model:
-        raise HTTPException(status_code=503, detail="Model loading failed on server.")
-    if not file.filename.endswith((".wav", ".mp3", ".m4a", ".flac")):
-        raise HTTPException(status_code=400, detail="Invalid audio format")
-    audio_path = None
     try:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=file.filename) as tmp:
-            tmp.write(await file.read())
-            audio_path = tmp.name
         try:
-            audio = whisperx.load_audio(audio_path)
-            duration = librosa.get_duration(path=audio_path)
-        except Exception as e:
-            raise HTTPException(status_code=400, detail=f"Failed to load or process audio file: {e}")
         try:
-            result = whisper_model.transcribe(audio, batch_size=8)
         except Exception as e:
-            raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
-        language = result.get("language", "unknown")
         try:
-            aligned_result = whisperx.align(
-                result["segments"],
-                align_model,
-                metadata,
-                audio,
-                device
-            )
-        except Exception as e:
-            raise HTTPException(status_code=500, detail=f"Alignment failed: {e}")
         try:
-            diarization = diarize_pipeline(audio_path)
         except Exception as e:
-            raise HTTPException(status_code=500, detail=f"Diarization failed: {e}")
-        diar_map = []
-        for turn in diarization.itertracks(yield_label=True):
-            segment, _, speaker_label = turn
-            diar_map.append({
-                "start": segment.start,
-                "end": segment.end,
-                "speaker": speaker_label
-            })
-        diar_df = pd.DataFrame(diar_map)
-        timeline = []
-        for seg in aligned_result["segments"]:
-            if "words" not in seg:
-                continue
-            for word in seg["words"]:
-                if word["start"] is None or word["end"] is None:
                     continue
-                match = diar_df[
-                    (diar_df.start <= word["start"]) &
-                    (diar_df.end >= word["end"])
-                ]
-                speaker = match.iloc[0].speaker if not match.empty else "Unknown"
-                timeline.append({
-                    "start": round(word["start"], 3),
-                    "end": round(word["end"], 3),
-                    "text": word["word"],
-                    "speaker": speaker
                 })
-        timeline = sorted(timeline, key=lambda x: x["start"])
-        return {
-            "duration": duration,
-            "language": language,
-            "timeline_data": timeline
-        }
     except HTTPException:
         raise
     except Exception as e:
-        traceback.print_exc()
-        raise HTTPException(status_code=500, detail=f"An unexpected server error occurred during processing: {e}")
     finally:
         if audio_path and os.path.exists(audio_path):
             os.remove(audio_path)
 @app.get("/")
 def root():
     return {"message": "Audio Analyzer Backend is running."}

 import tempfile
 import whisperx
+from whisperx import diarize
 import pandas as pd
 import librosa
+import soundfile as sf
+import numpy as np
+from scipy.signal import butter, filtfilt
+from typing import Optional, Dict, List, Any, Union
+import torch
+from dataclasses import dataclass, field
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import time
+import shutil
+try:
+    import noisereduce as nr
+    HAVE_NOISEREDUCE = True
+except ImportError:
+    HAVE_NOISEREDUCE = False
+Annotation: Any = None
+Segment: Any = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+token = os.environ.get("HF_TOKEN")
+if not token:
+    print("Warning: HF_TOKEN not set. Diarization will be skipped.")
+perform_diarization = True if token else False
+model_name = "medium"
+class TimelineItem(BaseModel):
+    start: float
+    end: float
+    speaker: Union[str, None] = None
+    text: str
+class AnalysisResult(BaseModel):
+    duration: float
+    language: str
+    der: Union[float, None] = None
+    speaker_error: Union[float, None] = None
+    missed_speech: Union[float, None] = None
+    false_alarm: Union[float, None] = None
+    timeline_data: List[TimelineItem]
+app = FastAPI(title="Audio Analyzer Backend")
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["https://frontend-audio-analyzer.vercel.app"],
+    allow_credentials=True,
     allow_methods=["*"],
+    allow_headers=["*"],
 )
+@dataclass
+class AnalysisResults:
+    timelineData: List[Dict[str, Any]] = field(default_factory=list)
+    duration: float = 0.0
+    languageCode: str = "unknown"
+    diarizationErrorRate: Optional[float] = None
+    speakerError: Optional[float] = None
+    missedSpeech: Optional[float] = None
+    falseAlarm: Optional[float] = None
+    warnings: List[str] = field(default_factory=list)
+    success: bool = False
+    message: str = "Analysis initiated."
+def warn(results: AnalysisResults, code: str, detail: str) -> None:
+    msg = f"{code}: {detail}"
+    if msg not in results.warnings:
+        results.warnings.append(msg)
+def set_message(results: AnalysisResults, msg: str) -> None:
+    initial_message = "Analysis initiated."
+    if results.message and results.message != initial_message:
+        results.message += f" | {msg}"
+    else:
+        results.message = msg
+def normalize_speaker(lbl: str) -> str:
+    lbl_str = str(lbl)
+    return lbl_str.replace("SPEAKER_", "Speaker_").replace("speaker_", "Speaker_")
+def temp_wav_path() -> str:
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        return f.name
+def force_float(value: Optional[Any]) -> Optional[float]:
+    """Ensures value is a native Python float or None. Returns None for NaN/Inf."""
+    if value is None:
+        return None
     try:
+        f_val = float(value)
+        if np.isnan(f_val) or np.isinf(f_val):
+            return None
+        return f_val
+    except (TypeError, ValueError, AttributeError):
+        return None
+def butter_filter(y, sr, lowpass=None, highpass=None, order=4):
+    nyq = 0.5 * sr
+    if highpass and highpass > 0 and highpass < nyq:
+        b, a = butter(order, highpass / nyq, btype="highpass", analog=False)
+        y = filtfilt(b, a, y)
+    if lowpass and lowpass > 0 and lowpass < nyq:
+        b, a = butter(order, lowpass / nyq, btype="lowpass", analog=False)
+        y = filtfilt(b, a, y)
+    return y
+def rms_normalize(y, target_rms=0.8, eps=1e-6):
+    rms = (y**2).mean() ** 0.5
+    if rms < eps:
+        return y
+    gain = target_rms / (rms + eps)
+    return y * gain
+def preprocess_audio(input_path,
+                     target_sr=16000,
+                     normalize_rms=True,
+                     target_rms=0.08,
+                     denoise=False,
+                     highpass=None,
+                     lowpass=None,
+                     output_subtype="PCM_16",
+                     verbose=False) -> str:
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"Input audio not found: {input_path}")
+    output_path = temp_wav_path()
+    y_stereo, sr = sf.read(input_path, dtype='float64')
+    if y_stereo.ndim > 1:
+        y = librosa.to_mono(y_stereo.T)
+    else:
+        y = y_stereo
+    if sr != target_sr:
+        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
+        sr = target_sr
+    if highpass or lowpass:
+        y = butter_filter(y, sr, highpass=highpass, lowpass=lowpass)
+    if denoise and HAVE_NOISEREDUCE:
         try:
+            noise_len = int(min(len(y), int(0.5 * sr)))
+            noise_clip = y[:noise_len]
+            y = nr.reduce_noise(y=y, sr=sr, y_noise=noise_clip, prop_decrease=0.9, verbose=False)
+        except Exception:
+            pass
+    if normalize_rms:
+        y = rms_normalize(y, target_rms=target_rms)
+    sf.write(output_path, y, sr, subtype=output_subtype)
+    return output_path
+    try:
+        if hasattr(diarize_output, '__iter__'):
+            for seg in diarize_output:
+                if all(k in seg for k in ("start", "end", "speaker")):
+                    s = float(seg.get("start", 0.0))
+                    e = float(seg.get("end", s))
+                    lbl = normalize_speaker(seg.get("speaker", "Speaker_1"))
+                    ann[Segment(s, e)] = lbl
+                if isinstance(seg, dict):
+                    if 'segment' in seg and 'label' in seg:
+                        s = float(seg['segment'].start)
+                        e = float(seg['segment'].end)
+                        lbl = normalize_speaker(seg['label'])
+                        ann[Segment(s, e)] = lbl
+                    elif all(k in seg for k in ("start", "end", "speaker")):
+                        s = float(seg.get("start", 0.0))
+                        e = float(seg.get("end", s))
+                        lbl = normalize_speaker(seg.get("speaker", "Speaker_1"))
+                        ann[Segment(s, e)] = lbl
+                elif hasattr(seg, 'start') and hasattr(seg, 'end'):
+                    s = float(seg.start)
+                    e = float(seg.end)
+                    lbl = normalize_speaker(getattr(seg, 'speaker', getattr(seg, 'label', 'Speaker_1')))
+                    ann[Segment(s, e)] = lbl
+        return ann
+    except Exception as e:
+        print(f"Error in diarization_to_annotation: {e}")
+        return None
+def analyze_audio(audio_file: str,
+                  reference_rttm_file: Optional[str] = None,
+                  preprocess: bool = True,
+                  preprocess_params: Optional[Dict[str, Any]] = None) -> AnalysisResults:
+    results = AnalysisResults()
+    if not os.path.exists(audio_file):
+        results.message = f"Error: Input audio file '{audio_file}' not found."
+        return results
+    audio_for_model = audio_file
+    temp_preproc = None
+    if preprocess:
+        params = {
+            "target_sr": 16000, "normalize_rms": True, "target_rms": 0.08,
+            "denoise": False, "highpass": None, "lowpass": None,
+            "output_subtype": "PCM_16", "verbose": False
+        }
+        if isinstance(preprocess_params, dict):
+            params.update(preprocess_params)
+        if params.get("denoise") and not HAVE_NOISEREDUCE:
+            warn(results, "DENOISE_SKIP", "Denoise requested but noisereduce not installed; skipping denoise.")
+            params["denoise"] = False
         try:
+            temp_preproc = preprocess_audio(audio_file, **params)
+            audio_for_model = temp_preproc
         except Exception as e:
+            warn(results, "PREP_FAIL", f"Preprocessing failed: {e}. Falling back to original audio.")
+            audio_for_model = audio_file
+            temp_preproc = None
+    start_ml_time = time.time()
+    try:
+        print(f"Loading Whisper model '{model_name}' on {device}...")
+        model = whisperx.load_model(model_name, device, compute_type="float32")
+        audio_loaded = whisperx.load_audio(audio_for_model)
+        print("Transcribing audio...")
+        result = model.transcribe(audio_loaded, batch_size=4)
+        language_code = result.get("language") or result.get("detected_language") or "en"
+        results.languageCode = language_code
+        print(f"Detected language: {language_code}. Aligning transcription...")
         try:
+            align_model, metadata = whisperx.load_align_model(language_code=language_code, device=device)
+            aligned = whisperx.align(result["segments"], align_model, metadata, audio_loaded, device)
+        except Exception:
+            aligned = {"segments": result["segments"]}
+            warn(results, "ALIGN_SKIP", "Alignment unavailable; using raw Whisper segments.")
+        diarize_output = None
+        if perform_diarization:
+            print("Performing speaker diarization (Requires HF_TOKEN)...")
+            try:
+                diarize_output = diarize(audio_for_model)
+                for segment, _, label in diarize_output.itertracks(yield_label=True):
+                    print(f"start={segment.start:.1f}s stop={segment.end:.1f}s {label}")
+            except Exception as e:
+                warn(results, "DIAR_SKIP", f"Error during diarization (likely token/model failure): {type(e).__name__}: {e}. Skipping diarization.")
+                diarize_output = None
+        else:
+            warn(results, "DIAR_SKIP", "HF_TOKEN not set. Skipping speaker diarization.")
+        print("Assigning speakers to words...")
         try:
+            diarize_segments_for_assignment = []
+            if diarize_output is not None:
+                if hasattr(diarize_output, "itertracks"):
+                    for segment, _, label in diarize_output.itertracks(yield_label=True):
+                        diarize_segments_for_assignment.append({
+                            "start": float(segment.start),
+                            "end": float(segment.end),
+                            "speaker": normalize_speaker(label)
+                        })
+                else:
+                    diarize_segments_for_assignment = diarize_output
+            else:
+                diarize_segments_for_assignment = []
+                for seg in aligned.get("segments", []):
+                    diarize_segments_for_assignment.append({
+                        "start": seg.get("start", 0),
+                        "end": seg.get("end", seg.get("start", 0)),
+                        "speaker": "Speaker_1"
+                    })
+            if diarize_segments_for_assignment:
+                final = whisperx.assign_word_speakers(diarize_segments_for_assignment, aligned)
+            else:
+                final = aligned
+                for seg in final.get("segments", []):
+                    seg["speaker"] = "Speaker_1"
         except Exception as e:
+            warn(results, "ASSIGN_SPEAKERS_ERROR", f"Error assigning speakers: {e}. Falling back to unassigned segments.")
+            final = aligned
+            for seg in final.get("segments", []):
+                seg["speaker"] = "Speaker_1"
+        def _get_time_field(d: Dict[str, Any], keys: List[str]) -> Optional[float]:
+            """Try multiple possible keys and coerce to native float, returning None if not possible."""
+            for k in keys:
+                if k in d:
+                    try:
+                        v = d[k]
+                        if v is None:
+                            continue
+                        f = float(v)
+                        if np.isnan(f) or np.isinf(f):
+                            return None
+                        return f
+                    except (TypeError, ValueError):
+                        continue
+            return None
+        rows: List[Dict[str, Any]] = []
+        for seg in final.get("segments", []):
+            seg_speaker = normalize_speaker(seg.get("speaker") or seg.get("speaker_label") or "Speaker_1")
+            word_list = seg.get("words") or seg.get("tokens") or seg.get("items") or []
+            if not word_list:
+                word_start = _get_time_field(seg, ["start", "s", "timestamp", "t0"])
+                word_end = _get_time_field(seg, ["end", "e", "t1"])
+                if word_start is None:
                     continue
+                if word_end is None:
+                    word_end = word_start
+                rows.append({
+                    "start": float(word_start),
+                    "end": float(word_end),
+                    "text": str(seg.get("text", "")).strip(),
+                    "speaker": str(seg_speaker),
                 })
+                continue
+            for w in word_list:
+                if not isinstance(w, dict):
+                    continue
+                word_start = _get_time_field(w, ["start", "s", "timestamp", "t0"])
+                word_end = _get_time_field(w, ["end", "e", "t1"])
+                if word_start is None:
+                    word_start = _get_time_field(seg, ["start", "s"])
+                if word_end is None:
+                    word_end = _get_time_field(seg, ["end", "e"])
+                if word_start is None:
+                    continue
+                if word_end is None:
+                    word_end = word_start
+                word_speaker = normalize_speaker(w.get("speaker") or seg_speaker)
+                word_text = (w.get("text") or w.get("word") or w.get("label") or "").strip()
+                rows.append({
+                    "start": float(word_start),
+                    "end": float(word_end),
+                    "text": str(word_text),
+                    "speaker": str(word_speaker),
+                })
+        rows = sorted(rows, key=lambda r: r.get("start", 0.0))
+        results.timelineData = rows
+        ends = []
+        for w in rows:
+            e = w.get("end")
+            f_e = force_float(e)
+            if f_e is not None:
+                ends.append(f_e)
+    except Exception as e:
+        results.message = f"Error during ML processing: {type(e).__name__}: {e}"
+        return results
+    finally:
+        if temp_preproc and os.path.exists(temp_preproc):
+            os.remove(temp_preproc)
+        results.duration = force_float(max(ends) if ends else 0.0) or 0.0
+        end_ml_time = time.time()
+        print(f"ML Processing finished in {end_ml_time - start_ml_time:.2f} seconds.")
+    return results
+@app.post("/upload", response_model=AnalysisResult)
+async def upload_file(audio_file: UploadFile = File(...)):
+    start_time = time.time()
+    audio_path: Optional[str] = None
+    try:
+        print("Incoming upload:", getattr(audio_file, "filename", None))
+        suffix = audio_file.filename.split(".")[-1] if audio_file.filename else "tmp"
+        with tempfile.NamedTemporaryFile(suffix=f".{suffix}", delete=False) as tmp_audio:
+            shutil.copyfileobj(audio_file.file, tmp_audio)
+            audio_path = tmp_audio.name
+        print(f"Received audio file: {audio_file.filename} (saved to {audio_path}), size: {os.path.getsize(audio_path)} bytes")
+        preprocessing_config = {"denoise": False}
+        print(f"Starting ML processing with audio: {audio_path}, preprocess_params: {preprocessing_config}")
+        analysis_result = analyze_audio(
+            audio_file=audio_path,
+            preprocess_params=preprocessing_config
+        )
+        print("FAILURE MESSAGE:", analysis_result.message)
+        if not analysis_result.success:
+            raise HTTPException(status_code=500, detail=analysis_result.message)
+        print("DURATION BEFORE RETURN:", analysis_result.duration)
+        if analysis_result.duration is None:
+            analysis_result.duration = 0.0
+        return AnalysisResult(
+            duration=force_float(analysis_result.duration) or 0.0,
+            language=analysis_result.languageCode,
+            timeline_data=[
+                TimelineItem(
+                    start=force_float(item.get('start')) or 0.0,
+                    end=force_float(item.get('end')) or 0.0,
+                    speaker=str(item.get('speaker')) if item.get('speaker') else None,
+                    text=str(item.get('text', ""))
+                ) for item in analysis_result.timelineData
+            ]
+        )
     except HTTPException:
         raise
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Unexpected error during upload process: {type(e).__name__}: {e}")
     finally:
         if audio_path and os.path.exists(audio_path):
             os.remove(audio_path)
+        end_time = time.time()
+        print(f"API Request processed in {end_time - start_time:.2f} seconds.")
 @app.get("/")
 def root():
     return {"message": "Audio Analyzer Backend is running."}