Spaces:

psychxD
/

voice_analysis_api

Runtime error

App Files Files Community

psychxD commited on Mar 29

Commit

db4323e

verified ·

1 Parent(s): 3ca9f65

Upload 2 files

Browse files

Files changed (2) hide show

app.py +298 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,298 @@

+"""
+Voice Analysis API for Salesforce
+==================================
+Endpoints:
+  /analyze - Full analysis (diarization + overlap + voice metrics)
+Returns JSON that Salesforce can parse.
+Models used:
+  - pyannote/speaker-diarization-3.1 (who spoke when)
+  - pyannote/overlapped-speech-detection (coaching detection)
+"""
+import gradio as gr
+import os
+import json
+import torch
+from pyannote.audio import Pipeline
+from pyannote.audio.pipelines import OverlappedSpeechDetection
+import scipy.io.wavfile as wavfile
+import numpy as np
+# ============================================================
+# CONFIGURATION
+# ============================================================
+HF_TOKEN = os.environ.get("HF_TOKEN")
+if not HF_TOKEN:
+    print("WARNING: HF_TOKEN not set. Gated models will fail.")
+# ============================================================
+# LOAD MODELS (runs once at startup)
+# ============================================================
+print("Loading diarization model...")
+try:
+    diarization_pipeline = Pipeline.from_pretrained(
+        "pyannote/speaker-diarization-3.1",
+        use_auth_token=HF_TOKEN
+    )
+    print("✅ Diarization model loaded")
+except Exception as e:
+    print(f"❌ Diarization model failed: {e}")
+    diarization_pipeline = None
+print("Loading overlap detection model...")
+try:
+    overlap_pipeline = Pipeline.from_pretrained(
+        "pyannote/overlapped-speech-detection",
+        use_auth_token=HF_TOKEN
+    )
+    print("✅ Overlap detection model loaded")
+except Exception as e:
+    print(f"❌ Overlap detection failed: {e}")
+    overlap_pipeline = None
+# ============================================================
+# ANALYSIS FUNCTIONS
+# ============================================================
+def analyze_diarization(audio_path):
+    """
+    Identifies different speakers and their timestamps.
+    Returns list of segments with speaker labels.
+    """
+    if diarization_pipeline is None:
+        return {"error": "Diarization model not loaded"}
+    try:
+        diarization = diarization_pipeline(audio_path)
+        segments = []
+        for turn, _, speaker in diarization.itertracks(yield_label=True):
+            segments.append({
+                "speaker": speaker,
+                "start": round(turn.start, 2),
+                "end": round(turn.end, 2),
+                "duration": round(turn.end - turn.start, 2)
+            })
+        # Identify borrower (assumes agent speaks first)
+        speakers = list(set([s["speaker"] for s in segments]))
+        agent_speaker = segments[0]["speaker"] if segments else None
+        borrower_speaker = None
+        for s in speakers:
+            if s != agent_speaker:
+                borrower_speaker = s
+                break
+        return {
+            "segments": segments,
+            "speaker_count": len(speakers),
+            "agent_speaker": agent_speaker,
+            "borrower_speaker": borrower_speaker,
+            "total_segments": len(segments)
+        }
+    except Exception as e:
+        return {"error": str(e)}
+def analyze_overlap(audio_path):
+    """
+    Detects overlapping speech (multiple people talking at once).
+    Used for coaching detection.
+    """
+    if overlap_pipeline is None:
+        return {"error": "Overlap detection model not loaded"}
+    try:
+        overlap = overlap_pipeline(audio_path)
+        overlap_segments = []
+        for segment, _, label in overlap.itertracks(yield_label=True):
+            overlap_segments.append({
+                "start": round(segment.start, 2),
+                "end": round(segment.end, 2),
+                "duration": round(segment.end - segment.start, 2)
+            })
+        total_overlap_duration = sum([s["duration"] for s in overlap_segments])
+        return {
+            "overlap_segments": overlap_segments,
+            "overlap_count": len(overlap_segments),
+            "total_overlap_duration": round(total_overlap_duration, 2)
+        }
+    except Exception as e:
+        return {"error": str(e)}
+def detect_coaching(diarization_result, overlap_result):
+    """
+    Cross-references overlap with borrower segments.
+    Overlap during borrower's speech = potential coaching.
+    """
+    coaching_flags = []
+    if "error" in diarization_result or "error" in overlap_result:
+        return {
+            "coaching_detected": False,
+            "error": "Could not analyze - model error"
+        }
+    borrower_speaker = diarization_result.get("borrower_speaker")
+    if not borrower_speaker:
+        return {
+            "coaching_detected": False,
+            "reason": "Could not identify borrower"
+        }
+    # Get borrower segments
+    borrower_segments = [
+        s for s in diarization_result["segments"]
+        if s["speaker"] == borrower_speaker
+    ]
+    # Get overlap segments
+    overlap_segments = overlap_result.get("overlap_segments", [])
+    # Check if any overlap falls within borrower's speaking time
+    for overlap in overlap_segments:
+        for borrower_seg in borrower_segments:
+            # Check if overlap is during borrower's speech
+            if (overlap["start"] >= borrower_seg["start"] and
+                overlap["start"] <= borrower_seg["end"]):
+                coaching_flags.append({
+                    "overlap_time": f"{overlap['start']}-{overlap['end']}",
+                    "during_borrower_segment": f"{borrower_seg['start']}-{borrower_seg['end']}",
+                    "duration": overlap["duration"]
+                })
+    return {
+        "coaching_detected": len(coaching_flags) > 0,
+        "coaching_instances": len(coaching_flags),
+        "coaching_flags": coaching_flags,
+        "borrower_segments_analyzed": len(borrower_segments)
+    }
+def analyze_voice_metrics(audio_path):
+    """
+    Basic voice analysis - pause detection, speaking rate.
+    For hesitation indicators.
+    """
+    try:
+        # Read audio file
+        sample_rate, audio_data = wavfile.read(audio_path)
+        # Convert to mono if stereo
+        if len(audio_data.shape) > 1:
+            audio_data = audio_data.mean(axis=1)
+        # Calculate basic metrics
+        duration = len(audio_data) / sample_rate
+        # Simple energy-based silence detection
+        energy = np.abs(audio_data).astype(float)
+        threshold = np.mean(energy) * 0.1
+        silence_samples = np.sum(energy < threshold)
+        silence_ratio = silence_samples / len(audio_data)
+        return {
+            "duration_seconds": round(duration, 2),
+            "silence_ratio": round(silence_ratio, 3),
+            "has_long_pauses": silence_ratio > 0.3
+        }
+    except Exception as e:
+        return {"error": str(e)}
+# ============================================================
+# MAIN ANALYSIS FUNCTION
+# ============================================================
+def full_analysis(audio_file):
+    """
+    Complete audio analysis - called by Gradio/API.
+    Returns JSON with all results.
+    """
+    if audio_file is None:
+        return json.dumps({"error": "No audio file provided"}, indent=2)
+    results = {
+        "status": "success",
+        "analysis": {}
+    }
+    try:
+        # Run all analyses
+        print(f"Analyzing: {audio_file}")
+        # 1. Diarization
+        print("Running diarization...")
+        diarization_result = analyze_diarization(audio_file)
+        results["analysis"]["diarization"] = diarization_result
+        # 2. Overlap detection
+        print("Running overlap detection...")
+        overlap_result = analyze_overlap(audio_file)
+        results["analysis"]["overlap"] = overlap_result
+        # 3. Coaching detection (cross-reference)
+        print("Analyzing coaching...")
+        coaching_result = detect_coaching(diarization_result, overlap_result)
+        results["analysis"]["coaching"] = coaching_result
+        # 4. Voice metrics
+        print("Analyzing voice metrics...")
+        voice_result = analyze_voice_metrics(audio_file)
+        results["analysis"]["voice_metrics"] = voice_result
+        # 5. Summary
+        results["summary"] = {
+            "speaker_count": diarization_result.get("speaker_count", 0),
+            "coaching_detected": coaching_result.get("coaching_detected", False),
+            "coaching_instances": coaching_result.get("coaching_instances", 0),
+            "has_long_pauses": voice_result.get("has_long_pauses", False),
+            "total_overlap_duration": overlap_result.get("total_overlap_duration", 0)
+        }
+        print("Analysis complete!")
+    except Exception as e:
+        results["status"] = "error"
+        results["error"] = str(e)
+    return json.dumps(results, indent=2)
+# ============================================================
+# GRADIO INTERFACE
+# ============================================================
+demo = gr.Interface(
+    fn=full_analysis,
+    inputs=gr.Audio(type="filepath", label="Upload Audio (MP3, WAV, M4A)"),
+    outputs=gr.JSON(label="Analysis Results"),
+    title="🎙️ Voice Analysis API for Salesforce",
+    description="""
+    Upload a call recording to analyze:
+    - **Speaker Diarization**: Who spoke when
+    - **Coaching Detection**: Overlapping speech during borrower's responses
+    - **Voice Metrics**: Pause detection, silence ratio
+    Returns JSON that Salesforce can parse via Apex callout.
+    """,
+    examples=[],
+    allow_flagging="never"
+)
+# Launch with API enabled
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+torchaudio
+pyannote.audio
+gradio>=4.0.0
+pydub
+scipy