Spaces:

psychxD
/

voice_analysis_api

Runtime error

File size: 9,396 Bytes

"""
Voice Analysis API for Salesforce
==================================
Endpoints:
  /analyze - Full analysis (diarization + overlap + voice metrics)
  
Returns JSON that Salesforce can parse.

Models used:
  - pyannote/speaker-diarization-3.1 (who spoke when)
  - pyannote/overlapped-speech-detection (coaching detection)
"""

import gradio as gr
import os
import json
import torch
from pyannote.audio import Pipeline
import numpy as np

# ============================================================
# CONFIGURATION
# ============================================================

HF_TOKEN = os.environ.get("HF_TOKEN")

if not HF_TOKEN:
    print("WARNING: HF_TOKEN not set. Gated models will fail.")

# ============================================================
# LOAD MODELS (runs once at startup)
# ============================================================

print("Loading diarization model...")
try:
    diarization_pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=HF_TOKEN
    )
    print("✅ Diarization model loaded")
except Exception as e:
    print(f"❌ Diarization model failed: {e}")
    diarization_pipeline = None

print("Loading overlap detection model...")
try:
    overlap_pipeline = Pipeline.from_pretrained(
        "pyannote/overlapped-speech-detection",
        use_auth_token=HF_TOKEN
    )
    print("✅ Overlap detection model loaded")
except Exception as e:
    print(f"❌ Overlap detection failed: {e}")
    overlap_pipeline = None


# ============================================================
# ANALYSIS FUNCTIONS
# ============================================================

def analyze_diarization(audio_path):
    """
    Identifies different speakers and their timestamps.
    Returns list of segments with speaker labels.
    """
    if diarization_pipeline is None:
        return {"error": "Diarization model not loaded"}
    
    try:
        diarization = diarization_pipeline(audio_path)
        
        segments = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            segments.append({
                "speaker": speaker,
                "start": round(turn.start, 2),
                "end": round(turn.end, 2),
                "duration": round(turn.end - turn.start, 2)
            })
        
        # Identify borrower (assumes agent speaks first)
        speakers = list(set([s["speaker"] for s in segments]))
        agent_speaker = segments[0]["speaker"] if segments else None
        borrower_speaker = None
        for s in speakers:
            if s != agent_speaker:
                borrower_speaker = s
                break
        
        return {
            "segments": segments,
            "speaker_count": len(speakers),
            "agent_speaker": agent_speaker,
            "borrower_speaker": borrower_speaker,
            "total_segments": len(segments)
        }
    
    except Exception as e:
        return {"error": str(e)}


def analyze_overlap(audio_path):
    """
    Detects overlapping speech (multiple people talking at once).
    Used for coaching detection.
    """
    if overlap_pipeline is None:
        return {"error": "Overlap detection model not loaded"}
    
    try:
        overlap = overlap_pipeline(audio_path)
        
        overlap_segments = []
        for segment, _, label in overlap.itertracks(yield_label=True):
            overlap_segments.append({
                "start": round(segment.start, 2),
                "end": round(segment.end, 2),
                "duration": round(segment.end - segment.start, 2)
            })
        
        total_overlap_duration = sum([s["duration"] for s in overlap_segments])
        
        return {
            "overlap_segments": overlap_segments,
            "overlap_count": len(overlap_segments),
            "total_overlap_duration": round(total_overlap_duration, 2)
        }
    
    except Exception as e:
        return {"error": str(e)}


def detect_coaching(diarization_result, overlap_result):
    """
    Cross-references overlap with borrower segments.
    Overlap during borrower's speech = potential coaching.
    """
    coaching_flags = []
    
    if "error" in diarization_result or "error" in overlap_result:
        return {
            "coaching_detected": False,
            "error": "Could not analyze - model error"
        }
    
    borrower_speaker = diarization_result.get("borrower_speaker")
    
    if not borrower_speaker:
        return {
            "coaching_detected": False,
            "reason": "Could not identify borrower"
        }
    
    # Get borrower segments
    borrower_segments = [
        s for s in diarization_result["segments"] 
        if s["speaker"] == borrower_speaker
    ]
    
    # Get overlap segments
    overlap_segments = overlap_result.get("overlap_segments", [])
    
    # Check if any overlap falls within borrower's speaking time
    for overlap in overlap_segments:
        for borrower_seg in borrower_segments:
            # Check if overlap is during borrower's speech
            if (overlap["start"] >= borrower_seg["start"] and 
                overlap["start"] <= borrower_seg["end"]):
                coaching_flags.append({
                    "overlap_time": f"{overlap['start']}-{overlap['end']}",
                    "during_borrower_segment": f"{borrower_seg['start']}-{borrower_seg['end']}",
                    "duration": overlap["duration"]
                })
    
    return {
        "coaching_detected": len(coaching_flags) > 0,
        "coaching_instances": len(coaching_flags),
        "coaching_flags": coaching_flags,
        "borrower_segments_analyzed": len(borrower_segments)
    }


def analyze_voice_metrics(audio_path):
    """
    Basic voice analysis - pause detection, speaking rate.
    For hesitation indicators.
    """
    try:
        import librosa
        
        # Load audio
        y, sr = librosa.load(audio_path, sr=16000)
        
        duration = len(y) / sr
        
        # Simple energy-based silence detection
        energy = np.abs(y)
        threshold = np.mean(energy) * 0.1
        silence_samples = np.sum(energy < threshold)
        silence_ratio = silence_samples / len(y)
        
        return {
            "duration_seconds": round(duration, 2),
            "silence_ratio": round(silence_ratio, 3),
            "has_long_pauses": silence_ratio > 0.3
        }
    
    except Exception as e:
        return {"error": str(e), "duration_seconds": 0, "silence_ratio": 0, "has_long_pauses": False}


# ============================================================
# MAIN ANALYSIS FUNCTION
# ============================================================

def full_analysis(audio_file):
    """
    Complete audio analysis - called by Gradio/API.
    Returns JSON with all results.
    """
    if audio_file is None:
        return json.dumps({"error": "No audio file provided"}, indent=2)
    
    results = {
        "status": "success",
        "analysis": {}
    }
    
    try:
        # Run all analyses
        print(f"Analyzing: {audio_file}")
        
        # 1. Diarization
        print("Running diarization...")
        diarization_result = analyze_diarization(audio_file)
        results["analysis"]["diarization"] = diarization_result
        
        # 2. Overlap detection
        print("Running overlap detection...")
        overlap_result = analyze_overlap(audio_file)
        results["analysis"]["overlap"] = overlap_result
        
        # 3. Coaching detection (cross-reference)
        print("Analyzing coaching...")
        coaching_result = detect_coaching(diarization_result, overlap_result)
        results["analysis"]["coaching"] = coaching_result
        
        # 4. Voice metrics
        print("Analyzing voice metrics...")
        voice_result = analyze_voice_metrics(audio_file)
        results["analysis"]["voice_metrics"] = voice_result
        
        # 5. Summary
        results["summary"] = {
            "speaker_count": diarization_result.get("speaker_count", 0),
            "coaching_detected": coaching_result.get("coaching_detected", False),
            "coaching_instances": coaching_result.get("coaching_instances", 0),
            "has_long_pauses": voice_result.get("has_long_pauses", False),
            "total_overlap_duration": overlap_result.get("total_overlap_duration", 0)
        }
        
        print("Analysis complete!")
        
    except Exception as e:
        results["status"] = "error"
        results["error"] = str(e)
    
    return json.dumps(results, indent=2)


# ============================================================
# GRADIO INTERFACE
# ============================================================

demo = gr.Interface(
    fn=full_analysis,
    inputs=gr.Audio(type="filepath", label="Upload Audio (MP3, WAV, M4A)"),
    outputs=gr.JSON(label="Analysis Results"),
    title="🎙️ Voice Analysis API for Salesforce",
    description="""
    Upload a call recording to analyze:
    - **Speaker Diarization**: Who spoke when
    - **Coaching Detection**: Overlapping speech during borrower's responses  
    - **Voice Metrics**: Pause detection, silence ratio
    
    Returns JSON that Salesforce can parse via Apex callout.
    """,
    examples=[],
    allow_flagging="never"
)

# Launch with API enabled
demo.launch()