Spaces:

MIP-Tech
/

Speach-To-Text

Sleeping

File size: 6,992 Bytes

0db822c

"""
Gradio UI for Misr Italia Properties Speech-to-Text Pipeline
"""
import os
import sys
import json
import logging
from pathlib import Path

# Add root to python path
root = Path(__file__).parent.parent
sys.path.insert(0, str(root))

import gradio as gr
from dotenv import load_dotenv

from src.inference.transcribe import WhisperTranscriber
from src.inference.analyze_call import CallAnalyzer, clean_transcript

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load env variables
load_dotenv(root / ".env")

# Initialize models globally so they load on server startup instead of per-request
logger.info("Loading Whisper Model...")
DEFAULT_MODEL = "outputs/checkpoints/merged_model"
model_path = str(root / DEFAULT_MODEL)
if not Path(model_path).exists():
    model_path = "openai/whisper-large-v3"

transcriber = WhisperTranscriber(model_path=model_path, device=None)

logger.info("Initializing CallAnalyzer with OpenAI...")
analyzer = None
try:
    analyzer = CallAnalyzer()
except Exception as e:
    logger.error("Failed to init CallAnalyzer: %s", e)


def process_call(audio_file, enable_analysis):
    if not audio_file:
        return "No audio uploaded.", "No audio uploaded.", *[None] * 11

    # Run transcription (VAD + Whisper; OpenAI handles speaker separation)
    try:
        transcript = transcriber.transcribe(audio_file)
    except Exception as e:
        logger.error("Transcription error: %s", e)
        err = f"Transcription error: {str(e)}"
        return err, err, *[None] * 11

    # Analysis Defaults
    parsed = {
        "cleaned_transcript": transcript,
        "agent_name": "",
        "customer_name": "",
        "unit_number": [],
        "project_name": "",
        "department_mentioned": "",
        "call_type": "",
        "customer_satisfaction": 0,
        "is_urgent": False,
        "pain_points": [],
        "action_items_promised": [],
        "next_steps": []
    }

    # Keep a clean copy of the raw Whisper output before OpenAI touches it
    raw_transcript = transcript
    openai_transcript = transcript  # fallback: same as raw if analysis disabled

    if enable_analysis and analyzer:
        try:
            analysis = analyzer.analyze(transcript)
            dump = analysis.model_dump()
            parsed.update(dump)
            openai_transcript = parsed.get("cleaned_transcript", transcript)
        except Exception as e:
            logger.error("OpenAI Analysis error: %s", e)
            parsed["agent_name"] = f"Error: {e}"
            openai_transcript = transcript

    return (
        raw_transcript,
        openai_transcript,
        parsed.get("agent_name"),
        parsed.get("customer_name"),
        ", ".join(parsed.get("unit_number", [])) if isinstance(parsed.get("unit_number"), list) else str(parsed.get("unit_number", "")),
        parsed.get("project_name"),
        parsed.get("department_mentioned"),
        parsed.get("call_type"),
        parsed.get("customer_satisfaction"),
        parsed.get("is_urgent"),
        "\n".join([f"- {x}" for x in parsed.get("pain_points", [])]),
        "\n".join([f"- {x}" for x in parsed.get("action_items_promised", [])]),
        "\n".join([f"- {x}" for x in parsed.get("next_steps", [])])
    )


def build_ui():
    theme = gr.themes.Monochrome(
        primary_hue="slate",
        neutral_hue="slate"
    )

    js_func = """
    function refresh() {
        const url = new URL(window.location);
        if (url.searchParams.get('__theme') !== 'dark') {
            url.searchParams.set('__theme', 'dark');
            window.location.href = url.href;
        }
    }
    """

    with gr.Blocks(title="Misr Italia Properties - Call Analyzer") as demo:
        gr.Markdown("# 🏢 Misr Italia Properties - Call Center AI")
        gr.Markdown("Upload an audio recording of a customer call to automatically transcribe, perform intelligent speaker separation, and extract business intelligence via OpenAI GPT-4o-mini.")
        
        with gr.Row():
            with gr.Column(scale=1):
                audio_input = gr.Audio(type="filepath", label="Upload Call Recording (WAV/MP3)")
                with gr.Row():
                    analyze_cb = gr.Checkbox(label="Enable OpenAI Analysis", value=True)
                
                submit_btn = gr.Button("Analyze Call", variant="primary")

                with gr.Tabs():
                    with gr.TabItem("🎙️ Raw Whisper"):
                        whisper_output = gr.Textbox(
                            label="Raw Whisper Transcript",
                            lines=18,
                            placeholder="Raw output from Whisper will appear here…",
                        )
                    with gr.TabItem("✨ OpenAI Cleaned"):
                        openai_output = gr.Textbox(
                            label="OpenAI Cleaned Transcript (with speaker turns)",
                            lines=18,
                            placeholder="OpenAI-separated and corrected transcript will appear here…",
                        )
            
            with gr.Column(scale=1):
                gr.Markdown("### 📊 Extracted Call Insights")
                with gr.Row():
                    is_urgent_output = gr.Checkbox(label="🚨 IS URGENT")
                    satisfaction_output = gr.Number(label="⭐ Customer Satisfaction (1-5)")
                    call_type_output = gr.Textbox(label="📞 Call Type")
                with gr.Row():
                    agent_name_output = gr.Textbox(label="Agent Name")
                    customer_name_output = gr.Textbox(label="Customer Name")
                with gr.Row():
                    project_name_output = gr.Textbox(label="Project Name")
                    unit_number_output = gr.Textbox(label="Unit Number(s)")
                
                dept_output = gr.Textbox(label="Department Mentioned")
                
                pain_points_output = gr.Textbox(label="💥 Pain Points", lines=3)
                action_items_output = gr.Textbox(label="✅ Action Items", lines=3)
                next_steps_output = gr.Textbox(label="⏭️ Next Steps", lines=3)

        submit_btn.click(
            fn=process_call,
            inputs=[audio_input, analyze_cb],
            outputs=[
                whisper_output,
                openai_output,
                agent_name_output,
                customer_name_output,
                unit_number_output,
                project_name_output,
                dept_output,
                call_type_output,
                satisfaction_output,
                is_urgent_output,
                pain_points_output,
                action_items_output,
                next_steps_output
            ]
        )
        
    return demo, theme, js_func

if __name__ == "__main__":
    app, theme_obj, js_func = build_ui()
    app.launch(server_name="0.0.0.0", server_port=7860, share=False, theme=theme_obj, js=js_func)