TeleAgent / app.py
S-K-yadav's picture
..
844d508
Raw
History Blame Contribute Delete
15.8 kB
"""
app.py β€” Telecalling Agent β€” Gradio 6 UI
Layout
──────
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ πŸ“ž AI Telecalling Agent [status badge] β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ 🎀 LIVE CALL β”‚ πŸ“‹ EXTRACTED DATA β”‚
β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ [intent markdown table] β”‚
β”‚ β”‚ Audio stream β”‚ β”‚ β”‚
β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ [Start] [End] β”‚ πŸ€– AGENT RESPONSE β”‚
β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ [spoken response box] β”‚
β”‚ β”‚ Transcript β”‚ β”‚ β”‚
β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ βœ… BOOKING CONFIRMED β”‚
β”‚ β”‚ [booking details box] β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ πŸ“ CALL LOG β”‚
β”‚ [dataframe β€” recent calls] β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
"""
import logging
import os
import json
import gradio as gr
import numpy as np
from pipeline.transcriber import get_transcriber
from pipeline.intent_parser import get_intent_parser
from pipeline.evaluater import get_evaluator
from config import APP_TITLE, APP_DESCRIPTION, SERVER_PORT, SERVER_NAME
from pipeline.orchestrator import CallSession, PipelineUpdate
from db import init_db
# Load HuggingFace config and set token early
try:
with open("hf_config.json", "r") as f:
hf_cfg = json.load(f)
hf_token = hf_cfg.get("huggingface", {}).get("hub", {}).get("token", "")
if hf_token and hf_token != "${HF_TOKEN}":
os.environ["HF_TOKEN"] = hf_token
except (FileNotFoundError, json.JSONDecodeError) as e:
pass # hf_config.json not found or invalid, use env var if set
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)
# Suppress verbose logs from HuggingFace hub
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARNING)
# Initialize database on startup
init_db()
# ── CSS ───────────────────────────────────────────────────────────────────────
CSS = """
/* Global */
.gradio-container { font-family: 'Inter', sans-serif; max-width: 1200px; }
/* Status badge */
#status-badge textarea {
font-size: 0.9rem;
font-weight: 600;
text-align: center;
border-radius: 20px;
padding: 4px 12px;
background: #f0fdf4;
border: 1px solid #86efac;
color: #166534;
}
/* Agent response */
#agent-box textarea {
font-size: 1.05rem;
font-style: italic;
background: #eff6ff;
border: 1px solid #93c5fd;
border-radius: 8px;
color: #1e3a5f;
min-height: 80px;
}
/* Booking confirmed */
#booking-box textarea {
background: #f0fdf4;
border: 1px solid #4ade80;
border-radius: 8px;
color: #14532d;
font-weight: 500;
}
/* Transcript */
#transcript-box textarea {
font-family: monospace;
font-size: 0.85rem;
background: #1e1e2e;
color: #cdd6f4;
border-radius: 8px;
min-height: 180px;
}
/* VAD indicator dot */
#vad-dot {
text-align: center;
font-size: 1.2rem;
}
/* Call buttons */
.call-btn-start { background: #16a34a !important; color: white !important; }
.call-btn-end { background: #dc2626 !important; color: white !important; }
.call-btn-reset { background: #6b7280 !important; color: white !important; }
/* Intent table inside Markdown */
#intent-panel table { width: 100%; border-collapse: collapse; font-size: 0.88rem; }
#intent-panel th, #intent-panel td {
padding: 5px 10px;
border: 1px solid #e2e8f0;
text-align: left;
}
#intent-panel tr:nth-child(even) { background: #f8fafc; }
/* Call log */
#call-log { font-size: 0.82rem; }
"""
# ── UI helpers ────────────────────────────────────────────────────────────────
def _format_transcript(lines: list[str]) -> str:
if not lines:
return "(waiting for speech…)"
return "\n".join(f"[{i+1}] {l}" for i, l in enumerate(lines))
def _format_booking(info: dict | None) -> str:
if not info:
return ""
return (
f"βœ… Booking #{info['booking_id']} confirmed!\n"
f" πŸ“… {info['date']} πŸ• {info['time']} "
f"({info['duration']} min)\n"
f" πŸ‘€ {info['caller']} πŸ“ž {info['type'].replace('_', ' ').title()}"
)
def _call_log_rows(records: list[dict]) -> list[list]:
rows = []
for r in records:
ts = r.get("timestamp", "")[:16].replace("T", " ")
rows.append([
r.get("id", ""),
ts,
r.get("caller_name") or "β€”",
r.get("intent") or "β€”",
r.get("decision") or "β€”",
r.get("status") or "β€”",
])
return rows
# ── Gradio App ────────────────────────────────────────────────────────────────
def build_app() -> gr.Blocks:
with gr.Blocks(css=CSS, title=APP_TITLE, theme=gr.themes.Soft()) as demo:
# ── Per-session state ──────────────────────────────────────────────
# gr.State holds one CallSession object per browser tab.
session_state = gr.State(value=None)
# ── Header ─────────────────────────────────────────────────────────
gr.Markdown(f"# {APP_TITLE}\n_{APP_DESCRIPTION}_")
status_badge = gr.Textbox(
value = "🟒 Ready β€” press Start Call",
label = "",
interactive = False,
elem_id = "status-badge",
)
# ── Main row ───────────────────────────────────────────────────────
with gr.Row():
# ── Left column: call controls + transcript ────────────────────
with gr.Column(scale=1):
gr.Markdown("### 🎀 Live Call")
audio_input = gr.Audio(
sources = ["microphone"],
streaming = True,
type = "numpy",
label = "Microphone input",
interactive= True,
elem_id = "audio-input",
)
gr.Markdown(
"_Tip: click the microphone widget to grant browser permission, then speak. "
"The call will start automatically on the first live audio input, or you can press πŸ“ž Start Call._"
)
vad_dot = gr.Markdown("⚫ _mic idle_", elem_id="vad-dot")
with gr.Row():
btn_start = gr.Button(
"πŸ“ž Start Call", variant="primary",
elem_classes=["call-btn-start"],
)
btn_end = gr.Button(
"πŸ“΅ End Call", variant="stop",
elem_classes=["call-btn-end"],
)
btn_reset = gr.Button(
"πŸ”„ Reset", variant="secondary",
elem_classes=["call-btn-reset"],
)
transcript_box = gr.Textbox(
label = "πŸ“ Live Transcript",
value = "(waiting for speech…)",
lines = 8,
max_lines = 20,
interactive = False,
elem_id = "transcript-box",
)
# ── Right column: intent + agent response + booking ────────────
with gr.Column(scale=1):
gr.Markdown("### πŸ“‹ Extracted Data")
intent_panel = gr.Markdown(
"_No data yet β€” waiting for first utterance…_",
elem_id = "intent-panel",
)
gr.Markdown("### πŸ€– Agent Response")
agent_box = gr.Textbox(
value = "",
label = "",
lines = 3,
interactive = False,
elem_id = "agent-box",
placeholder = "Agent will respond here…",
)
booking_box = gr.Textbox(
value = "",
label = "πŸ“… Booking Status",
lines = 3,
interactive = False,
elem_id = "booking-box",
visible = False,
)
# ── Call log ───────────────────────────────────────────────────────
gr.Markdown("### πŸ“ Call Log")
call_log_table = gr.Dataframe(
headers = ["ID", "Timestamp", "Caller", "Intent", "Decision", "Status"],
datatype = ["number", "str", "str", "str", "str", "str"],
value = [],
interactive = False,
elem_id = "call-log",
row_count = (5, "dynamic"),
)
# ── Helper: unpack PipelineUpdate β†’ tuple of component values ─────
def _unpack(u: PipelineUpdate):
"""Return values in the exact order of outputs lists below."""
vad_label = "πŸ”΄ _Speaking…_" if u.vad_speaking else "⚫ _mic idle_"
booking_text = _format_booking(u.booking_confirmed)
booking_visible = bool(booking_text)
return (
u.status, # status_badge
vad_label, # vad_dot
_format_transcript(u.transcript_lines), # transcript_box
u.intent_md, # intent_panel
u.agent_response, # agent_box
booking_text, # booking_box value
gr.update(visible=booking_visible), # booking_box visible
_call_log_rows(u.call_log), # call_log_table
)
# ── All output components in one list (matches _unpack order) ─────
ALL_OUTPUTS = [
status_badge,
vad_dot,
transcript_box,
intent_panel,
agent_box,
booking_box,
booking_box, # second entry β†’ gr.update(visible=…)
call_log_table,
]
# ── Session factory ────────────────────────────────────────────────
def _get_or_create_session(state):
if state is None:
state = CallSession()
return state
# ── Button callbacks ───────────────────────────────────────────────
def on_start(state):
state = _get_or_create_session(state)
update = state.start_call()
return (state, *_unpack(update))
def on_end(state):
state = _get_or_create_session(state)
update = state.end_call()
return (state, *_unpack(update))
def on_reset(state):
state = _get_or_create_session(state)
update = state.reset()
return (state, *_unpack(update))
BTN_OUTPUTS = [session_state] + ALL_OUTPUTS
btn_start.click(on_start, inputs=[session_state], outputs=BTN_OUTPUTS)
btn_end.click (on_end, inputs=[session_state], outputs=BTN_OUTPUTS)
btn_reset.click(on_reset, inputs=[session_state], outputs=BTN_OUTPUTS)
# ── Audio streaming callback ───────────────────────────────────────
# Fires every `stream_every` seconds with (sample_rate, np.ndarray).
# We pass the current session state in and get it back (updated).
def on_audio_stream(audio_chunk, state):
"""
Called by Gradio every 0.5 s while the mic is active.
audio_chunk: (sample_rate: int, data: np.ndarray) | None
"""
state = _get_or_create_session(state)
if audio_chunk is not None and not state.call_active:
logger.info("Auto-starting call session on first live audio input.")
state.start_call()
if not state.call_active:
# Return current state without processing
u = state._build_update()
return (state, *_unpack(u))
if audio_chunk is None:
u = state._build_update()
return (state, *_unpack(u))
sample_rate, audio_np = audio_chunk
# Ensure float32 mono
audio_np = np.array(audio_np, dtype=np.float32)
if audio_np.ndim == 2:
audio_np = audio_np.mean(axis=1)
update = state.process_audio_chunk(sample_rate, audio_np)
return (state, *_unpack(update))
audio_input.stream(
fn = on_audio_stream,
inputs = [audio_input, session_state],
outputs = [session_state] + ALL_OUTPUTS,
stream_every = 0.5, # seconds β€” half-second chunks
time_limit = 3600, # allow up to 1-hour calls
)
return demo
# ── Entry point ───────────────────────────────────────────────────────────────
if __name__ == "__main__":
logger.info("Starting Gradio app; prefetching deployed ASR model if needed...")
try:
get_transcriber().prefetch()
except Exception as exc:
logger.error(
"ASR prefetch failed at startup; continuing with lazy loading: %s",
exc,
)
app = build_app()
app.launch(
server_name = SERVER_NAME,
server_port = SERVER_PORT,
show_error = True,
)