Spaces:
Running
Running
File size: 15,780 Bytes
9a02b57 eb67b74 9a02b57 83c1aed 9a02b57 844d508 9a02b57 844d508 9a02b57 844d508 9a02b57 3e394c4 9a02b57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 | """
app.py β Telecalling Agent β Gradio 6 UI
Layout
ββββββ
βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β π AI Telecalling Agent [status badge] β
ββββββββββββββββββββββββ¬βββββββββββββββββββββββββββββββ€
β π€ LIVE CALL β π EXTRACTED DATA β
β ββββββββββββββββββ β [intent markdown table] β
β β Audio stream β β β
β ββββββββββββββββββ ββββββββββββββββββββββββββββββββ€
β [Start] [End] β π€ AGENT RESPONSE β
β ββββββββββββββββββ β [spoken response box] β
β β Transcript β β β
β ββββββββββββββββββ β β
BOOKING CONFIRMED β
β β [booking details box] β
ββββββββββββββββββββββββ΄βββββββββββββββββββββββββββββββ€
β π CALL LOG β
β [dataframe β recent calls] β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"""
import logging
import os
import json
import gradio as gr
import numpy as np
from pipeline.transcriber import get_transcriber
from pipeline.intent_parser import get_intent_parser
from pipeline.evaluater import get_evaluator
from config import APP_TITLE, APP_DESCRIPTION, SERVER_PORT, SERVER_NAME
from pipeline.orchestrator import CallSession, PipelineUpdate
from db import init_db
# Load HuggingFace config and set token early
try:
with open("hf_config.json", "r") as f:
hf_cfg = json.load(f)
hf_token = hf_cfg.get("huggingface", {}).get("hub", {}).get("token", "")
if hf_token and hf_token != "${HF_TOKEN}":
os.environ["HF_TOKEN"] = hf_token
except (FileNotFoundError, json.JSONDecodeError) as e:
pass # hf_config.json not found or invalid, use env var if set
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)
# Suppress verbose logs from HuggingFace hub
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARNING)
# Initialize database on startup
init_db()
# ββ CSS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
CSS = """
/* Global */
.gradio-container { font-family: 'Inter', sans-serif; max-width: 1200px; }
/* Status badge */
#status-badge textarea {
font-size: 0.9rem;
font-weight: 600;
text-align: center;
border-radius: 20px;
padding: 4px 12px;
background: #f0fdf4;
border: 1px solid #86efac;
color: #166534;
}
/* Agent response */
#agent-box textarea {
font-size: 1.05rem;
font-style: italic;
background: #eff6ff;
border: 1px solid #93c5fd;
border-radius: 8px;
color: #1e3a5f;
min-height: 80px;
}
/* Booking confirmed */
#booking-box textarea {
background: #f0fdf4;
border: 1px solid #4ade80;
border-radius: 8px;
color: #14532d;
font-weight: 500;
}
/* Transcript */
#transcript-box textarea {
font-family: monospace;
font-size: 0.85rem;
background: #1e1e2e;
color: #cdd6f4;
border-radius: 8px;
min-height: 180px;
}
/* VAD indicator dot */
#vad-dot {
text-align: center;
font-size: 1.2rem;
}
/* Call buttons */
.call-btn-start { background: #16a34a !important; color: white !important; }
.call-btn-end { background: #dc2626 !important; color: white !important; }
.call-btn-reset { background: #6b7280 !important; color: white !important; }
/* Intent table inside Markdown */
#intent-panel table { width: 100%; border-collapse: collapse; font-size: 0.88rem; }
#intent-panel th, #intent-panel td {
padding: 5px 10px;
border: 1px solid #e2e8f0;
text-align: left;
}
#intent-panel tr:nth-child(even) { background: #f8fafc; }
/* Call log */
#call-log { font-size: 0.82rem; }
"""
# ββ UI helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _format_transcript(lines: list[str]) -> str:
if not lines:
return "(waiting for speechβ¦)"
return "\n".join(f"[{i+1}] {l}" for i, l in enumerate(lines))
def _format_booking(info: dict | None) -> str:
if not info:
return ""
return (
f"β
Booking #{info['booking_id']} confirmed!\n"
f" π
{info['date']} π {info['time']} "
f"({info['duration']} min)\n"
f" π€ {info['caller']} π {info['type'].replace('_', ' ').title()}"
)
def _call_log_rows(records: list[dict]) -> list[list]:
rows = []
for r in records:
ts = r.get("timestamp", "")[:16].replace("T", " ")
rows.append([
r.get("id", ""),
ts,
r.get("caller_name") or "β",
r.get("intent") or "β",
r.get("decision") or "β",
r.get("status") or "β",
])
return rows
# ββ Gradio App ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def build_app() -> gr.Blocks:
with gr.Blocks(css=CSS, title=APP_TITLE, theme=gr.themes.Soft()) as demo:
# ββ Per-session state ββββββββββββββββββββββββββββββββββββββββββββββ
# gr.State holds one CallSession object per browser tab.
session_state = gr.State(value=None)
# ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
gr.Markdown(f"# {APP_TITLE}\n_{APP_DESCRIPTION}_")
status_badge = gr.Textbox(
value = "π’ Ready β press Start Call",
label = "",
interactive = False,
elem_id = "status-badge",
)
# ββ Main row βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Row():
# ββ Left column: call controls + transcript ββββββββββββββββββββ
with gr.Column(scale=1):
gr.Markdown("### π€ Live Call")
audio_input = gr.Audio(
sources = ["microphone"],
streaming = True,
type = "numpy",
label = "Microphone input",
interactive= True,
elem_id = "audio-input",
)
gr.Markdown(
"_Tip: click the microphone widget to grant browser permission, then speak. "
"The call will start automatically on the first live audio input, or you can press π Start Call._"
)
vad_dot = gr.Markdown("β« _mic idle_", elem_id="vad-dot")
with gr.Row():
btn_start = gr.Button(
"π Start Call", variant="primary",
elem_classes=["call-btn-start"],
)
btn_end = gr.Button(
"π΅ End Call", variant="stop",
elem_classes=["call-btn-end"],
)
btn_reset = gr.Button(
"π Reset", variant="secondary",
elem_classes=["call-btn-reset"],
)
transcript_box = gr.Textbox(
label = "π Live Transcript",
value = "(waiting for speechβ¦)",
lines = 8,
max_lines = 20,
interactive = False,
elem_id = "transcript-box",
)
# ββ Right column: intent + agent response + booking ββββββββββββ
with gr.Column(scale=1):
gr.Markdown("### π Extracted Data")
intent_panel = gr.Markdown(
"_No data yet β waiting for first utteranceβ¦_",
elem_id = "intent-panel",
)
gr.Markdown("### π€ Agent Response")
agent_box = gr.Textbox(
value = "",
label = "",
lines = 3,
interactive = False,
elem_id = "agent-box",
placeholder = "Agent will respond hereβ¦",
)
booking_box = gr.Textbox(
value = "",
label = "π
Booking Status",
lines = 3,
interactive = False,
elem_id = "booking-box",
visible = False,
)
# ββ Call log βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
gr.Markdown("### π Call Log")
call_log_table = gr.Dataframe(
headers = ["ID", "Timestamp", "Caller", "Intent", "Decision", "Status"],
datatype = ["number", "str", "str", "str", "str", "str"],
value = [],
interactive = False,
elem_id = "call-log",
row_count = (5, "dynamic"),
)
# ββ Helper: unpack PipelineUpdate β tuple of component values βββββ
def _unpack(u: PipelineUpdate):
"""Return values in the exact order of outputs lists below."""
vad_label = "π΄ _Speakingβ¦_" if u.vad_speaking else "β« _mic idle_"
booking_text = _format_booking(u.booking_confirmed)
booking_visible = bool(booking_text)
return (
u.status, # status_badge
vad_label, # vad_dot
_format_transcript(u.transcript_lines), # transcript_box
u.intent_md, # intent_panel
u.agent_response, # agent_box
booking_text, # booking_box value
gr.update(visible=booking_visible), # booking_box visible
_call_log_rows(u.call_log), # call_log_table
)
# ββ All output components in one list (matches _unpack order) βββββ
ALL_OUTPUTS = [
status_badge,
vad_dot,
transcript_box,
intent_panel,
agent_box,
booking_box,
booking_box, # second entry β gr.update(visible=β¦)
call_log_table,
]
# ββ Session factory ββββββββββββββββββββββββββββββββββββββββββββββββ
def _get_or_create_session(state):
if state is None:
state = CallSession()
return state
# ββ Button callbacks βββββββββββββββββββββββββββββββββββββββββββββββ
def on_start(state):
state = _get_or_create_session(state)
update = state.start_call()
return (state, *_unpack(update))
def on_end(state):
state = _get_or_create_session(state)
update = state.end_call()
return (state, *_unpack(update))
def on_reset(state):
state = _get_or_create_session(state)
update = state.reset()
return (state, *_unpack(update))
BTN_OUTPUTS = [session_state] + ALL_OUTPUTS
btn_start.click(on_start, inputs=[session_state], outputs=BTN_OUTPUTS)
btn_end.click (on_end, inputs=[session_state], outputs=BTN_OUTPUTS)
btn_reset.click(on_reset, inputs=[session_state], outputs=BTN_OUTPUTS)
# ββ Audio streaming callback βββββββββββββββββββββββββββββββββββββββ
# Fires every `stream_every` seconds with (sample_rate, np.ndarray).
# We pass the current session state in and get it back (updated).
def on_audio_stream(audio_chunk, state):
"""
Called by Gradio every 0.5 s while the mic is active.
audio_chunk: (sample_rate: int, data: np.ndarray) | None
"""
state = _get_or_create_session(state)
if audio_chunk is not None and not state.call_active:
logger.info("Auto-starting call session on first live audio input.")
state.start_call()
if not state.call_active:
# Return current state without processing
u = state._build_update()
return (state, *_unpack(u))
if audio_chunk is None:
u = state._build_update()
return (state, *_unpack(u))
sample_rate, audio_np = audio_chunk
# Ensure float32 mono
audio_np = np.array(audio_np, dtype=np.float32)
if audio_np.ndim == 2:
audio_np = audio_np.mean(axis=1)
update = state.process_audio_chunk(sample_rate, audio_np)
return (state, *_unpack(update))
audio_input.stream(
fn = on_audio_stream,
inputs = [audio_input, session_state],
outputs = [session_state] + ALL_OUTPUTS,
stream_every = 0.5, # seconds β half-second chunks
time_limit = 3600, # allow up to 1-hour calls
)
return demo
# ββ Entry point βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if __name__ == "__main__":
logger.info("Starting Gradio app; prefetching deployed ASR model if needed...")
try:
get_transcriber().prefetch()
except Exception as exc:
logger.error(
"ASR prefetch failed at startup; continuing with lazy loading: %s",
exc,
)
app = build_app()
app.launch(
server_name = SERVER_NAME,
server_port = SERVER_PORT,
show_error = True,
) |