KevinMerchant13's picture
Phase 7: initial deploy (cpu-basic)
35c0d38 verified
"""Gradio entry point.
Single app that runs both locally (`python app.py`) and on Hugging Face Spaces.
Builds a gr.ChatInterface with a radio toggle between the OSS (Qwen) and frontier
(Claude) assistants, wired through the full Phase 3 pipeline:
input guardrail -> memory-backed assistant (+ tools) -> output guardrail
A small status footer under each reply shows which assistant answered, which
tools fired, and whether either guardrail triggered.
"""
from __future__ import annotations
import uuid
import gradio as gr
from langchain_core.messages import AIMessage
from src.guardrails import (
INPUT_REFUSAL,
OUTPUT_REFUSAL,
check_input,
moderate_output,
)
from src.memory import build_conversational, get_session_history
from src.observability import flush, observe, trace_attributes
# Labels shown in the radio toggle.
FRONTIER = "Claude (frontier)"
OSS = "Qwen2.5-1.5B (open-source)"
# One session id per app launch -> a fresh conversation each run. (For a
# multi-user deployment we'd mint a per-browser-session id instead; fine for
# this single-user demo / eval harness.)
SESSION_ID = uuid.uuid4().hex
# Memory-wrapped assistants, built on first use and cached. Building Qwen here
# triggers its ~3 GB load, so we only do it when that assistant is first picked.
_conversationals: dict = {}
def _get_conversational(choice: str):
if choice not in _conversationals:
if choice == FRONTIER:
from src.assistants.frontier import ClaudeAssistant
_conversationals[choice] = build_conversational(ClaudeAssistant())
else:
from src.assistants.oss import QwenAssistant
_conversationals[choice] = build_conversational(QwenAssistant())
return _conversationals[choice]
def _footer(assistant: str, tools_used: list[str], in_blocked: bool, out_blocked: bool) -> str:
"""Build the small status line shown under each reply."""
tools = ", ".join(dict.fromkeys(tools_used)) if tools_used else "none"
input_status = "BLOCKED" if in_blocked else "ok"
output_status = "BLOCKED" if out_blocked else "ok"
return (
f"\n\n---\n"
f"*assistant: {assistant} | tools: {tools} | "
f"guardrails -- input: {input_status}, output: {output_status}*"
)
@observe(name="chat_turn")
def respond(message: str, history: list[dict], assistant_choice: str) -> str:
"""ChatInterface callback running the full guardrail + memory + tools pipeline.
Note: conversation context comes from persistent memory (SQLite via
RunnableWithMessageHistory), not from Gradio's `history` arg, so we ignore it.
The whole turn is one Langfuse trace, tagged with the session id and which
assistant answered; the model/tool/moderation spans nest underneath.
"""
with trace_attributes(
session_id=SESSION_ID,
tags=[assistant_choice],
metadata={"assistant_type": assistant_choice},
):
# --- Layer 1: input guardrail (before the model sees anything) ---
in_check = check_input(message)
if in_check.blocked:
flush()
return INPUT_REFUSAL + _footer(assistant_choice, [], True, False)
# --- Generate with memory + tools ---
conv = _get_conversational(assistant_choice)
result: AIMessage = conv.invoke(
{"input": message},
config={"configurable": {"session_id": SESSION_ID}},
)
text = result.content
tools_used = result.additional_kwargs.get("tools_used", [])
# --- Layer 2: output moderation ---
out_check = moderate_output(text)
if out_check.blocked:
# Replace what the user sees AND what gets remembered, so the unsafe
# text doesn't leak into future context. (Rare path; rewrite is cheap.)
history_store = get_session_history(SESSION_ID)
msgs = history_store.messages
history_store.clear()
history_store.add_messages(msgs[:-1] + [AIMessage(content=OUTPUT_REFUSAL)])
text = OUTPUT_REFUSAL
# Send buffered traces to Langfuse at the end of each turn.
flush()
return text + _footer(assistant_choice, tools_used, False, out_check.blocked)
def build_demo() -> gr.ChatInterface:
assistant_picker = gr.Radio(
choices=[FRONTIER, OSS],
value=FRONTIER,
label="Assistant",
info="Switch between the frontier (Claude) and open-source (Qwen) models.",
)
# Gradio 6 uses the "messages" history format by default (no `type=` arg).
return gr.ChatInterface(
fn=respond,
additional_inputs=[assistant_picker],
title="OSS vs. Frontier Assistant",
description=(
"Compare an open-source assistant (Qwen2.5-1.5B) against a frontier "
"assistant (Claude Sonnet 4.5). Both have short-term memory, a "
"calculator + web-search tool, and input/output guardrails. Pick one "
"below and chat — the status line under each reply shows what fired."
),
)
demo = build_demo()
if __name__ == "__main__":
demo.launch()