Spaces:

arjun10g
/

bankmind

Sleeping

App Files Files Community

arjun10g commited on Apr 30

Commit

f623ec4

verified ·

1 Parent(s): 30d4760

Deploy BankMind

Browse files

Files changed (4) hide show

app.py +6 -2
app/main.py +173 -47
app/query_pipeline.py +172 -3
pipelines/shared/llm.py +32 -0

app.py CHANGED Viewed

@@ -15,14 +15,18 @@ sys.path.insert(0, str(ROOT))
 import gradio as gr
-from app.main import build_app
 if __name__ == "__main__":
     demo = build_app()
     demo.launch(
         server_name=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
         server_port=int(os.environ.get("GRADIO_SERVER_PORT", "7860")),
         show_error=True,
-        theme=gr.themes.Soft(),
     )

 import gradio as gr
+from app.main import _BANKY_CSS, _BANKY_THEME, build_app
 if __name__ == "__main__":
     demo = build_app()
+    # Explicit queue so streaming events + multiple clients don't block each
+    # other (and tab switches don't freeze when a chat turn is in flight).
+    demo.queue(default_concurrency_limit=4, max_size=16)
     demo.launch(
         server_name=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
         server_port=int(os.environ.get("GRADIO_SERVER_PORT", "7860")),
         show_error=True,
+        theme=_BANKY_THEME,
+        css=_BANKY_CSS,
     )

app/main.py CHANGED Viewed

@@ -30,7 +30,7 @@ from app.charts import (
     retrieval_stage_2_figure,
     retrieval_stage_3_figure,
 )
-from app.query_pipeline import run_query
 COMPLIANCE_STRATEGIES = ["regulatory_boundary", "semantic", "hierarchical"]
@@ -253,55 +253,97 @@ def _build_qa_tab(module: str, strategies: list[str], default_strategy: str):
     # ---- Chat handlers ----------------------------------------------------
     def _on_send(user_msg, hist_pairs, strat, d, m, r, t, k, fk, gen, mx, chat_msgs):
-        """Append user msg, run pipeline, append assistant msg.
-        gr.Chatbot defaults to the messages format on modern Gradio: a list of
-        {"role": "user"/"assistant", "content": "..."} dicts. hist_pairs is our
-        parallel (user, assistant) tuple list used by the LLM rewriter.
         """
         user_msg = (user_msg or "").strip()
         if not user_msg:
-            return (chat_msgs or []), hist_pairs, gr.update(), "", "", "_(empty input)_", "_(no chunks)_", None
-        result = run_query(
-            query=user_msg,
-            module=module,
-            chunk_strategy=strat,
-            embedding_dim=int(d),
-            retrieval_method=m,
-            reranker=r,
-            query_transform=t,
-            top_k=int(k),
-            final_k=int(fk),
-            generate_answer=bool(gen),
-            chat_history=hist_pairs or [],
-            max_answer_tokens=int(mx),
-        )
-        assistant_msg = result.answer if result.answer else "_(generation is off in pipeline configuration)_"
         new_chat_msgs = (chat_msgs or []) + [
             {"role": "user", "content": user_msg},
-            {"role": "assistant", "content": assistant_msg},
         ]
-        new_pairs = (hist_pairs or []) + [(user_msg, assistant_msg)]
-        config_line = (
-            f"`{result.config_summary}`  ·  query_id=`{result.query_id or '—'}`"
-        )
-        if result.rewritten_query and result.rewritten_query != user_msg:
-            config_line += f"\n\n_Follow-up rewritten as:_ `{result.rewritten_query}`"
-        return (
-            new_chat_msgs,
-            new_pairs,
-            gr.update(value=""),                  # clear input
-            _format_timings(result.timings),
-            config_line,
-            _format_guardrails(result.guardrail_report),
-            _format_chunks(result.chunks),
-            result,
-        )
     def _on_clear():
         return [], [], None, "", "", "_(send a message to see guardrails)_", "_(send a message to see retrieved passages)_"
@@ -349,11 +391,94 @@ def _build_perf_tab(module: str):
 # Build the app
 # =============================================================================
 def build_app() -> gr.Blocks:
     with gr.Blocks(title="BankMind") as demo:
-        gr.Markdown(
-            "# 🏦 BankMind\n"
-            "_Multi-domain RAG for financial intelligence: compliance and credit._"
         )
         with gr.Tabs():
             with gr.Tab("⚖️ Compliance Q&A"):
@@ -392,5 +517,6 @@ def build_app() -> gr.Blocks:
 if __name__ == "__main__":
     app = build_app()
     app.launch(server_name="127.0.0.1", server_port=7860, show_error=True,
-               theme=gr.themes.Soft())

     retrieval_stage_2_figure,
     retrieval_stage_3_figure,
 )
+from app.query_pipeline import run_query, run_query_stream
 COMPLIANCE_STRATEGIES = ["regulatory_boundary", "semantic", "hierarchical"]
     # ---- Chat handlers ----------------------------------------------------
     def _on_send(user_msg, hist_pairs, strat, d, m, r, t, k, fk, gen, mx, chat_msgs):
+        """Streaming chat handler. Yields progressive Gradio updates as tokens
+        stream in, so the UI never freezes during generation.
+        When `gen` is False (Generate answer toggled off), runs sync retrieve
+        only — no LLM call, free.
         """
         user_msg = (user_msg or "").strip()
         if not user_msg:
+            yield (chat_msgs or [], hist_pairs, gr.update(), "", "",
+                   "_(empty input)_", "_(no chunks)_", None)
+            return
+        # Free path: no generation. Falls back to the non-streaming run_query.
+        if not gen:
+            result = run_query(
+                query=user_msg, module=module, chunk_strategy=strat,
+                embedding_dim=int(d), retrieval_method=m, reranker=r,
+                query_transform=t, top_k=int(k), final_k=int(fk),
+                generate_answer=False, chat_history=hist_pairs or [],
+                max_answer_tokens=int(mx),
+            )
+            assistant_msg = "_(generation is off in pipeline configuration)_"
+            new_chat_msgs = (chat_msgs or []) + [
+                {"role": "user", "content": user_msg},
+                {"role": "assistant", "content": assistant_msg},
+            ]
+            new_pairs = (hist_pairs or []) + [(user_msg, assistant_msg)]
+            cfg = f"`{result.config_summary}`  ·  query_id=`{result.query_id or '—'}`"
+            if result.rewritten_query and result.rewritten_query != user_msg:
+                cfg += f"\n\n_Follow-up rewritten as:_ `{result.rewritten_query}`"
+            yield (new_chat_msgs, new_pairs, gr.update(value=""),
+                   _format_timings(result.timings), cfg,
+                   _format_guardrails(result.guardrail_report),
+                   _format_chunks(result.chunks), result)
+            return
+        # Streaming path. Show user message + an empty assistant placeholder
+        # immediately, then progressively fill the assistant message as tokens
+        # arrive.
         new_chat_msgs = (chat_msgs or []) + [
             {"role": "user", "content": user_msg},
+            {"role": "assistant", "content": "…"},
         ]
+        # Initial yield: clear input, lock in the user message, show "thinking".
+        yield (new_chat_msgs, hist_pairs or [], gr.update(value=""),
+               "_…retrieving…_", "", "_(running guardrails after generation)_",
+               "_(loading)_", None)
+        last_setup_result = None
+        accumulated = ""
+        for event_type, payload in run_query_stream(
+            query=user_msg, module=module, chunk_strategy=strat,
+            embedding_dim=int(d), retrieval_method=m, reranker=r,
+            query_transform=t, top_k=int(k), final_k=int(fk),
+            chat_history=hist_pairs or [], max_answer_tokens=int(mx),
+        ):
+            if event_type == "setup":
+                last_setup_result = payload
+                cfg = (
+                    f"`{payload.config_summary}`"
+                    + (f"\n\n_Follow-up rewritten as:_ `{payload.rewritten_query}`"
+                       if payload.rewritten_query and payload.rewritten_query != user_msg
+                       else "")
+                )
+                yield (new_chat_msgs, hist_pairs or [], gr.update(),
+                       _format_timings(payload.timings),
+                       cfg,
+                       "_(running guardrails after generation)_",
+                       _format_chunks(payload.chunks),
+                       None)
+            elif event_type == "token":
+                accumulated = payload
+                # Update the LAST assistant message in place
+                new_chat_msgs[-1] = {"role": "assistant", "content": accumulated or "…"}
+                yield (new_chat_msgs, hist_pairs or [], gr.update(),
+                       gr.update(), gr.update(), gr.update(), gr.update(), None)
+            elif event_type == "done":
+                final_result = payload
+                final_answer = final_result.answer or accumulated or "_(no answer)_"
+                new_chat_msgs[-1] = {"role": "assistant", "content": final_answer}
+                new_pairs = (hist_pairs or []) + [(user_msg, final_answer)]
+                cfg = f"`{final_result.config_summary}`  ·  query_id=`{final_result.query_id or '—'}`"
+                if final_result.rewritten_query and final_result.rewritten_query != user_msg:
+                    cfg += f"\n\n_Follow-up rewritten as:_ `{final_result.rewritten_query}`"
+                yield (new_chat_msgs, new_pairs, gr.update(),
+                       _format_timings(final_result.timings),
+                       cfg,
+                       _format_guardrails(final_result.guardrail_report),
+                       _format_chunks(final_result.chunks),
+                       final_result)
+                return
     def _on_clear():
         return [], [], None, "", "", "_(send a message to see guardrails)_", "_(send a message to see retrieved passages)_"
 # Build the app
 # =============================================================================
+_BANKY_THEME = gr.themes.Base(
+    primary_hue=gr.themes.colors.amber,
+    secondary_hue=gr.themes.colors.slate,
+    neutral_hue=gr.themes.colors.slate,
+    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
+    font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "ui-monospace", "monospace"],
+).set(
+    body_background_fill="#0b1220",
+    body_background_fill_dark="#0b1220",
+    body_text_color="#e5e7eb",
+    body_text_color_dark="#e5e7eb",
+    background_fill_primary="#111827",
+    background_fill_primary_dark="#111827",
+    background_fill_secondary="#0f172a",
+    background_fill_secondary_dark="#0f172a",
+    block_background_fill="#0f172a",
+    block_background_fill_dark="#0f172a",
+    block_border_color="#1f2937",
+    block_border_color_dark="#1f2937",
+    block_label_background_fill="#0b1220",
+    block_label_background_fill_dark="#0b1220",
+    block_label_text_color="#fbbf24",
+    block_label_text_color_dark="#fbbf24",
+    border_color_accent="#fbbf24",
+    border_color_accent_dark="#fbbf24",
+    border_color_primary="#1f2937",
+    border_color_primary_dark="#1f2937",
+    button_primary_background_fill="#fbbf24",
+    button_primary_background_fill_dark="#fbbf24",
+    button_primary_background_fill_hover="#f59e0b",
+    button_primary_background_fill_hover_dark="#f59e0b",
+    button_primary_text_color="#0b1220",
+    button_primary_text_color_dark="#0b1220",
+    button_secondary_background_fill="#1f2937",
+    button_secondary_background_fill_dark="#1f2937",
+    button_secondary_text_color="#e5e7eb",
+    button_secondary_text_color_dark="#e5e7eb",
+    input_background_fill="#0b1220",
+    input_background_fill_dark="#0b1220",
+    input_border_color="#1f2937",
+    input_border_color_dark="#1f2937",
+    input_border_color_focus="#fbbf24",
+    input_border_color_focus_dark="#fbbf24",
+    color_accent_soft="#1f2937",
+    color_accent_soft_dark="#1f2937",
+)
+_BANKY_CSS = """
+.gradio-container { max-width: 1280px !important; }
+h1, h2, h3, h4 { letter-spacing: -0.01em; font-weight: 600; }
+h1 { font-size: 1.75rem !important; }
+.tabitem { padding-top: 0.5rem; }
+/* Tighter accordion headers */
+.label-wrap > span { font-weight: 600; letter-spacing: 0.01em; }
+/* Markdown body text */
+.prose { line-height: 1.55; }
+.prose code, .prose pre { background: #0b1220 !important; border: 1px solid #1f2937; border-radius: 6px; }
+.prose table { font-size: 0.92em; }
+.prose th { background: #0b1220 !important; color: #fbbf24 !important; font-weight: 600; }
+.prose td { border-color: #1f2937 !important; }
+/* Chatbot polish */
+.message.user { background: #1e293b !important; }
+.message.bot, .message.assistant { background: #0f172a !important; border: 1px solid #1f2937; }
+/* Subtle gold accent on the title bar */
+#title-banner {
+  border-left: 3px solid #fbbf24;
+  padding-left: 0.85rem;
+  margin: 0.25rem 0 1rem 0;
+}
+#title-banner h1 { margin: 0; font-size: 1.5rem !important; }
+#title-banner .tagline { color: #94a3b8; font-size: 0.95rem; margin-top: 0.15rem; }
+"""
 def build_app() -> gr.Blocks:
     with gr.Blocks(title="BankMind") as demo:
+        gr.HTML(
+            """
+            <div id="title-banner">
+              <h1>🏦 BankMind</h1>
+              <div class="tagline">Multi-domain RAG for financial intelligence: regulatory compliance and credit risk.</div>
+            </div>
+            """
         )
         with gr.Tabs():
             with gr.Tab("⚖️ Compliance Q&A"):
 if __name__ == "__main__":
     app = build_app()
+    app.queue(default_concurrency_limit=4, max_size=16)
     app.launch(server_name="127.0.0.1", server_port=7860, show_error=True,
+               theme=_BANKY_THEME, css=_BANKY_CSS)

app/query_pipeline.py CHANGED Viewed

@@ -19,7 +19,7 @@ from typing import Optional
 from pipelines.shared.fusion import convex_combination, hierarchical, rrf
 from pipelines.shared.guardrails import GuardrailReport, check as run_guardrails
-from pipelines.shared.llm import claude_text
 from pipelines.shared.query_logger import chunk_for_log, log_query
 from pipelines.shared.query_transformer import apply_transform
 from pipelines.shared.reranker import rerank
@@ -123,7 +123,9 @@ def _format_history(history: list[tuple[str, str]], *, max_turns: int = 6) -> st
 def _rewrite_followup(query: str, history: list[tuple[str, str]]) -> str:
     """Rewrite a possibly-ambiguous follow-up into a standalone query.
-    No-op when history is empty. On any LLM error, returns the original query.
     """
     if not history:
         return query
@@ -131,10 +133,10 @@ def _rewrite_followup(query: str, history: list[tuple[str, str]]) -> str:
         rewritten = claude_text(
             _REWRITE_USER.format(history=_format_history(history), query=query),
             system=_REWRITE_SYSTEM,
             max_tokens=200,
             temperature=0.0,
         )
-        # Defensive: strip surrounding quotes / leading/trailing whitespace
         rewritten = rewritten.strip().strip('"').strip("'").strip()
         return rewritten or query
     except Exception:
@@ -305,3 +307,170 @@ def run_query(
         query_id=qid,
         rewritten_query=rewritten_query,
     )

 from pipelines.shared.fusion import convex_combination, hierarchical, rrf
 from pipelines.shared.guardrails import GuardrailReport, check as run_guardrails
+from pipelines.shared.llm import FAST_MODEL, claude_text, claude_text_stream
 from pipelines.shared.query_logger import chunk_for_log, log_query
 from pipelines.shared.query_transformer import apply_transform
 from pipelines.shared.reranker import rerank
 def _rewrite_followup(query: str, history: list[tuple[str, str]]) -> str:
     """Rewrite a possibly-ambiguous follow-up into a standalone query.
+    Uses the fast/cheap model (Haiku by default) since paraphrasing is a
+    utility task and Sonnet is overkill. No-op when history is empty.
+    On any LLM error, returns the original query.
     """
     if not history:
         return query
         rewritten = claude_text(
             _REWRITE_USER.format(history=_format_history(history), query=query),
             system=_REWRITE_SYSTEM,
+            model=FAST_MODEL,
             max_tokens=200,
             temperature=0.0,
         )
         rewritten = rewritten.strip().strip('"').strip("'").strip()
         return rewritten or query
     except Exception:
         query_id=qid,
         rewritten_query=rewritten_query,
     )
+# === Streaming variant for the chat UI ======================================
+# Yields three kinds of events:
+#   ("setup",  QueryResult)    -> retrieval + rerank done, generation starts
+#   ("token",  str)            -> cumulative answer text (after each delta)
+#   ("done",   QueryResult)    -> final guardrail-checked + logged result
+# This lets the UI render the chunks panel and timings as soon as retrieval is
+# done, then stream tokens into the chat, then apply guardrails when generation
+# completes.
+def run_query_stream(
+    *,
+    query: str,
+    module: str,
+    chunk_strategy: str,
+    embedding_dim: int,
+    retrieval_method: str,
+    reranker: str,
+    query_transform: str,
+    top_k: int = 10,
+    final_k: int = 5,
+    chat_history: Optional[list[tuple[str, str]]] = None,
+    max_answer_tokens: int = 900,
+):
+    if not query.strip():
+        yield ("done", QueryResult(answer=None, chunks=[], timings={},
+                                   transformed_queries=[], config_summary="(empty query)"))
+        return
+    timings: dict[str, float] = {}
+    retr = _retriever()
+    history = chat_history or []
+    # 1) Follow-up rewrite (Haiku, fast)
+    rewritten_query = None
+    retrieval_query = query
+    if history:
+        t0 = time.perf_counter()
+        rewritten_query = _rewrite_followup(query, history)
+        retrieval_query = rewritten_query
+        timings["rewrite_ms"] = (time.perf_counter() - t0) * 1000
+    # 2) Optional pre-retrieval transform
+    t0 = time.perf_counter()
+    try:
+        tr = apply_transform(
+            query_transform, retrieval_query, module=module,
+            retriever=retr, chunk_strategy=chunk_strategy,
+            embedding_dim=embedding_dim,
+        )
+    except Exception as e:
+        from pipelines.shared.query_transformer import TransformResult
+        tr = TransformResult(queries=[retrieval_query], transform_name="none-fallback",
+                             extras={"error": str(e)})
+    timings["transform_ms"] = (time.perf_counter() - t0) * 1000
+    def _retrieve_one(q: str) -> list[ScoredChunk]:
+        if retrieval_method == "dense":
+            return retr.search(query=q, module=module, chunk_strategy=chunk_strategy,
+                               mode="dense", embedding_dim=embedding_dim, top_k=top_k)
+        if retrieval_method in ("bm25", "splade"):
+            return retr.search(query=q, module=module, chunk_strategy=chunk_strategy,
+                               mode="sparse", sparse_name=retrieval_method, top_k=top_k)
+        if retrieval_method == "hybrid_rrf":
+            return retr.search(query=q, module=module, chunk_strategy=chunk_strategy,
+                               mode="hybrid", embedding_dim=embedding_dim, top_k=top_k)
+        if retrieval_method == "hybrid_convex":
+            d, s, _ = retr.search_separate_channels(
+                query=q, module=module, chunk_strategy=chunk_strategy,
+                embedding_dim=embedding_dim, top_k=50,
+            )
+            return convex_combination(d, s, alpha=0.7, top_k=top_k)
+        if retrieval_method == "hybrid_hier":
+            d, s, _ = retr.search_separate_channels(
+                query=q, module=module, chunk_strategy=chunk_strategy,
+                embedding_dim=embedding_dim, top_k=50,
+            )
+            return hierarchical(q, d, s, top_k=top_k)
+        raise ValueError(f"unknown retrieval_method: {retrieval_method}")
+    t0 = time.perf_counter()
+    if len(tr.queries) == 1:
+        retrieved = _retrieve_one(tr.queries[0])
+    else:
+        retrieved = rrf([_retrieve_one(q) for q in tr.queries], top_k=top_k)
+    timings["retrieve_ms"] = (time.perf_counter() - t0) * 1000
+    t0 = time.perf_counter()
+    if reranker != "none":
+        try:
+            top = rerank(retrieval_query, retrieved, name=reranker, top_n=final_k)
+        except Exception as e:
+            top = retrieved[:final_k]
+            timings["reranker_error"] = str(e)
+    else:
+        top = retrieved[:final_k]
+    timings["rerank_ms"] = (time.perf_counter() - t0) * 1000
+    # Setup event: chunks are known, timings up to this point are known.
+    setup_result = QueryResult(
+        answer=None,
+        chunks=top,
+        timings=dict(timings),
+        transformed_queries=tr.queries if query_transform != "none" else [],
+        config_summary=(
+            f"module={module}  strategy={chunk_strategy}  dim={embedding_dim}  "
+            f"retrieval={retrieval_method}  reranker={reranker}  transform={query_transform}  "
+            f"top_k={top_k}  final_k={final_k}  chat_turns={len(history)}"
+        ),
+        rewritten_query=rewritten_query,
+    )
+    yield ("setup", setup_result)
+    # Stream generation
+    role = "compliance officer" if module == "compliance" else "credit analyst"
+    passages = "\n\n".join(
+        f"[{i+1}] (doc: {c.payload.get('doc_id','?')}, section: {c.payload.get('section_title','')})\n{c.content[:1800]}"
+        for i, c in enumerate(top)
+    )
+    if history:
+        user_prompt = _USER_TURN_TEMPLATE.format(
+            history=_format_history(history), query=query, passages=passages,
+        )
+    else:
+        user_prompt = _NO_HISTORY_TEMPLATE.format(query=query, passages=passages)
+    t0 = time.perf_counter()
+    accumulated = ""
+    for partial in claude_text_stream(
+        user_prompt,
+        system=_SYSTEM_PROMPT.format(role=role),
+        max_tokens=max_answer_tokens,
+        temperature=0.0,
+    ):
+        accumulated = partial
+        yield ("token", accumulated)
+    timings["generate_ms"] = (time.perf_counter() - t0) * 1000
+    # Guardrails + log
+    t0 = time.perf_counter()
+    guardrail_report = run_guardrails(module, accumulated, top, query)
+    timings["guardrails_ms"] = (time.perf_counter() - t0) * 1000
+    timings["total_ms"] = sum(v for k, v in timings.items() if k.endswith("_ms"))
+    config_dict = {
+        "module": module, "chunk_strategy": chunk_strategy,
+        "embedding_dim": embedding_dim, "retrieval_method": retrieval_method,
+        "reranker": reranker, "query_transform": query_transform,
+        "top_k": top_k, "final_k": final_k, "generate_answer": True,
+        "chat_turns": len(history), "rewritten_query": rewritten_query,
+        "streaming": True,
+    }
+    qid = log_query(
+        query=query, config=config_dict, timings=timings,
+        transformed_queries=tr.queries if query_transform != "none" else [],
+        top_chunks=[chunk_for_log(c) for c in top], answer=accumulated,
+        guardrail_report=guardrail_report.to_dict(),
+    )
+    yield ("done", QueryResult(
+        answer=accumulated, chunks=top, timings=timings,
+        transformed_queries=tr.queries if query_transform != "none" else [],
+        config_summary=setup_result.config_summary,
+        guardrail_report=guardrail_report, query_id=qid,
+        rewritten_query=rewritten_query,
+    ))

pipelines/shared/llm.py CHANGED Viewed

@@ -29,6 +29,8 @@ from dotenv import load_dotenv
 load_dotenv()
 DEFAULT_MODEL = os.environ.get("CLAUDE_MODEL", "claude-sonnet-4-6")
 DEFAULT_MAX_TOKENS = 1024
@@ -84,6 +86,36 @@ def claude_text(
     return "".join(parts).strip()
 _JSON_FENCE = re.compile(r"```(?:json)?\s*([\s\S]*?)```")

 load_dotenv()
 DEFAULT_MODEL = os.environ.get("CLAUDE_MODEL", "claude-sonnet-4-6")
+# Light, fast model for utility tasks like follow-up rewriting
+FAST_MODEL = os.environ.get("CLAUDE_FAST_MODEL", "claude-haiku-4-5-20251001")
 DEFAULT_MAX_TOKENS = 1024
     return "".join(parts).strip()
+def claude_text_stream(
+    prompt: str,
+    *,
+    system: str = "",
+    model: str = DEFAULT_MODEL,
+    max_tokens: int = DEFAULT_MAX_TOKENS,
+    temperature: float = 0.0,
+):
+    """Generator yielding the response text as it arrives.
+    Each yield is the *cumulative* output so far (suitable for piping straight
+    into Gradio's Chatbot streaming). On error, yields a single error message.
+    """
+    try:
+        client = _get_client()
+        accumulated = ""
+        with client.messages.stream(
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            system=system or "You are a helpful assistant.",
+            messages=[{"role": "user", "content": prompt}],
+        ) as stream:
+            for delta in stream.text_stream:
+                accumulated += delta
+                yield accumulated
+    except Exception as e:
+        yield f"_(generation failed: {type(e).__name__}: {e})_"
 _JSON_FENCE = re.compile(r"```(?:json)?\s*([\s\S]*?)```")