Spaces:

HyperlinksSpace
/

TinyModel1Space

Running

File size: 31,584 Bytes

61ff229

#!/usr/bin/env python3
"""Chat-style UI (single-line input + history) for the local "Universal Brain" stack.

**Default:** generative LM + TinyModel encoder + FAQ RAG + SQLite memory. **`--lm-only`**
turns off encoder/RAG/memory.

**Natural language:** the model **routes** each line to an intent (summarize, retrieve, remember,
plain chat, …). Slash commands (`/help`, `/status`, …) still work as shortcuts.

Requirements:
  pip install -r optional-requirements-horizon2.txt

Examples:
  python scripts/universal_brain_chat.py
  python scripts/universal_brain_chat.py --no-smart-route
  python scripts/universal_brain_chat.py --lm-only --smoke

Say what you want in plain language, or type `/help`.
"""

from __future__ import annotations

import argparse
import json
import os
import sqlite3
import sys
import warnings
from pathlib import Path

_scripts = Path(__file__).resolve().parent
_REPO = _scripts.parent
DEFAULT_MEMORY_DB = str(_REPO / ".tmp" / "ub_chat_memory.sqlite")
if str(_scripts) not in sys.path:
    sys.path.insert(0, str(_scripts))

from horizon2_core import (  # noqa: E402
    DEFAULT_CHAT_SYSTEM,
    DEFAULT_INSTRUCTION_MODEL,
    SMOKE_MODEL_ID,
    LoadedLM,
    build_user_prompt,
    format_for_model,
    generate_chat_reply,
    generate_completion,
    load_causal_lm,
    pick_device,
)
from horizon3_store import clear_session, connect, init_schema, list_for_scope, put  # noqa: E402
from rag_faq_smoke import _pick_model, hybrid_retrieve, load_chunks  # noqa: E402
from tinymodel_runtime import TinyModelRuntime  # noqa: E402

HELP_TEXT = """**How to use**
- **Normal language:** ask in plain English (or mixed); the app **infers** what you want (summarize, search FAQ, save a note, etc.).
- **Shortcuts:** slash commands still work (`/help`, `/status`, …).

**Intents the router understands** (examples, not exact wording):
- Ordinary chat / questions
- **Summarize** this text — provide the passage in the same message
- **Rewrite** professionally / rephrase
- **Answer using only** these facts — include both facts and question
- **Search** the FAQ / **find** in the knowledge base
- **Classify** (topic model) this paragraph
- **Remember** / note / store: **long-term** vs **this session only**
- **Show** saved notes; **clear** session notes
- **Status** of loaded models

**Classifier** uses AG News–style labels on default Hub weights (World, Business, Sports, Sci/Tech).

If routing misfires, try rephrasing or use a slash command; **`--no-smart-route`** disables inference (chat only, plus `/…`)."""

ROUTER_SYSTEM = """You are an intent router for a desktop AI assistant. The user speaks naturally (any language). Output EXACTLY one JSON object, one line, no markdown fences, no explanation.

Schema:
{"intent":"<name>","text":"","question":"","context":""}

intent must be one of:
- chat — general talk, advice, open questions, follow-ups; put the FULL user message in "text"
- summarize — user wants a shorter summary; put source in "text"
- reformulate — rewrite/clarify/professional tone; source in "text"
- grounded — answer only from given facts; put QUESTION in "question", FACTS in "context" (if user mixes both in one blob, split sensibly)
- retrieve — search FAQ/knowledge; put search query in "text"
- classify — show topic-classifier probabilities; put passage in "text"
- remember — save a durable note; put note body in "text"
- session_note — save a session-only note; put note in "text"
- list_memories — user wants to see saved notes
- clear_session — user wants session-only notes deleted
- status — loaded components / debug info
- help — explain available capabilities

Rules:
- Default to "chat" when unsure; copy the entire user message into "text".
- Do not invent facts for "grounded": if no clear facts/context, use "chat" instead.
- Extract minimal "text" for tool intents (do not repeat system chatter)."""

VALID_INTENTS = frozenset(
    {
        "chat",
        "summarize",
        "reformulate",
        "grounded",
        "retrieve",
        "classify",
        "remember",
        "session_note",
        "list_memories",
        "clear_session",
        "status",
        "help",
    }
)

_INTENT_ALIASES = {
    "memory": "list_memories",
    "memories": "list_memories",
    "notes": "list_memories",
    "search": "retrieve",
    "faq": "retrieve",
    "lookup": "retrieve",
}


def _classifier_result_markdown(probs: dict[str, float]) -> str:
    ranked = sorted(probs.items(), key=lambda x: -x[1])
    top_lab, top_p = ranked[0]
    lines = [
        "### Classifier (TinyModel)\n",
        f"**Winner:** `{top_lab}` · **p = {top_p:.4f}**\n",
        "\n| rank | label | p |\n|:---:|:---|---:|",
    ]
    for i, (lab, p) in enumerate(ranked[:12], 1):
        mark = " **←**" if i == 1 else ""
        lines.append(f"| {i} | {lab}{mark} | {p:.4f} |")
    return "\n".join(lines)


def _ensure_gradio_can_reach_localhost() -> None:
    """Gradio probes localhost via httpx; HTTP(S)_PROXY can break that on Windows/VPN."""
    extras = ("localhost", "127.0.0.1", "::1")
    for var in ("NO_PROXY", "no_proxy"):
        raw = os.environ.get(var, "")
        parts = [p.strip() for p in raw.replace(";", ",").split(",") if p.strip()]
        for h in extras:
            if h not in parts:
                parts.append(h)
        os.environ[var] = ",".join(parts)


def _patch_gradio_localhost_probe() -> None:
    """Gradio's built-in `url_ok` uses httpx with env proxies; on Windows/VPN, HEAD to
    127.0.0.1 often fails even though the app is up. Use direct (no-proxy) requests.
    """
    import time as time_mod
    import warnings as warn_mod

    import gradio.networking as gn
    import httpx

    def url_ok(url: str) -> bool:
        ok_codes = (200, 204, 401, 302, 303, 307)
        for _ in range(5):
            try:
                with warn_mod.catch_warnings():
                    warn_mod.filterwarnings("ignore")
                with httpx.Client(
                    timeout=5,
                    verify=False,
                    trust_env=False,
                    follow_redirects=True,
                ) as client:
                    r = client.head(url)
                    if r.status_code in ok_codes:
                        return True
                    r = client.get(url)
                    if r.status_code in ok_codes:
                        return True
            except (ConnectionError, OSError, httpx.HTTPError, httpx.TimeoutException):
                pass
            time_mod.sleep(0.4)
        return False

    gn.url_ok = url_ok  # type: ignore[assignment]


def _clip(s: str, n: int) -> str:
    s = (s or "").strip()
    if len(s) <= n:
        return s
    return s[: n - 3] + "..."


def _extract_json_object(s: str) -> dict | None:
    s = (s or "").strip()
    try:
        d = json.loads(s)
        return d if isinstance(d, dict) else None
    except json.JSONDecodeError:
        pass
    start = s.find("{")
    end = s.rfind("}")
    if start >= 0 and end > start:
        try:
            d = json.loads(s[start : end + 1])
            return d if isinstance(d, dict) else None
        except json.JSONDecodeError:
            return None
    return None


def _normalize_intent(raw: str) -> str:
    x = (raw or "chat").strip().lower().replace("-", "_")
    x = _INTENT_ALIASES.get(x, x)
    return x if x in VALID_INTENTS else "chat"


def infer_route(
    lm: LoadedLM,
    user_message: str,
    *,
    seed: int,
    max_new_tokens: int,
) -> dict[str, str]:
    u = (
        f"USER_MESSAGE (verbatim):\n{user_message}\n\n"
        "Output the JSON object now."
    )
    if getattr(lm.tokenizer, "chat_template", None):
        prompt = lm.tokenizer.apply_chat_template(
            [{"role": "system", "content": ROUTER_SYSTEM}, {"role": "user", "content": u}],
            tokenize=False,
            add_generation_prompt=True,
        )
    else:
        prompt = f"{ROUTER_SYSTEM}\n\n{u}\nJSON:"
    raw, _, _, _ = generate_completion(
        lm,
        prompt,
        max_new_tokens=max_new_tokens,
        seed=seed,
        do_sample=False,
    )
    data = _extract_json_object(raw) or {}
    intent = _normalize_intent(str(data.get("intent", "chat")))
    return {
        "intent": intent,
        "text": str(data.get("text", "")).strip(),
        "question": str(data.get("question", "")).strip(),
        "context": str(data.get("context", "")).strip(),
    }


def _format_status(
    *,
    meta_mid: str,
    meta_encoder: str,
    meta_rag_path: str | None,
    rag_chunks: list[str] | None,
    meta_mem_db: str | None,
    scope_key: str,
) -> str:
    rag_n = len(rag_chunks) if rag_chunks else 0
    lines = [
        "### Status\n",
        f"- **Generative:** `{meta_mid}`",
        f"- **Encoder:** {meta_encoder}",
        f"- **RAG corpus:** {_clip(meta_rag_path or '—', 80)} · **chunks:** {rag_n}",
        f"- **Memory DB:** `{meta_mem_db or 'off'}` · **scope:** `{scope_key}`",
    ]
    return "\n".join(lines)


def run_routed_tool(
    route: dict[str, str],
    *,
    msg: str,
    lm: LoadedLM,
    mem_conn: sqlite3.Connection | None,
    scope_key: str,
    encoder: TinyModelRuntime | None,
    rag_chunks: list[str] | None,
    rag_top_k: int,
    task_max_new_tokens: int,
    seed: int,
    meta_mid: str,
    meta_encoder: str,
    meta_mem_db: str | None,
    meta_rag_path: str | None,
) -> str:
    intent = route["intent"]
    text = route["text"]
    question = route["question"]
    context = route["context"]

    if intent == "help":
        return HELP_TEXT
    if intent == "status":
        return _format_status(
            meta_mid=meta_mid,
            meta_encoder=meta_encoder,
            meta_rag_path=meta_rag_path,
            rag_chunks=rag_chunks,
            meta_mem_db=meta_mem_db,
            scope_key=scope_key,
        )
    if intent == "classify":
        if not encoder:
            return "Classifier is not loaded (try without `--lm-only` / `--no-encoder`)."
        passage = text or msg
        if not passage:
            return "Tell me what text to classify."
        return _classifier_result_markdown(encoder.classify([passage])[0])
    if intent == "retrieve":
        if not encoder or not rag_chunks:
            return "FAQ search needs encoder + corpus (defaults on unless disabled)."
        q = text or msg
        if not q:
            return "What should I search for?"
        hr = hybrid_retrieve(encoder, q, rag_chunks, top_k=rag_top_k)
        if not hr:
            return "(No matching chunks.)"
        out = ["### Retrieved chunks\n"]
        for i, (sc, _idx, txt) in enumerate(hr, 1):
            out.append(f"**#{i}** score={sc:.4f}\n{_clip(txt, 700)}\n")
        return "\n".join(out)

    if intent in ("summarize", "reformulate", "grounded"):
        if intent == "grounded":
            qn = question or text
            ctx = context
            if not qn or not ctx:
                bod = text or msg
                # one-blob fallback: first sentence as question rest as context heuristic weak
                if "?" in bod:
                    qn = bod.split("?", 1)[0] + "?"
                    ctx = bod.split("?", 1)[1].strip() or bod
                else:
                    return (
                        "For a grounded answer I need **facts** and a **question**. "
                        "Say both in one message (e.g. facts first, then your question)."
                    )
            try:
                up = build_user_prompt("grounded", qn.strip(), context=ctx.strip())
            except ValueError as e:
                return str(e)
        else:
            src = text or msg
            if not src:
                return "What text should I process?"
            task = "summarize" if intent == "summarize" else "reformulate"
            up = build_user_prompt(task, src)
        prompt = format_for_model(lm.tokenizer, up)
        out, _, _, sec = generate_completion(
            lm,
            prompt,
            max_new_tokens=task_max_new_tokens,
            seed=seed,
            do_sample=True,
        )
        return f"**{intent}** ({sec:.2f}s)\n\n{out or '(empty)'}"

    if intent in ("remember", "session_note", "list_memories", "clear_session"):
        if mem_conn is None:
            return "Memory is off (enable default DB or drop `--no-memory`)."
        if intent == "remember":
            note = text or msg
            if not note:
                return "What should I remember?"
            put(mem_conn, scope_key=scope_key, kind="long_term", content=note)
            return "Saved to **long-term** memory."
        if intent == "session_note":
            note = text or msg
            if not note:
                return "What should I store for this session?"
            put(mem_conn, scope_key=scope_key, kind="session", content=note)
            return "Saved to **session** memory."
        if intent == "list_memories":
            items = list_for_scope(mem_conn, scope_key)
            if not items:
                return "(No saved notes for this scope.)"
            lines = [f"- **{it.kind}** · {_clip(it.content, 320)}" for it in items[:24]]
            extra = f"\n\n… {len(items) - 24} more" if len(items) > 24 else ""
            return "Saved notes:\n" + "\n".join(lines) + extra
        if intent == "clear_session":
            n = clear_session(mem_conn, scope_key)
            return f"Cleared **{n}** session note(s). Long-term notes unchanged."

    return ""


def handle_slash(
    msg: str,
    *,
    lm: LoadedLM | None,
    mem_conn: sqlite3.Connection | None,
    scope_key: str,
    encoder: TinyModelRuntime | None,
    rag_chunks: list[str] | None,
    rag_top_k: int,
    task_max_new_tokens: int,
    seed: int,
    meta_mid: str,
    meta_encoder: str,
    meta_mem_db: str | None,
    meta_rag_path: str | None,
) -> str | None:
    if not msg.startswith("/"):
        return None
    parts = msg.split(maxsplit=1)
    cmd = parts[0].lower()
    rest = parts[1].strip() if len(parts) > 1 else ""

    if cmd == "/help":
        return HELP_TEXT

    if cmd == "/status":
        return _format_status(
            meta_mid=meta_mid,
            meta_encoder=meta_encoder,
            meta_rag_path=meta_rag_path,
            rag_chunks=rag_chunks,
            meta_mem_db=meta_mem_db,
            scope_key=scope_key,
        )

    if cmd == "/classify":
        if not encoder:
            return "Classifier off. Drop `--lm-only` / `--no-encoder` or pass `--encoder`."
        if not rest:
            return "Usage: `/classify <text>`"
        return _classifier_result_markdown(encoder.classify([rest])[0])

    if cmd == "/retrieve":
        if not encoder or not rag_chunks:
            return "Retrieve needs encoder + FAQ corpus (default on unless `--lm-only` / `--no-rag` / `--no-encoder`)."
        if not rest:
            return "Usage: `/retrieve <query>`"
        hr = hybrid_retrieve(encoder, rest, rag_chunks, top_k=rag_top_k)
        if not hr:
            return "(No chunks.)"
        out = ["### Retrieve (hybrid)\n"]
        for i, (sc, _idx, txt) in enumerate(hr, 1):
            out.append(f"**#{i}** score={sc:.4f}\n{_clip(txt, 700)}\n")
        return "\n".join(out)

    if cmd in ("/summarize", "/reformulate", "/grounded"):
        if lm is None:
            return "Generative model not loaded."
        if cmd == "/grounded":
            if "|||" not in rest:
                return "Usage: `/grounded <question> ||| <context>`"
            qpart, _, ctxpart = rest.partition("|||")
            question, context = qpart.strip(), ctxpart.strip()
            if not question or not context:
                return "Both question and context required (use `|||`)."
            try:
                up = build_user_prompt("grounded", question, context=context)
            except ValueError as e:
                return str(e)
        else:
            if not rest:
                return f"Usage: `{cmd} <text>`"
            task = "summarize" if cmd == "/summarize" else "reformulate"
            up = build_user_prompt(task, rest)
        prompt = format_for_model(lm.tokenizer, up)
        out, _np, _nn, sec = generate_completion(
            lm,
            prompt,
            max_new_tokens=task_max_new_tokens,
            seed=seed,
            do_sample=True,
        )
        tag = cmd.lstrip("/")
        return f"**/{tag}** ({sec:.2f}s)\n\n{out or '(empty)'}"

    mem_cmds = {"/remember", "/session", "/memories", "/clear-session"}
    if cmd in mem_cmds and mem_conn is None:
        return "Memory off. Drop `--no-memory` or pass `--memory-db` (default DB is used when memory is on)."

    if cmd == "/remember":
        if not rest:
            return "Usage: `/remember <text>`"
        put(mem_conn, scope_key=scope_key, kind="long_term", content=rest)  # type: ignore[arg-type]
        return "Saved to **long-term** memory for this scope."
    if cmd == "/session":
        if not rest:
            return "Usage: `/session <text>`"
        put(mem_conn, scope_key=scope_key, kind="session", content=rest)  # type: ignore[arg-type]
        return "Saved to **session** memory for this scope."
    if cmd == "/memories":
        items = list_for_scope(mem_conn, scope_key)  # type: ignore[arg-type]
        if not items:
            return "(No memory items for this scope.)"
        lines = [f"- **{it.kind}** · {_clip(it.content, 320)}" for it in items[:24]]
        extra = f"\n\n… {len(items) - 24} more" if len(items) > 24 else ""
        return "Stored notes:\n" + "\n".join(lines) + extra
    if cmd == "/clear-session":
        n = clear_session(mem_conn, scope_key)  # type: ignore[arg-type]
        return f"Cleared **{n}** session item(s). Long-term notes are unchanged."

    return None


def _resolve_rag_path(arg: str | None, no_rag: bool) -> Path | None:
    if no_rag:
        return None
    if arg:
        p = Path(arg)
        if not p.is_file():
            p = _REPO / arg
        return p if p.is_file() else None
    default = _REPO / "texts" / "rag_faq_corpus.md"
    return default if default.is_file() else None


def _encoder_device(lm_device: str, explicit: str) -> str:
    if explicit != "auto":
        return explicit
    return "cpu" if lm_device == "cuda" else lm_device


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--model", type=str, default=None, help="HF generative model id.")
    p.add_argument("--smoke", action="store_true", help=f"Tiny generative model {SMOKE_MODEL_ID!r}.")
    p.add_argument("--device", default="auto", help="auto | cpu | cuda | mps")
    p.add_argument("--host", type=str, default="127.0.0.1")
    p.add_argument("--port", type=int, default=7860)
    p.add_argument("--share", action="store_true", help="Gradio share=True (tunnel).")
    p.add_argument("--max-new-tokens", type=int, default=512)
    p.add_argument(
        "--task-max-new-tokens",
        type=int,
        default=256,
        help="Max new tokens for /summarize, /reformulate, /grounded.",
    )
    p.add_argument("--seed", type=int, default=42)
    p.add_argument("--system-prompt", type=str, default="", help="Override system prompt.")

    p.add_argument("--lm-only", action="store_true", help="Chat-only: no encoder, RAG, or SQLite memory.")
    p.add_argument(
        "--no-encoder",
        action="store_true",
        help="Disable TinyModel classifier and FAQ retrieval.",
    )
    p.add_argument("--no-memory", action="store_true", help="Disable Horizon 3 SQLite memory.")
    p.add_argument(
        "--brain",
        action="store_true",
        help="(Optional) Log which default encoder path was resolved; on by default unless --lm-only.",
    )
    p.add_argument(
        "--encoder",
        type=str,
        default=None,
        help="Classifier checkpoint dir or Hub id (overrides --brain default when both set).",
    )
    p.add_argument(
        "--encoder-device",
        type=str,
        default="auto",
        choices=("auto", "cpu", "cuda", "mps"),
        help="Device for TinyModelRuntime (default auto: cpu if generative model is on CUDA).",
    )
    p.add_argument("--no-rag", action="store_true", help="Disable FAQ retrieval even with an encoder.")
    p.add_argument("--rag-corpus", type=str, default=None, help="FAQ markdown path; default texts/rag_faq_corpus.md.")
    p.add_argument("--rag-top-k", type=int, default=2)

    p.add_argument(
        "--memory-db",
        type=str,
        default=None,
        help=f"SQLite path (default when memory on: {DEFAULT_MEMORY_DB}).",
    )
    p.add_argument(
        "--memory-scope",
        type=str,
        default="ub-chat-default",
        help="scope_key for stored memory (tenant/session id).",
    )
    p.add_argument("--no-trace", action="store_true", help="Do not append Brain trace line to assistant replies.")
    p.add_argument(
        "--no-smart-route",
        action="store_true",
        help="Disable NL intent routing (plain chat only; slash commands still work).",
    )
    p.add_argument(
        "--router-max-new-tokens",
        type=int,
        default=192,
        help="Max new tokens for the routing JSON completion.",
    )

    return p.parse_args()


def main() -> None:
    args = parse_args()
    _ensure_gradio_can_reach_localhost()
    try:
        import gradio as gr
    except ImportError as e:
        print("Install Gradio: pip install 'gradio>=5.49,<6'", file=sys.stderr)
        raise SystemExit(1) from e

    _patch_gradio_localhost_probe()

    # Gradio 5.x warns whenever allow_tags is not True (including explicit False); noise only.
    warnings.filterwarnings(
        "ignore",
        message=r".*allow_tags.*gr\.Chatbot.*",
        category=DeprecationWarning,
    )

    if args.smoke:
        mid = SMOKE_MODEL_ID
    elif args.model:
        mid = args.model
    else:
        mid = os.environ.get("HORIZON2_MODEL", DEFAULT_INSTRUCTION_MODEL)
    dev = pick_device(args.device)
    system_text = (args.system_prompt or "").strip() or DEFAULT_CHAT_SYSTEM

    encoder: TinyModelRuntime | None = None
    rag_chunks: list[str] | None = None
    encoder_id: str | None = None

    if args.lm_only or args.no_encoder:
        if args.encoder:
            print("Note: --encoder ignored with --lm-only or --no-encoder.", file=sys.stderr)
        encoder_id = None
    elif args.encoder:
        encoder_id = _pick_model(args.encoder)
    else:
        encoder_id = _pick_model(None)
        if args.brain:
            print(f"--brain: encoder {encoder_id!r}", flush=True)
        else:
            print(f"Encoder (default): {encoder_id!r}", flush=True)

    rag_path = _resolve_rag_path(args.rag_corpus, args.no_rag or args.lm_only)
    if encoder_id:
        enc_dev = _encoder_device(dev, args.encoder_device)
        print(f"Loading encoder {encoder_id!r} on {enc_dev!r} ...", flush=True)
        encoder = TinyModelRuntime(encoder_id, device=enc_dev, max_length=128)
    if encoder and rag_path:
        rag_chunks = load_chunks(rag_path)
        print(f"RAG: {len(rag_chunks)} chunks from {rag_path}", flush=True)
    elif rag_path and not encoder:
        print("Note: FAQ corpus not loaded without encoder.", file=sys.stderr)

    mem_path: str | None = None
    if not args.lm_only and not args.no_memory:
        mem_path = args.memory_db or DEFAULT_MEMORY_DB

    mem_conn: sqlite3.Connection | None = None
    if mem_path:
        mem_conn = connect(mem_path, check_same_thread=False)
        init_schema(mem_conn)
        print(f"Memory: scope={args.memory_scope!r} db={mem_path!r}", flush=True)

    meta_encoder = encoder_id or "off"
    meta_rag = str(rag_path.resolve()) if rag_path else None
    meta_mem = mem_path

    print(f"Loading generative model {mid!r} on {dev!r} ...", flush=True)
    lm = load_causal_lm(mid, dev)
    turn_counter = {"n": 0}
    show_trace = not args.no_trace and (
        encoder is not None or mem_conn is not None or (rag_chunks is not None)
    )

    def respond(
        message: str,
        history: list[dict],
    ) -> tuple[str, list[dict]]:
        msg = (message or "").strip()
        hist = list(history or [])
        if not msg:
            return "", hist

        turn_counter["n"] += 1
        seed = (args.seed + turn_counter["n"]) % (2**31)

        slash_out = handle_slash(
            msg,
            lm=lm,
            mem_conn=mem_conn,
            scope_key=args.memory_scope,
            encoder=encoder,
            rag_chunks=rag_chunks,
            rag_top_k=args.rag_top_k,
            task_max_new_tokens=args.task_max_new_tokens,
            seed=seed,
            meta_mid=mid,
            meta_encoder=meta_encoder,
            meta_mem_db=meta_mem,
            meta_rag_path=meta_rag,
        )
        if slash_out is not None:
            hist.append({"role": "user", "content": msg})
            hist.append({"role": "assistant", "content": slash_out})
            return "", hist

        chat_line = msg
        if not args.no_smart_route:
            try:
                route = infer_route(
                    lm,
                    msg,
                    seed=seed,
                    max_new_tokens=args.router_max_new_tokens,
                )
            except Exception:
                route = {"intent": "chat", "text": msg, "question": "", "context": ""}

            if route["intent"] != "chat":
                tool_reply = run_routed_tool(
                    route,
                    msg=msg,
                    lm=lm,
                    mem_conn=mem_conn,
                    scope_key=args.memory_scope,
                    encoder=encoder,
                    rag_chunks=rag_chunks,
                    rag_top_k=args.rag_top_k,
                    task_max_new_tokens=args.task_max_new_tokens,
                    seed=(seed + 11) % (2**31),
                    meta_mid=mid,
                    meta_encoder=meta_encoder,
                    meta_mem_db=meta_mem,
                    meta_rag_path=meta_rag,
                ).strip()
                if tool_reply:
                    foot = f"\n\n---\n*Routed intent:* `{route['intent']}`"
                    hist.append({"role": "user", "content": msg})
                    hist.append({"role": "assistant", "content": tool_reply + foot})
                    return "", hist

            chat_line = route["text"] or msg

        trace: list[str] = []
        extras: list[str] = []

        if encoder:
            probs = encoder.classify([chat_line])[0]
            top_lab = max(probs, key=probs.get)
            top_p = probs[top_lab]
            trace.append(f"classify:{top_lab}({top_p:.2f})")
            extras.append(
                f"Encoder routing hint: the line most resembles label {top_lab!r} "
                f"(winner probability {top_p:.2f}). Use as soft context only."
            )

        rag_block = ""
        if encoder and rag_chunks:
            hr = hybrid_retrieve(encoder, chat_line, rag_chunks, top_k=args.rag_top_k)
            if hr:
                trace.append(f"RAG:{len(hr)}chunk(s)")
                pieces = []
                for i, (_sc, _idx, txt) in enumerate(hr):
                    pieces.append(f"[FAQ excerpt {i + 1}]\n{_clip(txt, 900)}")
                rag_block = "\n\n".join(pieces)
                extras.append(
                    "Relevant FAQ excerpts (may be incomplete). "
                    "Ground factual claims in them when they apply; do not invent policy."
                    f"\n\n{rag_block}"
                )

        if mem_conn:
            items = list_for_scope(mem_conn, args.memory_scope)
            if items:
                trace.append(f"mem:{len(items)}item(s)")
                mem_lines = []
                for it in items[:10]:
                    mem_lines.append(f"- ({it.kind}) {_clip(it.content, 240)}")
                extras.append(
                    "User-visible stored notes for this chat scope (from /remember and /session):\n"
                    + "\n".join(mem_lines)
                )

        extra_system = "\n\n".join(extras) if extras else ""
        if extra_system:
            extra_system = "\n\n---\n" + extra_system

        eff_system = system_text + extra_system
        messages: list[dict[str, str]] = [{"role": "system", "content": eff_system}]
        messages.extend(hist)
        messages.append({"role": "user", "content": chat_line})

        seed_chat = (seed + 97) % (2**31)
        reply, _, _, _ = generate_chat_reply(
            lm,
            messages,
            max_new_tokens=args.max_new_tokens,
            seed=seed_chat,
            do_sample=True,
        )
        out = reply or "(empty generation)"
        if show_trace and trace:
            out += "\n\n---\n*Brain trace:* " + " · ".join(trace)

        hist.append({"role": "user", "content": msg})
        hist.append({"role": "assistant", "content": out})
        return "", hist

    brain_bits = []
    if encoder:
        brain_bits.append("encoder")
    if rag_chunks:
        brain_bits.append("RAG")
    if mem_conn:
        brain_bits.append("memory")
    brain_label = "+".join(brain_bits) if brain_bits else "LM only"

    with gr.Blocks(title="Universal Brain (chat prototype)") as demo:
        gr.Markdown(
            "### Universal Brain — chat prototype\n"
            f"**Generative:** `{mid}` ({lm.device}) · **Brain layers:** {brain_label}\n\n"
            "**NL routing:** the model infers what you want (summarize, FAQ search, save note, …). "
            "Use **`--no-smart-route`** for plain chat-only + slash shortcuts. "
            "`/help` lists slash commands.\n\n"
            "Encoder topics (Hub TinyModel1 ≈ AG News) still feed context and an optional *Brain trace* line; "
            "use `/classify` or ask naturally to see the full probability table in chat."
        )
        chat = gr.Chatbot(type="messages", height=520, label="Conversation", allow_tags=False)
        with gr.Row():
            inp = gr.Textbox(
                lines=1,
                max_lines=1,
                show_label=False,
                placeholder="Ask in plain language, or use /help …",
                scale=9,
            )
            go = gr.Button("Send", variant="primary", scale=1)
        gr.ClearButton([chat, inp])

        def _submit(m: str, h: list[dict]) -> tuple[str, list[dict]]:
            return respond(m, h)

        go.click(_submit, [inp, chat], [inp, chat])
        inp.submit(_submit, [inp, chat], [inp, chat])

    demo.queue(default_concurrency_limit=2)
    share = args.share
    if share is False and os.environ.get("GRADIO_SHARE", "").lower() == "true":
        share = True
    try:
        demo.launch(
            server_name=args.host,
            server_port=args.port,
            share=share,
            ssr_mode=False,
        )
    except ValueError as e:
        err = str(e)
        if "localhost is not accessible" in err:
            print(
                "\nGradio could not verify localhost (often HTTP_PROXY / corporate VPN).\n"
                "Try one of:\n"
                "  python scripts/universal_brain_chat.py --share\n"
                "  set GRADIO_SHARE=True   (Windows cmd)\n"
                "  $env:GRADIO_SHARE='true'   (PowerShell)\n",
                file=sys.stderr,
            )
        raise


if __name__ == "__main__":
    main()