Spaces:

HyperlinksSpace
/

TinyModel1Space

Sleeping

App Files Files Community

anriltine commited on 17 days ago

Commit

4339a77

verified ·

1 Parent(s): 61ff229

Deploy TinyModel1Space from GitHub Actions

Browse files

Files changed (5) hide show

scripts/eval_report_routing.py +78 -0
scripts/horizon2_core.py +52 -8
scripts/nl_controls.py +652 -0
scripts/rag_faq_smoke.py +64 -5
scripts/universal_brain_chat.py +1121 -26

scripts/eval_report_routing.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env python3
+"""Read the Phase 2 **`routing`** object from a classifier checkpoint's **`eval_report.json`**.
+Used by Horizon 1 glue, **rag_faq_smoke**, **embeddings_smoke_test**, **routing_policy** (**`--from-checkpoint`**), **horizon1_route_then_retrieve**, and training/report CLIs so training notes and runtime gates stay aligned."""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+def load_routing_from_eval_report(model_path: str | Path) -> dict | None:
+    """Return the top-level ``routing`` dict if ``model_path`` is a dir with a valid report."""
+    p = Path(model_path)
+    if not p.is_dir():
+        return None
+    er = p / "eval_report.json"
+    if not er.is_file():
+        return None
+    try:
+        data = json.loads(er.read_text(encoding="utf-8"))
+    except json.JSONDecodeError:
+        return None
+    r = data.get("routing")
+    return r if isinstance(r, dict) else None
+def format_checkpoint_tip_path(
+    output_dir: str | Path,
+    *,
+    cwd: Path | None = None,
+) -> str:
+    """Return a repo-relative checkpoint path when ``output_dir`` is under ``cwd``."""
+    p = Path(output_dir).resolve()
+    base = (cwd if cwd is not None else Path.cwd()).resolve()
+    try:
+        return p.relative_to(base).as_posix()
+    except ValueError:
+        return p.as_posix()
+def format_routing_policy_from_checkpoint_command(
+    output_dir: str | Path,
+    *,
+    cwd: Path | None = None,
+) -> str:
+    """Full ``python scripts/routing_policy.py --from-checkpoint …`` line (no shell quoting)."""
+    tip = format_checkpoint_tip_path(output_dir, cwd=cwd)
+    return f"python scripts/routing_policy.py --from-checkpoint {tip}"
+def print_routing_policy_from_checkpoint_tip(
+    output_dir: str | Path,
+    *,
+    headline: str = "Tip: dump Phase 2 `routing` JSON (no model load):",
+    cwd: Path | None = None,
+) -> None:
+    """Print a copy-paste **Tip:** for ``routing_policy`` (shared by train/compare/verify scripts)."""
+    cmd = format_routing_policy_from_checkpoint_command(output_dir, cwd=cwd)
+    print(f"{headline}\n  {cmd}", flush=True)
+def maybe_print_routing_section(model_path: str, *, enabled: bool, prog: str) -> None:
+    """If ``enabled``, print ``routing`` JSON or a stderr hint (``prog`` labels the caller)."""
+    if not enabled:
+        return
+    notes = load_routing_from_eval_report(model_path)
+    if notes is None:
+        print(
+            f"{prog}: no eval_report.json with top-level `routing` "
+            "(Hub id or missing artifact).",
+            file=sys.stderr,
+        )
+        return
+    print("=== eval_report.json routing (Phase 2 training notes) ===\n")
+    print(json.dumps(notes, indent=2))
+    print()

scripts/horizon2_core.py CHANGED Viewed

@@ -173,9 +173,26 @@ def load_causal_lm(
     model_id: str,
     device: str,
 ) -> LoadedLM:
     import torch
     from transformers import AutoModelForCausalLM, AutoTokenizer
     d = device if device in ("cpu", "cuda", "mps") else "cpu"
     tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     if tok.pad_token is None and tok.eos_token is not None:
@@ -187,15 +204,42 @@ def load_causal_lm(
         )
     else:
         dt = torch.float32
-    # Prefer `dtype` (newer Transformers); fall back to `torch_dtype` (older).
-    try:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, trust_remote_code=True, dtype=dt
-        )
-    except TypeError:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, trust_remote_code=True, torch_dtype=dt
         )
     model.eval()
     model = model.to(d)
     return LoadedLM(model=model, tokenizer=tok, device=d)

     model_id: str,
     device: str,
 ) -> LoadedLM:
+    import os
+    import sys
+    # Must run before `import torch` on first use (e.g. horizon2_server on Windows).
+    if sys.platform == "win32":
+        os.environ.setdefault("OMP_NUM_THREADS", "1")
+        os.environ.setdefault("MKL_NUM_THREADS", "1")
+        os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+        os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
     import torch
     from transformers import AutoModelForCausalLM, AutoTokenizer
+    if sys.platform == "win32":
+        torch.set_num_threads(1)
+        try:
+            torch.set_num_interop_threads(1)
+        except RuntimeError:
+            pass
     d = device if device in ("cpu", "cuda", "mps") else "cpu"
     tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     if tok.pad_token is None and tok.eos_token is not None:
         )
     else:
         dt = torch.float32
+    def _from_pretrained(extra: dict[str, Any]) -> Any:
+        # Prefer `dtype` (newer Transformers); fall back to `torch_dtype` (older).
+        try:
+            return AutoModelForCausalLM.from_pretrained(
+                model_id, trust_remote_code=True, dtype=dt, **extra
+            )
+        except TypeError:
+            return AutoModelForCausalLM.from_pretrained(
+                model_id, trust_remote_code=True, torch_dtype=dt, **extra
+            )
+    # Retry with progressively fewer options (compat + stability on Windows CPU).
+    if d == "cpu":
+        extras: tuple[dict[str, Any], ...] = (
+            {"low_cpu_mem_usage": True, "attn_implementation": "eager"},
+            {"low_cpu_mem_usage": True},
+            {},
         )
+    else:
+        extras = ({"low_cpu_mem_usage": True}, {})
+    model = None
+    last_err: BaseException | None = None
+    for extra in extras:
+        try:
+            model = _from_pretrained(extra)
+            break
+        except (TypeError, ValueError, OSError) as e:
+            last_err = e
+            continue
+    if model is None:
+        raise RuntimeError(
+            f"Failed to load causal LM {model_id!r}; last error: {last_err!r}"
+        ) from last_err
     model.eval()
     model = model.to(d)
     return LoadedLM(model=model, tokenizer=tok, device=d)

scripts/nl_controls.py ADDED Viewed

	@@ -0,0 +1,652 @@

+"""Natural-language control phrases for Universal Brain chat.
+This is a lightweight, deterministic pre-router for actions that should not depend on
+LLM JSON routing (and should work without requiring users to remember slash commands).
+It is intentionally conservative: it only triggers on fairly explicit phrasing.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import re
+@dataclass(frozen=True)
+class ControlAction:
+    name: str
+    value: str | None = None
+_WS = re.compile(r"\s+")
+def _norm(s: str) -> str:
+    return _WS.sub(" ", (s or "").strip().lower())
+def parse_control_action(message: str) -> ControlAction | None:
+    """Return a ControlAction if the message is a natural-language control request."""
+    m = _norm(message)
+    if not m:
+        return None
+    # "What mode is this? What session/scope am I in?"
+    if re.search(r"\b(what|show)\b.*\b(my )?(session|scope|settings|mode|status)\b", m) or re.search(
+        r"\bwhich\b.*\b(scope|session)\b", m
+    ):
+        return ControlAction("show_session")
+    # Start a fresh private session (new scope key).
+    if re.search(r"\b(new|fresh)\b.*\b(private )?(session|scope)\b", m) or re.search(
+        r"\b(start|begin)\b.*\b(private )?(session|scope)\b", m
+    ):
+        return ControlAction("new_private_session")
+    # Switch to a named scope in chat, e.g. "use scope abc-123" / "switch to session foo".
+    m2 = re.search(r"\b(use|switch to|set)\b.*\b(scope|session)\b\s*[:=]?\s*([a-z0-9][a-z0-9_.:-]{1,63})\b", m)
+    if m2:
+        return ControlAction("set_scope", m2.group(3))
+    # Memory controls (order matters: list/show before export/download)
+    if re.search(
+        r"\b(show|list)\b.*\b(my )?(data|memory|memories|notes)\b",
+        m,
+    ):
+        return ControlAction("list_memories")
+    if re.search(
+        r"\b(export|download)\b.*\b(my )?(data|memory|memories|notes)\b",
+        m,
+    ):
+        return ControlAction("export_memory")
+    if re.search(r"\b(clear|wipe|delete|forget)\b.*\b(session)\b.*\b(memory|memories|notes)?\b", m):
+        return ControlAction("clear_session")
+    if re.search(r"\b(forget|delete|erase|wipe)\b.*\b(all|everything)\b.*\b(memory|memories|notes|data)\b", m) or re.search(
+        r"\b(delete|erase)\b.*\b(my )?(data|account data|data for this chat)\b", m
+    ):
+        return ControlAction("forget_scope")
+    # Session toggles (chat UX)
+    if re.search(r"\b(turn on|enable|show)\b.*\b(trace|brain trace|debug)\b", m):
+        return ControlAction("set_trace", "on")
+    if re.search(r"\b(turn off|disable|hide)\b.*\b(trace|brain trace|debug)\b", m):
+        return ControlAction("set_trace", "off")
+    if re.search(r"\b(turn on|enable)\b.*\b(smart routing|auto routing|router)\b", m):
+        return ControlAction("set_smart_route", "on")
+    if re.search(r"\b(turn off|disable)\b.*\b(smart routing|auto routing|router)\b", m):
+        return ControlAction("set_smart_route", "off")
+    if re.search(r"\b(turn on|enable)\b.*\b(faq|rag|retrieval)\b", m):
+        return ControlAction("set_rag", "on")
+    if re.search(r"\b(turn off|disable)\b.*\b(faq|rag|retrieval)\b", m):
+        return ControlAction("set_rag", "off")
+    # Reply style for the generative model (short lines only to avoid hijacking real questions).
+    # Require "reply"/"answer" before style|format|length so phrases like "default quote style" / "reset tables"
+    # are handled by narrower matchers below.
+    if len(m) <= 140 and (
+        re.search(r"\breset\b.*\b(reply|answer)\s+(style|format|length)\b", m)
+        or re.search(r"\b(default|normal)\b.*\b(reply|answer)\s+(style|format|length)\b", m)
+    ):
+        return ControlAction("reset_reply_style")
+    if len(m) <= 96 and re.search(
+        r"\b(be brief|stay brief|keep it short|short answers|answer briefly|concise replies)\b",
+        m,
+    ):
+        return ControlAction("set_verbosity", "brief")
+    if len(m) <= 120 and re.search(
+        r"\b(more detail|go deeper|in greater detail|explain thoroughly|longer answers|detailed answers)\b",
+        m,
+    ):
+        return ControlAction("set_verbosity", "detailed")
+    if len(m) <= 100 and re.search(
+        r"\b(normal (answer )?length|default length|balanced length)\b",
+        m,
+    ):
+        return ControlAction("set_verbosity", "normal")
+    if len(m) <= 110 and re.search(r"\b(use|prefer)\b", m) and re.search(
+        r"\b(bullet points?|numbered lists?)\b",
+        m,
+    ):
+        return ControlAction("set_reply_format", "bullets")
+    if len(m) <= 100 and re.search(
+        r"\b(no bullets|plain paragraphs?|prose only|stop using lists)\b",
+        m,
+    ):
+        return ControlAction("set_reply_format", "prose")
+    # FAQ / RAG grounding hints for the assistant (short control lines).
+    if len(m) <= 100 and re.search(
+        r"\b(strict faq|faq only|stick to (the )?faq|only use (the )?faq|only trust (the )?faq)\b",
+        m,
+    ):
+        return ControlAction("set_faq_grounding", "strict")
+    if len(m) <= 115 and re.search(
+        r"\b(balanced faq|normal faq|default faq(\s+grounding)?|default faq mode)\b",
+        m,
+    ):
+        return ControlAction("set_faq_grounding", "normal")
+    if len(m) <= 130 and re.search(
+        r"\b(relaxed faq|faq plus general knowledge|general knowledge(\s+is)?\s+ok|mix faq and general knowledge)\b",
+        m,
+    ):
+        return ControlAction("set_faq_grounding", "relaxed")
+    # Explanation depth (who the answer is for) — short control lines only.
+    if (
+        (len(m) <= 40 and re.match(r"^(please\s+)?explain simply[\s.!?]*$", m))
+        or re.match(r"^(please\s+)?eli5\b[\s.!?]*$", m)
+        or (len(m) <= 56 and re.search(r"\b(i'?m\s+a\s+beginner|beginner\s+here)\b", m))
+        or re.match(r"^(please\s+)?assume i'?m\s+new\b[\s.!?]*$", m)
+        or (len(m) <= 56 and re.search(r"\bi\s+need\s+(the\s+)?basics\b", m))
+    ):
+        return ControlAction("set_audience", "simple")
+    if len(m) <= 72 and (
+        re.match(r"^(please\s+)?assume i'?m\s+technical[\s.!?]*$", m)
+        or re.match(r"^expert\s+mode[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?use jargon freely[\s.!?]*$", m)
+        or re.match(r"^technical audience[\s.!?]*$", m)
+        or re.match(r"^for experts[\s.!?]*$", m)
+    ):
+        return ControlAction("set_audience", "technical")
+    if len(m) <= 78 and (
+        re.match(r"^(please\s+)?(default explanation level|normal explanation level|general audience)[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?(reset|default)\s+audience[\s.!?]*$", m)
+    ):
+        return ControlAction("set_audience", "normal")
+    # Answer lead — whether to front-load a TL;DR line (orthogonal to verbosity).
+    if len(m) <= 88 and (
+        re.match(r"^(please\s+)?(tl;|tl)dr\s+first\b[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?(lead|start)\s+with\s+(a\s+)?(short\s+)?summary\b[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?summary\s+first\b[\s.!?]*$", m)
+    ):
+        return ControlAction("set_answer_lead", "tldr_first")
+    if len(m) <= 92 and (
+        re.match(r"^(please\s+)?no\s+tl;?dr\b[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?skip (the\s+)?summary\b[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?answer directly\b[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?direct answer\s+only\b[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?without\s+a\s+tldr\b[\s.!?]*$", m)
+    ):
+        return ControlAction("set_answer_lead", "direct")
+    if len(m) <= 64 and (
+        re.match(r"^(please\s+)?(default answer structure|normal answer opening|usual\s+opening)[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?reset\s+(answer\s+)?opening[\s.!?]*$", m)
+    ):
+        return ControlAction("set_answer_lead", "normal")
+    # Procedures: numbered steps vs continuous prose (orthogonal to bullets).
+    if len(m) <= 88 and (
+        re.match(r"^(please\s+)?(step by step|step-by-step)[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?use numbered steps[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?numbered steps\b[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?walk me through( the)? steps\b[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?break it into steps[\s.!?]*$", m)
+    ):
+        return ControlAction("set_step_style", "numbered")
+    if len(m) <= 92 and (
+        re.match(r"^(please\s+)?(no numbered steps|don'?t number steps|skip step numbers)[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?(continuous prose|prose without steps)[\s.!?]*$", m)
+    ):
+        return ControlAction("set_step_style", "continuous")
+    if len(m) <= 64 and re.match(r"^(please\s+)?(default step style|normal steps|reset steps)[\s.!?]*$", m):
+        return ControlAction("set_step_style", "normal")
+    # How hard to hedge / flag limits (orthogonal to FAQ strictness).
+    if len(m) <= 94 and (
+        re.match(r"^(please\s+)?flag your assumptions[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?be explicit about uncertainty[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?say if you don'?t know[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?tell me when you(?:'?re|\s+are)\s+unsure[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?say when you(?:'?re|\s+are)\s+unsure[\s.!?]*$", m)
+    ):
+        return ControlAction("set_confidence_tone", "transparent")
+    if len(m) <= 72 and (
+        re.match(r"^(please\s+)?be decisive[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?don'?t hedge[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?give firm answers[\s.!?]*$", m)
+    ):
+        return ControlAction("set_confidence_tone", "assertive")
+    if len(m) <= 80 and re.match(
+        r"^(please\s+)?(default confidence tone|normal confidence|reset uncertainty)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_confidence_tone", "normal")
+    # Whether to offer follow-ups / next steps at the end of answers.
+    if len(m) <= 96 and (
+        re.match(r"^(please\s+)?suggest next steps[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?offer follow[- ]up questions[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?end with (optional )?next steps[\s.!?]*$", m)
+    ):
+        return ControlAction("set_followup_close", "suggest")
+    if len(m) <= 100 and (
+        re.match(r"^(please\s+)?no follow[- ]up questions[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?don'?t ask follow[- ]up questions[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?no questions at the end[\s.!?]*$", m)
+    ):
+        return ControlAction("set_followup_close", "minimal")
+    if len(m) <= 78 and (
+        re.match(r"^(please\s+)?(default follow[- ]ups?|reset follow[- ]ups?|normal follow[- ]ups?)[\s.!?]*$", m)
+    ):
+        return ControlAction("set_followup_close", "normal")
+    # Teach order: define terms vs motivate first (orthogonal to TL;DR / steps).
+    if len(m) <= 80 and (
+        re.match(r"^(please\s+)?definitions first[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?start with definitions[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?define terms first[\s.!?]*$", m)
+    ):
+        return ControlAction("set_exposition_order", "definitions_first")
+    if len(m) <= 96 and (
+        re.match(r"^(please\s+)?intuition first[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?big picture first[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?start with the big picture[\s.!?]*$", m)
+    ):
+        return ControlAction("set_exposition_order", "intuition_first")
+    if len(m) <= 88 and re.match(
+        r"^(please\s+)?(default explanation order|reset explanation order|normal explanation order)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_exposition_order", "normal")
+    # Examples vs terse explanations when comparing or teaching.
+    if len(m) <= 76 and (
+        re.match(r"^(please\s+)?include examples[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?use concrete examples[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?illustrate with examples[\s.!?]*$", m)
+    ):
+        return ControlAction("set_example_density", "rich")
+    if len(m) <= 92 and (
+        re.match(r"^(please\s+)?skip examples[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?don'?t add examples[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?no examples unless i ask[\s.!?]*$", m)
+    ):
+        return ControlAction("set_example_density", "sparse")
+    if len(m) <= 68 and re.match(
+        r"^(please\s+)?(default examples|normal examples|reset examples)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_example_density", "normal")
+    # Compare/contrast presentation.
+    if len(m) <= 96 and (
+        re.match(r"^(please\s+)?use pros and cons[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?pros and cons sections[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?compare with pros and cons[\s.!?]*$", m)
+    ):
+        return ControlAction("set_comparison_frame", "pros_cons")
+    if len(m) <= 100 and (
+        re.match(r"^(please\s+)?compare in flowing prose[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?prose comparison only[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?no pros and cons sections[\s.!?]*$", m)
+    ):
+        return ControlAction("set_comparison_frame", "narrative")
+    if len(m) <= 82 and re.match(
+        r"^(please\s+)?(default comparison style|normal comparison|reset comparison)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_comparison_frame", "normal")
+    # Professional vs conversational wording (orthogonal to verbosity).
+    if len(m) <= 92 and (
+        re.match(r"^(please\s+)?formal tone[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?professional register[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?business writing style[\s.!?]*$", m)
+    ):
+        return ControlAction("set_register_tone", "formal")
+    if len(m) <= 96 and (
+        re.match(r"^(please\s+)?casual tone[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?friendly casual style[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?speak casually[\s.!?]*$", m)
+    ):
+        return ControlAction("set_register_tone", "casual")
+    if len(m) <= 76 and re.match(
+        r"^(please\s+)?(default tone|neutral tone|reset tone)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_register_tone", "normal")
+    # Markdown code snippet layout.
+    if len(m) <= 100 and (
+        re.match(r"^(please\s+)?use code fences[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?fenced code blocks[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?markdown code fences[\s.!?]*$", m)
+    ):
+        return ControlAction("set_code_block_style", "fenced")
+    if len(m) <= 104 and (
+        re.match(r"^(please\s+)?inline code only[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?no triple backticks[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?no fenced code blocks[\s.!?]*$", m)
+    ):
+        return ControlAction("set_code_block_style", "inline")
+    if len(m) <= 96 and re.match(
+        r"^(please\s+)?(default code formatting|reset code style|normal code blocks)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_code_block_style", "normal")
+    # Analogies / metaphors vs literal explanations only.
+    if len(m) <= 92 and (
+        re.match(r"^(please\s+)?use analogies[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?analogies when helpful[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?metaphors are ok[\s.!?]*$", m)
+    ):
+        return ControlAction("set_analogy_use", "prefer")
+    if len(m) <= 100 and (
+        re.match(r"^(please\s+)?no analogies[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?skip metaphors[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?literal explanations only[\s.!?]*$", m)
+    ):
+        return ControlAction("set_analogy_use", "avoid")
+    if len(m) <= 82 and re.match(
+        r"^(please\s+)?(default analogy style|reset analogies|normal analogies)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_analogy_use", "normal")
+    # Expand vs terse acronym handling on first introduce.
+    if len(m) <= 112 and (
+        re.match(r"^(please\s+)?spell out acronyms[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?expand acronyms on first use[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?define acronyms when you use them[\s.!?]*$", m)
+    ):
+        return ControlAction("set_acronym_style", "spell_out")
+    if len(m) <= 112 and (
+        re.match(r"^(please\s+)?assume i know acronyms[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?don'?t expand acronyms[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?keep acronyms as is[\s.!?]*$", m)
+    ):
+        return ControlAction("set_acronym_style", "terse")
+    if len(m) <= 92 and re.match(
+        r"^(please\s+)?(default acronym style|reset acronyms|normal acronyms)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_acronym_style", "normal")
+    # Clarify-first: ask brief questions before answering if key info is missing.
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?ask clarifying questions first[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?clarify first[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?ask me questions before answering[\s.!?]*$", m)
+    ):
+        return ControlAction("set_clarify_first", "on")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?no clarifying questions[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?just answer without questions[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?answer without asking questions[\s.!?]*$", m)
+    ):
+        return ControlAction("set_clarify_first", "off")
+    if len(m) <= 96 and re.match(
+        r"^(please\s+)?(default clarify mode|reset clarify mode|normal clarify mode)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_clarify_first", "normal")
+    # Speculation level: strict factual vs brainstorming.
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?no speculation[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?stick to high confidence only[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?avoid guessing[\s.!?]*$", m)
+    ):
+        return ControlAction("set_speculation", "strict")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?brainstorm freely[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?speculate freely[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?wild ideas ok[\s.!?]*$", m)
+    ):
+        return ControlAction("set_speculation", "creative")
+    if len(m) <= 100 and re.match(
+        r"^(please\s+)?(default speculation|normal speculation|reset speculation)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_speculation", "normal")
+    # Math/explanations: show work vs final-only.
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?show your work[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?show the derivation[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?include steps in math[\s.!?]*$", m)
+    ):
+        return ControlAction("set_math_detail", "show_work")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?final answer only[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?no derivation[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?skip the steps[\s.!?]*$", m)
+    ):
+        return ControlAction("set_math_detail", "final_only")
+    if len(m) <= 110 and re.match(
+        r"^(please\s+)?(default math detail|normal math detail|reset math detail)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_math_detail", "normal")
+    # Output structure: JSON-shaped vs normal prose.
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?answer in json[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?json output[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?structured json[\s.!?]*$", m)
+    ):
+        return ControlAction("set_output_format", "json")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?plain text only[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?no json[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?no structured output[\s.!?]*$", m)
+    ):
+        return ControlAction("set_output_format", "plain")
+    if len(m) <= 110 and re.match(
+        r"^(please\s+)?(default output format|normal output format|reset output format)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_output_format", "normal")
+    # Safety/risk posture for recommendations.
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?be risk averse[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?be conservative[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?err on the side of safety[\s.!?]*$", m)
+    ):
+        return ControlAction("set_risk_posture", "conservative")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?be pragmatic[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?optimize for speed[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?good enough is fine[\s.!?]*$", m)
+    ):
+        return ControlAction("set_risk_posture", "pragmatic")
+    if len(m) <= 110 and re.match(
+        r"^(please\s+)?(default risk posture|normal risk posture|reset risk posture)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_risk_posture", "normal")
+    # Actionability: runnable steps vs conceptual explanation.
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?give me runnable commands[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?include commands[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?make it actionable[\s.!?]*$", m)
+    ):
+        return ControlAction("set_actionability", "commands")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?no commands[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?conceptual only[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?high level only[\s.!?]*$", m)
+    ):
+        return ControlAction("set_actionability", "conceptual")
+    if len(m) <= 110 and re.match(
+        r"^(please\s+)?(default actionability|normal actionability|reset actionability)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_actionability", "normal")
+    # Quote/citation preference when using supplied excerpts.
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?quote the faq excerpts[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?use direct quotes[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?cite with quotes[\s.!?]*$", m)
+    ):
+        return ControlAction("set_quote_style", "quote")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?no quotes[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?don'?t quote excerpts[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?paraphrase only[\s.!?]*$", m)
+    ):
+        return ControlAction("set_quote_style", "paraphrase")
+    if len(m) <= 110 and re.match(
+        r"^(please\s+)?(default quote style|normal quote style|reset quote style)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_quote_style", "normal")
+    # Tables: prefer markdown tables vs avoid.
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?use tables[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?markdown tables[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?tabular format[\s.!?]*$", m)
+    ):
+        return ControlAction("set_table_style", "prefer")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?no tables[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?avoid tables[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?no markdown tables[\s.!?]*$", m)
+    ):
+        return ControlAction("set_table_style", "avoid")
+    if len(m) <= 110 and re.match(
+        r"^(please\s+)?(default table style|normal tables|reset tables)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_table_style", "normal")
+    # Emoji in assistant replies (short lines; conservative wording).
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?(use emoji|emoji ok|emoji welcome|include emoji)[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?add (a few )?emoji[\s.!?]*$", m)
+    ):
+        return ControlAction("set_emoji_style", "include")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?no emojis?[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?avoid emoji[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?don'?t use emoji[\s.!?]*$", m)
+    ):
+        return ControlAction("set_emoji_style", "avoid")
+    if len(m) <= 110 and re.match(
+        r"^(please\s+)?(default emoji style|normal emoji|reset emoji)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_emoji_style", "normal")
+    # Markdown section headings (## / ###) vs flat prose.
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?use section headings[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?organize with headings[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?use markdown headings[\s.!?]*$", m)
+    ):
+        return ControlAction("set_section_headings", "prefer")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?no section headings[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?avoid markdown headings[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?flat (answer|prose)( please)?[\s.!?]*$", m)
+    ):
+        return ControlAction("set_section_headings", "avoid")
+    if len(m) <= 110 and re.match(
+        r"^(please\s+)?(default section headings|normal headings|reset headings)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_section_headings", "normal")
+    # Inline emphasis: bold a few key terms vs keep markdown minimal.
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?bold key terms[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?highlight important terms[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?emphasize keywords[\s.!?]*$", m)
+    ):
+        return ControlAction("set_term_emphasis", "highlight")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?minimal bold[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?don'?t overuse bold[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?avoid excessive bold[\s.!?]*$", m)
+    ):
+        return ControlAction("set_term_emphasis", "minimal")
+    if len(m) <= 110 and re.match(
+        r"^(please\s+)?(default emphasis|normal bold|reset emphasis)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_term_emphasis", "normal")
+    # Counterpoint tone: supportive vs challenge assumptions (short lines).
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?challenge my assumptions[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?play devils advocate[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?push back on weak points[\s.!?]*$", m)
+    ):
+        return ControlAction("set_counterpoint_tone", "challenge")
+    if len(m) <= 110 and (
+        re.match(r"^(please\s+)?be supportive[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?assume good intent[\s.!?]*$", m)
+        or re.match(r"^(please\s+)?encourage my ideas[\s.!?]*$", m)
+    ):
+        return ControlAction("set_counterpoint_tone", "supportive")
+    if len(m) <= 110 and re.match(
+        r"^(please\s+)?(default counterpoints|normal pushback|reset counterpoints)[\s.!?]*$",
+        m,
+    ):
+        return ControlAction("set_counterpoint_tone", "normal")
+    return None

scripts/rag_faq_smoke.py CHANGED Viewed

@@ -3,7 +3,9 @@
 Chunks a FAQ markdown corpus by `##` sections, embeds with TinyModelRuntime, retrieves top
 matches for a query, and reports **keyword overlap** in the top hit as a cheap faithfulness
-proxy (not neural entailment)."""
 from __future__ import annotations
@@ -11,12 +13,15 @@ import argparse
 import re
 import sys
 from pathlib import Path
 _scripts = Path(__file__).resolve().parent
 if str(_scripts) not in sys.path:
     sys.path.insert(0, str(_scripts))
-from tinymodel_runtime import TinyModelRuntime
 _STOP = frozenset(
     "a an the to of and or for in on at is are was be as it with from by not"
@@ -58,8 +63,22 @@ def _pick_model(explicit: str | None) -> str:
     return explicit  # Hub id, e.g. HyperlinksSpace/TinyModel1
-def parse_args() -> argparse.Namespace:
-    p = argparse.ArgumentParser(description=__doc__)
     p.add_argument(
         "--model",
         type=str,
@@ -82,7 +101,25 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Use only TinyModelRuntime.retrieve (stricter; tiny encoders may fail on short FAQ chunks).",
     )
-    return p.parse_args()
 def load_chunks(corpus: Path) -> list[str]:
@@ -164,7 +201,29 @@ def main() -> None:
         raise SystemExit(1)
     chunks = load_chunks(corpus)
     rt = TinyModelRuntime(model_id, device="cpu", max_length=128)
     print("=== RAG FAQ smoke (retrieval) ===\n")
     # (query, substring that must appear in top-1 chunk for a pass — citation-style check)
     samples: list[tuple[str, str]] = [

 Chunks a FAQ markdown corpus by `##` sections, embeds with TinyModelRuntime, retrieves top
 matches for a query, and reports **keyword overlap** in the top hit as a cheap faithfulness
+proxy (not neural entailment). Optional **--show-train-routing** prints Phase 2 **`routing`**
+notes from the checkpoint's **eval_report.json** (same helper as **embeddings_smoke_test** /
+**horizon1_route_then_retrieve**)."""
 from __future__ import annotations
 import re
 import sys
 from pathlib import Path
+from typing import Any
 _scripts = Path(__file__).resolve().parent
 if str(_scripts) not in sys.path:
     sys.path.insert(0, str(_scripts))
+from eval_report_routing import maybe_print_routing_section
+_PROG = "rag_faq_smoke"
 _STOP = frozenset(
     "a an the to of and or for in on at is are was be as it with from by not"
     return explicit  # Hub id, e.g. HyperlinksSpace/TinyModel1
+def build_parser() -> argparse.ArgumentParser:
+    epilog = (
+        "Examples:\n"
+        "  python scripts/rag_faq_smoke.py\n"
+        "  python scripts/rag_faq_smoke.py --query \"How do I get a refund?\" --top-k 3\n"
+        "  python scripts/rag_faq_smoke.py --model artifacts/phase1/runs/smoke/ag_news/scratch "
+        "--show-train-routing\n"
+        "If --model is omitted, the first default checkpoint dir with config.json is used, "
+        f"else {_DEFAULT_HUB!r} (see --model above)."
+    )
+    p = argparse.ArgumentParser(
+        prog=_PROG,
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=epilog,
+    )
     p.add_argument(
         "--model",
         type=str,
         action="store_true",
         help="Use only TinyModelRuntime.retrieve (stricter; tiny encoders may fail on short FAQ chunks).",
     )
+    p.add_argument(
+        "--query",
+        type=str,
+        default=None,
+        help=(
+            "If set, run a single retrieval for this query and print top-k chunks with scores "
+            "(citation-style index into the chunk list). Skips the built-in smoke assertions."
+        ),
+    )
+    p.add_argument(
+        "--show-train-routing",
+        action="store_true",
+        help="Print eval_report.json top-level routing (Phase 2 notes) before retrieval output.",
+    )
+    return p
+def parse_args() -> argparse.Namespace:
+    return build_parser().parse_args()
 def load_chunks(corpus: Path) -> list[str]:
         raise SystemExit(1)
     chunks = load_chunks(corpus)
+    maybe_print_routing_section(
+        model_id, enabled=args.show_train_routing, prog=_PROG,
+    )
+    from tinymodel_runtime import TinyModelRuntime
     rt = TinyModelRuntime(model_id, device="cpu", max_length=128)
+    if args.query:
+        q = args.query.strip()
+        print("=== RAG FAQ (single query) ===\n")
+        print(f"model={model_id!r}\ncorpus={corpus}\nquery={q!r}\n")
+        if args.semantic_only:
+            hits = rt.retrieve(q, chunks, top_k=args.top_k)
+            for rank, h in enumerate(hits, 1):
+                prev = h.text[:240].replace("\n", " ")
+                print(f"  #{rank}  idx={h.index}  score={h.score:.4f}  {prev!r}...")
+        else:
+            hr = hybrid_retrieve(rt, q, chunks, top_k=args.top_k)
+            for rank, (score, idx, text) in enumerate(hr, 1):
+                prev = text[:240].replace("\n", " ")
+                print(f"  #{rank}  idx={idx}  hybrid_score={score:.4f}  {prev!r}...")
+        return
     print("=== RAG FAQ smoke (retrieval) ===\n")
     # (query, substring that must appear in top-1 chunk for a pass — citation-style check)
     samples: list[tuple[str, str]] = [

scripts/universal_brain_chat.py CHANGED Viewed

@@ -25,8 +25,27 @@ import json
 import os
 import sqlite3
 import sys
 import warnings
 from pathlib import Path
 _scripts = Path(__file__).resolve().parent
 _REPO = _scripts.parent
@@ -46,13 +65,59 @@ from horizon2_core import (  # noqa: E402
     load_causal_lm,
     pick_device,
 )
-from horizon3_store import clear_session, connect, init_schema, list_for_scope, put  # noqa: E402
 from rag_faq_smoke import _pick_model, hybrid_retrieve, load_chunks  # noqa: E402
 from tinymodel_runtime import TinyModelRuntime  # noqa: E402
 HELP_TEXT = """**How to use**
 - **Normal language:** ask in plain English (or mixed); the app **infers** what you want (summarize, search FAQ, save a note, etc.).
-- **Shortcuts:** slash commands still work (`/help`, `/status`, …).
 **Intents the router understands** (examples, not exact wording):
 - Ordinary chat / questions
@@ -61,6 +126,9 @@ HELP_TEXT = """**How to use**
 - **Answer using only** these facts — include both facts and question
 - **Search** the FAQ / **find** in the knowledge base
 - **Classify** (topic model) this paragraph
 - **Remember** / note / store: **long-term** vs **this session only**
 - **Show** saved notes; **clear** session notes
 - **Status** of loaded models
@@ -81,6 +149,9 @@ intent must be one of:
 - grounded — answer only from given facts; put QUESTION in "question", FACTS in "context" (if user mixes both in one blob, split sensibly)
 - retrieve — search FAQ/knowledge; put search query in "text"
 - classify — show topic-classifier probabilities; put passage in "text"
 - remember — save a durable note; put note body in "text"
 - session_note — save a session-only note; put note in "text"
 - list_memories — user wants to see saved notes
@@ -101,6 +172,9 @@ VALID_INTENTS = frozenset(
         "grounded",
         "retrieve",
         "classify",
         "remember",
         "session_note",
         "list_memories",
@@ -117,9 +191,69 @@ _INTENT_ALIASES = {
     "search": "retrieve",
     "faq": "retrieve",
     "lookup": "retrieve",
 }
 def _classifier_result_markdown(probs: dict[str, float]) -> str:
     ranked = sorted(probs.items(), key=lambda x: -x[1])
     top_lab, top_p = ranked[0]
@@ -323,6 +457,45 @@ def run_routed_tool(
             out.append(f"**#{i}** score={sc:.4f}\n{_clip(txt, 700)}\n")
         return "\n".join(out)
     if intent in ("summarize", "reformulate", "grounded"):
         if intent == "grounded":
             qn = question or text
@@ -387,6 +560,798 @@ def run_routed_tool(
     return ""
 def handle_slash(
     msg: str,
     *,
@@ -442,6 +1407,39 @@ def handle_slash(
             out.append(f"**#{i}** score={sc:.4f}\n{_clip(txt, 700)}\n")
         return "\n".join(out)
     if cmd in ("/summarize", "/reformulate", "/grounded"):
         if lm is None:
             return "Generative model not loaded."
@@ -665,27 +1663,61 @@ def main() -> None:
     print(f"Loading generative model {mid!r} on {dev!r} ...", flush=True)
     lm = load_causal_lm(mid, dev)
     turn_counter = {"n": 0}
-    show_trace = not args.no_trace and (
-        encoder is not None or mem_conn is not None or (rag_chunks is not None)
-    )
     def respond(
         message: str,
         history: list[dict],
-    ) -> tuple[str, list[dict]]:
         msg = (message or "").strip()
         hist = list(history or [])
         if not msg:
-            return "", hist
         turn_counter["n"] += 1
         seed = (args.seed + turn_counter["n"]) % (2**31)
         slash_out = handle_slash(
             msg,
             lm=lm,
             mem_conn=mem_conn,
-            scope_key=args.memory_scope,
             encoder=encoder,
             rag_chunks=rag_chunks,
             rag_top_k=args.rag_top_k,
@@ -699,10 +1731,28 @@ def main() -> None:
         if slash_out is not None:
             hist.append({"role": "user", "content": msg})
             hist.append({"role": "assistant", "content": slash_out})
-            return "", hist
         chat_line = msg
-        if not args.no_smart_route:
             try:
                 route = infer_route(
                     lm,
@@ -719,9 +1769,9 @@ def main() -> None:
                     msg=msg,
                     lm=lm,
                     mem_conn=mem_conn,
-                    scope_key=args.memory_scope,
                     encoder=encoder,
-                    rag_chunks=rag_chunks,
                     rag_top_k=args.rag_top_k,
                     task_max_new_tokens=args.task_max_new_tokens,
                     seed=(seed + 11) % (2**31),
@@ -734,12 +1784,13 @@ def main() -> None:
                     foot = f"\n\n---\n*Routed intent:* `{route['intent']}`"
                     hist.append({"role": "user", "content": msg})
                     hist.append({"role": "assistant", "content": tool_reply + foot})
-                    return "", hist
             chat_line = route["text"] or msg
         trace: list[str] = []
         extras: list[str] = []
         if encoder:
             probs = encoder.classify([chat_line])[0]
@@ -752,8 +1803,8 @@ def main() -> None:
             )
         rag_block = ""
-        if encoder and rag_chunks:
-            hr = hybrid_retrieve(encoder, chat_line, rag_chunks, top_k=args.rag_top_k)
             if hr:
                 trace.append(f"RAG:{len(hr)}chunk(s)")
                 pieces = []
@@ -767,7 +1818,7 @@ def main() -> None:
                 )
         if mem_conn:
-            items = list_for_scope(mem_conn, args.memory_scope)
             if items:
                 trace.append(f"mem:{len(items)}item(s)")
                 mem_lines = []
@@ -796,12 +1847,21 @@ def main() -> None:
             do_sample=True,
         )
         out = reply or "(empty generation)"
-        if show_trace and trace:
             out += "\n\n---\n*Brain trace:* " + " · ".join(trace)
         hist.append({"role": "user", "content": msg})
         hist.append({"role": "assistant", "content": out})
-        return "", hist
     brain_bits = []
     if encoder:
@@ -812,33 +1872,67 @@ def main() -> None:
         brain_bits.append("memory")
     brain_label = "+".join(brain_bits) if brain_bits else "LM only"
-    with gr.Blocks(title="Universal Brain (chat prototype)") as demo:
         gr.Markdown(
             "### Universal Brain — chat prototype\n"
             f"**Generative:** `{mid}` ({lm.device}) · **Brain layers:** {brain_label}\n\n"
             "**NL routing:** the model infers what you want (summarize, FAQ search, save note, …). "
             "Use **`--no-smart-route`** for plain chat-only + slash shortcuts. "
             "`/help` lists slash commands.\n\n"
             "Encoder topics (Hub TinyModel1 ≈ AG News) still feed context and an optional *Brain trace* line; "
             "use `/classify` or ask naturally to see the full probability table in chat."
         )
         chat = gr.Chatbot(type="messages", height=520, label="Conversation", allow_tags=False)
         with gr.Row():
             inp = gr.Textbox(
-                lines=1,
-                max_lines=1,
                 show_label=False,
                 placeholder="Ask in plain language, or use /help …",
                 scale=9,
             )
             go = gr.Button("Send", variant="primary", scale=1)
         gr.ClearButton([chat, inp])
-        def _submit(m: str, h: list[dict]) -> tuple[str, list[dict]]:
-            return respond(m, h)
-        go.click(_submit, [inp, chat], [inp, chat])
-        inp.submit(_submit, [inp, chat], [inp, chat])
     demo.queue(default_concurrency_limit=2)
     share = args.share
@@ -850,6 +1944,7 @@ def main() -> None:
             server_port=args.port,
             share=share,
             ssr_mode=False,
         )
     except ValueError as e:
         err = str(e)

 import os
 import sqlite3
 import sys
+import uuid
 import warnings
 from pathlib import Path
+from typing import Any
+# Windows: avoid OpenMP/MKL oversubscription and duplicate CRT issues that can
+# segfault during large `from_pretrained` CPU loads (common with torch+transformers).
+if sys.platform == "win32":
+    os.environ.setdefault("OMP_NUM_THREADS", "1")
+    os.environ.setdefault("MKL_NUM_THREADS", "1")
+    os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+import torch
+if sys.platform == "win32":
+    torch.set_num_threads(1)
+    try:
+        torch.set_num_interop_threads(1)
+    except RuntimeError:
+        pass
 _scripts = Path(__file__).resolve().parent
 _REPO = _scripts.parent
     load_causal_lm,
     pick_device,
 )
+from horizon3_store import (  # noqa: E402
+    clear_session,
+    connect,
+    export_scope_json,
+    forget_scope,
+    init_schema,
+    list_for_scope,
+    put,
+)
+from nl_controls import parse_control_action  # noqa: E402
 from rag_faq_smoke import _pick_model, hybrid_retrieve, load_chunks  # noqa: E402
 from tinymodel_runtime import TinyModelRuntime  # noqa: E402
 HELP_TEXT = """**How to use**
 - **Normal language:** ask in plain English (or mixed); the app **infers** what you want (summarize, search FAQ, save a note, etc.).
+- **Session controls (say it in chat, no slash command):**
+  - *What is my current scope?*, *Show my session settings* -> prints scope + toggles (FAQ context, routing, trace)
+  - *Start a new private session*, *Begin a fresh scope* -> generates a **new memory scope key** so notes are isolated from the shared default demo scope
+  - *Switch to scope my-team-123* / *Use session demo-key* -> set the Horizon 3 **`scope_key`** from chat (ASCII id)
+  - *Be brief* / *More detail please* / *Use bullet points* / *No bullets, plain paragraphs* -> soft **reply-style** hints (injected into the assistant system context; short control lines only)
+  - *Strict FAQ* / *FAQ only* / *Stick to the FAQ* vs *Relaxed FAQ* / *FAQ plus general knowledge* vs *Balanced FAQ* / *Normal FAQ* -> **FAQ grounding** hints for how tightly to treat injected FAQ excerpts vs general knowledge
+  - *Explain simply* / *ELI5* / *I'm a beginner* vs *Expert mode* / *Assume I'm technical* vs *Normal explanation level* -> **audience depth** hints (simple vs technical vs default)
+  - *TLDR first* / *Lead with a summary* vs *No TLDR* / *Answer directly* vs *Default answer structure* -> **answer opening** style (short upfront summary vs dive straight in)
+  - *Step by step* / *Numbered steps* vs *No numbered steps* / *Continuous prose* vs *Default step style* -> **procedure layout** (numbered steps vs flowing paragraphs)
+  - *Flag your assumptions* / *Be explicit about uncertainty* vs *Be decisive* / *Don't hedge* vs *Reset uncertainty* -> **confidence tone** hints
+  - *Suggest next steps* / *Offer follow-up questions* vs *No follow-up questions* / *No questions at the end* vs *Default follow-ups* -> **closing** style at end of answers
+  - *Definitions first* / *Define terms first* vs *Intuition first* / *Big picture first* vs *Default explanation order* -> **concept order** in explanations
+  - *Include examples* / *Use concrete examples* vs *Skip examples* / *No examples unless I ask* vs *Default examples* -> **example density**
+  - *Use pros and cons* / *Pros and cons sections* vs *Compare in flowing prose* / *No pros and cons sections* vs *Default comparison style* -> **comparison layout** for trade-offs
+  - *Formal tone* / *Professional register* vs *Casual tone* / *Speak casually* vs *Default tone* -> **writing register**
+  - *Use code fences* / *Fenced code blocks* vs *Inline code only* / *No fenced code blocks* vs *Default code formatting* -> **markdown code layout**
+  - *Use analogies* / *Analogies when helpful* vs *No analogies* / *Literal explanations only* vs *Default analogy style* -> **analogy / metaphor** usage
+  - *Spell out acronyms* / *Expand acronyms on first use* vs *Assume I know acronyms* / *Don't expand acronyms* vs *Default acronym style* -> **acronym verbosity**
+  - *Ask clarifying questions first* / *Clarify first* vs *No clarifying questions* / *Just answer without questions* vs *Default clarify mode* -> whether the assistant should ask for missing info before answering
+  - *No speculation* / *Stick to high confidence only* vs *Brainstorm freely* / *Wild ideas ok* vs *Default speculation* -> how strictly to avoid guessing vs allow ideation
+  - *Show your work* / *Show the derivation* vs *Final answer only* / *No derivation* vs *Default math detail* -> how much intermediate reasoning to show for math-like answers
+  - *Answer in JSON* / *JSON output* vs *Plain text only* / *No JSON* vs *Default output format* -> structured output preference
+  - *Be risk averse* / *Err on the side of safety* vs *Be pragmatic* / *Optimize for speed* vs *Default risk posture* -> conservative vs practical recommendations
+  - *Give me runnable commands* / *Make it actionable* vs *No commands* / *Conceptual only* vs *Default actionability* -> how command-heavy responses should be
+  - *Quote the FAQ excerpts* / *Use direct quotes* vs *Paraphrase only* / *Don't quote excerpts* vs *Default quote style* -> quoting vs paraphrasing when relying on injected excerpts
+  - *Use tables* / *Tabular format* vs *No tables* / *Avoid tables* vs *Default table style* -> whether markdown tables are preferred
+  - *Use emoji* / *Emoji ok* vs *No emoji* / *Avoid emoji* vs *Default emoji style* -> light **emoji** usage in answers
+  - *Use section headings* / *Organize with headings* vs *No section headings* / *Flat answer* vs *Default section headings* -> **markdown headings** vs flat prose
+  - *Bold key terms* / *Highlight important terms* vs *Minimal bold* / *Don't overuse bold* vs *Default emphasis* -> **inline bold** for key phrases vs sparse formatting
+  - *Challenge my assumptions* / *Play devils advocate* vs *Be supportive* / *Assume good intent* vs *Default counterpoints* -> how much to **push back** vs stay encouraging
+  - *Reset reply style* -> back to defaults for length + prose + balanced FAQ grounding + audience + opening + steps + confidence tone + follow-ups + concept order + examples + comparisons + register + code layout + analogy + acronym style + clarify + speculation + math detail + output format + risk posture + actionability + quote style + table style + emoji + section headings + term emphasis + counterpoints
+  - *Export my memories*, *Download my notes as JSON* -> returns a Horizon 3 export blob for **this Space session scope**
+  - *Delete all my memories for this chat* / *Erase everything you stored about me here* -> **forget-scope** wipe for this scope (**long-term + session** rows)
+  - *Clear my session notes* -> wipes **session** notes only
+  - *Turn off the FAQ context*, *Disable RAG snippets*, *Turn FAQ back on* -> toggles whether FAQ excerpts are injected into the chat system context
+  - *Turn off smart routing*, *Go back to normal chat only* -> disables the JSON intent router (slash commands still work)
+  - *Show the brain trace*, *Hide debug trace* -> toggles the optional *Brain trace* footer on replies
+- **Shortcuts:** `/help`, `/status`, `/classify`, `/retrieve`, `/summarize`, `/reformulate`, `/grounded q ||| ctx`, `/remember`, `/session`, `/memories`, `/clear-session`, **`/similarity a ||| b`**, **`/embed` / `/embedding`**, **`/nearest q ||| c1 ||| c2`**.
 **Intents the router understands** (examples, not exact wording):
 - Ordinary chat / questions
 - **Answer using only** these facts — include both facts and question
 - **Search** the FAQ / **find** in the knowledge base
 - **Classify** (topic model) this paragraph
+- **Similarity:** are these two snippets close in meaning? (encoder cosine)
+- **Embedding** stats for a passage (dimension, norm, preview)
+- **Nearest** among several options: which candidate is closest to a query? (`query ||| opt1 ||| opt2 …`)
 - **Remember** / note / store: **long-term** vs **this session only**
 - **Show** saved notes; **clear** session notes
 - **Status** of loaded models
 - grounded — answer only from given facts; put QUESTION in "question", FACTS in "context" (if user mixes both in one blob, split sensibly)
 - retrieve — search FAQ/knowledge; put search query in "text"
 - classify — show topic-classifier probabilities; put passage in "text"
+- similarity — cosine similarity between two texts; put "text_a ||| text_b" in "text"
+- embedding — embedding vector summary for one passage; put passage in "text"
+- nearest — encoder top-k over candidates; put "query ||| candidate1 ||| candidate2 ||| …" in "text" (at least one candidate)
 - remember — save a durable note; put note body in "text"
 - session_note — save a session-only note; put note in "text"
 - list_memories — user wants to see saved notes
         "grounded",
         "retrieve",
         "classify",
+        "similarity",
+        "embedding",
+        "nearest",
         "remember",
         "session_note",
         "list_memories",
     "search": "retrieve",
     "faq": "retrieve",
     "lookup": "retrieve",
+    "similar": "similarity",
+    "cosine": "similarity",
+    "embed": "embedding",
+    "embeddings": "embedding",
+    "knn": "nearest",
+    "triage": "nearest",
+    "encoder_retrieve": "nearest",
 }
+def _parse_two_segments(blob: str) -> tuple[str, str]:
+    if "|||" not in blob:
+        raise ValueError("Need two segments separated by `|||` (e.g. `text A ||| text B`).")
+    a, _, b = blob.partition("|||")
+    a, b = a.strip(), b.strip()
+    if not a or not b:
+        raise ValueError("Both sides of `|||` must be non-empty.")
+    return a, b
+def _parse_nearest_blob(blob: str) -> tuple[str, list[str]]:
+    parts = [p.strip() for p in blob.split("|||") if p.strip()]
+    if len(parts) < 2:
+        raise ValueError(
+            "Need `query ||| candidate1 ||| candidate2` (at least one candidate after `|||`)."
+        )
+    return parts[0], parts[1:]
+def _embedding_summary_markdown(encoder: TinyModelRuntime, passage: str) -> str:
+    vec = encoder.embed([passage], normalize=False)[0]
+    dim = int(vec.shape[0])
+    norm = float(torch.linalg.vector_norm(vec))
+    k = min(8, dim)
+    head = ", ".join(f"{float(vec[i]):.4f}" for i in range(k))
+    return "\n".join(
+        [
+            "### Encoder embedding (raw [CLS], not L2-normalized)\n",
+            f"- **dim:** {dim}",
+            f"- **L2 norm:** {norm:.4f}",
+            f"- **first {k} values:** {head}",
+        ]
+    )
+def _nearest_markdown(
+    encoder: TinyModelRuntime,
+    query: str,
+    candidates: list[str],
+    *,
+    top_k: int,
+) -> str:
+    hits = encoder.retrieve(query, candidates, top_k=top_k)
+    if not hits:
+        return "(No candidates.)"
+    lines = ["### Encoder nearest neighbors (cosine on pooled embeddings)\n"]
+    for rank, h in enumerate(hits, 1):
+        lines.append(
+            f"**#{rank}** score={h.score:.4f} · index={h.index}\n{_clip(h.text, 700)}\n"
+        )
+    return "\n".join(lines)
 def _classifier_result_markdown(probs: dict[str, float]) -> str:
     ranked = sorted(probs.items(), key=lambda x: -x[1])
     top_lab, top_p = ranked[0]
             out.append(f"**#{i}** score={sc:.4f}\n{_clip(txt, 700)}\n")
         return "\n".join(out)
+    if intent == "similarity":
+        if not encoder:
+            return "Similarity needs the encoder (drop `--lm-only` / `--no-encoder`)."
+        blob = (text or msg).strip()
+        if not blob:
+            return "Provide two texts: `first ||| second`."
+        try:
+            ta, tb = _parse_two_segments(blob)
+        except ValueError as e:
+            return str(e)
+        score = encoder.similarity(ta, tb)
+        return (
+            "### Similarity (encoder cosine)\n"
+            f"**Score:** {score:.4f}\n\n"
+            f"**A:** {_clip(ta, 480)}\n\n"
+            f"**B:** {_clip(tb, 480)}"
+        )
+    if intent == "embedding":
+        if not encoder:
+            return "Embedding stats need the encoder (drop `--lm-only` / `--no-encoder`)."
+        passage = (text or msg).strip()
+        if not passage:
+            return "What text should I embed?"
+        return _embedding_summary_markdown(encoder, passage)
+    if intent == "nearest":
+        if not encoder:
+            return "Nearest-neighbor search needs the encoder (drop `--lm-only` / `--no-encoder`)."
+        blob = (text or msg).strip()
+        if not blob:
+            return "Usage: `query ||| option1 ||| option2 ...`"
+        try:
+            query, cands = _parse_nearest_blob(blob)
+        except ValueError as e:
+            return str(e)
+        k = max(1, min(rag_top_k, len(cands)))
+        return _nearest_markdown(encoder, query, cands, top_k=k)
     if intent in ("summarize", "reformulate", "grounded"):
         if intent == "grounded":
             qn = question or text
     return ""
+def handle_nl_control(
+    msg: str,
+    session: dict[str, Any],
+    *,
+    mem_conn: sqlite3.Connection | None,
+    scope_key: str,
+    rag_chunks_base: list[str] | None,
+    locked_no_smart_route: bool,
+) -> str | None:
+    act = parse_control_action(msg)
+    if act is None:
+        return None
+    if act.name == "show_session":
+        bits = [
+            f"- scope: `{scope_key}`",
+            f"- smart routing: **{'on' if session.get('smart_route') and not locked_no_smart_route else 'off'}**",
+            f"- FAQ context: **{'on' if session.get('rag') and rag_chunks_base is not None else 'off'}**",
+            f"- brain trace footer: **{'on' if session.get('trace') else 'off'}**",
+            f"- memory store: **{'on' if mem_conn is not None else 'off'}**",
+            f"- reply length: **{session.get('verbosity', 'normal')}**",
+            f"- lists: **{'bullets when helpful' if session.get('reply_format') == 'bullets' else 'prose'}**",
+            f"- FAQ grounding: **{session.get('faq_grounding', 'normal')}**",
+            f"- audience: **{session.get('audience', 'normal')}**",
+            f"- answer opening: **{session.get('answer_lead', 'normal')}**",
+            f"- procedure steps: **{session.get('step_style', 'normal')}**",
+            f"- confidence tone: **{session.get('confidence_tone', 'normal')}**",
+            f"- follow-up ending: **{session.get('followup_close', 'normal')}**",
+            f"- concept order: **{session.get('exposition_order', 'normal')}**",
+            f"- examples: **{session.get('example_density', 'normal')}**",
+            f"- comparisons: **{session.get('comparison_frame', 'normal')}**",
+            f"- register: **{session.get('register_tone', 'normal')}**",
+            f"- code blocks: **{session.get('code_block_style', 'normal')}**",
+            f"- analogies: **{session.get('analogy_use', 'normal')}**",
+            f"- acronyms: **{session.get('acronym_style', 'normal')}**",
+            f"- clarify-first: **{session.get('clarify_first', 'normal')}**",
+            f"- speculation: **{session.get('speculation', 'normal')}**",
+            f"- math detail: **{session.get('math_detail', 'normal')}**",
+            f"- output format: **{session.get('output_format', 'normal')}**",
+            f"- risk posture: **{session.get('risk_posture', 'normal')}**",
+            f"- actionability: **{session.get('actionability', 'normal')}**",
+            f"- quote style: **{session.get('quote_style', 'normal')}**",
+            f"- tables: **{session.get('table_style', 'normal')}**",
+            f"- emoji: **{session.get('emoji_style', 'normal')}**",
+            f"- section headings: **{session.get('section_headings', 'normal')}**",
+            f"- term emphasis: **{session.get('term_emphasis', 'normal')}**",
+            f"- counterpoints: **{session.get('counterpoint_tone', 'normal')}**",
+        ]
+        return "### Session settings\n" + "\n".join(bits)
+    if act.name == "new_private_session":
+        # Keep it readable and low-collision; not a secret, just a scope id.
+        new_scope = f"ub-{uuid.uuid4().hex[:8]}"
+        session["scope_key"] = new_scope
+        return (
+            f"**Started a new private session scope.**\n\n"
+            f"Current scope is now `{new_scope}`.\n"
+            "Memory operations (remember/export/forget) will apply to this new scope."
+        )
+    if act.name == "set_scope":
+        if not act.value:
+            return "Tell me the scope key, e.g. `Switch to scope demo-123`."
+        session["scope_key"] = act.value
+        return f"Switched session scope to `{act.value}`."
+    if act.name == "export_memory":
+        if mem_conn is None:
+            return "Memory is off for this Space (no SQLite store); nothing to export."
+        blob = export_scope_json(mem_conn, scope_key)
+        js = json.dumps(blob, indent=2, ensure_ascii=False)
+        max_chars = 48_000
+        if len(js) > max_chars:
+            js = js[:max_chars] + "\n…(truncated for chat; schema is horizon3_export/1.0)…"
+        return f"### Memory export (`{scope_key}`)\nPaste/save externally if needed.\n\n```json\n{js}\n```"
+    if act.name == "forget_scope":
+        if mem_conn is None:
+            return "Memory is off; nothing to delete."
+        n = forget_scope(mem_conn, scope_key)
+        return (
+            f"**Erased stored memory for this Space session.**\n\n"
+            f"Deleted **{n}** row(s) (**session + long-term**) for `{scope_key}`."
+        )
+    if act.name == "list_memories":
+        if mem_conn is None:
+            return "Memory is off."
+        items = list_for_scope(mem_conn, scope_key)
+        if not items:
+            return "(No saved notes for this scope.)"
+        lines = [f"- **{it.kind}** · {_clip(it.content, 320)}" for it in items[:24]]
+        extra = f"\n\n… {len(items) - 24} more" if len(items) > 24 else ""
+        return "**Saved notes:**\n" + "\n".join(lines) + extra
+    if act.name == "clear_session":
+        if mem_conn is None:
+            return "Memory is off."
+        n = clear_session(mem_conn, scope_key)
+        return f"Cleared **{n}** session note(s). Long-term notes unchanged."
+    if act.name == "set_trace":
+        session["trace"] = act.value == "on"
+        return f"**Brain trace** is now **{'on' if session['trace'] else 'off'}** (footer on assistant replies)."
+    if act.name == "set_smart_route":
+        if locked_no_smart_route:
+            return "Smart routing is **locked off** for this server (`--no-smart-route`)."
+        session["smart_route"] = act.value == "on"
+        return (
+            f"**Smart routing** is now **{'on' if session['smart_route'] else 'off'}** "
+            "(off = plain chat + FAQ context injection + slash shortcuts only)."
+        )
+    if act.name == "set_rag":
+        if rag_chunks_base is None:
+            return "FAQ/RAG corpus is **not loaded** on this deployment; nothing to toggle."
+        session["rag"] = act.value == "on"
+        return (
+            f"**FAQ/RAG excerpts in prompts** are now **{'on' if session['rag'] else 'off'}**."
+        )
+    if act.name == "reset_reply_style":
+        session["verbosity"] = "normal"
+        session["reply_format"] = "prose"
+        session["faq_grounding"] = "normal"
+        session["audience"] = "normal"
+        session["answer_lead"] = "normal"
+        session["step_style"] = "normal"
+        session["confidence_tone"] = "normal"
+        session["followup_close"] = "normal"
+        session["exposition_order"] = "normal"
+        session["example_density"] = "normal"
+        session["comparison_frame"] = "normal"
+        session["register_tone"] = "normal"
+        session["code_block_style"] = "normal"
+        session["analogy_use"] = "normal"
+        session["acronym_style"] = "normal"
+        session["clarify_first"] = "normal"
+        session["speculation"] = "normal"
+        session["math_detail"] = "normal"
+        session["output_format"] = "normal"
+        session["risk_posture"] = "normal"
+        session["actionability"] = "normal"
+        session["quote_style"] = "normal"
+        session["table_style"] = "normal"
+        session["emoji_style"] = "normal"
+        session["section_headings"] = "normal"
+        session["term_emphasis"] = "normal"
+        session["counterpoint_tone"] = "normal"
+        return (
+            "**Reply style reset:** normal length, prose, balanced FAQ grounding, general audience, "
+            "default opening, default steps, normal confidence tone, default follow-ups, default concept order, "
+            "default examples, default comparisons, default register, default code blocks, default analogies, "
+            "default acronyms, default clarify mode, default speculation, default math detail, default output format, "
+            "default risk posture, default actionability, default quote style, default tables, default emoji, "
+            "default section headings, default term emphasis, default counterpoints."
+        )
+    if act.name == "set_verbosity":
+        v = (act.value or "normal").lower()
+        if v not in ("brief", "normal", "detailed"):
+            v = "normal"
+        session["verbosity"] = v
+        return f"**Reply length** is now **{v}** (applies to assistant chat replies)."
+    if act.name == "set_reply_format":
+        f = (act.value or "prose").lower()
+        if f not in ("prose", "bullets"):
+            f = "prose"
+        session["reply_format"] = f
+        return f"**List formatting** is now **{f}** (how the assistant structures multi-point answers)."
+    if act.name == "set_faq_grounding":
+        mode = (act.value or "normal").lower()
+        if mode not in ("strict", "normal", "relaxed"):
+            mode = "normal"
+        session["faq_grounding"] = mode
+        extra = ""
+        if rag_chunks_base is None or not session.get("rag", True):
+            extra = (
+                "\n\n**Note:** FAQ excerpt injection is currently **off** in this chat session "
+                "(or no FAQ corpus loaded). Grounding hints apply whenever FAQ snippets are present."
+            )
+        return f"**FAQ grounding** is now **{mode}**.{extra}"
+    if act.name == "set_audience":
+        aud = (act.value or "normal").lower()
+        if aud not in ("simple", "normal", "technical"):
+            aud = "normal"
+        session["audience"] = aud
+        label = {"simple": "beginner-friendly", "normal": "general", "technical": "technical"}.get(aud, aud)
+        return f"**Audience** is now **{label}** (how deep or jargon-heavy explanations should feel)."
+    if act.name == "set_answer_lead":
+        lead = (act.value or "normal").lower()
+        if lead not in ("tldr_first", "direct", "normal"):
+            lead = "normal"
+        session["answer_lead"] = lead
+        human = {"tldr_first": "TL;DR first line", "direct": "straight in (no TL;DR line)", "normal": "default"}.get(
+            lead, lead
+        )
+        return f"**Answer opening** is now **{human}**."
+    if act.name == "set_step_style":
+        st = (act.value or "normal").lower()
+        if st not in ("numbered", "continuous", "normal"):
+            st = "normal"
+        session["step_style"] = st
+        human = {
+            "numbered": "numbered steps when explaining procedures",
+            "continuous": "continuous prose (avoid numbered step lists)",
+            "normal": "default",
+        }.get(st, st)
+        return f"**Procedure layout** is now **{human}**."
+    if act.name == "set_confidence_tone":
+        ct = (act.value or "normal").lower()
+        if ct not in ("transparent", "assertive", "normal"):
+            ct = "normal"
+        session["confidence_tone"] = ct
+        human = {
+            "transparent": "flag limits and assumptions",
+            "assertive": "decisive, minimal hedging",
+            "normal": "default",
+        }.get(ct, ct)
+        return f"**Confidence tone** is now **{human}**."
+    if act.name == "set_followup_close":
+        fu = (act.value or "normal").lower()
+        if fu not in ("suggest", "minimal", "normal"):
+            fu = "normal"
+        session["followup_close"] = fu
+        human = {
+            "suggest": "offer brief next steps / follow-ups when useful",
+            "minimal": "no rhetorical closing questions",
+            "normal": "default",
+        }.get(fu, fu)
+        return f"**Follow-up closing** is now **{human}**."
+    if act.name == "set_exposition_order":
+        eo = (act.value or "normal").lower()
+        if eo not in ("definitions_first", "intuition_first", "normal"):
+            eo = "normal"
+        session["exposition_order"] = eo
+        human = {
+            "definitions_first": "definitions and terms before intuition",
+            "intuition_first": "big-picture intuition before formal detail",
+            "normal": "default",
+        }.get(eo, eo)
+        return f"**Concept order** is now **{human}**."
+    if act.name == "set_example_density":
+        ed = (act.value or "normal").lower()
+        if ed not in ("rich", "sparse", "normal"):
+            ed = "normal"
+        session["example_density"] = ed
+        human = {
+            "rich": "include concrete examples when they help",
+            "sparse": "minimal examples unless asked",
+            "normal": "default",
+        }.get(ed, ed)
+        return f"**Examples** preference is now **{human}**."
+    if act.name == "set_comparison_frame":
+        cf = (act.value or "normal").lower()
+        if cf not in ("pros_cons", "narrative", "normal"):
+            cf = "normal"
+        session["comparison_frame"] = cf
+        human = {
+            "pros_cons": "explicit Pros / Cons sections for trade-offs",
+            "narrative": "flowing prose comparisons (no rigid Pros/Cons headings)",
+            "normal": "default",
+        }.get(cf, cf)
+        return f"**Comparison layout** is now **{human}**."
+    if act.name == "set_register_tone":
+        rt = (act.value or "normal").lower()
+        if rt not in ("formal", "casual", "normal"):
+            rt = "normal"
+        session["register_tone"] = rt
+        human = {
+            "formal": "professional / polished wording",
+            "casual": "friendly conversational wording",
+            "normal": "default",
+        }.get(rt, rt)
+        return f"**Register** is now **{human}**."
+    if act.name == "set_code_block_style":
+        cs = (act.value or "normal").lower()
+        if cs not in ("fenced", "inline", "normal"):
+            cs = "normal"
+        session["code_block_style"] = cs
+        human = {
+            "fenced": "use ``` fenced blocks for multi-line code",
+            "inline": "prefer inline `backticks`, avoid large fences",
+            "normal": "default",
+        }.get(cs, cs)
+        return f"**Code markdown** is now **{human}**."
+    if act.name == "set_analogy_use":
+        au = (act.value or "normal").lower()
+        if au not in ("prefer", "avoid", "normal"):
+            au = "normal"
+        session["analogy_use"] = au
+        human = {
+            "prefer": "use concise analogies when they clarify",
+            "avoid": "literal wording; skip analogies and metaphors",
+            "normal": "default",
+        }.get(au, au)
+        return f"**Analogy usage** is now **{human}**."
+    if act.name == "set_acronym_style":
+        ac = (act.value or "normal").lower()
+        if ac not in ("spell_out", "terse", "normal"):
+            ac = "normal"
+        session["acronym_style"] = ac
+        human = {
+            "spell_out": "expand unfamiliar acronyms on first mention",
+            "terse": "keep acronym forms without spelling them out first",
+            "normal": "default",
+        }.get(ac, ac)
+        return f"**Acronym style** is now **{human}**."
+    if act.name == "set_clarify_first":
+        cf = (act.value or "normal").lower()
+        if cf not in ("on", "off", "normal"):
+            cf = "normal"
+        session["clarify_first"] = cf
+        human = {
+            "on": "ask 1–3 targeted clarifying questions before answering when info is missing",
+            "off": "answer immediately; do not ask clarifying questions first",
+            "normal": "default",
+        }.get(cf, cf)
+        return f"**Clarify-first** is now **{human}**."
+    if act.name == "set_speculation":
+        sp = (act.value or "normal").lower()
+        if sp not in ("strict", "creative", "normal"):
+            sp = "normal"
+        session["speculation"] = sp
+        human = {
+            "strict": "avoid guessing; stick to high-confidence statements",
+            "creative": "brainstorm and speculate (label assumptions clearly)",
+            "normal": "default",
+        }.get(sp, sp)
+        return f"**Speculation level** is now **{human}**."
+    if act.name == "set_math_detail":
+        md = (act.value or "normal").lower()
+        if md not in ("show_work", "final_only", "normal"):
+            md = "normal"
+        session["math_detail"] = md
+        human = {
+            "show_work": "show intermediate steps/derivation when doing math-like reasoning",
+            "final_only": "final results only (no derivation/steps)",
+            "normal": "default",
+        }.get(md, md)
+        return f"**Math detail** is now **{human}**."
+    if act.name == "set_output_format":
+        of = (act.value or "normal").lower()
+        if of not in ("json", "plain", "normal"):
+            of = "normal"
+        session["output_format"] = of
+        human = {
+            "json": "reply in a JSON-shaped object when possible",
+            "plain": "plain text (no forced JSON structure)",
+            "normal": "default",
+        }.get(of, of)
+        return f"**Output format** is now **{human}**."
+    if act.name == "set_risk_posture":
+        rp = (act.value or "normal").lower()
+        if rp not in ("conservative", "pragmatic", "normal"):
+            rp = "normal"
+        session["risk_posture"] = rp
+        human = {
+            "conservative": "risk-averse / safety-first recommendations",
+            "pragmatic": "practical, speed-oriented recommendations",
+            "normal": "default",
+        }.get(rp, rp)
+        return f"**Risk posture** is now **{human}**."
+    if act.name == "set_actionability":
+        ac = (act.value or "normal").lower()
+        if ac not in ("commands", "conceptual", "normal"):
+            ac = "normal"
+        session["actionability"] = ac
+        human = {
+            "commands": "include runnable commands/snippets when possible",
+            "conceptual": "avoid commands; stay conceptual/high-level",
+            "normal": "default",
+        }.get(ac, ac)
+        return f"**Actionability** is now **{human}**."
+    if act.name == "set_quote_style":
+        qs = (act.value or "normal").lower()
+        if qs not in ("quote", "paraphrase", "normal"):
+            qs = "normal"
+        session["quote_style"] = qs
+        human = {
+            "quote": "prefer short direct quotes when relying on FAQ excerpts",
+            "paraphrase": "paraphrase excerpts; avoid quoting",
+            "normal": "default",
+        }.get(qs, qs)
+        return f"**Quote style** is now **{human}**."
+    if act.name == "set_table_style":
+        ts = (act.value or "normal").lower()
+        if ts not in ("prefer", "avoid", "normal"):
+            ts = "normal"
+        session["table_style"] = ts
+        human = {
+            "prefer": "use markdown tables when presenting structured comparisons",
+            "avoid": "avoid tables; use bullets/prose instead",
+            "normal": "default",
+        }.get(ts, ts)
+        return f"**Tables** preference is now **{human}**."
+    if act.name == "set_emoji_style":
+        es = (act.value or "normal").lower()
+        if es not in ("include", "avoid", "normal"):
+            es = "normal"
+        session["emoji_style"] = es
+        human = {
+            "include": "a few tasteful emoji are welcome when they aid scanning",
+            "avoid": "no emoji unless the user uses them first",
+            "normal": "default",
+        }.get(es, es)
+        return f"**Emoji style** is now **{human}**."
+    if act.name == "set_section_headings":
+        sh = (act.value or "normal").lower()
+        if sh not in ("prefer", "avoid", "normal"):
+            sh = "normal"
+        session["section_headings"] = sh
+        human = {
+            "prefer": "use markdown ##/### headings to structure longer answers",
+            "avoid": "avoid markdown heading lines; keep flowing paragraphs/lists",
+            "normal": "default",
+        }.get(sh, sh)
+        return f"**Section headings** preference is now **{human}**."
+    if act.name == "set_term_emphasis":
+        te = (act.value or "normal").lower()
+        if te not in ("highlight", "minimal", "normal"):
+            te = "normal"
+        session["term_emphasis"] = te
+        human = {
+            "highlight": "bold a few crucial terms/phrases for scanability",
+            "minimal": "avoid decorative bold; use it sparingly",
+            "normal": "default",
+        }.get(te, te)
+        return f"**Term emphasis** is now **{human}**."
+    if act.name == "set_counterpoint_tone":
+        cp = (act.value or "normal").lower()
+        if cp not in ("challenge", "supportive", "normal"):
+            cp = "normal"
+        session["counterpoint_tone"] = cp
+        human = {
+            "challenge": "look for gaps; name risks and counterarguments respectfully",
+            "supportive": "prioritize encouragement and constructive framing",
+            "normal": "default",
+        }.get(cp, cp)
+        return f"**Counterpoint tone** is now **{human}**."
+    return None
+def _append_reply_style_hints(extras: list[str], session: dict[str, Any]) -> None:
+    verbosity = str(session.get("verbosity") or "normal").lower()
+    rformat = str(session.get("reply_format") or "prose").lower()
+    if verbosity not in ("brief", "normal", "detailed"):
+        verbosity = "normal"
+    if rformat not in ("prose", "bullets"):
+        rformat = "prose"
+    lines: list[str] = []
+    if verbosity == "brief":
+        lines.append(
+            "Keep replies concise (about a short paragraph or less) unless the user explicitly asks for depth."
+        )
+    elif verbosity == "detailed":
+        lines.append("Prefer fuller, well-structured explanations when they help the user.")
+    if rformat == "bullets":
+        lines.append("When listing multiple points, use markdown bullet or numbered lists.")
+    audience = str(session.get("audience") or "normal").lower()
+    if audience not in ("simple", "normal", "technical"):
+        audience = "normal"
+    if audience == "simple":
+        lines.append(
+            "Assume the reader is new to the topic: define jargon when you use it, prefer plain language and small steps."
+        )
+    elif audience == "technical":
+        lines.append(
+            "Assume a technical reader: standard domain terms and shorthand are fine; prioritize precision over hand-holding."
+        )
+    lead = str(session.get("answer_lead") or "normal").lower()
+    if lead not in ("tldr_first", "direct", "normal"):
+        lead = "normal"
+    if lead == "tldr_first":
+        lines.append(
+            "Start substantive answers with one short **TL;DR:** line (one sentence), then elaborate."
+        )
+    elif lead == "direct":
+        lines.append(
+            "Do not add a standalone TL;DR/summary prelude; answer immediately in-flow (still use lists if configured)."
+        )
+    steps = str(session.get("step_style") or "normal").lower()
+    if steps not in ("numbered", "continuous", "normal"):
+        steps = "normal"
+    if steps == "numbered":
+        lines.append(
+            "When explaining procedures or multi-part how-tos, structure the answer with clear **numbered steps** "
+            "(1. 2. 3.) and one action per step when practical."
+        )
+    elif steps == "continuous":
+        lines.append(
+            "Avoid numbered step lists; explain procedures as **connected paragraphs** unless the user explicitly "
+            "asks for steps."
+        )
+    conf = str(session.get("confidence_tone") or "normal").lower()
+    if conf not in ("transparent", "assertive", "normal"):
+        conf = "normal"
+    if conf == "transparent":
+        lines.append(
+            "Be explicit about uncertainty: say when you are guessing, label key assumptions, and avoid overstating "
+            "facts you cannot support from the prompt or supplied excerpts."
+        )
+    elif conf == "assertive":
+        lines.append(
+            "Answer in a direct, confident tone: minimize throat-clearing and hedging unless a short disclaimer is "
+            "truly necessary for safety or policy."
+        )
+    fu = str(session.get("followup_close") or "normal").lower()
+    if fu not in ("suggest", "minimal", "normal"):
+        fu = "normal"
+    if fu == "suggest":
+        lines.append(
+            "When helpful, end with concise **optional next steps** or a short **follow-up invitation** "
+            '(e.g., one line like "Want me to drill into X?" — optional, not repetitive).'
+        )
+    elif fu == "minimal":
+        lines.append(
+            "Avoid stock closers such as prompting whether the user needs anything else unless they explicitly invite it; "
+            "finish crisply after the core answer."
+        )
+    expo = str(session.get("exposition_order") or "normal").lower()
+    if expo not in ("definitions_first", "intuition_first", "normal"):
+        expo = "normal"
+    if expo == "definitions_first":
+        lines.append(
+            "Prefer stating **definitions and key terms upfront**, then intuition, analogies, and examples."
+        )
+    elif expo == "intuition_first":
+        lines.append(
+            "Prefer a short **motivation / big-picture intuition** section first, then formal definitions and details."
+        )
+    ex_density = str(session.get("example_density") or "normal").lower()
+    if ex_density not in ("rich", "sparse", "normal"):
+        ex_density = "normal"
+    if ex_density == "rich":
+        lines.append(
+            "When it clarifies the answer, include at least one **short concrete example** or miniature scenario."
+        )
+    elif ex_density == "sparse":
+        lines.append(
+            "Unless the user explicitly requests an example, keep answers **example-free** (no illustrative stories)."
+        )
+    comp = str(session.get("comparison_frame") or "normal").lower()
+    if comp not in ("pros_cons", "narrative", "normal"):
+        comp = "normal"
+    if comp == "pros_cons":
+        lines.append(
+            "For trade-offs or comparing options, use markdown subheadings **Pros** and **Cons** (short bullets under each)."
+        )
+    elif comp == "narrative":
+        lines.append(
+            "For trade-offs or comparing options, weave pros/cons into **continuous prose** rather than labeled sections."
+        )
+    reg = str(session.get("register_tone") or "normal").lower()
+    if reg not in ("formal", "casual", "normal"):
+        reg = "normal"
+    if reg == "formal":
+        lines.append(
+            "Use a **polished professional register**: clear sentences, minimal slang/emoji unless the topic demands it."
+        )
+    elif reg == "casual":
+        lines.append(
+            "**Conversational register** is preferred: contractions and light phrasing are fine; sound like a helpful teammate."
+        )
+    cb = str(session.get("code_block_style") or "normal").lower()
+    if cb not in ("fenced", "inline", "normal"):
+        cb = "normal"
+    if cb == "fenced":
+        lines.append(
+            "For multi-line commands or code, use **markdown fenced code blocks** with a language hint when recognizable."
+        )
+    elif cb == "inline":
+        lines.append(
+            "Prefer **inline backticks** for short snippets; **avoid triple-backtick fences** unless the user pastes a block."
+        )
+    an = str(session.get("analogy_use") or "normal").lower()
+    if an not in ("prefer", "avoid", "normal"):
+        an = "normal"
+    if an == "prefer":
+        lines.append(
+            "When stuck on an abstract concept, optionally add **one tight analogy/metaphor** (label it plainly; keep it respectful)."
+        )
+    elif an == "avoid":
+        lines.append(
+            "Keep explanations **literal and direct**: do **not** use analogies, metaphors, or cute comparisons."
+        )
+    acr = str(session.get("acronym_style") or "normal").lower()
+    if acr not in ("spell_out", "terse", "normal"):
+        acr = "normal"
+    if acr == "spell_out":
+        lines.append(
+            'On **first substantive mention** of a non-obvious acronym/title-case initialism (e.g. API, SLA), '
+            'write the **expanded form once** (`Long Form (ACRONYM)`), then use the acronym afterwards.'
+        )
+    elif acr == "terse":
+        lines.append(
+            "Assume the reader is acronym-literate: **reuse acronyms** as written without mandatory expansion."
+        )
+    clarify = str(session.get("clarify_first") or "normal").lower()
+    if clarify not in ("on", "off", "normal"):
+        clarify = "normal"
+    if clarify == "on":
+        lines.append(
+            "If the request is underspecified, ask **1–3 short clarifying questions first** (only the minimum needed), "
+            "then wait for the user's answers before giving a full solution."
+        )
+    elif clarify == "off":
+        lines.append(
+            "Do not pause to ask clarifying questions first; provide the best answer immediately and note assumptions briefly."
+        )
+    spec = str(session.get("speculation") or "normal").lower()
+    if spec not in ("strict", "creative", "normal"):
+        spec = "normal"
+    if spec == "strict":
+        lines.append(
+            "Avoid speculation: prefer high-confidence statements, and say when something is unknown or not supported by the prompt."
+        )
+    elif spec == "creative":
+        lines.append(
+            "Brainstorming is allowed: you may propose speculative ideas, but label assumptions and uncertainty clearly."
+        )
+    md = str(session.get("math_detail") or "normal").lower()
+    if md not in ("show_work", "final_only", "normal"):
+        md = "normal"
+    if md == "show_work":
+        lines.append(
+            "When the user asks for math/derivations, show concise intermediate steps and explain symbols briefly."
+        )
+    elif md == "final_only":
+        lines.append(
+            "When the user asks for math/derivations, give the final result directly (no intermediate derivation)."
+        )
+    of = str(session.get("output_format") or "normal").lower()
+    if of not in ("json", "plain", "normal"):
+        of = "normal"
+    if of == "json":
+        lines.append(
+            "When appropriate, format the answer as a single JSON object with stable keys; avoid extra prose outside the JSON."
+        )
+    elif of == "plain":
+        lines.append("Do not force JSON or rigid schemas; answer in normal plain text.")
+    rp = str(session.get("risk_posture") or "normal").lower()
+    if rp not in ("conservative", "pragmatic", "normal"):
+        rp = "normal"
+    if rp == "conservative":
+        lines.append(
+            "Prefer safer, low-risk recommendations; call out risks and choose options that minimize downside."
+        )
+    elif rp == "pragmatic":
+        lines.append(
+            "Prefer practical, time-efficient recommendations; avoid over-engineering unless clearly needed."
+        )
+    actz = str(session.get("actionability") or "normal").lower()
+    if actz not in ("commands", "conceptual", "normal"):
+        actz = "normal"
+    if actz == "commands":
+        lines.append(
+            "When proposing a solution, include runnable commands/snippets/checklists where appropriate."
+        )
+    elif actz == "conceptual":
+        lines.append(
+            "Avoid command dumps; focus on concepts, rationale, and decision points."
+        )
+    qs = str(session.get("quote_style") or "normal").lower()
+    if qs not in ("quote", "paraphrase", "normal"):
+        qs = "normal"
+    if qs == "quote":
+        lines.append(
+            "When you rely on an injected **[FAQ excerpt N]**, include a short verbatim quote (a sentence or clause) "
+            "before paraphrasing."
+        )
+    elif qs == "paraphrase":
+        lines.append(
+            "Prefer paraphrasing FAQ excerpts; avoid quoting unless the user asks for exact wording."
+        )
+    ts = str(session.get("table_style") or "normal").lower()
+    if ts not in ("prefer", "avoid", "normal"):
+        ts = "normal"
+    if ts == "prefer":
+        lines.append(
+            "When comparing several options, prefer a **markdown table** if it makes the structure clearer."
+        )
+    elif ts == "avoid":
+        lines.append(
+            "Avoid markdown tables; use bullets or short sections instead."
+        )
+    es = str(session.get("emoji_style") or "normal").lower()
+    if es not in ("include", "avoid", "normal"):
+        es = "normal"
+    if es == "include":
+        lines.append(
+            "You may use a few tasteful emoji in replies when they help readability (keep it sparse and professional)."
+        )
+    elif es == "avoid":
+        lines.append("Do not use emoji in replies unless the user explicitly uses emoji first.")
+    sh = str(session.get("section_headings") or "normal").lower()
+    if sh not in ("prefer", "avoid", "normal"):
+        sh = "normal"
+    if sh == "prefer":
+        lines.append(
+            "For multi-part answers, organize with short **markdown headings** (## / ###) before each major block."
+        )
+    elif sh == "avoid":
+        lines.append(
+            "Avoid leading lines that look like markdown headings (no `#` / `##` title lines); use bold inline labels or paragraphs instead."
+        )
+    te = str(session.get("term_emphasis") or "normal").lower()
+    if te not in ("highlight", "minimal", "normal"):
+        te = "normal"
+    if te == "highlight":
+        lines.append(
+            "Use **bold** on a handful of key terms or short phrases (not whole sentences) to help the reader scan."
+        )
+    elif te == "minimal":
+        lines.append(
+            "Keep inline **bold** rare; prefer plain text unless emphasis is truly needed for clarity."
+        )
+    cp = str(session.get("counterpoint_tone") or "normal").lower()
+    if cp not in ("challenge", "supportive", "normal"):
+        cp = "normal"
+    if cp == "challenge":
+        lines.append(
+            "Briefly stress-test the user's plan: note plausible failure modes, missing constraints, or stronger "
+            "alternatives—stay respectful and specific."
+        )
+    elif cp == "supportive":
+        lines.append(
+            "Lean supportive: acknowledge effort, frame improvements as next steps, and avoid needless harsh critique."
+        )
+    g = str(session.get("faq_grounding") or "normal").lower()
+    if g not in ("strict", "normal", "relaxed"):
+        g = "normal"
+    if g == "strict":
+        lines.append(
+            "FAQ grounding (strict): Treat product/process/policy claims as supported only when clearly stated in "
+            "the FAQ excerpts provided in this turn. If not stated there, say you are unsure or that it is outside "
+            "the provided FAQ. When you rely on an excerpt, cite it as **[FAQ excerpt N]** matching the numbered "
+            "excerpt headings you were given."
+        )
+    elif g == "relaxed":
+        lines.append(
+            "FAQ grounding (relaxed): Prefer the supplied FAQ excerpts for product/support specifics, but you may add "
+            "brief general-knowledge context if you clearly separate it from anything implied by FAQ text."
+        )
+    # "normal": default product behavior --- rely on FAQ block wording without duplicating instructions.
+    if lines:
+        extras.append(
+            "Preferred reply style for this chat session:\n" + "\n".join(f"- {ln}" for ln in lines)
+        )
 def handle_slash(
     msg: str,
     *,
             out.append(f"**#{i}** score={sc:.4f}\n{_clip(txt, 700)}\n")
         return "\n".join(out)
+    if cmd == "/similarity":
+        if not encoder:
+            return "Encoder off. Drop `--lm-only` / `--no-encoder`."
+        if "|||" not in rest:
+            return "Usage: `/similarity text A ||| text B`"
+        try:
+            ta, tb = _parse_two_segments(rest)
+        except ValueError as e:
+            return str(e)
+        score = encoder.similarity(ta, tb)
+        return (
+            f"**Similarity:** {score:.4f}\n\n**A:** {_clip(ta, 480)}\n\n**B:** {_clip(tb, 480)}"
+        )
+    if cmd in ("/embedding", "/embed"):
+        if not encoder:
+            return "Encoder off. Drop `--lm-only` / `--no-encoder`."
+        if not rest:
+            return f"Usage: `{cmd} <text>`"
+        return _embedding_summary_markdown(encoder, rest)
+    if cmd == "/nearest":
+        if not encoder:
+            return "Encoder off. Drop `--lm-only` / `--no-encoder`."
+        if "|||" not in rest:
+            return "Usage: `/nearest query ||| cand1 ||| cand2 ...`"
+        try:
+            qn, cands = _parse_nearest_blob(rest)
+        except ValueError as e:
+            return str(e)
+        k = max(1, min(rag_top_k, len(cands)))
+        return _nearest_markdown(encoder, qn, cands, top_k=k)
     if cmd in ("/summarize", "/reformulate", "/grounded"):
         if lm is None:
             return "Generative model not loaded."
     print(f"Loading generative model {mid!r} on {dev!r} ...", flush=True)
     lm = load_causal_lm(mid, dev)
     turn_counter = {"n": 0}
+    initial_ub_session = {
+        "trace": not args.no_trace
+        and (encoder is not None or mem_conn is not None or (rag_chunks is not None)),
+        "smart_route": not args.no_smart_route,
+        "rag": rag_chunks is not None,
+        "scope_key": args.memory_scope,
+        "verbosity": "normal",
+        "reply_format": "prose",
+        "faq_grounding": "normal",
+        "audience": "normal",
+        "answer_lead": "normal",
+        "step_style": "normal",
+        "confidence_tone": "normal",
+        "followup_close": "normal",
+        "exposition_order": "normal",
+        "example_density": "normal",
+        "comparison_frame": "normal",
+        "register_tone": "normal",
+        "code_block_style": "normal",
+        "analogy_use": "normal",
+        "acronym_style": "normal",
+        "clarify_first": "normal",
+        "speculation": "normal",
+        "math_detail": "normal",
+        "output_format": "normal",
+        "risk_posture": "normal",
+        "actionability": "normal",
+        "quote_style": "normal",
+        "table_style": "normal",
+        "emoji_style": "normal",
+        "section_headings": "normal",
+        "term_emphasis": "normal",
+        "counterpoint_tone": "normal",
+    }
     def respond(
         message: str,
         history: list[dict],
+        ub_session: dict[str, Any],
+    ) -> tuple[str, list[dict], dict[str, Any]]:
         msg = (message or "").strip()
         hist = list(history or [])
         if not msg:
+            return "", hist, ub_session
         turn_counter["n"] += 1
         seed = (args.seed + turn_counter["n"]) % (2**31)
+        cur_scope = str(ub_session.get("scope_key") or args.memory_scope)
         slash_out = handle_slash(
             msg,
             lm=lm,
             mem_conn=mem_conn,
+            scope_key=cur_scope,
             encoder=encoder,
             rag_chunks=rag_chunks,
             rag_top_k=args.rag_top_k,
         if slash_out is not None:
             hist.append({"role": "user", "content": msg})
             hist.append({"role": "assistant", "content": slash_out})
+            return "", hist, ub_session
+        nl_out = handle_nl_control(
+            msg,
+            ub_session,
+            mem_conn=mem_conn,
+            scope_key=cur_scope,
+            rag_chunks_base=rag_chunks,
+            locked_no_smart_route=args.no_smart_route,
+        )
+        if nl_out is not None:
+            hist.append({"role": "user", "content": msg})
+            hist.append({"role": "assistant", "content": nl_out})
+            return "", hist, ub_session
+        effective_rag = (
+            rag_chunks if rag_chunks is not None and ub_session.get("rag") else None
+        )
+        use_smart = bool(ub_session.get("smart_route")) and not args.no_smart_route
         chat_line = msg
+        if use_smart:
             try:
                 route = infer_route(
                     lm,
                     msg=msg,
                     lm=lm,
                     mem_conn=mem_conn,
+                    scope_key=cur_scope,
                     encoder=encoder,
+                    rag_chunks=effective_rag,
                     rag_top_k=args.rag_top_k,
                     task_max_new_tokens=args.task_max_new_tokens,
                     seed=(seed + 11) % (2**31),
                     foot = f"\n\n---\n*Routed intent:* `{route['intent']}`"
                     hist.append({"role": "user", "content": msg})
                     hist.append({"role": "assistant", "content": tool_reply + foot})
+                    return "", hist, ub_session
             chat_line = route["text"] or msg
         trace: list[str] = []
         extras: list[str] = []
+        _append_reply_style_hints(extras, ub_session)
         if encoder:
             probs = encoder.classify([chat_line])[0]
             )
         rag_block = ""
+        if encoder and effective_rag:
+            hr = hybrid_retrieve(encoder, chat_line, effective_rag, top_k=args.rag_top_k)
             if hr:
                 trace.append(f"RAG:{len(hr)}chunk(s)")
                 pieces = []
                 )
         if mem_conn:
+            items = list_for_scope(mem_conn, cur_scope)
             if items:
                 trace.append(f"mem:{len(items)}item(s)")
                 mem_lines = []
             do_sample=True,
         )
         out = reply or "(empty generation)"
+        show_trace_footer = (
+            (not args.no_trace)
+            and bool(ub_session.get("trace"))
+            and (
+                encoder is not None
+                or mem_conn is not None
+                or effective_rag is not None
+            )
+        )
+        if show_trace_footer and trace:
             out += "\n\n---\n*Brain trace:* " + " · ".join(trace)
         hist.append({"role": "user", "content": msg})
         hist.append({"role": "assistant", "content": out})
+        return "", hist, ub_session
     brain_bits = []
     if encoder:
         brain_bits.append("memory")
     brain_label = "+".join(brain_bits) if brain_bits else "LM only"
+    _css = """
+    /* Space UX: keep the input compact and predictable. */
+    #ub_input textarea { height: 120px !important; }
+    """
+    with gr.Blocks(title="Universal Brain (chat prototype)", css=_css) as demo:
         gr.Markdown(
             "### Universal Brain — chat prototype\n"
             f"**Generative:** `{mid}` ({lm.device}) · **Brain layers:** {brain_label}\n\n"
             "**NL routing:** the model infers what you want (summarize, FAQ search, save note, …). "
             "Use **`--no-smart-route`** for plain chat-only + slash shortcuts. "
             "`/help` lists slash commands.\n\n"
+            "**NL session controls:** say things like "
+            "**`What is my current scope?`**, **`Start a new private session`**, **`Switch to scope my-key`**, "
+            "**`Be brief`**, **`More detail please`**, **`Use bullet points`**, **`Reset reply style`**, "
+            "**`Strict FAQ`** / **`Relaxed FAQ`** / **`Balanced FAQ`**, "
+            "**`ELI5`** / **`Expert mode`**, **`TLDR first`** / **`Answer directly`**, "
+            "**`Step by step`** / **`No numbered steps`**, **`Flag your assumptions`** / **`Be decisive`**, "
+            "**`Suggest next steps`** / **`No follow-up questions`**, **`Definitions first`** / **`Intuition first`**, "
+            "**`Include examples`** / **`Skip examples`**, **`Use pros and cons`** / **`Compare in flowing prose`**, **`Formal tone`** / **`Casual tone`**, **`Use code fences`** / **`Inline code only`**, "
+            "**`Use analogies`** / **`No analogies`**, **`Spell out acronyms`** / **`Don't expand acronyms`**, "
+            "**`Clarify first`** / **`No clarifying questions`**, **`No speculation`** / **`Brainstorm freely`**, "
+            "**`Show your work`** / **`Final answer only`**, **`Answer in JSON`** / **`Plain text only`**, "
+            "**`Be risk averse`** / **`Be pragmatic`**, **`Give me runnable commands`** / **`No commands`**, "
+            "**`Quote the FAQ excerpts`** / **`Paraphrase only`**, **`Use tables`** / **`No tables`**, "
+            "**`Use emoji`** / **`No emoji`**, **`Use section headings`** / **`Flat answer`**, "
+            "**`Bold key terms`** / **`Minimal bold`**, **`Challenge my assumptions`** / **`Be supportive`**, "
+            "**`Export my memories`**, **`Delete all my memories for this chat`**, **`Clear my session notes`**, "
+            "**`Turn off FAQ context`**, **`Turn off smart routing`**, **`Show the brain trace`** "
+            "(no slash command required). See the repo `README` for more example phrases.\n\n"
             "Encoder topics (Hub TinyModel1 ≈ AG News) still feed context and an optional *Brain trace* line; "
             "use `/classify` or ask naturally to see the full probability table in chat."
         )
         chat = gr.Chatbot(type="messages", height=520, label="Conversation", allow_tags=False)
+        ub_state = gr.State(initial_ub_session)
         with gr.Row():
             inp = gr.Textbox(
+                lines=4,
+                max_lines=8,
                 show_label=False,
                 placeholder="Ask in plain language, or use /help …",
                 scale=9,
+                elem_id="ub_input",
             )
             go = gr.Button("Send", variant="primary", scale=1)
         gr.ClearButton([chat, inp])
+        def _submit(
+            m: str,
+            h: list[dict],
+            s: dict[str, Any],
+        ) -> tuple[str, list[dict], dict[str, Any]]:
+            return respond(m, h, s)
+        go.click(
+            _submit,
+            [inp, chat, ub_state],
+            [inp, chat, ub_state],
+            api_name="chat",
+            api_description="Universal Brain chat endpoint (routing + optional RAG + memory + classifier context).",
+        )
+        inp.submit(_submit, [inp, chat, ub_state], [inp, chat, ub_state])
     demo.queue(default_concurrency_limit=2)
     share = args.share
             server_port=args.port,
             share=share,
             ssr_mode=False,
+            show_api=True,
         )
     except ValueError as e:
         err = str(e)