Spaces:

chyams
/

llm-explorer

Running on L4

chyams Claude Opus 4.6 commited on 25 days ago

Commit

beb8b02

1 Parent(s): f2b5e3f

System Prompt Explorer: dual model, multi-turn chat, configurable presets

- Dual model architecture: base (Llama-3.2-3B) + chat (Llama-3.2-3B-Instruct)
- Multi-turn chat with gr.State for clean history (Chatbot display-only)
- 11 configurable presets via admin panel or SYSTEM_PROMPT_PRESETS env var
- All config values overridable via env vars (Secrets vs Variables documented)
- No auto-reset on prompt changes; green terminal collapsed by default
- Educational note: no hidden system prompt, helpfulness from RLHF

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show

app.py +316 -117
config.json +15 -1
models.py +204 -89

app.py CHANGED Viewed

@@ -17,6 +17,8 @@ from datetime import datetime, timezone, timedelta
 import gradio as gr
 from models import AVAILABLE_MODELS, manager, demo_tokenizer
 # ---------------------------------------------------------------------------
@@ -663,16 +665,12 @@ def tokenize_text(text):
 # Tab 3: System Prompt Explorer
 # ---------------------------------------------------------------------------
-SYSTEM_PROMPT_PRESETS = {
-    "(none)": "",
-    "Helpful Assistant": "You are a helpful, friendly assistant.",
-    "Pirate": "You are a pirate. Respond to everything in pirate speak, using nautical terms and saying 'arr' frequently.",
-    "Formal Academic": "You are a formal academic scholar. Use precise, scholarly language. Cite concepts carefully and avoid casual tone.",
-    "Five-Year-Old": "You are explaining things to a five-year-old. Use very simple words, short sentences, and fun comparisons.",
-    "Hostile / Rude": "You are rude and dismissive. You answer questions but with obvious annoyance and sarcasm.",
-    "Haiku Only": "You must respond only in haiku (5-7-5 syllable format). Never break this rule.",
-    "Spanish Tutor": "You are a Spanish language tutor. Respond in Spanish, then provide the English translation in parentheses.",
-}
 def _esc_terminal(text: str) -> str:
@@ -680,82 +678,139 @@ def _esc_terminal(text: str) -> str:
     return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
-def run_system_prompt_explorer(system_prompt, user_message, max_tokens, temperature, seed):
-    """Generate a chat response and return formatted terminal + response HTML."""
-    if not manager.is_ready():
-        return (
-            f"<div class='green-terminal'><span class='sp-special'>Error: {manager.status_message()}</span></div>",
-            "<div class='response-card' style='color:red;'>No model loaded. Load an instruct model from the Admin tab.</div>",
-        )
-    if not manager.is_instruct():
-        return (
-            "<div class='green-terminal'><span class='sp-special'>⚠ Current model is not an instruct model.\n\nLoad an instruct model (e.g. Qwen2.5-3B-Instruct) from the Admin tab.</span></div>",
-            "<div class='response-card'>The System Prompt Explorer requires an instruct/chat model. Base models don't understand system prompts.</div>",
-        )
-    if not user_message.strip():
-        return (
-            "<div class='green-terminal'><span class='sp-special'>Enter a message below and click Generate.</span></div>",
-            "",
-        )
     result = manager.generate_chat(
-        system_prompt=system_prompt,
-        user_message=user_message,
         max_new_tokens=int(max_tokens),
         temperature=temperature,
         seed=int(seed),
     )
     if "error" in result:
-        return (
-            f"<div class='green-terminal'><span class='sp-special'>Error: {_esc_terminal(result['error'])}</span></div>",
-            "",
-        )
-    # Render the formatted prompt in the green terminal
-    formatted = result["formatted_prompt"]
-    # Color-code the special tokens and roles
-    terminal_html = _esc_terminal(formatted)
-    # Highlight Qwen-style chat template tokens
-    for tag in ["<|im_start|>", "<|im_end|>"]:
-        terminal_html = terminal_html.replace(
-            _esc_terminal(tag),
-            f"<span class='sp-special'>{_esc_terminal(tag)}</span>",
-        )
-    terminal_html = terminal_html.replace(
-        "system\n", "<span class='sp-label'>system</span>\n"
-    )
-    terminal_html = terminal_html.replace(
-        "user\n", "<span class='sp-label'>user</span>\n"
-    )
-    terminal_html = terminal_html.replace(
-        "assistant\n", "<span class='sp-label'>assistant</span>\n"
-    )
-    # Highlight the system prompt content
-    if system_prompt.strip():
-        terminal_html = terminal_html.replace(
-            _esc_terminal(system_prompt),
-            f"<span class='sp-system'>{_esc_terminal(system_prompt)}</span>",
-        )
-    # Highlight user message
-    terminal_html = terminal_html.replace(
-        _esc_terminal(user_message),
-        f"<span class='sp-user'>{_esc_terminal(user_message)}</span>",
-    )
-    terminal_out = f"<div class='green-terminal'>{terminal_html}</div>"
-    # Render the response in a clean card
-    response_html = f"<div class='response-card'>{_esc(result['response'])}</div>"
-    return terminal_out, response_html
 def on_preset_change(preset_name):
-    """Update system prompt textbox when a preset is selected."""
-    return SYSTEM_PROMPT_PRESETS.get(preset_name, "")
 # ---------------------------------------------------------------------------
@@ -770,13 +825,21 @@ def admin_login(password):
 def admin_load_model(model_name):
-    """Load a new model from admin panel."""
     status = manager.load_model(model_name)
     cfg = manager.get_config()
     header_status = f"**{manager.status_message()}**"
     return status, json.dumps(cfg, indent=2), header_status
 def admin_save_defaults(prompt, tokenizer_text, temperature, top_k, steps, seed):
     """Save default settings and return updated values for all outputs."""
     manager.update_config(
@@ -800,6 +863,41 @@ def admin_save_defaults(prompt, tokenizer_text, temperature, top_k, steps, seed)
     )
 # ---------------------------------------------------------------------------
 # Build the Gradio app
 # ---------------------------------------------------------------------------
@@ -978,44 +1076,36 @@ def create_app():
             gr.Markdown("### System Prompt Explorer")
             gr.Markdown(
                 "See how **system prompts** change an LLM's behavior. "
                 "The green terminal shows exactly what the model receives — "
-                "special tokens, role labels, and all. "
-                "Try different presets or write your own."
             )
-            # Instruct model notice
-            sp_model_notice = gr.Markdown(
-                "*Requires an instruct model (e.g. Qwen2.5-3B-Instruct). "
-                "Load one from the Admin tab.*",
-                elem_classes=["param-help"],
-            )
             sp_preset = gr.Dropdown(
                 label="Preset",
-                choices=list(SYSTEM_PROMPT_PRESETS.keys()),
-                value="Helpful Assistant",
                 interactive=True,
             )
             sp_system = gr.Textbox(
                 label="System Prompt",
-                value=SYSTEM_PROMPT_PRESETS["Helpful Assistant"],
                 lines=3,
                 placeholder="Enter a system prompt, or select a preset above...",
             )
-            sp_user = gr.Textbox(
-                label="User Message",
-                value="What is Huston-Tillotson University?",
-                lines=2,
-            )
             with gr.Accordion("Settings", open=False):
                 sp_max_tokens = gr.Slider(
                     label="Max tokens",
-                    minimum=32, maximum=512, step=16,
-                    value=256,
                 )
                 gr.Markdown(
-                    "Maximum number of tokens in the response.",
                     elem_classes=["param-help"],
                 )
                 sp_temperature = gr.Slider(
@@ -1029,33 +1119,69 @@ def create_app():
                     precision=0,
                 )
-            sp_generate_btn = gr.Button("Generate", variant="primary")
-            gr.Markdown("#### What the model sees")
             gr.Markdown(
-                "This is the actual text sent to the model, including special tokens "
-                "that mark where system instructions, user messages, and assistant "
-                "responses begin and end.",
                 elem_classes=["param-help"],
             )
-            sp_terminal = gr.HTML(
-                value="<div class='green-terminal'><span class='sp-special'>Select a preset and enter a message, then click Generate.</span></div>",
-            )
-            gr.Markdown("#### Model response")
-            sp_response = gr.HTML(value="")
-            # Wiring
             sp_preset.change(
                 fn=on_preset_change,
                 inputs=[sp_preset],
                 outputs=[sp_system],
             )
-            sp_generate_btn.click(
-                fn=run_system_prompt_explorer,
-                inputs=[sp_system, sp_user, sp_max_tokens, sp_temperature, sp_seed],
-                outputs=[sp_terminal, sp_response],
             )
           # ==================================================================
@@ -1076,16 +1202,26 @@ def create_app():
             # Admin controls (hidden until login)
             with gr.Group(visible=False) as admin_controls:
-                gr.Markdown("#### Model")
                 with gr.Row():
                     admin_model_dropdown = gr.Dropdown(
                         choices=list(AVAILABLE_MODELS.keys()),
-                        value=manager.current_model_name or cfg.get("model", "Qwen2.5-3B"),
                         label="Select model",
                     )
-                    admin_load_btn = gr.Button("Load Model", variant="primary")
                 admin_model_status = gr.Markdown("")
                 gr.Markdown("---")
                 gr.Markdown("#### Default Settings")
                 admin_prompt = gr.Textbox(
@@ -1120,13 +1256,48 @@ def create_app():
                 admin_save_msg = gr.Markdown("")
                 gr.Markdown("---")
-                gr.Markdown("#### Export Slides")
                 gr.Markdown(
-                    "*Uses current settings from Probability Explorer tab.*",
                     elem_classes=["param-help"],
                 )
-                admin_export_btn = gr.Button("Export Slides", variant="secondary")
-                admin_slides_file = gr.File(label="Slideshow", visible=False)
                 gr.Markdown("---")
                 gr.Markdown("#### Current Config")
@@ -1136,6 +1307,15 @@ def create_app():
                     interactive=False,
                 )
             # Login wiring
             admin_login_btn.click(
                 fn=admin_login,
@@ -1143,13 +1323,20 @@ def create_app():
                 outputs=[admin_controls, admin_login_group, admin_login_msg],
             )
-            # Model loading
             admin_load_btn.click(
                 fn=admin_load_model,
                 inputs=[admin_model_dropdown],
                 outputs=[admin_model_status, admin_config_display, status_display],
             )
             # Save defaults — updates config display + Probability Explorer + Tokenizer controls
             admin_save_btn.click(
                 fn=admin_save_defaults,
@@ -1166,6 +1353,13 @@ def create_app():
                 ],
             )
             # Export slides — uses current Probability Explorer settings
             admin_export_btn.click(
                 fn=generate_slideshow,
@@ -1327,12 +1521,17 @@ def create_app():
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
-    # Load default model on startup
     cfg = manager.get_config()
-    model_to_load = cfg.get("model", "Qwen2.5-3B")
-    print(f"Loading default model: {model_to_load}")
-    status = manager.load_model(model_to_load)
-    print(status)
     app = create_app()
     app.launch(

 import gradio as gr
+import re
 from models import AVAILABLE_MODELS, manager, demo_tokenizer
 # ---------------------------------------------------------------------------
 # Tab 3: System Prompt Explorer
 # ---------------------------------------------------------------------------
+MAX_CHAT_TURNS = 10  # max user messages before forcing reset
+def _get_presets() -> dict:
+    """Get current system prompt presets from config."""
+    return manager.get_config().get("system_prompt_presets", {})
 def _esc_terminal(text: str) -> str:
     return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+def _format_terminal(raw_text: str) -> str:
+    """Parse a chat template string and produce color-coded HTML for the green terminal.
+    Works with both Llama (<|start_header_id|>) and Qwen (<|im_start|>) templates.
+    """
+    # Split on special tokens, keeping them
+    parts = re.split(r'(<\|[^|]*\|>)', raw_text)
+    html_parts = []
+    current_role = None
+    expect_role = False  # true right after a token that precedes a role label
+    role_css = {
+        "system": "sp-system",
+        "user": "sp-user",
+        "assistant": "sp-assistant",
+    }
+    for part in parts:
+        if re.match(r'<\|[^|]*\|>', part):
+            # Special token — render in gray
+            html_parts.append(f"<span class='sp-special'>{_esc_terminal(part)}</span>")
+            # After im_start or start_header_id, next text chunk is a role label
+            expect_role = ("im_start" in part or "start_header_id" in part)
+        elif expect_role and part.strip() in role_css:
+            # Role label (system / user / assistant)
+            role = part.strip()
+            current_role = role
+            before = _esc_terminal(part[: len(part) - len(part.lstrip())])
+            after = _esc_terminal(part[len(part.rstrip()) :])
+            html_parts.append(f"{before}<span class='sp-label'>{role}</span>{after}")
+            expect_role = False
+        else:
+            expect_role = False
+            css = role_css.get(current_role, "")
+            if css and part.strip():
+                html_parts.append(f"<span class='{css}'>{_esc_terminal(part)}</span>")
+            else:
+                html_parts.append(_esc_terminal(part))
+    return "<div class='green-terminal'>" + "".join(html_parts) + "</div>"
+def _initial_terminal() -> str:
+    return "<div class='green-terminal'><span class='sp-special'>Send a message to see what the model receives.</span></div>"
+def send_chat_message(user_message, history, system_prompt, max_tokens, temperature, seed):
+    """Handle a user message: generate response, update state + chatbot + terminal.
+    `history` is the gr.State list of clean {"role": ..., "content": ...} dicts.
+    The Chatbot is derived from this — never read back from Chatbot (Gradio
+    mangles the dicts on round-trip).
+    """
+    if not user_message or not user_message.strip():
+        chatbot = [{"role": m["role"], "content": m["content"]} for m in history]
+        return "", history, chatbot, _format_terminal_from_history(history, system_prompt)
+    if not manager.chat_ready():
+        history = history + [
+            {"role": "user", "content": user_message},
+            {"role": "assistant", "content": "No chat model loaded. Load one from the Admin tab."},
+        ]
+        chatbot = [{"role": m["role"], "content": m["content"]} for m in history]
+        return "", history, chatbot, _initial_terminal()
+    # Check turn limit
+    user_turns = sum(1 for m in history if m["role"] == "user")
+    if user_turns >= MAX_CHAT_TURNS:
+        history = history + [
+            {"role": "user", "content": user_message},
+            {"role": "assistant", "content": f"Conversation limit reached ({MAX_CHAT_TURNS} exchanges). Click Reset to start a new conversation."},
+        ]
+        chatbot = [{"role": m["role"], "content": m["content"]} for m in history]
+        return "", history, chatbot, _format_terminal_from_history(history, system_prompt)
+    # Build full messages for the model
+    history = history + [{"role": "user", "content": user_message}]
+    messages = []
+    if system_prompt and system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt})
+    messages.extend(history)
     result = manager.generate_chat(
+        messages=messages,
         max_new_tokens=int(max_tokens),
         temperature=temperature,
         seed=int(seed),
     )
     if "error" in result:
+        history = history + [
+            {"role": "assistant", "content": f"Error: {result['error']}"},
+        ]
+        chatbot = [{"role": m["role"], "content": m["content"]} for m in history]
+        return "", history, chatbot, _format_terminal_from_history(history, system_prompt)
+    history = history + [{"role": "assistant", "content": result["response"]}]
+    chatbot = [{"role": m["role"], "content": m["content"]} for m in history]
+    terminal_html = _format_terminal(result["formatted_display"])
+    return "", history, chatbot, terminal_html
+def _format_terminal_from_history(chat_history, system_prompt):
+    """Build terminal display from chat history (without generating)."""
+    if not chat_history:
+        return _initial_terminal()
+    messages = []
+    if system_prompt and system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt})
+    messages.extend(chat_history)
+    formatted = manager.format_chat_template(messages)
+    if not formatted:
+        return _initial_terminal()
+    return _format_terminal(formatted)
+def reset_chat(system_prompt):
+    """Clear chat history, keep system prompt. Show initial terminal with just system prompt.
+    Returns (state, chatbot, terminal).
+    """
+    if system_prompt and system_prompt.strip() and manager.chat_ready():
+        messages = [{"role": "system", "content": system_prompt}]
+        formatted = manager.format_chat_template(messages)
+        return [], [], _format_terminal(formatted)
+    return [], [], _initial_terminal()
 def on_preset_change(preset_name):
+    """Update system prompt textbox when a preset is selected. No chat reset."""
+    return _get_presets().get(preset_name, "")
 # ---------------------------------------------------------------------------
 def admin_load_model(model_name):
+    """Load a new base model from admin panel."""
     status = manager.load_model(model_name)
     cfg = manager.get_config()
     header_status = f"**{manager.status_message()}**"
     return status, json.dumps(cfg, indent=2), header_status
+def admin_load_chat_model(model_name):
+    """Load a new chat model from admin panel."""
+    status = manager.load_chat_model(model_name)
+    cfg = manager.get_config()
+    header_status = f"**{manager.status_message()}**"
+    return status, json.dumps(cfg, indent=2), header_status
 def admin_save_defaults(prompt, tokenizer_text, temperature, top_k, steps, seed):
     """Save default settings and return updated values for all outputs."""
     manager.update_config(
     )
+def admin_save_presets(presets_json):
+    """Save system prompt presets from admin panel.
+    Returns (status_msg, config_json, dropdown_update, presets_json_display).
+    """
+    try:
+        presets = json.loads(presets_json)
+    except (json.JSONDecodeError, TypeError) as e:
+        cfg = manager.get_config()
+        return (
+            f"Invalid JSON: {e}",
+            json.dumps(cfg, indent=2),
+            gr.update(),
+            gr.update(),
+        )
+    if not isinstance(presets, dict):
+        cfg = manager.get_config()
+        return (
+            "Presets must be a JSON object `{\"Name\": \"prompt\", ...}`",
+            json.dumps(cfg, indent=2),
+            gr.update(),
+            gr.update(),
+        )
+    manager.update_config(system_prompt_presets=presets)
+    cfg = manager.get_config()
+    return (
+        f"Presets saved ({len(presets)} presets).",
+        json.dumps(cfg, indent=2),
+        gr.update(choices=list(presets.keys())),
+        json.dumps(presets, indent=2),
+    )
 # ---------------------------------------------------------------------------
 # Build the Gradio app
 # ---------------------------------------------------------------------------
             gr.Markdown("### System Prompt Explorer")
             gr.Markdown(
                 "See how **system prompts** change an LLM's behavior. "
+                "Pick a preset or write your own, then chat with the model. "
                 "The green terminal shows exactly what the model receives — "
+                "every special token, every role label, every turn."
             )
+            presets = _get_presets()
+            preset_names = list(presets.keys())
+            default_preset = "Helpful Assistant" if "Helpful Assistant" in presets else preset_names[0] if preset_names else ""
             sp_preset = gr.Dropdown(
                 label="Preset",
+                choices=preset_names,
+                value=default_preset,
                 interactive=True,
             )
             sp_system = gr.Textbox(
                 label="System Prompt",
+                value=presets.get(default_preset, ""),
                 lines=3,
                 placeholder="Enter a system prompt, or select a preset above...",
             )
             with gr.Accordion("Settings", open=False):
                 sp_max_tokens = gr.Slider(
                     label="Max tokens",
+                    minimum=32, maximum=1024, step=16,
+                    value=512,
                 )
                 gr.Markdown(
+                    "Maximum number of tokens per response.",
                     elem_classes=["param-help"],
                 )
                 sp_temperature = gr.Slider(
                     precision=0,
                 )
+            with gr.Accordion("What the model sees", open=False):
+                gr.Markdown(
+                    "The full text sent to the model on every turn — system prompt, "
+                    "all previous messages, and special tokens. Watch it grow with each exchange.",
+                    elem_classes=["param-help"],
+                )
+                sp_terminal = gr.HTML(value=_initial_terminal())
+            gr.Markdown("#### Chat")
             gr.Markdown(
+                "**No hidden system prompt.** This model's helpful behavior comes from "
+                "fine-tuning (RLHF), not a secret prompt. When you add a system prompt above, "
+                "it's the *only* instruction the model receives. Commercial APIs like ChatGPT "
+                "and Claude prepend their own system prompts before yours — you can't see or "
+                "remove them.",
                 elem_classes=["param-help"],
             )
+            sp_chat_state = gr.State([])
+            sp_chatbot = gr.Chatbot(height=700, feedback_options=None)
+            with gr.Row():
+                sp_user_input = gr.Textbox(
+                    label="Message",
+                    placeholder="Type a message...",
+                    lines=1,
+                    scale=4,
+                    show_label=False,
+                )
+                sp_send_btn = gr.Button("Send", variant="primary", scale=0, min_width=80)
+                sp_reset_btn = gr.Button("Reset", variant="secondary", scale=0, min_width=80)
+            # --- Wiring ---
+            # Preset dropdown → just fill in the textbox (no chat reset)
             sp_preset.change(
                 fn=on_preset_change,
                 inputs=[sp_preset],
                 outputs=[sp_system],
             )
+            # System prompt textbox edits take effect on the next message sent.
+            # No auto-reset — avoids losing conversation on accidental edits.
+            # Use Reset button or pick a new preset to start fresh.
+            # Send message (button or enter)
+            send_inputs = [sp_user_input, sp_chat_state, sp_system, sp_max_tokens, sp_temperature, sp_seed]
+            send_outputs = [sp_user_input, sp_chat_state, sp_chatbot, sp_terminal]
+            sp_send_btn.click(
+                fn=send_chat_message,
+                inputs=send_inputs,
+                outputs=send_outputs,
+            )
+            sp_user_input.submit(
+                fn=send_chat_message,
+                inputs=send_inputs,
+                outputs=send_outputs,
+            )
+            # Reset button
+            sp_reset_btn.click(
+                fn=reset_chat,
+                inputs=[sp_system],
+                outputs=[sp_chat_state, sp_chatbot, sp_terminal],
             )
           # ==================================================================
             # Admin controls (hidden until login)
             with gr.Group(visible=False) as admin_controls:
+                gr.Markdown("#### Base Model (Probability Explorer)")
                 with gr.Row():
                     admin_model_dropdown = gr.Dropdown(
                         choices=list(AVAILABLE_MODELS.keys()),
+                        value=manager.current_model_name or cfg.get("model", "Llama-3.2-3B"),
                         label="Select model",
                     )
+                    admin_load_btn = gr.Button("Load", variant="primary")
                 admin_model_status = gr.Markdown("")
+                gr.Markdown("#### Chat Model (System Prompt Explorer)")
+                with gr.Row():
+                    admin_chat_dropdown = gr.Dropdown(
+                        choices=list(AVAILABLE_MODELS.keys()),
+                        value=manager.chat_model_name or cfg.get("chat_model", "Llama-3.2-3B-Instruct"),
+                        label="Select chat model",
+                    )
+                    admin_chat_load_btn = gr.Button("Load", variant="primary")
+                admin_chat_status = gr.Markdown("")
                 gr.Markdown("---")
                 gr.Markdown("#### Default Settings")
                 admin_prompt = gr.Textbox(
                 admin_save_msg = gr.Markdown("")
                 gr.Markdown("---")
+                gr.Markdown("#### System Prompt Presets")
                 gr.Markdown(
+                    "Edit the presets available in the System Prompt Explorer dropdown. "
+                    "JSON object: `{\"Name\": \"prompt text\", ...}`",
                     elem_classes=["param-help"],
                 )
+                admin_presets = gr.Code(
+                    value=json.dumps(cfg.get("system_prompt_presets", {}), indent=2),
+                    language="json",
+                    interactive=True,
+                )
+                admin_presets_save_btn = gr.Button("Save Presets")
+                admin_presets_msg = gr.Markdown("")
+                gr.Markdown("---")
+                with gr.Accordion("Environment Variables Reference", open=False):
+                    _pw_status = "*(set)*" if os.environ.get("ADMIN_PASSWORD") else "*(default: admin)*"
+                    _rb_status = "*(set)*" if REBRANDLY_API_KEY else "*(not set)*"
+                    gr.Markdown(
+                        "Override settings via "
+                        "[HF Space Settings](https://huggingface.co/spaces/chyams/llm-explorer/settings). "
+                        "Use **Secrets** for sensitive values (encrypted, hidden after saving) "
+                        "and **Variables** for everything else (visible in settings).\n\n"
+                        "**Precedence:** env var > config.json > code defaults\n\n"
+                        "**Secrets** (sensitive — encrypted)\n\n"
+                        "| Variable | Description | Format | Current |\n"
+                        "|----------|-------------|--------|---------|\n"
+                        f"| `ADMIN_PASSWORD` | Admin panel password | Plain text | {_pw_status} |\n"
+                        f"| `REBRANDLY_API_KEY` | URL shortener API key | API key | {_rb_status} |\n"
+                        "\n**Variables** (non-sensitive — visible)\n\n"
+                        "| Variable | Description | Format | Current |\n"
+                        "|----------|-------------|--------|---------|\n"
+                        f"| `DEFAULT_MODEL` | Base model (Prob Explorer) | Model name | `{cfg.get('model', '')}` |\n"
+                        f"| `DEFAULT_CHAT_MODEL` | Chat model (Sys Prompt Explorer) | Model name | `{cfg.get('chat_model', '')}` |\n"
+                        f"| `DEFAULT_PROMPT` | Default prompt | Plain text | `{cfg.get('default_prompt', '')[:40]}...` |\n"
+                        f"| `DEFAULT_TEMPERATURE` | Default temperature | Number (0–2.5) | `{cfg.get('default_temperature', 0.8)}` |\n"
+                        f"| `DEFAULT_TOP_K` | Default top-k | Integer (5–100) | `{cfg.get('default_top_k', 10)}` |\n"
+                        f"| `DEFAULT_STEPS` | Default steps | Integer (1–100) | `{cfg.get('default_steps', 8)}` |\n"
+                        f"| `DEFAULT_SEED` | Default seed | Integer | `{cfg.get('default_seed', 42)}` |\n"
+                        f"| `DEFAULT_TOKENIZER_TEXT` | Default tokenizer text | Plain text | `{cfg.get('default_tokenizer_text', '')[:40]}...` |\n"
+                        f"| `SYSTEM_PROMPT_PRESETS` | System prompt presets | JSON object | *({len(cfg.get('system_prompt_presets', {}))} presets)* |"
+                    )
                 gr.Markdown("---")
                 gr.Markdown("#### Current Config")
                     interactive=False,
                 )
+                gr.Markdown("---")
+                gr.Markdown("#### Export Slides")
+                gr.Markdown(
+                    "*Uses current settings from Probability Explorer tab.*",
+                    elem_classes=["param-help"],
+                )
+                admin_export_btn = gr.Button("Export Slides", variant="secondary")
+                admin_slides_file = gr.File(label="Slideshow", visible=False)
             # Login wiring
             admin_login_btn.click(
                 fn=admin_login,
                 outputs=[admin_controls, admin_login_group, admin_login_msg],
             )
+            # Model loading — base
             admin_load_btn.click(
                 fn=admin_load_model,
                 inputs=[admin_model_dropdown],
                 outputs=[admin_model_status, admin_config_display, status_display],
             )
+            # Model loading — chat
+            admin_chat_load_btn.click(
+                fn=admin_load_chat_model,
+                inputs=[admin_chat_dropdown],
+                outputs=[admin_chat_status, admin_config_display, status_display],
+            )
             # Save defaults — updates config display + Probability Explorer + Tokenizer controls
             admin_save_btn.click(
                 fn=admin_save_defaults,
                 ],
             )
+            # Save presets — updates config, dropdown choices, and presets display
+            admin_presets_save_btn.click(
+                fn=admin_save_presets,
+                inputs=[admin_presets],
+                outputs=[admin_presets_msg, admin_config_display, sp_preset, admin_presets],
+            )
             # Export slides — uses current Probability Explorer settings
             admin_export_btn.click(
                 fn=generate_slideshow,
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
     cfg = manager.get_config()
+    # Load base model (Probability Explorer)
+    base_model = cfg.get("model", "Llama-3.2-3B")
+    print(f"Loading base model: {base_model}")
+    print(manager.load_model(base_model))
+    # Load chat model (System Prompt Explorer)
+    chat_model = cfg.get("chat_model", "Llama-3.2-3B-Instruct")
+    print(f"Loading chat model: {chat_model}")
+    print(manager.load_chat_model(chat_model))
     app = create_app()
     app.launch(

config.json CHANGED Viewed

@@ -5,5 +5,19 @@
   "default_top_k": 10,
   "default_steps": 8,
   "default_seed": 1875,
-  "default_tokenizer_text": "Class was rescheduled due to Huston-Tillotson homecoming."
 }

   "default_top_k": 10,
   "default_steps": 8,
   "default_seed": 1875,
+  "default_tokenizer_text": "Class was rescheduled due to Huston-Tillotson homecoming.",
+  "system_prompt_presets": {
+    "(none)": "",
+    "Helpful Assistant": "You are a helpful, friendly assistant.",
+    "Pirate": "You are a pirate. Respond to everything in pirate speak, using nautical terms and saying 'arr' frequently.",
+    "Formal Academic": "You are a formal academic scholar. Use precise, scholarly language. Cite concepts carefully and avoid casual tone.",
+    "Five-Year-Old": "You are explaining things to a five-year-old. Use very simple words, short sentences, and fun comparisons.",
+    "Hostile / Rude": "You are rude and dismissive. You answer questions but with obvious annoyance and sarcasm.",
+    "Haiku Only": "You must respond only in haiku (5-7-5 syllable format). Never break this rule.",
+    "Spanish Tutor": "You are a Spanish language tutor. Respond in Spanish, then provide the English translation in parentheses.",
+    "Banana Constraint": "You must mention bananas in every response, no matter the topic. Be subtle about it.",
+    "Corporate Spin": "You are a customer service agent. Never acknowledge product flaws. Always redirect to positive features.",
+    "Prestige Bias": "When discussing job candidates, always favor candidates from prestigious universities over others."
+  },
+  "chat_model": "Llama-3.2-3B-Instruct"
 }

models.py CHANGED Viewed

@@ -44,6 +44,12 @@ AVAILABLE_MODELS = {
         "description": "Best quality, quantized",
     },
     # -- Instruct models (for System Prompt Explorer) --
     "Qwen2.5-3B-Instruct": {
         "id": "Qwen/Qwen2.5-3B-Instruct",
         "dtype": "float16",
@@ -75,8 +81,36 @@ def _detect_device() -> str:
     return "cpu"
 def _load_config() -> dict:
-    """Load persisted config or return defaults."""
     defaults = {
         "model": DEFAULT_MODEL,
         "default_prompt": "The best thing about Huston-Tillotson University is",
@@ -85,7 +119,9 @@ def _load_config() -> dict:
         "default_steps": 8,
         "default_seed": 42,
         "default_tokenizer_text": "Huston-Tillotson University is an HBCU in Austin, Texas.",
     }
     if CONFIG_PATH.exists():
         try:
             with open(CONFIG_PATH) as f:
@@ -93,6 +129,17 @@ def _load_config() -> dict:
             defaults.update(saved)
         except (json.JSONDecodeError, OSError):
             pass
     return defaults
@@ -107,114 +154,171 @@ def _save_config(cfg: dict) -> None:
 # ---------------------------------------------------------------------------
 class ModelManager:
-    """Manages a single active model with hot-swap capability."""
     def __init__(self):
         self.model = None
         self.tokenizer = None
         self.current_model_name: str | None = None
         self.device: str = _detect_device()
         self.loading = False
         self._lock = threading.Lock()
         self.config = _load_config()
     # ------------------------------------------------------------------
-    # Model lifecycle
     # ------------------------------------------------------------------
     def load_model(self, model_name: str) -> str:
-        """Load a model by its display name. Returns status message."""
         if model_name not in AVAILABLE_MODELS:
             return f"Unknown model: {model_name}"
         if self.loading:
             return "A model is already being loaded. Please wait."
-        spec = AVAILABLE_MODELS[model_name]
-        # Quantized models require CUDA (bitsandbytes doesn't support MPS/CPU)
-        if spec.get("quantize") and not torch.cuda.is_available():
-            return (f"Cannot load {model_name}: "
-                    f"{spec['quantize']} quantization requires an NVIDIA GPU (CUDA). "
-                    f"Try a non-quantized model for local development.")
         with self._lock:
             self.loading = True
             try:
-                # Unload current model
-                self._unload()
-                # Determine load kwargs
-                model_id = spec["id"]
-                load_kwargs: dict = {"device_map": "auto"}
-                if spec.get("quantize") == "4bit":
-                    from transformers import BitsAndBytesConfig
-                    load_kwargs["quantization_config"] = BitsAndBytesConfig(
-                        load_in_4bit=True,
-                        bnb_4bit_compute_dtype=torch.float16,
-                    )
-                elif spec.get("quantize") == "8bit":
-                    from transformers import BitsAndBytesConfig
-                    load_kwargs["quantization_config"] = BitsAndBytesConfig(
-                        load_in_8bit=True,
-                    )
-                else:
-                    dtype_str = spec.get("dtype", "float16")
-                    if dtype_str == "auto":
-                        load_kwargs["dtype"] = "auto"
-                    else:
-                        load_kwargs["dtype"] = getattr(torch, dtype_str)
-                # Load tokenizer + model
-                self.tokenizer = AutoTokenizer.from_pretrained(model_id)
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    model_id, **load_kwargs
-                )
-                self.model.eval()
                 self.current_model_name = model_name
-                # Persist choice
                 self.config["model"] = model_name
                 _save_config(self.config)
-                return f"Loaded {model_name} ({model_id})"
             except Exception as e:
-                self._unload()
                 return f"Failed to load {model_name}: {e}"
             finally:
                 self.loading = False
-    def _unload(self) -> None:
-        """Release current model and free memory."""
-        if self.model is not None:
-            del self.model
-            self.model = None
-        if self.tokenizer is not None:
-            del self.tokenizer
-            self.tokenizer = None
-        self.current_model_name = None
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
     def is_ready(self) -> bool:
         return self.model is not None and not self.loading
-    def is_instruct(self) -> bool:
-        """Check if the current model is an instruct/chat model."""
-        if self.current_model_name is None:
-            return False
-        spec = AVAILABLE_MODELS.get(self.current_model_name, {})
-        return spec.get("instruct", False)
     def status_message(self) -> str:
         if self.loading:
             return "Loading model..."
-        if self.model is None:
-            return "No model loaded"
-        return f"Model: {self.current_model_name}"
     # ------------------------------------------------------------------
     # Inference helpers
@@ -329,61 +433,72 @@ class ModelManager:
     def generate_chat(
         self,
-        system_prompt: str,
-        user_message: str,
         max_new_tokens: int = 256,
         temperature: float = 0.7,
         seed: int = 42,
     ) -> dict:
-        """Generate a chat response using the instruct model's chat template.
         Returns dict with:
-            - formatted_prompt: the full tokenized prompt with special tokens
             - response: the model's generated response text
         """
-        if not self.is_ready():
-            return {"error": "Model not loaded"}
-        messages = []
-        if system_prompt.strip():
-            messages.append({"role": "system", "content": system_prompt})
-        messages.append({"role": "user", "content": user_message})
-        # Apply chat template
-        formatted = self.tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True,
         )
         # Tokenize input
-        inputs = self.tokenizer(formatted, return_tensors="pt")
-        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
         input_len = inputs["input_ids"].shape[1]
         # Generate
         gen_kwargs = {
             "max_new_tokens": max_new_tokens,
             "do_sample": temperature > 0,
-            "pad_token_id": self.tokenizer.eos_token_id,
         }
         if temperature > 0:
             gen_kwargs["temperature"] = temperature
-            # Set seed for reproducibility
-            if self.model.device.type == "cuda":
                 torch.cuda.manual_seed(seed)
             torch.manual_seed(seed)
         with torch.no_grad():
-            output_ids = self.model.generate(**inputs, **gen_kwargs)
         # Decode only the new tokens
         new_ids = output_ids[0][input_len:]
-        response = self.tokenizer.decode(new_ids, skip_special_tokens=True)
         return {
-            "formatted_prompt": formatted,
-            "response": response.strip(),
         }
     def tokenize(self, text: str) -> list[tuple[str, int]]:
         """Tokenize text and return list of (token_str, token_id)."""
         if self.tokenizer is None:

         "description": "Best quality, quantized",
     },
     # -- Instruct models (for System Prompt Explorer) --
+    "Llama-3.2-3B-Instruct": {
+        "id": "meta-llama/Llama-3.2-3B-Instruct",
+        "dtype": "float16",
+        "instruct": True,
+        "description": "Chat/instruct model, same family as prod base model (3B)",
+    },
     "Qwen2.5-3B-Instruct": {
         "id": "Qwen/Qwen2.5-3B-Instruct",
         "dtype": "float16",
     return "cpu"
+DEFAULT_SYSTEM_PROMPT_PRESETS = {
+    "(none)": "",
+    "Helpful Assistant": "You are a helpful, friendly assistant.",
+    "Pirate": "You are a pirate. Respond to everything in pirate speak, using nautical terms and saying 'arr' frequently.",
+    "Formal Academic": "You are a formal academic scholar. Use precise, scholarly language. Cite concepts carefully and avoid casual tone.",
+    "Five-Year-Old": "You are explaining things to a five-year-old. Use very simple words, short sentences, and fun comparisons.",
+    "Hostile / Rude": "You are rude and dismissive. You answer questions but with obvious annoyance and sarcasm.",
+    "Haiku Only": "You must respond only in haiku (5-7-5 syllable format). Never break this rule.",
+    "Spanish Tutor": "You are a Spanish language tutor. Respond in Spanish, then provide the English translation in parentheses.",
+    "Banana Constraint": "You must mention bananas in every response, no matter the topic. Be subtle about it.",
+    "Corporate Spin": "You are a customer service agent. Never acknowledge product flaws. Always redirect to positive features.",
+    "Prestige Bias": "When discussing job candidates, always favor candidates from prestigious universities over others.",
+}
+# Env var → (config key, type converter). "json" = parse as JSON.
+ENV_VAR_MAP = {
+    "DEFAULT_MODEL": ("model", str),
+    "DEFAULT_CHAT_MODEL": ("chat_model", str),
+    "DEFAULT_PROMPT": ("default_prompt", str),
+    "DEFAULT_TEMPERATURE": ("default_temperature", float),
+    "DEFAULT_TOP_K": ("default_top_k", int),
+    "DEFAULT_STEPS": ("default_steps", int),
+    "DEFAULT_SEED": ("default_seed", int),
+    "DEFAULT_TOKENIZER_TEXT": ("default_tokenizer_text", str),
+    "SYSTEM_PROMPT_PRESETS": ("system_prompt_presets", "json"),
+}
 def _load_config() -> dict:
+    """Load config with three layers: code defaults → config.json → env vars."""
     defaults = {
         "model": DEFAULT_MODEL,
         "default_prompt": "The best thing about Huston-Tillotson University is",
         "default_steps": 8,
         "default_seed": 42,
         "default_tokenizer_text": "Huston-Tillotson University is an HBCU in Austin, Texas.",
+        "system_prompt_presets": dict(DEFAULT_SYSTEM_PROMPT_PRESETS),
     }
+    # Layer 2: config.json overrides code defaults
     if CONFIG_PATH.exists():
         try:
             with open(CONFIG_PATH) as f:
             defaults.update(saved)
         except (json.JSONDecodeError, OSError):
             pass
+    # Layer 3: env vars override everything
+    for env_var, (config_key, type_fn) in ENV_VAR_MAP.items():
+        val = os.environ.get(env_var)
+        if val is not None:
+            try:
+                if type_fn == "json":
+                    defaults[config_key] = json.loads(val)
+                else:
+                    defaults[config_key] = type_fn(val)
+            except (json.JSONDecodeError, ValueError, TypeError):
+                pass  # bad env var value — skip
     return defaults
 # ---------------------------------------------------------------------------
 class ModelManager:
+    """Manages two model slots: base (Probability Explorer) and chat (System Prompt Explorer)."""
     def __init__(self):
+        # Base model (Probability Explorer)
         self.model = None
         self.tokenizer = None
         self.current_model_name: str | None = None
+        # Chat model (System Prompt Explorer)
+        self.chat_model = None
+        self.chat_tokenizer = None
+        self.chat_model_name: str | None = None
         self.device: str = _detect_device()
         self.loading = False
         self._lock = threading.Lock()
         self.config = _load_config()
     # ------------------------------------------------------------------
+    # Shared loading logic
+    # ------------------------------------------------------------------
+    def _do_load(self, model_name: str):
+        """Load model + tokenizer by name. Returns (model, tokenizer). Raises on failure."""
+        spec = AVAILABLE_MODELS[model_name]
+        if spec.get("quantize") and not torch.cuda.is_available():
+            raise RuntimeError(
+                f"Cannot load {model_name}: "
+                f"{spec['quantize']} quantization requires an NVIDIA GPU (CUDA). "
+                f"Try a non-quantized model for local development."
+            )
+        model_id = spec["id"]
+        load_kwargs: dict = {"device_map": "auto"}
+        if spec.get("quantize") == "4bit":
+            from transformers import BitsAndBytesConfig
+            load_kwargs["quantization_config"] = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+            )
+        elif spec.get("quantize") == "8bit":
+            from transformers import BitsAndBytesConfig
+            load_kwargs["quantization_config"] = BitsAndBytesConfig(
+                load_in_8bit=True,
+            )
+        else:
+            dtype_str = spec.get("dtype", "float16")
+            if dtype_str == "auto":
+                load_kwargs["dtype"] = "auto"
+            else:
+                load_kwargs["dtype"] = getattr(torch, dtype_str)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
+        model.eval()
+        return model, tokenizer
+    # ------------------------------------------------------------------
+    # Base model lifecycle
     # ------------------------------------------------------------------
     def load_model(self, model_name: str) -> str:
+        """Load base model for Probability Explorer. Returns status message."""
         if model_name not in AVAILABLE_MODELS:
             return f"Unknown model: {model_name}"
         if self.loading:
             return "A model is already being loaded. Please wait."
         with self._lock:
             self.loading = True
             try:
+                # Unload current base model
+                if self.model is not None:
+                    del self.model
+                    self.model = None
+                if self.tokenizer is not None:
+                    del self.tokenizer
+                    self.tokenizer = None
+                self.current_model_name = None
+                gc.collect()
+                model, tokenizer = self._do_load(model_name)
+                self.model = model
+                self.tokenizer = tokenizer
                 self.current_model_name = model_name
                 self.config["model"] = model_name
                 _save_config(self.config)
+                return f"Loaded base model: {model_name}"
             except Exception as e:
+                self.model = None
+                self.tokenizer = None
+                self.current_model_name = None
                 return f"Failed to load {model_name}: {e}"
             finally:
                 self.loading = False
+    # ------------------------------------------------------------------
+    # Chat model lifecycle
+    # ------------------------------------------------------------------
+    def load_chat_model(self, model_name: str) -> str:
+        """Load chat/instruct model for System Prompt Explorer. Returns status message."""
+        if model_name not in AVAILABLE_MODELS:
+            return f"Unknown model: {model_name}"
+        if self.loading:
+            return "A model is already being loaded. Please wait."
+        with self._lock:
+            self.loading = True
+            try:
+                if self.chat_model is not None:
+                    del self.chat_model
+                    self.chat_model = None
+                if self.chat_tokenizer is not None:
+                    del self.chat_tokenizer
+                    self.chat_tokenizer = None
+                self.chat_model_name = None
+                gc.collect()
+                model, tokenizer = self._do_load(model_name)
+                self.chat_model = model
+                self.chat_tokenizer = tokenizer
+                self.chat_model_name = model_name
+                self.config["chat_model"] = model_name
+                _save_config(self.config)
+                return f"Loaded chat model: {model_name}"
+            except Exception as e:
+                self.chat_model = None
+                self.chat_tokenizer = None
+                self.chat_model_name = None
+                return f"Failed to load chat model {model_name}: {e}"
+            finally:
+                self.loading = False
+    # ------------------------------------------------------------------
+    # Status
+    # ------------------------------------------------------------------
     def is_ready(self) -> bool:
         return self.model is not None and not self.loading
+    def chat_ready(self) -> bool:
+        return self.chat_model is not None and not self.loading
     def status_message(self) -> str:
         if self.loading:
             return "Loading model..."
+        parts = []
+        if self.model:
+            parts.append(f"Base: {self.current_model_name}")
+        if self.chat_model:
+            parts.append(f"Chat: {self.chat_model_name}")
+        if not parts:
+            return "No models loaded"
+        return " | ".join(parts)
     # ------------------------------------------------------------------
     # Inference helpers
     def generate_chat(
         self,
+        messages: list[dict],
         max_new_tokens: int = 256,
         temperature: float = 0.7,
         seed: int = 42,
     ) -> dict:
+        """Generate a chat response using the dedicated chat model.
+        Args:
+            messages: Full conversation as list of {"role": ..., "content": ...} dicts,
+                      including system prompt and all previous turns.
         Returns dict with:
+            - formatted_display: the full template including the response (for terminal)
             - response: the model's generated response text
         """
+        if not self.chat_ready():
+            return {"error": "Chat model not loaded"}
+        # Format input (everything up to and including the generation prompt)
+        formatted = self.chat_tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True,
         )
         # Tokenize input
+        inputs = self.chat_tokenizer(formatted, return_tensors="pt")
+        inputs = {k: v.to(self.chat_model.device) for k, v in inputs.items()}
         input_len = inputs["input_ids"].shape[1]
         # Generate
         gen_kwargs = {
             "max_new_tokens": max_new_tokens,
             "do_sample": temperature > 0,
+            "pad_token_id": self.chat_tokenizer.eos_token_id,
         }
         if temperature > 0:
             gen_kwargs["temperature"] = temperature
+            if self.chat_model.device.type == "cuda":
                 torch.cuda.manual_seed(seed)
             torch.manual_seed(seed)
         with torch.no_grad():
+            output_ids = self.chat_model.generate(**inputs, **gen_kwargs)
         # Decode only the new tokens
         new_ids = output_ids[0][input_len:]
+        response = self.chat_tokenizer.decode(new_ids, skip_special_tokens=True).strip()
+        # Build display template (includes the response) for green terminal
+        display_messages = messages + [{"role": "assistant", "content": response}]
+        formatted_display = self.chat_tokenizer.apply_chat_template(
+            display_messages, tokenize=False, add_generation_prompt=False,
+        )
         return {
+            "formatted_display": formatted_display,
+            "response": response,
         }
+    def format_chat_template(self, messages: list[dict]) -> str:
+        """Format messages using the chat model's template (for terminal display)."""
+        if not self.chat_tokenizer:
+            return ""
+        return self.chat_tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True,
+        )
     def tokenize(self, text: str) -> list[tuple[str, int]]:
         """Tokenize text and return list of (token_str, token_id)."""
         if self.tokenizer is None: