| |
| |
| """ |
| LULUV2 Pro local chat UI. |
| |
| A clean ChatGPT-style desktop UI for the fine-tuned LULUV2 checkpoint. |
| It keeps the important local features only: |
| - chat inference |
| - live token streaming |
| - new chat / save / load chats |
| - persistent memory notes |
| - live edge monitor: tok/s, RAM, VRAM, GPU, pass2 metrics |
| - 32K context controls and test prompt helper |
| |
| Run: |
| python ./app.py --ckpt ./LULUV2-bf16.pt --model-py ./luluv2_inference_runtime.py --inbrowser |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import re |
| from datetime import datetime |
| from pathlib import Path |
| from typing import Any, Dict, List, Tuple |
|
|
| import gradio as gr |
|
|
| from luluv2_live_inference import ( |
| GenerationConfig, |
| LULUV2LiveEngine, |
| clean_text, |
| normalize_history, |
| system_usage, |
| ) |
|
|
| APP_NAME = "LuluV2" |
| CHAT_DIR = Path(os.getenv("LULU_CHAT_DIR", "lulu_chats")) |
| MEMORY_FILE = Path(os.getenv("LULU_MEMORY_FILE", "lulu_memory.json")) |
|
|
| DEFAULT_SYSTEM_PROMPT = """Your name is LuluV2. |
| You are a local AI assistant made by Open Machine. |
| You run offline from the LULUV2 VWM checkpoint. |
| Answer directly and naturally. |
| Use Markdown for structure. |
| When writing code, use fenced code blocks with the correct language tag. |
| Do not output role tags, hidden scratchpad text, JSON UI fragments, or {'type':'text'} blocks. |
| """ |
|
|
| PRESETS = { |
| "Balanced": dict(temperature=0.65, top_k=40, top_p=0.90, min_p=0.03, repetition_penalty=1.10, frequency_penalty=0.02, max_new_tokens=768), |
| "Precise": dict(temperature=0.35, top_k=30, top_p=0.84, min_p=0.04, repetition_penalty=1.14, frequency_penalty=0.03, max_new_tokens=512), |
| "Code": dict(temperature=0.42, top_k=40, top_p=0.88, min_p=0.03, repetition_penalty=1.10, frequency_penalty=0.02, max_new_tokens=1200), |
| "Long 32K": dict(temperature=0.55, top_k=50, top_p=0.90, min_p=0.025, repetition_penalty=1.08, frequency_penalty=0.02, max_new_tokens=1200), |
| } |
|
|
|
|
| def safe_int(value: Any, default: int, low: int | None = None, high: int | None = None) -> int: |
| try: |
| value = int(value) |
| except Exception: |
| value = default |
| if low is not None: |
| value = max(low, value) |
| if high is not None: |
| value = min(high, value) |
| return value |
|
|
|
|
| def clamp(value: Any, low: float, high: float, default: float) -> float: |
| try: |
| value = float(value) |
| except Exception: |
| return default |
| return max(low, min(high, value)) |
|
|
|
|
| def esc(text: Any) -> str: |
| return str(text).replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """) |
|
|
|
|
| def status_html(title: str, detail: str = "", tone: str = "neutral") -> str: |
| tone = tone if tone in {"neutral", "good", "warn", "bad", "live"} else "neutral" |
| return f""" |
| <div class="status-pill status-{tone}"> |
| <span class="pulse-dot"></span> |
| <div><b>{esc(title)}</b><small>{esc(detail)}</small></div> |
| </div> |
| """ |
|
|
|
|
| def read_memory() -> str: |
| if not MEMORY_FILE.exists(): |
| return "" |
| try: |
| return str(json.loads(MEMORY_FILE.read_text(encoding="utf-8")).get("memory_notes", "")) |
| except Exception: |
| return "" |
|
|
|
|
| def write_memory(memory_notes: str) -> Tuple[str, str]: |
| MEMORY_FILE.write_text( |
| json.dumps( |
| {"memory_notes": memory_notes or "", "saved_at": datetime.now().isoformat(timespec="seconds"), "app": APP_NAME}, |
| indent=2, |
| ensure_ascii=False, |
| ), |
| encoding="utf-8", |
| ) |
| return str(MEMORY_FILE), status_html("Memory saved", str(MEMORY_FILE), "good") |
|
|
|
|
| def safe_chat_filename(chat_name: str, suffix: str) -> Path: |
| CHAT_DIR.mkdir(parents=True, exist_ok=True) |
| base = re.sub(r"[^a-zA-Z0-9_-]+", "_", chat_name or "chat").strip("_") or "chat" |
| return CHAT_DIR / f"{base}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.{suffix}" |
|
|
|
|
| def list_saved_chats() -> List[str]: |
| CHAT_DIR.mkdir(parents=True, exist_ok=True) |
| return [str(p) for p in sorted(CHAT_DIR.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True)] |
|
|
|
|
| def save_chat(history: Any, chat_name: str, memory_notes: str) -> Tuple[str, str, List[str]]: |
| path = safe_chat_filename(chat_name or "Lulu chat", "json") |
| data = { |
| "chat_name": chat_name or "Lulu chat", |
| "history": normalize_history(history), |
| "memory_notes": memory_notes or "", |
| "saved_at": datetime.now().isoformat(timespec="seconds"), |
| "app": APP_NAME, |
| } |
| path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") |
| return str(path), status_html("Chat saved", path.name, "good"), list_saved_chats() |
|
|
|
|
| def load_chat(path: str) -> Tuple[List[Dict[str, str]], str, str, str]: |
| if not path: |
| return [], "New chat", read_memory(), status_html("No saved chat selected", "Pick a JSON file from the sidebar.", "warn") |
| try: |
| data = json.loads(Path(path).read_text(encoding="utf-8")) |
| except Exception as exc: |
| return [], "New chat", read_memory(), status_html("Load failed", f"{type(exc).__name__}: {exc}", "bad") |
| return ( |
| normalize_history(data.get("history", [])), |
| str(data.get("chat_name") or Path(path).stem), |
| str(data.get("memory_notes", read_memory())), |
| status_html("Chat loaded", Path(path).name, "good"), |
| ) |
|
|
|
|
| def chat_to_markdown(history: Any, chat_name: str) -> str: |
| lines = [f"# {clean_text(chat_name) or 'LuluV2 chat'}", ""] |
| for item in normalize_history(history): |
| lines.append("## You" if item["role"] == "user" else "## LuluV2") |
| lines.append(item["content"]) |
| lines.append("") |
| return "\n".join(lines).strip() + "\n" |
|
|
|
|
| def export_markdown(history: Any, chat_name: str) -> Tuple[str, str]: |
| path = safe_chat_filename(chat_name or "Lulu chat", "md") |
| path.write_text(chat_to_markdown(history, chat_name), encoding="utf-8") |
| return str(path), status_html("Markdown exported", path.name, "good") |
|
|
|
|
| def postprocess_answer(text: Any, final: bool = False) -> str: |
| text = clean_text(text) |
| |
| text = re.sub(r"\n?\s*\[\s*\{\s*['\"]text['\"].*?['\"]type['\"]\s*:\s*['\"]text['\"]\s*\}\s*\]\s*$", "", text, flags=re.S) |
| text = re.sub(r"\n?\s*type\s*:\s*['\"]text['\"]\s*$", "", text, flags=re.I) |
| text = re.sub(r"\n{4,}", "\n\n\n", text) |
| if final and text.count("```") % 2 == 1: |
| text += "\n```" |
| return text.strip() |
|
|
|
|
| def metric_cards(engine: LULUV2LiveEngine, max_context: int) -> str: |
| stats = engine.stats_dict() |
| sys = stats.get("system", {}) |
| model = stats.get("model", {}) |
| pass_kl = stats.get("pass1_pass2_kl") |
| pass_cos = stats.get("pass1_pass2_logit_cosine") |
| pass_text = "base" |
| if pass_kl is not None and pass_cos is not None: |
| pass_text = f"KL {pass_kl:.3f} / cos {pass_cos:.3f}" |
| gpu_util = sys.get("gpu_util_percent") |
| gpu_temp = sys.get("gpu_temp_c") |
| gpu_text = "n/a" if gpu_util is None else f"{gpu_util}%" |
| temp_text = "n/a" if gpu_temp is None else f"{gpu_temp}°C" |
| return f""" |
| <div class="monitor-bar"> |
| <div class="mon-card hot"><b>{float(stats.get('tokens_per_sec', 0.0)):.1f}</b><span>tok/s</span></div> |
| <div class="mon-card"><b>{int(stats.get('generated_tokens', 0))}</b><span>tokens</span></div> |
| <div class="mon-card"><b>{sys.get('python_ram', 'n/a')}</b><span>Python RAM</span></div> |
| <div class="mon-card"><b>{sys.get('vram_used', 'n/a')}</b><span>VRAM / {sys.get('vram_total', 'n/a')}</span></div> |
| <div class="mon-card"><b>{gpu_text}</b><span>GPU · {temp_text}</span></div> |
| <div class="mon-card"><b>{max_context//1024}K</b><span>context</span></div> |
| <div class="mon-card"><b>{model.get('has_pass2')}</b><span>pass2</span></div> |
| <div class="mon-card wide"><b>{pass_text}</b><span>pass1 → pass2</span></div> |
| </div> |
| """ |
|
|
|
|
| def make_32k_prompt() -> str: |
| seed = ( |
| "We are testing a 32K context window for LuluV2. " |
| "Remember these constraints: answer directly, keep code formatted, and summarize the relevant details. " |
| "The repeated context below is synthetic filler for a long-context stress test.\n\n" |
| ) |
| block = ( |
| "Section: VWM reconstruction. A model can use A/B atoms and c-code recipes to reconstruct behavior online. " |
| "Pass 1 builds a scaffold, pass 2 refines it, and the UI should keep live tokens/sec, RAM, VRAM, and pass metrics visible. " |
| "When asked at the end, explain the three key ideas and provide a tiny Python example.\n" |
| ) |
| |
| return seed + (block * 520) + "\nFinal question: What are the three key ideas above, and can you show a tiny Python class for tracking tokens per second?" |
|
|
|
|
| def create_chatbot(): |
| kwargs = dict( |
| value=[], |
| elem_id="chatbot", |
| height=760, |
| show_label=False, |
| avatar_images=(None, None), |
| bubble_full_width=False, |
| ) |
| try: |
| return gr.Chatbot(type="messages", render_markdown=True, sanitize_html=True, **kwargs) |
| except TypeError: |
| try: |
| return gr.Chatbot(render_markdown=True, sanitize_html=True, **kwargs) |
| except TypeError: |
| return gr.Chatbot(**kwargs) |
|
|
|
|
| def build_app(engine: LULUV2LiveEngine, default_context: int): |
| def respond( |
| message, |
| history, |
| chat_name, |
| system_prompt, |
| memory_notes, |
| preset, |
| history_turns, |
| max_context_tokens, |
| max_new_tokens, |
| temperature, |
| top_k, |
| top_p, |
| min_p, |
| repetition_penalty, |
| frequency_penalty, |
| greedy, |
| no_repeat_ngram, |
| stream_every, |
| show_pass_metrics, |
| ): |
| hist = normalize_history(history) |
| msg = clean_text(message) |
| max_context_tokens = safe_int(max_context_tokens, default_context, 128, 32768) |
| if not msg: |
| yield "", hist, status_html("Empty message", "Type something first.", "warn"), metric_cards(engine, max_context_tokens), engine.token_trace_text(), engine.stats_dict() |
| return |
|
|
| |
| prompt = engine.build_chat_prompt( |
| message=msg, |
| history=hist, |
| system_prompt=system_prompt or DEFAULT_SYSTEM_PROMPT, |
| memory_notes=memory_notes or "", |
| history_turns=safe_int(history_turns, 4, 0, 32), |
| ) |
| cfg = GenerationConfig( |
| max_new_tokens=safe_int(max_new_tokens, 768, 1, 8192), |
| temperature=clamp(temperature, 0.0, 2.0, 0.65), |
| top_k=safe_int(top_k, 40, 0, 500), |
| top_p=clamp(top_p, 0.01, 1.0, 0.90), |
| min_p=clamp(min_p, 0.0, 0.5, 0.03), |
| repetition_penalty=clamp(repetition_penalty, 1.0, 3.0, 1.10), |
| frequency_penalty=clamp(frequency_penalty, 0.0, 3.0, 0.02), |
| greedy=bool(greedy), |
| no_repeat_ngram=safe_int(no_repeat_ngram, 4, 0, 16), |
| stream_every=safe_int(stream_every, 1, 1, 64), |
| max_context_tokens=max_context_tokens, |
| return_pass_metrics=bool(show_pass_metrics), |
| ) |
|
|
| hist.append({"role": "user", "content": msg}) |
| hist.append({"role": "assistant", "content": "Thinking..."}) |
| yield "", hist, status_html("Generating", "LuluV2 is reconstructing tokens live.", "live"), metric_cards(engine, max_context_tokens), engine.token_trace_text(), engine.stats_dict() |
|
|
| final = "" |
| try: |
| for partial in engine.generate(prompt, cfg): |
| final = postprocess_answer(partial, final=False) |
| hist[-1] = {"role": "assistant", "content": final or "..."} |
| yield "", hist, status_html("Generating", f"{engine.last_stats.generated_tokens} tokens · {engine.last_stats.tokens_per_sec:.1f} tok/s", "live"), metric_cards(engine, max_context_tokens), engine.token_trace_text(), engine.stats_dict() |
| except Exception as exc: |
| hist[-1] = {"role": "assistant", "content": f"Generation failed:\n\n```text\n{type(exc).__name__}: {exc}\n```"} |
| yield msg, hist, status_html("Generation failed", f"{type(exc).__name__}: {exc}", "bad"), metric_cards(engine, max_context_tokens), engine.token_trace_text(), engine.stats_dict() |
| return |
|
|
| final = postprocess_answer(final, final=True) or "I’m not sure how to answer that yet." |
| hist[-1] = {"role": "assistant", "content": final} |
| yield "", hist, status_html("Done", f"{engine.last_stats.generated_tokens} tokens · {engine.last_stats.tokens_per_sec:.1f} tok/s", "good"), metric_cards(engine, max_context_tokens), engine.token_trace_text(), engine.stats_dict() |
|
|
| def regenerate( |
| history, |
| chat_name, |
| system_prompt, |
| memory_notes, |
| preset, |
| history_turns, |
| max_context_tokens, |
| max_new_tokens, |
| temperature, |
| top_k, |
| top_p, |
| min_p, |
| repetition_penalty, |
| frequency_penalty, |
| greedy, |
| no_repeat_ngram, |
| stream_every, |
| show_pass_metrics, |
| ): |
| hist = normalize_history(history) |
| if not hist: |
| yield "", hist, status_html("Nothing to regenerate", "Send a message first.", "warn"), metric_cards(engine, safe_int(max_context_tokens, default_context)), engine.token_trace_text(), engine.stats_dict() |
| return |
| work = hist[:] |
| if work and work[-1]["role"] == "assistant": |
| work = work[:-1] |
| if not work or work[-1]["role"] != "user": |
| yield "", hist, status_html("Cannot regenerate", "Last turn is not a user message.", "warn"), metric_cards(engine, safe_int(max_context_tokens, default_context)), engine.token_trace_text(), engine.stats_dict() |
| return |
| last_msg = work[-1]["content"] |
| prev = work[:-1] |
| yield from respond(last_msg, prev, chat_name, system_prompt, memory_notes, preset, history_turns, max_context_tokens, max_new_tokens, temperature, top_k, top_p, min_p, repetition_penalty, frequency_penalty, greedy, no_repeat_ngram, stream_every, show_pass_metrics) |
|
|
| def new_chat(): |
| return [], "New chat", status_html("New chat", "Fresh conversation. Memory notes are kept.", "good") |
|
|
| def forget_last(history): |
| hist = normalize_history(history) |
| if len(hist) >= 2: |
| return hist[:-2], status_html("Forgot last turn", "Removed the latest exchange.", "good") |
| return [], status_html("Nothing to forget", "No full turn to remove.", "warn") |
|
|
| def apply_preset(name): |
| p = PRESETS.get(name, PRESETS["Balanced"]) |
| context = 32768 if name == "Long 32K" else default_context |
| return p["temperature"], p["top_k"], p["top_p"], p["min_p"], p["repetition_penalty"], p["frequency_penalty"], p["max_new_tokens"], context |
|
|
| css = """ |
| :root{ |
| --bg:#05060d;--panel:#0b1020;--panel2:#101827;--line:rgba(148,163,184,.16); |
| --text:#edf2ff;--muted:#94a3b8;--accent:#8b5cf6;--accent2:#22d3ee;--good:#22c55e;--bad:#ef4444; |
| } |
| html, body, .gradio-container{ |
| background: radial-gradient(circle at top left, rgba(139,92,246,.23), transparent 34%), |
| radial-gradient(circle at top right, rgba(34,211,238,.14), transparent 30%), |
| linear-gradient(180deg,#05060d,#070a12 62%,#02030a)!important; |
| color:var(--text)!important; |
| } |
| .gradio-container{max-width:1680px!important;margin:auto!important;font-family:Inter,ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif!important;} |
| footer{display:none!important}.main-wrap{gap:18px!important}.sidebar{padding:16px;border:1px solid var(--line);border-radius:28px;background:rgba(9,14,28,.76);box-shadow:0 20px 70px rgba(0,0,0,.32)} |
| .brand{padding:10px 4px 18px}.brand h1{margin:0;font-size:32px;letter-spacing:-.06em;color:#fff}.brand p{margin:5px 0 0;color:var(--muted);font-size:13px}.brand .badge{display:inline-flex;margin-top:12px;padding:7px 10px;border-radius:999px;border:1px solid rgba(34,211,238,.28);background:rgba(8,145,178,.12);color:#cffafe;font-weight:800;font-size:12px} |
| .chat-shell{padding:16px;border:1px solid var(--line);border-radius:32px;background:rgba(5,8,18,.62);box-shadow:0 30px 110px rgba(0,0,0,.38)} |
| #chatbot{height:760px!important;border:0!important;background:transparent!important;overflow:hidden!important}.message{font-size:15.5px!important;line-height:1.62!important}.message-wrap{max-width:900px!important}.bot .message, .assistant .message{background:rgba(15,23,42,.72)!important;border:1px solid rgba(148,163,184,.13)!important;border-radius:22px!important}.user .message{background:linear-gradient(135deg,rgba(124,58,237,.70),rgba(59,130,246,.42))!important;border:1px solid rgba(167,139,250,.35)!important;border-radius:22px!important;color:white!important} |
| #chatbot pre{background:#101827!important;border:1px solid rgba(148,163,184,.22)!important;border-radius:18px!important;padding:16px!important;box-shadow:inset 0 1px 0 rgba(255,255,255,.04)!important}#chatbot code{font-family:'JetBrains Mono','Cascadia Code','SFMono-Regular',Consolas,monospace!important;font-size:14px!important}#chatbot p{margin:0 0 .7em!important}#chatbot ul,#chatbot ol{margin-top:.3em!important} |
| .composer-card{display:flex;gap:12px;align-items:end;padding:10px;border-radius:26px;border:1px solid rgba(139,92,246,.28);background:rgba(2,6,23,.80);box-shadow:0 20px 70px rgba(139,92,246,.12)}#composer textarea{min-height:72px!important;max-height:190px!important;background:transparent!important;border:0!important;color:#fff!important;font-size:16px!important;line-height:1.5!important;box-shadow:none!important}.input-container{border:0!important;background:transparent!important}.form{border:0!important;background:transparent!important}label{color:#cbd5e1!important;font-weight:700!important} |
| button{border-radius:16px!important;font-weight:850!important;border:1px solid rgba(148,163,184,.16)!important;box-shadow:0 10px 28px rgba(0,0,0,.22)!important}.send-btn{min-height:56px!important;background:linear-gradient(135deg,#8b5cf6,#06b6d4)!important;color:white!important}.side-btn button,.side-btn{width:100%!important} |
| .monitor-bar{display:grid;grid-template-columns:repeat(8,minmax(110px,1fr));gap:10px;margin:0 0 12px}.mon-card{padding:12px 13px;border:1px solid var(--line);border-radius:18px;background:rgba(15,23,42,.78);min-height:64px}.mon-card b{display:block;font-size:20px;color:#fff;white-space:nowrap}.mon-card span{display:block;color:var(--muted);font-size:11px;margin-top:3px}.mon-card.hot{background:linear-gradient(135deg,rgba(139,92,246,.30),rgba(34,211,238,.16));border-color:rgba(34,211,238,.30)}.mon-card.wide b{font-size:15px}.status-pill{display:flex;align-items:center;gap:10px;margin:0 0 12px;padding:10px 13px;border-radius:18px;border:1px solid var(--line);background:rgba(2,6,23,.72)}.status-pill b{display:block}.status-pill small{display:block;color:var(--muted);font-size:12px}.pulse-dot{width:10px;height:10px;border-radius:99px;background:var(--accent2);box-shadow:0 0 0 7px rgba(34,211,238,.10),0 0 25px rgba(34,211,238,.55)}.status-good .pulse-dot{background:var(--good);box-shadow:0 0 0 7px rgba(34,197,94,.12),0 0 25px rgba(34,197,94,.5)}.status-bad .pulse-dot{background:var(--bad)}.status-live .pulse-dot{animation:pulse 1.1s infinite}@keyframes pulse{0%{transform:scale(1)}50%{transform:scale(1.45)}100%{transform:scale(1)}} |
| .gr-box,.gr-panel,.block{background:transparent!important;border-color:var(--line)!important}.sidebar textarea,.sidebar input,.sidebar select,.sidebar .wrap{background:rgba(2,6,23,.62)!important;color:#e5e7eb!important;border-color:rgba(148,163,184,.16)!important;border-radius:14px!important}.small-note{color:#94a3b8;font-size:12px}.tokenbox textarea,.jsonbox textarea{font-family:'JetBrains Mono','Cascadia Code',Consolas,monospace!important;font-size:12px!important;background:#060914!important} |
| @media(max-width:1100px){.monitor-bar{grid-template-columns:repeat(2,1fr)}.sidebar{display:none}.chat-shell{padding:8px}} |
| """ |
|
|
| theme = gr.themes.Base(primary_hue="violet", secondary_hue="cyan", neutral_hue="slate") |
|
|
| with gr.Blocks(title=APP_NAME, css=css, theme=theme) as demo: |
| with gr.Row(elem_classes=["main-wrap"]): |
| with gr.Column(scale=1, min_width=270, elem_classes=["sidebar"]): |
| gr.HTML(""" |
| <div class="brand"> |
| <h1>LuluV2</h1> |
| <p>Offline VWM local assistant.</p> |
| <span class="badge">LOCAL EDGE MODE</span> |
| </div> |
| """) |
| new_btn = gr.Button("+ New chat", variant="primary", elem_classes=["side-btn"]) |
| save_btn = gr.Button("Save chat", elem_classes=["side-btn"]) |
| saved_path = gr.Textbox(label="Last saved path", interactive=False, visible=False) |
| saved_chats = gr.Dropdown(choices=list_saved_chats(), label="Saved chats", value=None, interactive=True) |
| with gr.Row(): |
| refresh_chats = gr.Button("Refresh") |
| load_btn = gr.Button("Load") |
| export_btn = gr.Button("Export .md", elem_classes=["side-btn"]) |
| export_path = gr.Textbox(label="Export path", interactive=False, visible=False) |
|
|
| with gr.Accordion("Memory", open=True): |
| memory_notes = gr.Textbox(label="Persistent memory notes", value=read_memory(), lines=8, placeholder="Things Lulu should remember locally...") |
| memory_path = gr.Textbox(label="Memory path", interactive=False, visible=False) |
| save_mem_btn = gr.Button("Save memory") |
|
|
| with gr.Accordion("Live tokens", open=False): |
| token_trace = gr.Textbox(label="Recent generated tokens", value="No tokens generated yet.", lines=14, elem_classes=["tokenbox"]) |
|
|
| with gr.Accordion("Advanced", open=False): |
| chat_name = gr.Textbox(label="Chat name", value="New chat") |
| preset = gr.Dropdown(label="Preset", choices=list(PRESETS.keys()), value="Balanced") |
| system_prompt = gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=9) |
| history_turns = gr.Slider(0, 24, value=4, step=1, label="History turns sent") |
| max_context_tokens = gr.Slider(128, 32768, value=default_context, step=128, label="Max context tokens") |
| max_new_tokens = gr.Slider(16, 8192, value=768, step=16, label="Max new tokens") |
| temperature = gr.Slider(0.0, 2.0, value=0.65, step=0.01, label="Temperature") |
| top_k = gr.Slider(0, 500, value=40, step=1, label="Top-k") |
| top_p = gr.Slider(0.01, 1.0, value=0.90, step=0.01, label="Top-p") |
| min_p = gr.Slider(0.0, 0.5, value=0.03, step=0.005, label="Min-p") |
| repetition_penalty = gr.Slider(1.0, 3.0, value=1.10, step=0.01, label="Repetition penalty") |
| frequency_penalty = gr.Slider(0.0, 3.0, value=0.02, step=0.01, label="Frequency penalty") |
| greedy = gr.Checkbox(value=False, label="Greedy") |
| no_repeat_ngram = gr.Slider(0, 16, value=4, step=1, label="No-repeat ngram") |
| stream_every = gr.Slider(1, 64, value=1, step=1, label="Stream every N tokens") |
| show_pass_metrics = gr.Checkbox(value=True, label="Measure pass1/pass2 before generation") |
| insert_32k = gr.Button("Insert 32K stress prompt") |
|
|
| with gr.Column(scale=4, elem_classes=["chat-shell"]): |
| monitor = gr.HTML(metric_cards(engine, default_context)) |
| status = gr.HTML(status_html("Ready", f"{engine.model_info.get('checkpoint_size')} checkpoint · {engine.model_info.get('device')}", "good")) |
| chatbot = create_chatbot() |
| with gr.Row(elem_classes=["composer-card"]): |
| msg = gr.Textbox(show_label=False, placeholder="Message LuluV2...", lines=3, elem_id="composer", scale=12) |
| send_btn = gr.Button("Send", variant="primary", elem_classes=["send-btn"], scale=2) |
| with gr.Row(): |
| stop_btn = gr.Button("Stop") |
| regen_btn = gr.Button("Regenerate") |
| forget_btn = gr.Button("Forget last turn") |
| prompt_32k_btn = gr.Button("Try 32K prompt") |
| with gr.Accordion("Raw metrics", open=False): |
| raw_metrics = gr.JSON(label="Raw metrics") |
| usage_text = gr.Textbox(label="RAM / VRAM / model stats", value=system_usage(engine), lines=18, elem_classes=["jsonbox"]) |
|
|
| inputs = [ |
| msg, chatbot, chat_name, system_prompt, memory_notes, preset, |
| history_turns, max_context_tokens, max_new_tokens, temperature, top_k, top_p, |
| min_p, repetition_penalty, frequency_penalty, greedy, no_repeat_ngram, |
| stream_every, show_pass_metrics, |
| ] |
| outputs = [msg, chatbot, status, monitor, token_trace, raw_metrics] |
| send_event = send_btn.click(respond, inputs=inputs, outputs=outputs) |
| enter_event = msg.submit(respond, inputs=inputs, outputs=outputs) |
| stop_btn.click(fn=None, inputs=None, outputs=None, cancels=[send_event, enter_event]) |
|
|
| regen_inputs = [ |
| chatbot, chat_name, system_prompt, memory_notes, preset, |
| history_turns, max_context_tokens, max_new_tokens, temperature, top_k, top_p, |
| min_p, repetition_penalty, frequency_penalty, greedy, no_repeat_ngram, |
| stream_every, show_pass_metrics, |
| ] |
| regen_event = regen_btn.click(regenerate, inputs=regen_inputs, outputs=outputs) |
| stop_btn.click(fn=None, inputs=None, outputs=None, cancels=[regen_event]) |
|
|
| new_btn.click(new_chat, outputs=[chatbot, chat_name, status]) |
| forget_btn.click(forget_last, inputs=[chatbot], outputs=[chatbot, status]) |
| save_btn.click(save_chat, inputs=[chatbot, chat_name, memory_notes], outputs=[saved_path, status, saved_chats]) |
| refresh_chats.click(lambda: gr.update(choices=list_saved_chats()), outputs=[saved_chats]) |
| load_btn.click(load_chat, inputs=[saved_chats], outputs=[chatbot, chat_name, memory_notes, status]) |
| export_btn.click(export_markdown, inputs=[chatbot, chat_name], outputs=[export_path, status]) |
| save_mem_btn.click(write_memory, inputs=[memory_notes], outputs=[memory_path, status]) |
| preset.change(apply_preset, inputs=[preset], outputs=[temperature, top_k, top_p, min_p, repetition_penalty, frequency_penalty, max_new_tokens, max_context_tokens]) |
| insert_32k.click(lambda: make_32k_prompt(), outputs=[msg]) |
| prompt_32k_btn.click(lambda: make_32k_prompt(), outputs=[msg]) |
|
|
| return demo |
|
|
|
|
| def parse_args(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--ckpt", default="LULU2_instruct_ddp.pt") |
| ap.add_argument("--model-py", default="luluv2_inference_runtime.py") |
| ap.add_argument("--tokenizer-dir", default="tokenizer") |
| ap.add_argument("--host", default="127.0.0.1") |
| ap.add_argument("--port", type=int, default=7862) |
| ap.add_argument("--device", default="cuda") |
| ap.add_argument("--dtype", default="bf16") |
| ap.add_argument("--max-context", type=int, default=32768) |
| ap.add_argument("--share", action="store_true") |
| ap.add_argument("--inbrowser", action="store_true") |
| ap.add_argument("--base-only", action="store_true") |
| return ap.parse_args() |
|
|
|
|
| def main(): |
| args = parse_args() |
| os.environ.setdefault("HF_HUB_OFFLINE", "1") |
| os.environ.setdefault("TRANSFORMERS_OFFLINE", "1") |
| engine = LULUV2LiveEngine( |
| ckpt_path=args.ckpt, |
| model_py=args.model_py, |
| tokenizer_dir=args.tokenizer_dir, |
| device=args.device, |
| dtype=args.dtype, |
| local_files_only=True, |
| no_config_download=True, |
| force_base_only=bool(args.base_only), |
| ) |
| demo = build_app(engine, default_context=safe_int(args.max_context, 32768, 128, 32768)) |
| demo.queue(default_concurrency_limit=1).launch( |
| server_name=args.host, |
| server_port=int(args.port), |
| share=bool(args.share), |
| inbrowser=bool(args.inbrowser), |
| show_error=True, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|