""" Trillim Chat — Gradio 6 front-end for Trillim CPU inference. Startup flow: 1. Monkey-patch TOKEN_PROGRESS_TIMEOUT_SECONDS before the LLM starts, so slow CPU inference doesn't time-out after only 5 s. 2. Pull the model from the Trillim HF namespace (no-op if already cached). 3. Start the Trillim LLM component via Runtime. 4. Serve the Gradio chat UI on port 7860. """ import os import shutil import subprocess import sys import threading from pathlib import Path import gradio as gr # ── Model to use ────────────────────────────────────────────────────────────── MODEL_ID = "Trillim/BitNet-TRNQ" # ── Patch the inference-engine progress timeout BEFORE importing LLM ────────── # The default is 5 s — far too short for a slow CPU Space. # We patch the constant on the public module so LLM.__init__ picks it up. _PROGRESS_TIMEOUT = 120.0 # seconds to wait for the first / next token try: import trillim.components.llm.public as _llm_pub _llm_pub.TOKEN_PROGRESS_TIMEOUT_SECONDS = _PROGRESS_TIMEOUT print( f"[trillim] progress_timeout patched → {_PROGRESS_TIMEOUT} s", flush=True, ) except Exception as _patch_err: print(f"[trillim] WARNING: could not patch timeout: {_patch_err}", flush=True) # ── Global runtime handle ───────────────────────────────────────────────────── _runtime = None _ready = threading.Event() _startup_error: str | None = None # Use all available CPUs; Trillim default (0) sometimes under-uses them. _NUM_THREADS = os.cpu_count() or 2 def _pull_model() -> None: """Pull the model bundle into the Trillim managed store via the CLI binary.""" trillim_bin = shutil.which("trillim") or str( Path(sys.executable).parent / "trillim" ) print(f"[trillim] Pulling {MODEL_ID} using '{trillim_bin}' …", flush=True) result = subprocess.run([trillim_bin, "pull", MODEL_ID], capture_output=False) if result.returncode != 0: raise RuntimeError(f"trillim pull exited with code {result.returncode}") print("[trillim] Pull complete.", flush=True) def _start_runtime() -> None: """Background thread: pull the model then start the Trillim Runtime.""" global _runtime, _startup_error try: _pull_model() from trillim import LLM, Runtime print( f"[trillim] Starting Runtime with {MODEL_ID} " f"(threads={_NUM_THREADS}, timeout={_PROGRESS_TIMEOUT}s) …", flush=True, ) _runtime = Runtime(LLM(MODEL_ID, num_threads=_NUM_THREADS)) _runtime.__enter__() print("[trillim] Runtime ready.", flush=True) except Exception as exc: _startup_error = str(exc) print(f"[trillim] Startup failed: {exc}", file=sys.stderr, flush=True) finally: _ready.set() threading.Thread(target=_start_runtime, daemon=True).start() # ── Chat logic ──────────────────────────────────────────────────────────────── def _wait_or_raise(timeout: float = 600.0) -> None: """Block until the runtime is ready, or surface a clear error.""" if not _ready.wait(timeout=timeout): raise RuntimeError("Trillim runtime did not become ready within 10 minutes.") if _startup_error: raise RuntimeError(f"Trillim startup error: {_startup_error}") def chat_fn( message: str, history: list[dict], # Gradio 6: always [{"role":…, "content":…}, …] system_prompt: str, temperature: float, max_new_tokens: int, ): """Streaming chat handler — yields partial assistant strings.""" _wait_or_raise() from trillim.components.llm import ChatDoneEvent, ChatTokenEvent messages: list[dict] = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt.strip()}) messages.extend(history) messages.append({"role": "user", "content": message}) partial = "" for event in _runtime.llm.stream_chat( messages, temperature=temperature, max_tokens=max_new_tokens, ): if isinstance(event, ChatTokenEvent): partial += event.text yield partial elif isinstance(event, ChatDoneEvent): break # ── Gradio 6 UI ─────────────────────────────────────────────────────────────── DESCRIPTION = f""" ## 🧠 Trillim Chat Powered by [Trillim](https://trillim.com) — privacy-first, CPU-native local AI. Model: **{MODEL_ID}** · Threads: **{_NUM_THREADS}** · Token timeout: **{int(_PROGRESS_TIMEOUT)} s** > ⏳ The model loads in the background. If you send a message while it's still loading, > the request will wait automatically — no need to refresh. """ with gr.Blocks(title="Trillim Chat") as demo: gr.Markdown(DESCRIPTION) gr.ChatInterface( fn=chat_fn, chatbot=gr.Chatbot( elem_id="chatbot", show_label=False, render_markdown=True, ), additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False), additional_inputs=[ gr.Textbox( value="You are a helpful, concise assistant.", label="System prompt", lines=2, ), gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature"), gr.Slider(64, 2048, value=256, step=64, label="Max new tokens"), ], title=None, submit_btn="Send", stop_btn="Stop", ) gr.Markdown( "---\n" "Built with [Trillim](https://github.com/Trillim/Trillim) · " "[Gradio](https://gradio.app) · Runs 100 % on CPU." ) if __name__ == "__main__": demo.queue().launch( server_name="0.0.0.0", server_port=7860, show_error=True, theme=gr.themes.Soft( primary_hue="indigo", secondary_hue="purple", neutral_hue="slate", ), css="#chatbot { height: 520px; } footer { display: none !important; }", )