Trillim / app.py
wop's picture
Update app.py
a20d278 verified
"""
Trillim Chat β€” Gradio 6 front-end for Trillim CPU inference.
Startup flow:
1. Monkey-patch TOKEN_PROGRESS_TIMEOUT_SECONDS before the LLM starts,
so slow CPU inference doesn't time-out after only 5 s.
2. Pull the model from the Trillim HF namespace (no-op if already cached).
3. Start the Trillim LLM component via Runtime.
4. Serve the Gradio chat UI on port 7860.
"""
import os
import shutil
import subprocess
import sys
import threading
from pathlib import Path
import gradio as gr
# ── Model to use ──────────────────────────────────────────────────────────────
MODEL_ID = "Trillim/BitNet-TRNQ"
# ── Patch the inference-engine progress timeout BEFORE importing LLM ──────────
# The default is 5 s β€” far too short for a slow CPU Space.
# We patch the constant on the public module so LLM.__init__ picks it up.
_PROGRESS_TIMEOUT = 120.0 # seconds to wait for the first / next token
try:
import trillim.components.llm.public as _llm_pub
_llm_pub.TOKEN_PROGRESS_TIMEOUT_SECONDS = _PROGRESS_TIMEOUT
print(
f"[trillim] progress_timeout patched β†’ {_PROGRESS_TIMEOUT} s",
flush=True,
)
except Exception as _patch_err:
print(f"[trillim] WARNING: could not patch timeout: {_patch_err}", flush=True)
# ── Global runtime handle ─────────────────────────────────────────────────────
_runtime = None
_ready = threading.Event()
_startup_error: str | None = None
# Use all available CPUs; Trillim default (0) sometimes under-uses them.
_NUM_THREADS = os.cpu_count() or 2
def _pull_model() -> None:
"""Pull the model bundle into the Trillim managed store via the CLI binary."""
trillim_bin = shutil.which("trillim") or str(
Path(sys.executable).parent / "trillim"
)
print(f"[trillim] Pulling {MODEL_ID} using '{trillim_bin}' …", flush=True)
result = subprocess.run([trillim_bin, "pull", MODEL_ID], capture_output=False)
if result.returncode != 0:
raise RuntimeError(f"trillim pull exited with code {result.returncode}")
print("[trillim] Pull complete.", flush=True)
def _start_runtime() -> None:
"""Background thread: pull the model then start the Trillim Runtime."""
global _runtime, _startup_error
try:
_pull_model()
from trillim import LLM, Runtime
print(
f"[trillim] Starting Runtime with {MODEL_ID} "
f"(threads={_NUM_THREADS}, timeout={_PROGRESS_TIMEOUT}s) …",
flush=True,
)
_runtime = Runtime(LLM(MODEL_ID, num_threads=_NUM_THREADS))
_runtime.__enter__()
print("[trillim] Runtime ready.", flush=True)
except Exception as exc:
_startup_error = str(exc)
print(f"[trillim] Startup failed: {exc}", file=sys.stderr, flush=True)
finally:
_ready.set()
threading.Thread(target=_start_runtime, daemon=True).start()
# ── Chat logic ────────────────────────────────────────────────────────────────
def _wait_or_raise(timeout: float = 600.0) -> None:
"""Block until the runtime is ready, or surface a clear error."""
if not _ready.wait(timeout=timeout):
raise RuntimeError("Trillim runtime did not become ready within 10 minutes.")
if _startup_error:
raise RuntimeError(f"Trillim startup error: {_startup_error}")
def chat_fn(
message: str,
history: list[dict], # Gradio 6: always [{"role":…, "content":…}, …]
system_prompt: str,
temperature: float,
max_new_tokens: int,
):
"""Streaming chat handler β€” yields partial assistant strings."""
_wait_or_raise()
from trillim.components.llm import ChatDoneEvent, ChatTokenEvent
messages: list[dict] = []
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt.strip()})
messages.extend(history)
messages.append({"role": "user", "content": message})
partial = ""
for event in _runtime.llm.stream_chat(
messages,
temperature=temperature,
max_tokens=max_new_tokens,
):
if isinstance(event, ChatTokenEvent):
partial += event.text
yield partial
elif isinstance(event, ChatDoneEvent):
break
# ── Gradio 6 UI ───────────────────────────────────────────────────────────────
DESCRIPTION = f"""
## 🧠 Trillim Chat
Powered by [Trillim](https://trillim.com) β€” privacy-first, CPU-native local AI.
Model: **{MODEL_ID}** Β· Threads: **{_NUM_THREADS}** Β· Token timeout: **{int(_PROGRESS_TIMEOUT)} s**
> ⏳ The model loads in the background. If you send a message while it's still loading,
> the request will wait automatically β€” no need to refresh.
"""
with gr.Blocks(title="Trillim Chat") as demo:
gr.Markdown(DESCRIPTION)
gr.ChatInterface(
fn=chat_fn,
chatbot=gr.Chatbot(
elem_id="chatbot",
show_label=False,
render_markdown=True,
),
additional_inputs_accordion=gr.Accordion(label="βš™οΈ Parameters", open=False),
additional_inputs=[
gr.Textbox(
value="You are a helpful, concise assistant.",
label="System prompt",
lines=2,
),
gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature"),
gr.Slider(64, 2048, value=256, step=64, label="Max new tokens"),
],
title=None,
submit_btn="Send",
stop_btn="Stop",
)
gr.Markdown(
"---\n"
"Built with [Trillim](https://github.com/Trillim/Trillim) Β· "
"[Gradio](https://gradio.app) Β· Runs 100 % on CPU."
)
if __name__ == "__main__":
demo.queue().launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
theme=gr.themes.Soft(
primary_hue="indigo",
secondary_hue="purple",
neutral_hue="slate",
),
css="#chatbot { height: 520px; } footer { display: none !important; }",
)