| """ |
| Trillim Chat β Gradio 6 front-end for Trillim CPU inference. |
| |
| Startup flow: |
| 1. Monkey-patch TOKEN_PROGRESS_TIMEOUT_SECONDS before the LLM starts, |
| so slow CPU inference doesn't time-out after only 5 s. |
| 2. Pull the model from the Trillim HF namespace (no-op if already cached). |
| 3. Start the Trillim LLM component via Runtime. |
| 4. Serve the Gradio chat UI on port 7860. |
| """ |
|
|
| import os |
| import shutil |
| import subprocess |
| import sys |
| import threading |
| from pathlib import Path |
|
|
| import gradio as gr |
|
|
| |
| MODEL_ID = "Trillim/BitNet-TRNQ" |
|
|
| |
| |
| |
| _PROGRESS_TIMEOUT = 120.0 |
|
|
| try: |
| import trillim.components.llm.public as _llm_pub |
| _llm_pub.TOKEN_PROGRESS_TIMEOUT_SECONDS = _PROGRESS_TIMEOUT |
| print( |
| f"[trillim] progress_timeout patched β {_PROGRESS_TIMEOUT} s", |
| flush=True, |
| ) |
| except Exception as _patch_err: |
| print(f"[trillim] WARNING: could not patch timeout: {_patch_err}", flush=True) |
|
|
| |
| _runtime = None |
| _ready = threading.Event() |
| _startup_error: str | None = None |
|
|
| |
| _NUM_THREADS = os.cpu_count() or 2 |
|
|
|
|
| def _pull_model() -> None: |
| """Pull the model bundle into the Trillim managed store via the CLI binary.""" |
| trillim_bin = shutil.which("trillim") or str( |
| Path(sys.executable).parent / "trillim" |
| ) |
| print(f"[trillim] Pulling {MODEL_ID} using '{trillim_bin}' β¦", flush=True) |
| result = subprocess.run([trillim_bin, "pull", MODEL_ID], capture_output=False) |
| if result.returncode != 0: |
| raise RuntimeError(f"trillim pull exited with code {result.returncode}") |
| print("[trillim] Pull complete.", flush=True) |
|
|
|
|
| def _start_runtime() -> None: |
| """Background thread: pull the model then start the Trillim Runtime.""" |
| global _runtime, _startup_error |
| try: |
| _pull_model() |
|
|
| from trillim import LLM, Runtime |
|
|
| print( |
| f"[trillim] Starting Runtime with {MODEL_ID} " |
| f"(threads={_NUM_THREADS}, timeout={_PROGRESS_TIMEOUT}s) β¦", |
| flush=True, |
| ) |
| _runtime = Runtime(LLM(MODEL_ID, num_threads=_NUM_THREADS)) |
| _runtime.__enter__() |
| print("[trillim] Runtime ready.", flush=True) |
| except Exception as exc: |
| _startup_error = str(exc) |
| print(f"[trillim] Startup failed: {exc}", file=sys.stderr, flush=True) |
| finally: |
| _ready.set() |
|
|
|
|
| threading.Thread(target=_start_runtime, daemon=True).start() |
|
|
|
|
| |
|
|
| def _wait_or_raise(timeout: float = 600.0) -> None: |
| """Block until the runtime is ready, or surface a clear error.""" |
| if not _ready.wait(timeout=timeout): |
| raise RuntimeError("Trillim runtime did not become ready within 10 minutes.") |
| if _startup_error: |
| raise RuntimeError(f"Trillim startup error: {_startup_error}") |
|
|
|
|
| def chat_fn( |
| message: str, |
| history: list[dict], |
| system_prompt: str, |
| temperature: float, |
| max_new_tokens: int, |
| ): |
| """Streaming chat handler β yields partial assistant strings.""" |
| _wait_or_raise() |
|
|
| from trillim.components.llm import ChatDoneEvent, ChatTokenEvent |
|
|
| messages: list[dict] = [] |
| if system_prompt.strip(): |
| messages.append({"role": "system", "content": system_prompt.strip()}) |
| messages.extend(history) |
| messages.append({"role": "user", "content": message}) |
|
|
| partial = "" |
| for event in _runtime.llm.stream_chat( |
| messages, |
| temperature=temperature, |
| max_tokens=max_new_tokens, |
| ): |
| if isinstance(event, ChatTokenEvent): |
| partial += event.text |
| yield partial |
| elif isinstance(event, ChatDoneEvent): |
| break |
|
|
|
|
| |
|
|
| DESCRIPTION = f""" |
| ## π§ Trillim Chat |
| |
| Powered by [Trillim](https://trillim.com) β privacy-first, CPU-native local AI. |
| Model: **{MODEL_ID}** Β· Threads: **{_NUM_THREADS}** Β· Token timeout: **{int(_PROGRESS_TIMEOUT)} s** |
| |
| > β³ The model loads in the background. If you send a message while it's still loading, |
| > the request will wait automatically β no need to refresh. |
| """ |
|
|
| with gr.Blocks(title="Trillim Chat") as demo: |
| gr.Markdown(DESCRIPTION) |
|
|
| gr.ChatInterface( |
| fn=chat_fn, |
| chatbot=gr.Chatbot( |
| elem_id="chatbot", |
| show_label=False, |
| render_markdown=True, |
| ), |
| additional_inputs_accordion=gr.Accordion(label="βοΈ Parameters", open=False), |
| additional_inputs=[ |
| gr.Textbox( |
| value="You are a helpful, concise assistant.", |
| label="System prompt", |
| lines=2, |
| ), |
| gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature"), |
| gr.Slider(64, 2048, value=256, step=64, label="Max new tokens"), |
| ], |
| title=None, |
| submit_btn="Send", |
| stop_btn="Stop", |
| ) |
|
|
| gr.Markdown( |
| "---\n" |
| "Built with [Trillim](https://github.com/Trillim/Trillim) Β· " |
| "[Gradio](https://gradio.app) Β· Runs 100 % on CPU." |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| demo.queue().launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| show_error=True, |
| theme=gr.themes.Soft( |
| primary_hue="indigo", |
| secondary_hue="purple", |
| neutral_hue="slate", |
| ), |
| css="#chatbot { height: 520px; } footer { display: none !important; }", |
| ) |