File size: 6,496 Bytes
387b4b6 599a2b7 387b4b6 a20d278 387b4b6 a20d278 599a2b7 387b4b6 599a2b7 387b4b6 a20d278 387b4b6 a20d278 387b4b6 599a2b7 abbee5f 387b4b6 599a2b7 387b4b6 599a2b7 387b4b6 a20d278 abbee5f a20d278 abbee5f 387b4b6 abbee5f 387b4b6 599a2b7 387b4b6 a20d278 387b4b6 a20d278 387b4b6 abbee5f 387b4b6 599a2b7 abbee5f 387b4b6 abbee5f 387b4b6 abbee5f 387b4b6 599a2b7 387b4b6 599a2b7 a20d278 599a2b7 387b4b6 599a2b7 abbee5f a20d278 599a2b7 387b4b6 599a2b7 387b4b6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | """
Trillim Chat β Gradio 6 front-end for Trillim CPU inference.
Startup flow:
1. Monkey-patch TOKEN_PROGRESS_TIMEOUT_SECONDS before the LLM starts,
so slow CPU inference doesn't time-out after only 5 s.
2. Pull the model from the Trillim HF namespace (no-op if already cached).
3. Start the Trillim LLM component via Runtime.
4. Serve the Gradio chat UI on port 7860.
"""
import os
import shutil
import subprocess
import sys
import threading
from pathlib import Path
import gradio as gr
# ββ Model to use ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
MODEL_ID = "Trillim/BitNet-TRNQ"
# ββ Patch the inference-engine progress timeout BEFORE importing LLM ββββββββββ
# The default is 5 s β far too short for a slow CPU Space.
# We patch the constant on the public module so LLM.__init__ picks it up.
_PROGRESS_TIMEOUT = 120.0 # seconds to wait for the first / next token
try:
import trillim.components.llm.public as _llm_pub
_llm_pub.TOKEN_PROGRESS_TIMEOUT_SECONDS = _PROGRESS_TIMEOUT
print(
f"[trillim] progress_timeout patched β {_PROGRESS_TIMEOUT} s",
flush=True,
)
except Exception as _patch_err:
print(f"[trillim] WARNING: could not patch timeout: {_patch_err}", flush=True)
# ββ Global runtime handle βββββββββββββββββββββββββββββββββββββββββββββββββββββ
_runtime = None
_ready = threading.Event()
_startup_error: str | None = None
# Use all available CPUs; Trillim default (0) sometimes under-uses them.
_NUM_THREADS = os.cpu_count() or 2
def _pull_model() -> None:
"""Pull the model bundle into the Trillim managed store via the CLI binary."""
trillim_bin = shutil.which("trillim") or str(
Path(sys.executable).parent / "trillim"
)
print(f"[trillim] Pulling {MODEL_ID} using '{trillim_bin}' β¦", flush=True)
result = subprocess.run([trillim_bin, "pull", MODEL_ID], capture_output=False)
if result.returncode != 0:
raise RuntimeError(f"trillim pull exited with code {result.returncode}")
print("[trillim] Pull complete.", flush=True)
def _start_runtime() -> None:
"""Background thread: pull the model then start the Trillim Runtime."""
global _runtime, _startup_error
try:
_pull_model()
from trillim import LLM, Runtime
print(
f"[trillim] Starting Runtime with {MODEL_ID} "
f"(threads={_NUM_THREADS}, timeout={_PROGRESS_TIMEOUT}s) β¦",
flush=True,
)
_runtime = Runtime(LLM(MODEL_ID, num_threads=_NUM_THREADS))
_runtime.__enter__()
print("[trillim] Runtime ready.", flush=True)
except Exception as exc:
_startup_error = str(exc)
print(f"[trillim] Startup failed: {exc}", file=sys.stderr, flush=True)
finally:
_ready.set()
threading.Thread(target=_start_runtime, daemon=True).start()
# ββ Chat logic ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _wait_or_raise(timeout: float = 600.0) -> None:
"""Block until the runtime is ready, or surface a clear error."""
if not _ready.wait(timeout=timeout):
raise RuntimeError("Trillim runtime did not become ready within 10 minutes.")
if _startup_error:
raise RuntimeError(f"Trillim startup error: {_startup_error}")
def chat_fn(
message: str,
history: list[dict], # Gradio 6: always [{"role":β¦, "content":β¦}, β¦]
system_prompt: str,
temperature: float,
max_new_tokens: int,
):
"""Streaming chat handler β yields partial assistant strings."""
_wait_or_raise()
from trillim.components.llm import ChatDoneEvent, ChatTokenEvent
messages: list[dict] = []
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt.strip()})
messages.extend(history)
messages.append({"role": "user", "content": message})
partial = ""
for event in _runtime.llm.stream_chat(
messages,
temperature=temperature,
max_tokens=max_new_tokens,
):
if isinstance(event, ChatTokenEvent):
partial += event.text
yield partial
elif isinstance(event, ChatDoneEvent):
break
# ββ Gradio 6 UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
DESCRIPTION = f"""
## π§ Trillim Chat
Powered by [Trillim](https://trillim.com) β privacy-first, CPU-native local AI.
Model: **{MODEL_ID}** Β· Threads: **{_NUM_THREADS}** Β· Token timeout: **{int(_PROGRESS_TIMEOUT)} s**
> β³ The model loads in the background. If you send a message while it's still loading,
> the request will wait automatically β no need to refresh.
"""
with gr.Blocks(title="Trillim Chat") as demo:
gr.Markdown(DESCRIPTION)
gr.ChatInterface(
fn=chat_fn,
chatbot=gr.Chatbot(
elem_id="chatbot",
show_label=False,
render_markdown=True,
),
additional_inputs_accordion=gr.Accordion(label="βοΈ Parameters", open=False),
additional_inputs=[
gr.Textbox(
value="You are a helpful, concise assistant.",
label="System prompt",
lines=2,
),
gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature"),
gr.Slider(64, 2048, value=256, step=64, label="Max new tokens"),
],
title=None,
submit_btn="Send",
stop_btn="Stop",
)
gr.Markdown(
"---\n"
"Built with [Trillim](https://github.com/Trillim/Trillim) Β· "
"[Gradio](https://gradio.app) Β· Runs 100 % on CPU."
)
if __name__ == "__main__":
demo.queue().launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
theme=gr.themes.Soft(
primary_hue="indigo",
secondary_hue="purple",
neutral_hue="slate",
),
css="#chatbot { height: 520px; } footer { display: none !important; }",
) |