Spaces:

wop
/

Trillim

Paused

App Files Files Community

Trillim / app.py

wop

Update app.py

a20d278 verified 24 days ago

raw

history blame contribute delete

6.5 kB

	"""
	Trillim Chat — Gradio 6 front-end for Trillim CPU inference.

	Startup flow:
	1. Monkey-patch TOKEN_PROGRESS_TIMEOUT_SECONDS before the LLM starts,
	so slow CPU inference doesn't time-out after only 5 s.
	2. Pull the model from the Trillim HF namespace (no-op if already cached).
	3. Start the Trillim LLM component via Runtime.
	4. Serve the Gradio chat UI on port 7860.
	"""

	import os
	import shutil
	import subprocess
	import sys
	import threading
	from pathlib import Path

	import gradio as gr

	# ── Model to use ──────────────────────────────────────────────────────────────
	MODEL_ID = "Trillim/BitNet-TRNQ"

	# ── Patch the inference-engine progress timeout BEFORE importing LLM ──────────
	# The default is 5 s — far too short for a slow CPU Space.
	# We patch the constant on the public module so LLM.__init__ picks it up.
	_PROGRESS_TIMEOUT = 120.0 # seconds to wait for the first / next token

	try:
	import trillim.components.llm.public as _llm_pub
	_llm_pub.TOKEN_PROGRESS_TIMEOUT_SECONDS = _PROGRESS_TIMEOUT
	print(
	f"[trillim] progress_timeout patched → {_PROGRESS_TIMEOUT} s",
	flush=True,
	)
	except Exception as _patch_err:
	print(f"[trillim] WARNING: could not patch timeout: {_patch_err}", flush=True)

	# ── Global runtime handle ─────────────────────────────────────────────────────
	_runtime = None
	_ready = threading.Event()
	_startup_error: str \| None = None

	# Use all available CPUs; Trillim default (0) sometimes under-uses them.
	_NUM_THREADS = os.cpu_count() or 2


	def _pull_model() -> None:
	"""Pull the model bundle into the Trillim managed store via the CLI binary."""
	trillim_bin = shutil.which("trillim") or str(
	Path(sys.executable).parent / "trillim"
	)
	print(f"[trillim] Pulling {MODEL_ID} using '{trillim_bin}' …", flush=True)
	result = subprocess.run([trillim_bin, "pull", MODEL_ID], capture_output=False)
	if result.returncode != 0:
	raise RuntimeError(f"trillim pull exited with code {result.returncode}")
	print("[trillim] Pull complete.", flush=True)


	def _start_runtime() -> None:
	"""Background thread: pull the model then start the Trillim Runtime."""
	global _runtime, _startup_error
	try:
	_pull_model()

	from trillim import LLM, Runtime

	print(
	f"[trillim] Starting Runtime with {MODEL_ID} "
	f"(threads={_NUM_THREADS}, timeout={_PROGRESS_TIMEOUT}s) …",
	flush=True,
	)
	_runtime = Runtime(LLM(MODEL_ID, num_threads=_NUM_THREADS))
	_runtime.__enter__()
	print("[trillim] Runtime ready.", flush=True)
	except Exception as exc:
	_startup_error = str(exc)
	print(f"[trillim] Startup failed: {exc}", file=sys.stderr, flush=True)
	finally:
	_ready.set()


	threading.Thread(target=_start_runtime, daemon=True).start()


	# ── Chat logic ────────────────────────────────────────────────────────────────

	def _wait_or_raise(timeout: float = 600.0) -> None:
	"""Block until the runtime is ready, or surface a clear error."""
	if not _ready.wait(timeout=timeout):
	raise RuntimeError("Trillim runtime did not become ready within 10 minutes.")
	if _startup_error:
	raise RuntimeError(f"Trillim startup error: {_startup_error}")


	def chat_fn(
	message: str,
	history: list[dict], # Gradio 6: always [{"role":…, "content":…}, …]
	system_prompt: str,
	temperature: float,
	max_new_tokens: int,
	):
	"""Streaming chat handler — yields partial assistant strings."""
	_wait_or_raise()

	from trillim.components.llm import ChatDoneEvent, ChatTokenEvent

	messages: list[dict] = []
	if system_prompt.strip():
	messages.append({"role": "system", "content": system_prompt.strip()})
	messages.extend(history)
	messages.append({"role": "user", "content": message})

	partial = ""
	for event in _runtime.llm.stream_chat(
	messages,
	temperature=temperature,
	max_tokens=max_new_tokens,
	):
	if isinstance(event, ChatTokenEvent):
	partial += event.text
	yield partial
	elif isinstance(event, ChatDoneEvent):
	break


	# ── Gradio 6 UI ───────────────────────────────────────────────────────────────

	DESCRIPTION = f"""
	## 🧠 Trillim Chat

	Powered by [Trillim](https://trillim.com) — privacy-first, CPU-native local AI.
	Model: {MODEL_ID} · Threads: {_NUM_THREADS} · Token timeout: {int(_PROGRESS_TIMEOUT)} s

	> ⏳ The model loads in the background. If you send a message while it's still loading,
	> the request will wait automatically — no need to refresh.
	"""

	with gr.Blocks(title="Trillim Chat") as demo:
	gr.Markdown(DESCRIPTION)

	gr.ChatInterface(
	fn=chat_fn,
	chatbot=gr.Chatbot(
	elem_id="chatbot",
	show_label=False,
	render_markdown=True,
	),
	additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
	additional_inputs=[
	gr.Textbox(
	value="You are a helpful, concise assistant.",
	label="System prompt",
	lines=2,
	),
	gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature"),
	gr.Slider(64, 2048, value=256, step=64, label="Max new tokens"),
	],
	title=None,
	submit_btn="Send",
	stop_btn="Stop",
	)

	gr.Markdown(
	"---\n"
	"Built with [Trillim](https://github.com/Trillim/Trillim) · "
	"[Gradio](https://gradio.app) · Runs 100 % on CPU."
	)


	if __name__ == "__main__":
	demo.queue().launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True,
	theme=gr.themes.Soft(
	primary_hue="indigo",
	secondary_hue="purple",
	neutral_hue="slate",
	),
	css="#chatbot { height: 520px; } footer { display: none !important; }",
	)