Spaces:

wop
/

Trillim

Paused

App Files Files Community

wop commited on Apr 25

Commit

387b4b6

verified ·

1 Parent(s): 467d263

Create app.py

Browse files

Files changed (1) hide show

app.py +185 -0

app.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+Trillim Chat — Gradio front-end for Trillim CPU inference.
+Startup flow:
+  1. Pull the model from the Trillim HF namespace (no-op if already cached).
+  2. Start the Trillim LLM component via Runtime.
+  3. Serve the Gradio chat UI on port 7860.
+"""
+import subprocess
+import sys
+import threading
+import time
+import gradio as gr
+# ── Model to use ──────────────────────────────────────────────────────────────
+MODEL_ID = "Trillim/BitNet-TRNQ"
+# Change to e.g. "Trillim/BitNet-GenZ-TRNQ" if you want a different bundle.
+# ── Global runtime handle ─────────────────────────────────────────────────────
+_runtime = None
+_ready = threading.Event()
+_startup_error: str | None = None
+def _pull_model() -> None:
+    """Pull the model bundle into the Trillim managed store."""
+    print(f"[trillim] Pulling {MODEL_ID} …", flush=True)
+    result = subprocess.run(
+        [sys.executable, "-m", "trillim", "pull", MODEL_ID],
+        capture_output=False,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"trillim pull failed with exit code {result.returncode}")
+    print("[trillim] Pull complete.", flush=True)
+def _start_runtime() -> None:
+    """Background thread: pull model then start the Trillim Runtime."""
+    global _runtime, _startup_error
+    try:
+        _pull_model()
+        from trillim import LLM, Runtime  # noqa: PLC0415
+        print(f"[trillim] Starting Runtime with {MODEL_ID} …", flush=True)
+        _runtime = Runtime(LLM(MODEL_ID))
+        _runtime.__enter__()          # equivalent to `with Runtime(...) as r:`
+        print("[trillim] Runtime ready.", flush=True)
+    except Exception as exc:  # noqa: BLE001
+        _startup_error = str(exc)
+        print(f"[trillim] Startup failed: {exc}", file=sys.stderr, flush=True)
+    finally:
+        _ready.set()
+# Kick off the background startup immediately (before Gradio blocks).
+_thread = threading.Thread(target=_start_runtime, daemon=True)
+_thread.start()
+# ── Chat logic ────────────────────────────────────────────────────────────────
+def _wait_or_raise(timeout: float = 300.0) -> None:
+    """Block until the runtime is ready or raise if startup failed."""
+    if not _ready.wait(timeout=timeout):
+        raise RuntimeError("Trillim runtime did not become ready in time.")
+    if _startup_error:
+        raise RuntimeError(f"Trillim startup error: {_startup_error}")
+def chat_fn(
+    message: str,
+    history: list[dict],
+    system_prompt: str,
+    temperature: float,
+    max_new_tokens: int,
+) -> gr.ChatMessage:
+    """
+    Called by Gradio for every user message.
+    `history` is a list of {"role": ..., "content": ...} dicts (messages format).
+    We stream tokens back via generator so the UI updates in real time.
+    """
+    _wait_or_raise()
+    from trillim.components.llm import ChatDoneEvent, ChatTokenEvent  # noqa: PLC0415
+    # Build the message list for this turn.
+    messages: list[dict] = []
+    if system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt.strip()})
+    messages.extend(history)
+    messages.append({"role": "user", "content": message})
+    partial = ""
+    for event in _runtime.llm.stream_chat(
+        messages,
+        temperature=temperature,
+        max_tokens=max_new_tokens,
+    ):
+        if isinstance(event, ChatTokenEvent):
+            partial += event.text
+            yield partial
+        elif isinstance(event, ChatDoneEvent):
+            break
+# ── Gradio UI ─────────────────────────────────────────────────────────────────
+DESCRIPTION = """
+## 🧠 Trillim Chat
+Powered by [Trillim](https://trillim.com) — privacy-first, CPU-native local AI inference.
+Model: **{model}**
+""".format(model=MODEL_ID)
+with gr.Blocks(
+    title="Trillim Chat",
+    theme=gr.themes.Soft(
+        primary_hue="indigo",
+        secondary_hue="purple",
+        neutral_hue="slate",
+    ),
+    css="""
+    #chatbot { height: 520px; }
+    footer { display: none !important; }
+    """,
+) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column(scale=3):
+            chatbot = gr.ChatInterface(
+                fn=chat_fn,
+                type="messages",
+                chatbot=gr.Chatbot(
+                    elem_id="chatbot",
+                    show_label=False,
+                    bubble_full_width=False,
+                    render_markdown=True,
+                ),
+                additional_inputs_accordion=gr.Accordion(
+                    label="⚙️ Parameters", open=False
+                ),
+                additional_inputs=[
+                    gr.Textbox(
+                        value="You are a helpful, concise assistant.",
+                        label="System prompt",
+                        lines=2,
+                    ),
+                    gr.Slider(
+                        minimum=0.0,
+                        maximum=2.0,
+                        value=0.7,
+                        step=0.05,
+                        label="Temperature",
+                    ),
+                    gr.Slider(
+                        minimum=64,
+                        maximum=8192,
+                        value=512,
+                        step=64,
+                        label="Max new tokens",
+                    ),
+                ],
+                title=None,
+                submit_btn="Send",
+                stop_btn="Stop",
+            )
+    gr.Markdown(
+        "---\n"
+        "Built with [Trillim](https://github.com/Trillim/Trillim) · "
+        "[Gradio](https://gradio.app) · Runs 100 % on CPU."
+    )
+if __name__ == "__main__":
+    demo.queue().launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+    )