wop commited on
Commit
387b4b6
Β·
verified Β·
1 Parent(s): 467d263

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -0
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Trillim Chat β€” Gradio front-end for Trillim CPU inference.
3
+
4
+ Startup flow:
5
+ 1. Pull the model from the Trillim HF namespace (no-op if already cached).
6
+ 2. Start the Trillim LLM component via Runtime.
7
+ 3. Serve the Gradio chat UI on port 7860.
8
+ """
9
+
10
+ import subprocess
11
+ import sys
12
+ import threading
13
+ import time
14
+
15
+ import gradio as gr
16
+
17
+ # ── Model to use ──────────────────────────────────────────────────────────────
18
+ MODEL_ID = "Trillim/BitNet-TRNQ"
19
+ # Change to e.g. "Trillim/BitNet-GenZ-TRNQ" if you want a different bundle.
20
+
21
+ # ── Global runtime handle ─────────────────────────────────────────────────────
22
+ _runtime = None
23
+ _ready = threading.Event()
24
+ _startup_error: str | None = None
25
+
26
+
27
+ def _pull_model() -> None:
28
+ """Pull the model bundle into the Trillim managed store."""
29
+ print(f"[trillim] Pulling {MODEL_ID} …", flush=True)
30
+ result = subprocess.run(
31
+ [sys.executable, "-m", "trillim", "pull", MODEL_ID],
32
+ capture_output=False,
33
+ )
34
+ if result.returncode != 0:
35
+ raise RuntimeError(f"trillim pull failed with exit code {result.returncode}")
36
+ print("[trillim] Pull complete.", flush=True)
37
+
38
+
39
+ def _start_runtime() -> None:
40
+ """Background thread: pull model then start the Trillim Runtime."""
41
+ global _runtime, _startup_error
42
+ try:
43
+ _pull_model()
44
+
45
+ from trillim import LLM, Runtime # noqa: PLC0415
46
+
47
+ print(f"[trillim] Starting Runtime with {MODEL_ID} …", flush=True)
48
+ _runtime = Runtime(LLM(MODEL_ID))
49
+ _runtime.__enter__() # equivalent to `with Runtime(...) as r:`
50
+ print("[trillim] Runtime ready.", flush=True)
51
+ except Exception as exc: # noqa: BLE001
52
+ _startup_error = str(exc)
53
+ print(f"[trillim] Startup failed: {exc}", file=sys.stderr, flush=True)
54
+ finally:
55
+ _ready.set()
56
+
57
+
58
+ # Kick off the background startup immediately (before Gradio blocks).
59
+ _thread = threading.Thread(target=_start_runtime, daemon=True)
60
+ _thread.start()
61
+
62
+
63
+ # ── Chat logic ────────────────────────────────────────────────────────────────
64
+
65
+ def _wait_or_raise(timeout: float = 300.0) -> None:
66
+ """Block until the runtime is ready or raise if startup failed."""
67
+ if not _ready.wait(timeout=timeout):
68
+ raise RuntimeError("Trillim runtime did not become ready in time.")
69
+ if _startup_error:
70
+ raise RuntimeError(f"Trillim startup error: {_startup_error}")
71
+
72
+
73
+ def chat_fn(
74
+ message: str,
75
+ history: list[dict],
76
+ system_prompt: str,
77
+ temperature: float,
78
+ max_new_tokens: int,
79
+ ) -> gr.ChatMessage:
80
+ """
81
+ Called by Gradio for every user message.
82
+
83
+ `history` is a list of {"role": ..., "content": ...} dicts (messages format).
84
+ We stream tokens back via generator so the UI updates in real time.
85
+ """
86
+ _wait_or_raise()
87
+
88
+ from trillim.components.llm import ChatDoneEvent, ChatTokenEvent # noqa: PLC0415
89
+
90
+ # Build the message list for this turn.
91
+ messages: list[dict] = []
92
+ if system_prompt.strip():
93
+ messages.append({"role": "system", "content": system_prompt.strip()})
94
+ messages.extend(history)
95
+ messages.append({"role": "user", "content": message})
96
+
97
+ partial = ""
98
+ for event in _runtime.llm.stream_chat(
99
+ messages,
100
+ temperature=temperature,
101
+ max_tokens=max_new_tokens,
102
+ ):
103
+ if isinstance(event, ChatTokenEvent):
104
+ partial += event.text
105
+ yield partial
106
+ elif isinstance(event, ChatDoneEvent):
107
+ break
108
+
109
+
110
+ # ── Gradio UI ─────────────────────────────────────────────────────────────────
111
+
112
+ DESCRIPTION = """
113
+ ## 🧠 Trillim Chat
114
+
115
+ Powered by [Trillim](https://trillim.com) β€” privacy-first, CPU-native local AI inference.
116
+ Model: **{model}**
117
+ """.format(model=MODEL_ID)
118
+
119
+ with gr.Blocks(
120
+ title="Trillim Chat",
121
+ theme=gr.themes.Soft(
122
+ primary_hue="indigo",
123
+ secondary_hue="purple",
124
+ neutral_hue="slate",
125
+ ),
126
+ css="""
127
+ #chatbot { height: 520px; }
128
+ footer { display: none !important; }
129
+ """,
130
+ ) as demo:
131
+ gr.Markdown(DESCRIPTION)
132
+
133
+ with gr.Row():
134
+ with gr.Column(scale=3):
135
+ chatbot = gr.ChatInterface(
136
+ fn=chat_fn,
137
+ type="messages",
138
+ chatbot=gr.Chatbot(
139
+ elem_id="chatbot",
140
+ show_label=False,
141
+ bubble_full_width=False,
142
+ render_markdown=True,
143
+ ),
144
+ additional_inputs_accordion=gr.Accordion(
145
+ label="βš™οΈ Parameters", open=False
146
+ ),
147
+ additional_inputs=[
148
+ gr.Textbox(
149
+ value="You are a helpful, concise assistant.",
150
+ label="System prompt",
151
+ lines=2,
152
+ ),
153
+ gr.Slider(
154
+ minimum=0.0,
155
+ maximum=2.0,
156
+ value=0.7,
157
+ step=0.05,
158
+ label="Temperature",
159
+ ),
160
+ gr.Slider(
161
+ minimum=64,
162
+ maximum=8192,
163
+ value=512,
164
+ step=64,
165
+ label="Max new tokens",
166
+ ),
167
+ ],
168
+ title=None,
169
+ submit_btn="Send",
170
+ stop_btn="Stop",
171
+ )
172
+
173
+ gr.Markdown(
174
+ "---\n"
175
+ "Built with [Trillim](https://github.com/Trillim/Trillim) Β· "
176
+ "[Gradio](https://gradio.app) Β· Runs 100 % on CPU."
177
+ )
178
+
179
+
180
+ if __name__ == "__main__":
181
+ demo.queue().launch(
182
+ server_name="0.0.0.0",
183
+ server_port=7860,
184
+ show_error=True,
185
+ )