Maxwell commited on
Commit
35472a1
Β·
verified Β·
1 Parent(s): 930ac2c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +426 -0
app.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import uuid
5
+ import asyncio
6
+ import threading
7
+ import queue as queue_mod
8
+ import gradio as gr
9
+ from fastapi import FastAPI, Request
10
+ from fastapi.responses import StreamingResponse, JSONResponse
11
+ from huggingface_hub import hf_hub_download
12
+ from llama_cpp import Llama
13
+
14
+ HF_TOKEN = os.environ.get("HF_TOKEN")
15
+ MODEL_PATH = "/tmp/lumen-dpo.gguf"
16
+ MEMORY_FILE = "/tmp/memories.json"
17
+
18
+ SYSTEM_PROMPT = (
19
+ "You are Lumen, a helpful AI assistant made by Axion Labs."
20
+ )
21
+
22
+ llm = None
23
+ infer_lock = threading.Lock()
24
+
25
+
26
+ # ── Memory ────────────────────────────────────────────────────────────────────
27
+
28
+ def _load_memories():
29
+ try:
30
+ if not os.path.exists(MEMORY_FILE):
31
+ return []
32
+ with open(MEMORY_FILE) as f:
33
+ return json.load(f)
34
+ except Exception:
35
+ return []
36
+
37
+
38
+ def _save_memories(memories):
39
+ try:
40
+ with open(MEMORY_FILE, "w") as f:
41
+ json.dump(memories, f, indent=2)
42
+ except Exception:
43
+ pass
44
+
45
+
46
+ def get_memories():
47
+ return _load_memories()
48
+
49
+
50
+ def add_memory(text):
51
+ memories = _load_memories()
52
+ memories.append({"text": text.strip(), "addedAt": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())})
53
+ _save_memories(memories)
54
+ return memories
55
+
56
+
57
+ def remove_all_memories():
58
+ _save_memories([])
59
+
60
+
61
+ def remove_memory_by_index(index):
62
+ memories = _load_memories()
63
+ if 0 <= index < len(memories):
64
+ memories.pop(index)
65
+ _save_memories(memories)
66
+ return True
67
+ return False
68
+
69
+
70
+ def build_system_prompt():
71
+ memories = get_memories()
72
+ prompt = SYSTEM_PROMPT
73
+ if memories:
74
+ notes = "\n".join(f"- {m['text']}" for m in memories)
75
+ prompt += f"\n\nPersistent notes (always keep in mind):\n{notes}"
76
+ return prompt
77
+
78
+
79
+ def memories_display_text():
80
+ memories = get_memories()
81
+ if not memories:
82
+ return "No memories saved."
83
+ return "\n".join(f"{i + 1}. {m['text']}" for i, m in enumerate(memories))
84
+
85
+
86
+ # ── Model loading ─────────────────────────────────────────────────────────────
87
+
88
+ def _load_model():
89
+ global llm
90
+ if not os.path.exists(MODEL_PATH):
91
+ print("Downloading Lumen DPO model…")
92
+ hf_hub_download(
93
+ repo_id = "RavikxxBGamin/Lumen",
94
+ filename = "lumen-dpo.gguf",
95
+ token = HF_TOKEN,
96
+ local_dir = "/tmp",
97
+ )
98
+
99
+ print("Loading model…")
100
+ llm = Llama(
101
+ model_path = MODEL_PATH,
102
+ n_ctx = 8192,
103
+ n_threads = 2,
104
+ verbose = False,
105
+ )
106
+ print("Model ready.")
107
+
108
+
109
+ # ── FastAPI ───────────────────────────────────────────────────────────────────
110
+
111
+ fastapi_app = FastAPI()
112
+
113
+
114
+ @fastapi_app.on_event("startup")
115
+ async def startup():
116
+ loop = asyncio.get_event_loop()
117
+ loop.run_in_executor(None, _load_model)
118
+
119
+
120
+ @fastapi_app.get("/health")
121
+ def health():
122
+ return {"status": "ready" if llm is not None else "loading"}
123
+
124
+
125
+ @fastapi_app.get("/v1/memories")
126
+ def api_list_memories():
127
+ return {"memories": get_memories()}
128
+
129
+
130
+ @fastapi_app.post("/v1/memories")
131
+ async def api_add_memory(request: Request):
132
+ body = await request.json()
133
+ text = (body.get("text") or "").strip()
134
+ if not text:
135
+ return JSONResponse({"error": "text is required"}, status_code=400)
136
+ updated = add_memory(text)
137
+ return {"memories": updated}
138
+
139
+
140
+ @fastapi_app.delete("/v1/memories/{index}")
141
+ def api_delete_memory(index: int):
142
+ if remove_memory_by_index(index):
143
+ return {"memories": get_memories()}
144
+ return JSONResponse({"error": "index out of range"}, status_code=404)
145
+
146
+
147
+ @fastapi_app.post("/v1/chat/completions")
148
+ async def chat_completions(request: Request):
149
+ if llm is None:
150
+ return JSONResponse({"error": "Model is still loading, try again in a moment."}, status_code=503)
151
+
152
+ body = await request.json()
153
+ messages = body.get("messages", [])
154
+ max_tokens = int(body.get("max_tokens", 512))
155
+ temperature = float(body.get("temperature", 0.7))
156
+ stream = body.get("stream", False)
157
+ model_id = body.get("model", "lumen")
158
+ use_memories = body.get("use_memories", False)
159
+
160
+ sys_prompt = build_system_prompt() if use_memories else SYSTEM_PROMPT
161
+ if not any(m.get("role") == "system" for m in messages):
162
+ messages = [{"role": "system", "content": sys_prompt}] + messages
163
+
164
+ if stream:
165
+ async def event_stream():
166
+ resp_id = "chatcmpl-" + uuid.uuid4().hex
167
+ created = int(time.time())
168
+ q = queue_mod.Queue(maxsize=64)
169
+ DONE = object()
170
+
171
+ def produce():
172
+ try:
173
+ with infer_lock:
174
+ for chunk in llm.create_chat_completion(
175
+ messages = messages,
176
+ max_tokens = max_tokens,
177
+ temperature = temperature,
178
+ stream = True,
179
+ ):
180
+ q.put(chunk)
181
+ except Exception as e:
182
+ q.put(e)
183
+ finally:
184
+ q.put(DONE)
185
+
186
+ threading.Thread(target=produce, daemon=True).start()
187
+ while True:
188
+ chunk = await asyncio.to_thread(q.get)
189
+ if chunk is DONE:
190
+ break
191
+ if isinstance(chunk, Exception):
192
+ yield f"data: {json.dumps({'error': str(chunk)})}\n\n"
193
+ break
194
+ delta = chunk["choices"][0]["delta"]
195
+ finish = chunk["choices"][0].get("finish_reason")
196
+ data = {
197
+ "id": resp_id,
198
+ "object": "chat.completion.chunk",
199
+ "created": created,
200
+ "model": model_id,
201
+ "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
202
+ }
203
+ yield f"data: {json.dumps(data)}\n\n"
204
+ yield "data: [DONE]\n\n"
205
+
206
+ return StreamingResponse(event_stream(), media_type="text/event-stream")
207
+
208
+ def generate():
209
+ with infer_lock:
210
+ return llm.create_chat_completion(
211
+ messages = messages,
212
+ max_tokens = max_tokens,
213
+ temperature = temperature,
214
+ stream = False,
215
+ )
216
+
217
+ result = await asyncio.to_thread(generate)
218
+ return JSONResponse(result)
219
+
220
+
221
+ # ── Gradio chat helpers ───────────────────────────────────────────────────────
222
+
223
+ def user_submit(message, history):
224
+ if not message.strip():
225
+ return "", history
226
+ return "", history + [{"role": "user", "content": message}]
227
+
228
+
229
+ def bot_respond(history, temperature, max_tokens):
230
+ if llm is None:
231
+ yield history + [{"role": "assistant", "content": "Model is still loading β€” please wait a moment and try again."}]
232
+ return
233
+
234
+ messages = [{"role": "system", "content": build_system_prompt()}]
235
+ for item in history:
236
+ if not isinstance(item, dict):
237
+ continue
238
+ content = item.get("content", "")
239
+ if isinstance(content, list):
240
+ content = " ".join(p.get("text", "") for p in content if isinstance(p, dict))
241
+ messages.append({"role": item["role"], "content": content})
242
+
243
+ response = ""
244
+ working_history = history + [{"role": "assistant", "content": ""}]
245
+ with infer_lock:
246
+ for chunk in llm.create_chat_completion(
247
+ messages = messages,
248
+ max_tokens = int(max_tokens),
249
+ temperature = float(temperature),
250
+ stream = True,
251
+ ):
252
+ delta = chunk["choices"][0]["delta"].get("content", "")
253
+ response += delta
254
+ working_history[-1]["content"] = response
255
+ yield working_history
256
+
257
+
258
+ def model_status():
259
+ if llm is not None:
260
+ return "<p class='status ready'>● Model ready</p>"
261
+ return "<p class='status loading'>● Loading model… (first boot takes a few minutes)</p>"
262
+
263
+
264
+ def do_add_memory(text):
265
+ if not text.strip():
266
+ return "", memories_display_text()
267
+ add_memory(text.strip())
268
+ return "", memories_display_text()
269
+
270
+
271
+ def do_clear_memories():
272
+ remove_all_memories()
273
+ return memories_display_text()
274
+
275
+
276
+ # ── Theme & CSS ───────────────────────────────────────────────────────────────
277
+
278
+ THEME = gr.themes.Base(
279
+ primary_hue = gr.themes.colors.orange,
280
+ secondary_hue = gr.themes.colors.stone,
281
+ neutral_hue = gr.themes.colors.stone,
282
+ font = [gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
283
+ font_mono = [gr.themes.GoogleFont("JetBrains Mono"), "ui-monospace", "monospace"],
284
+ ).set(
285
+ body_background_fill = "#110d08",
286
+ body_background_fill_dark = "#110d08",
287
+ block_background_fill = "#1c1510",
288
+ block_background_fill_dark = "#1c1510",
289
+ block_border_color = "#2e2218",
290
+ block_border_color_dark = "#2e2218",
291
+ block_label_background_fill = "#1c1510",
292
+ block_label_background_fill_dark = "#1c1510",
293
+ input_background_fill = "#150f0a",
294
+ input_background_fill_dark = "#150f0a",
295
+ input_border_color = "#2e2218",
296
+ input_border_color_dark = "#2e2218",
297
+ button_primary_background_fill = "#cc785c",
298
+ button_primary_background_fill_hover = "#b8664a",
299
+ button_primary_background_fill_dark = "#cc785c",
300
+ button_primary_text_color = "#fff",
301
+ button_secondary_background_fill = "#2e2218",
302
+ button_secondary_background_fill_hover = "#3a2c1e",
303
+ button_secondary_background_fill_dark = "#2e2218",
304
+ button_secondary_text_color = "#d4b896",
305
+ body_text_color = "#e8ddd0",
306
+ body_text_color_dark = "#e8ddd0",
307
+ block_label_text_color = "#a08060",
308
+ block_label_text_color_dark = "#a08060",
309
+ )
310
+
311
+ CSS = """
312
+ .gradio-container { max-width: 820px !important; margin: 0 auto !important; padding: 0 12px !important; }
313
+ footer { display: none !important; }
314
+ #lumen-header { padding: 24px 0 8px; border-bottom: 1px solid #2e2218; margin-bottom: 16px; }
315
+ #lumen-header h1 { font-size: 1.6em; font-weight: 700; margin: 0 0 2px; color: #e8ddd0; letter-spacing: -0.01em; }
316
+ #lumen-header h1 span { color: #cc785c; }
317
+ #lumen-header p { color: #7a6050; margin: 0; font-size: 0.85em; }
318
+ .status { margin: 0 0 10px; font-size: 0.8em; font-weight: 500; }
319
+ .status.ready { color: #6aa87a; }
320
+ .status.loading { color: #c9994a; }
321
+ .chatbot-wrap .message.user { background: #2a1e14 !important; border: 1px solid #3a2c1e !important; }
322
+ .chatbot-wrap .message.bot { background: #1c1510 !important; border: 1px solid #2e2218 !important; }
323
+ .chatbot-wrap .message { border-radius: 8px !important; }
324
+ .input-row textarea {
325
+ background: #150f0a !important; border: 1px solid #3a2c1e !important;
326
+ border-radius: 8px !important; color: #e8ddd0 !important; resize: none !important;
327
+ }
328
+ .input-row textarea:focus { border-color: #cc785c !important; outline: none !important; }
329
+ .send-btn {
330
+ background: #cc785c !important; border: none !important;
331
+ border-radius: 8px !important; color: #fff !important;
332
+ font-size: 1.1em !important; min-width: 48px !important;
333
+ }
334
+ .send-btn:hover { background: #b8664a !important; }
335
+ .settings-row { margin: 10px 0 4px; gap: 16px; }
336
+ .settings-row label { color: #a08060 !important; font-size: 0.8em !important; }
337
+ .memory-panel { margin-top: 8px; border-top: 1px solid #2e2218; padding-top: 10px; }
338
+ .memory-panel .gr-accordion-header { color: #a08060 !important; font-size: 0.82em !important; }
339
+ .memory-list textarea {
340
+ font-size: 0.82em !important; color: #a08060 !important;
341
+ background: #110d08 !important; border: 1px solid #2e2218 !important; border-radius: 6px !important;
342
+ }
343
+ #lumen-footer { color: #4a3828; font-size: 0.75em; text-align: center; padding: 14px 0; border-top: 1px solid #2e2218; margin-top: 12px; }
344
+ #lumen-footer code { background: #1c1510; padding: 1px 5px; border-radius: 4px; color: #7a6050; }
345
+ """
346
+
347
+ # ── Gradio UI ─────────────────────────────────────────────────────────────────
348
+
349
+ with gr.Blocks(theme=THEME, css=CSS, title="Lumen β€” Axion Labs") as demo:
350
+
351
+ gr.HTML("""
352
+ <div id="lumen-header">
353
+ <h1>βš› <span>Lumen</span></h1>
354
+ <p>Fine-tuned Llama 3.1 8B Β· by Axion Labs Β· free, no key needed</p>
355
+ </div>
356
+ """)
357
+
358
+ status_html = gr.HTML(model_status)
359
+
360
+ chatbot = gr.Chatbot(
361
+ type = "messages",
362
+ height = 440,
363
+ show_copy_button = True,
364
+ elem_classes = ["chatbot-wrap"],
365
+ label = "",
366
+ show_label = False,
367
+ bubble_full_width = False,
368
+ )
369
+
370
+ with gr.Row(elem_classes=["input-row"]):
371
+ msg_box = gr.Textbox(
372
+ placeholder = "Message Lumen…",
373
+ show_label = False,
374
+ scale = 5,
375
+ container = False,
376
+ autofocus = True,
377
+ lines = 1,
378
+ max_lines = 6,
379
+ )
380
+ send_btn = gr.Button("↑", scale=1, variant="primary", elem_classes=["send-btn"], min_width=48)
381
+
382
+ with gr.Row(elem_classes=["settings-row"]):
383
+ temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature", scale=1)
384
+ max_tokens = gr.Slider(64, 1024, value=512, step=64, label="Max tokens", scale=1)
385
+
386
+ with gr.Accordion("Memory", open=False, elem_classes=["memory-panel"]):
387
+ mem_display = gr.Textbox(
388
+ value = memories_display_text,
389
+ label = "",
390
+ lines = 4,
391
+ interactive = False,
392
+ show_copy_button = False,
393
+ elem_classes = ["memory-list"],
394
+ every = 10,
395
+ )
396
+ with gr.Row():
397
+ mem_input = gr.Textbox(placeholder="Add a memory…", show_label=False, scale=3, container=False)
398
+ mem_add_btn = gr.Button("Save", scale=1, size="sm")
399
+ mem_clr_btn = gr.Button("Clear all", scale=1, size="sm", variant="stop")
400
+
401
+ gr.HTML("""
402
+ <div id="lumen-footer">
403
+ OpenAI-compatible API: <code>POST /v1/chat/completions</code>
404
+ &nbsp;Β·&nbsp; use with Axion CLI via <code>/model lumen</code>
405
+ </div>
406
+ """)
407
+
408
+ msg_box.submit(
409
+ user_submit, [msg_box, chatbot], [msg_box, chatbot], queue=False
410
+ ).then(
411
+ bot_respond, [chatbot, temperature, max_tokens], chatbot
412
+ )
413
+ send_btn.click(
414
+ user_submit, [msg_box, chatbot], [msg_box, chatbot], queue=False
415
+ ).then(
416
+ bot_respond, [chatbot, temperature, max_tokens], chatbot
417
+ )
418
+
419
+ mem_add_btn.click(do_add_memory, [mem_input], [mem_input, mem_display])
420
+ mem_input.submit(do_add_memory, [mem_input], [mem_input, mem_display])
421
+ mem_clr_btn.click(do_clear_memories, [], [mem_display])
422
+
423
+ demo.load(model_status, outputs=status_html)
424
+
425
+
426
+ app = gr.mount_gradio_app(fastapi_app, demo, path="/")