Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,10 +4,8 @@ from pydantic import BaseModel
|
|
| 4 |
from llama_cpp import Llama
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
from supabase import create_client
|
| 7 |
-
import os, uvicorn, threading
|
| 8 |
from contextlib import asynccontextmanager
|
| 9 |
-
from queue import Queue
|
| 10 |
-
from threading import Thread
|
| 11 |
|
| 12 |
# =========================
|
| 13 |
# CONFIG
|
|
@@ -119,17 +117,28 @@ def save_message(role, content, request_id):
|
|
| 119 |
}).execute()
|
| 120 |
|
| 121 |
# =========================
|
| 122 |
-
#
|
| 123 |
# =========================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
|
|
|
|
|
|
|
|
|
| 125 |
def generate_and_stream(req: ChatRequest):
|
| 126 |
|
| 127 |
prompt = build_prompt(req.message)
|
| 128 |
|
| 129 |
-
# save user
|
| 130 |
save_message("user", req.message, req.request_id)
|
| 131 |
|
| 132 |
-
# create assistant row
|
| 133 |
res = supabase.table("messages").insert({
|
| 134 |
"role": "assistant",
|
| 135 |
"content": "",
|
|
@@ -140,36 +149,8 @@ def generate_and_stream(req: ChatRequest):
|
|
| 140 |
msg_id = res.data[0]["id"]
|
| 141 |
|
| 142 |
full_text = ""
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
# =========================
|
| 146 |
-
# π₯ DB WRITER THREAD
|
| 147 |
-
# =========================
|
| 148 |
-
def db_writer():
|
| 149 |
-
last_sent = ""
|
| 150 |
-
|
| 151 |
-
while True:
|
| 152 |
-
chunk = q.get()
|
| 153 |
-
|
| 154 |
-
if chunk is None:
|
| 155 |
-
break
|
| 156 |
|
| 157 |
-
last_sent += chunk
|
| 158 |
-
|
| 159 |
-
try:
|
| 160 |
-
supabase.table("messages").update({
|
| 161 |
-
"content": last_sent
|
| 162 |
-
}).eq("id", msg_id).execute()
|
| 163 |
-
except:
|
| 164 |
-
pass
|
| 165 |
-
|
| 166 |
-
time.sleep(0.05) # smooth rate
|
| 167 |
-
|
| 168 |
-
Thread(target=db_writer, daemon=True).start()
|
| 169 |
-
|
| 170 |
-
# =========================
|
| 171 |
-
# π₯ MODEL STREAM
|
| 172 |
-
# =========================
|
| 173 |
stream = model(
|
| 174 |
prompt,
|
| 175 |
max_tokens=2048,
|
|
@@ -182,29 +163,25 @@ def generate_and_stream(req: ChatRequest):
|
|
| 182 |
|
| 183 |
for chunk in stream:
|
| 184 |
token = chunk["choices"][0]["text"]
|
| 185 |
-
|
| 186 |
full_text += token
|
| 187 |
|
| 188 |
-
#
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
|
|
|
|
| 194 |
final = clean_output(full_text)
|
|
|
|
| 195 |
|
| 196 |
-
supabase.table("messages").update({
|
| 197 |
-
"content": final,
|
| 198 |
-
"status": "done"
|
| 199 |
-
}).eq("id", msg_id).execute()
|
| 200 |
-
|
| 201 |
# =========================
|
| 202 |
# CHAT
|
| 203 |
# =========================
|
| 204 |
@app.post("/v1/chat")
|
| 205 |
async def chat(req: ChatRequest):
|
| 206 |
|
| 207 |
-
#
|
| 208 |
threading.Thread(target=generate_and_stream, args=(req,)).start()
|
| 209 |
|
| 210 |
return {"status": "streaming_started"}
|
|
|
|
| 4 |
from llama_cpp import Llama
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
from supabase import create_client
|
| 7 |
+
import os, uvicorn, threading
|
| 8 |
from contextlib import asynccontextmanager
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# =========================
|
| 11 |
# CONFIG
|
|
|
|
| 117 |
}).execute()
|
| 118 |
|
| 119 |
# =========================
|
| 120 |
+
# SUPABASE UPDATE HELPER
|
| 121 |
# =========================
|
| 122 |
+
def update_message(msg_id, content, status=None):
|
| 123 |
+
data = {"content": content}
|
| 124 |
+
if status:
|
| 125 |
+
data["status"] = status
|
| 126 |
+
try:
|
| 127 |
+
supabase.table("messages").update(data).eq("id", msg_id).execute()
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f"Supabase update failed: {e}")
|
| 130 |
|
| 131 |
+
# =========================
|
| 132 |
+
# STREAMING GENERATION
|
| 133 |
+
# =========================
|
| 134 |
def generate_and_stream(req: ChatRequest):
|
| 135 |
|
| 136 |
prompt = build_prompt(req.message)
|
| 137 |
|
| 138 |
+
# save user message
|
| 139 |
save_message("user", req.message, req.request_id)
|
| 140 |
|
| 141 |
+
# create empty assistant row
|
| 142 |
res = supabase.table("messages").insert({
|
| 143 |
"role": "assistant",
|
| 144 |
"content": "",
|
|
|
|
| 149 |
msg_id = res.data[0]["id"]
|
| 150 |
|
| 151 |
full_text = ""
|
| 152 |
+
word_count = 0 # β
track completed words
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
stream = model(
|
| 155 |
prompt,
|
| 156 |
max_tokens=2048,
|
|
|
|
| 163 |
|
| 164 |
for chunk in stream:
|
| 165 |
token = chunk["choices"][0]["text"]
|
|
|
|
| 166 |
full_text += token
|
| 167 |
|
| 168 |
+
# β
only update Supabase when a full word is completed (space found)
|
| 169 |
+
# this reduces DB calls from ~60 per response to ~10
|
| 170 |
+
if " " in token or "\n" in token:
|
| 171 |
+
word_count += 1
|
| 172 |
+
update_message(msg_id, full_text.strip())
|
| 173 |
|
| 174 |
+
# β
final clean + mark done
|
| 175 |
final = clean_output(full_text)
|
| 176 |
+
update_message(msg_id, final, status="done")
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
# =========================
|
| 179 |
# CHAT
|
| 180 |
# =========================
|
| 181 |
@app.post("/v1/chat")
|
| 182 |
async def chat(req: ChatRequest):
|
| 183 |
|
| 184 |
+
# run generation in background β return instantly to ESP32
|
| 185 |
threading.Thread(target=generate_and_stream, args=(req,)).start()
|
| 186 |
|
| 187 |
return {"status": "streaming_started"}
|