Spaces:

Valtry
/

AI-Machine

Sleeping

App Files Files Community

Valtry commited on 28 days ago

Commit

926595a

verified ·

1 Parent(s): c7f5553

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -47

app.py CHANGED Viewed

@@ -4,10 +4,8 @@ from pydantic import BaseModel
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from supabase import create_client
-import os, uvicorn, threading, time
 from contextlib import asynccontextmanager
-from queue import Queue
-from threading import Thread
 # =========================
 # CONFIG
@@ -119,17 +117,28 @@ def save_message(role, content, request_id):
     }).execute()
 # =========================
-# 🔥 STREAMING GENERATION
 # =========================
 def generate_and_stream(req: ChatRequest):
     prompt = build_prompt(req.message)
-    # save user
     save_message("user", req.message, req.request_id)
-    # create assistant row
     res = supabase.table("messages").insert({
         "role": "assistant",
         "content": "",
@@ -140,36 +149,8 @@ def generate_and_stream(req: ChatRequest):
     msg_id = res.data[0]["id"]
     full_text = ""
-    q = Queue()
-    # =========================
-    # 🔥 DB WRITER THREAD
-    # =========================
-    def db_writer():
-        last_sent = ""
-        while True:
-            chunk = q.get()
-            if chunk is None:
-                break
-            last_sent += chunk
-            try:
-                supabase.table("messages").update({
-                    "content": last_sent
-                }).eq("id", msg_id).execute()
-            except:
-                pass
-            time.sleep(0.05)  # smooth rate
-    Thread(target=db_writer, daemon=True).start()
-    # =========================
-    # 🔥 MODEL STREAM
-    # =========================
     stream = model(
         prompt,
         max_tokens=2048,
@@ -182,29 +163,25 @@ def generate_and_stream(req: ChatRequest):
     for chunk in stream:
         token = chunk["choices"][0]["text"]
         full_text += token
-        # 🔥 send to queue instead of direct DB write
-        q.put(token)
-    # stop writer
-    q.put(None)
     final = clean_output(full_text)
-    supabase.table("messages").update({
-        "content": final,
-        "status": "done"
-    }).eq("id", msg_id).execute()
 # =========================
 # CHAT
 # =========================
 @app.post("/v1/chat")
 async def chat(req: ChatRequest):
-    # 🔥 run in background
     threading.Thread(target=generate_and_stream, args=(req,)).start()
     return {"status": "streaming_started"}

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from supabase import create_client
+import os, uvicorn, threading
 from contextlib import asynccontextmanager
 # =========================
 # CONFIG
     }).execute()
 # =========================
+# SUPABASE UPDATE HELPER
 # =========================
+def update_message(msg_id, content, status=None):
+    data = {"content": content}
+    if status:
+        data["status"] = status
+    try:
+        supabase.table("messages").update(data).eq("id", msg_id).execute()
+    except Exception as e:
+        print(f"Supabase update failed: {e}")
+# =========================
+# STREAMING GENERATION
+# =========================
 def generate_and_stream(req: ChatRequest):
     prompt = build_prompt(req.message)
+    # save user message
     save_message("user", req.message, req.request_id)
+    # create empty assistant row
     res = supabase.table("messages").insert({
         "role": "assistant",
         "content": "",
     msg_id = res.data[0]["id"]
     full_text = ""
+    word_count = 0          # ✅ track completed words
     stream = model(
         prompt,
         max_tokens=2048,
     for chunk in stream:
         token = chunk["choices"][0]["text"]
         full_text += token
+        # ✅ only update Supabase when a full word is completed (space found)
+        # this reduces DB calls from ~60 per response to ~10
+        if " " in token or "\n" in token:
+            word_count += 1
+            update_message(msg_id, full_text.strip())
+    # ✅ final clean + mark done
     final = clean_output(full_text)
+    update_message(msg_id, final, status="done")
 # =========================
 # CHAT
 # =========================
 @app.post("/v1/chat")
 async def chat(req: ChatRequest):
+    # run generation in background — return instantly to ESP32
     threading.Thread(target=generate_and_stream, args=(req,)).start()
     return {"status": "streaming_started"}