Spaces:

Valtry
/

AI-Machine

Sleeping

App Files Files Community

Valtry commited on 21 days ago

Commit

df77e05

verified ·

1 Parent(s): 49b3087

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -26

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from pydantic import BaseModel
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from supabase import create_client
-import os, uvicorn
 from contextlib import asynccontextmanager
 # =========================
@@ -53,13 +53,13 @@ Your name is Llama and you are a cheerful friendly AI buddy made for voice conve
 Rules:
 - Always refer to yourself as Llama
 - Speak naturally like a real voice conversation with a friend
-- Use casual spoken language like "hey" "sure" "yep" "got it"
 - Answer in 1 to 2 sentences only
 - Keep answer under 30 words
-- Never use symbols like * - : ! or bullet points
-- Never use abbreviations like etc or eg
-- Never spell out numbers use digits like 3 not three
-- Do NOT use new lines or formatting
 - Output plain text only
 <|eot_id|>
 <|start_header_id|>user<|end_header_id|>
@@ -117,17 +117,16 @@ def save_message(role, content, request_id):
     }).execute()
 # =========================
-# CHAT
 # =========================
-@app.post("/v1/chat")
-async def chat(req: ChatRequest):
     prompt = build_prompt(req.message)
-    # ✅ Save user message
     save_message("user", req.message, req.request_id)
-    # ✅ Create empty assistant row with status "streaming"
     res = supabase.table("messages").insert({
         "role": "assistant",
         "content": "",
@@ -137,9 +136,7 @@ async def chat(req: ChatRequest):
     msg_id = res.data[0]["id"]
-    # ✅ Stream tokens and update same row every 3 tokens
     full_text = ""
-    buffer_count = 0
     stream = model(
         prompt,
@@ -153,25 +150,34 @@ async def chat(req: ChatRequest):
     for chunk in stream:
         token = chunk["choices"][0]["text"]
         full_text += token
-        buffer_count += 1
-        # ✅ Update Supabase every 3 tokens
-        if buffer_count >= 3:
-            supabase.table("messages").update({
-                "content": full_text
-            }).eq("id", msg_id).execute()
-            buffer_count = 0
-    # ✅ Final clean + mark as done
-    text = clean_output(full_text)
     supabase.table("messages").update({
-        "content": text,
         "status": "done"
     }).eq("id", msg_id).execute()
-    return {"status": "saved"}
 # =========================
 # GET RESPONSE
@@ -192,7 +198,7 @@ def get_response(request_id: str):
         if data:
             return {
                 "response": data[0]["content"],
-                "status": data[0]["status"]   # "streaming" or "done"
             }
         else:
             return {"response": None, "status": "waiting"}
@@ -205,7 +211,7 @@ def get_response(request_id: str):
 # =========================
 @app.get("/")
 def root():
-    return {"status": "LLaMA API running 🚀"}
 # =========================
 # RUN

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from supabase import create_client
+import os, uvicorn, threading, time
 from contextlib import asynccontextmanager
 # =========================
 Rules:
 - Always refer to yourself as Llama
 - Speak naturally like a real voice conversation with a friend
+- Use casual spoken language like hey sure yep got it
 - Answer in 1 to 2 sentences only
 - Keep answer under 30 words
+- Do not use symbols
+- Do not use abbreviations
+- Use digits instead of words
+- No new lines
 - Output plain text only
 <|eot_id|>
 <|start_header_id|>user<|end_header_id|>
     }).execute()
 # =========================
+# 🔥 STREAMING GENERATION
 # =========================
+def generate_and_stream(req: ChatRequest):
     prompt = build_prompt(req.message)
+    # save user
     save_message("user", req.message, req.request_id)
+    # create assistant row
     res = supabase.table("messages").insert({
         "role": "assistant",
         "content": "",
     msg_id = res.data[0]["id"]
     full_text = ""
     stream = model(
         prompt,
     for chunk in stream:
         token = chunk["choices"][0]["text"]
         full_text += token
+        # 🔥 REAL STREAMING UPDATE
+        supabase.table("messages").update({
+            "content": full_text
+        }).eq("id", msg_id).execute()
+        time.sleep(0.05)  # 🔥 critical for visible streaming
+    # final clean
+    final = clean_output(full_text)
     supabase.table("messages").update({
+        "content": final,
         "status": "done"
     }).eq("id", msg_id).execute()
+# =========================
+# CHAT
+# =========================
+@app.post("/v1/chat")
+async def chat(req: ChatRequest):
+    # 🔥 run in background
+    threading.Thread(target=generate_and_stream, args=(req,)).start()
+    return {"status": "streaming_started"}
 # =========================
 # GET RESPONSE
         if data:
             return {
                 "response": data[0]["content"],
+                "status": data[0]["status"]
             }
         else:
             return {"response": None, "status": "waiting"}
 # =========================
 @app.get("/")
 def root():
+    return {"status": "LLaMA API running"}
 # =========================
 # RUN