Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,7 +4,7 @@ from pydantic import BaseModel
|
|
| 4 |
from llama_cpp import Llama
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
from supabase import create_client
|
| 7 |
-
import os, uvicorn
|
| 8 |
from contextlib import asynccontextmanager
|
| 9 |
|
| 10 |
# =========================
|
|
@@ -53,13 +53,13 @@ Your name is Llama and you are a cheerful friendly AI buddy made for voice conve
|
|
| 53 |
Rules:
|
| 54 |
- Always refer to yourself as Llama
|
| 55 |
- Speak naturally like a real voice conversation with a friend
|
| 56 |
-
- Use casual spoken language like
|
| 57 |
- Answer in 1 to 2 sentences only
|
| 58 |
- Keep answer under 30 words
|
| 59 |
-
-
|
| 60 |
-
-
|
| 61 |
-
-
|
| 62 |
-
-
|
| 63 |
- Output plain text only
|
| 64 |
<|eot_id|>
|
| 65 |
<|start_header_id|>user<|end_header_id|>
|
|
@@ -117,17 +117,16 @@ def save_message(role, content, request_id):
|
|
| 117 |
}).execute()
|
| 118 |
|
| 119 |
# =========================
|
| 120 |
-
#
|
| 121 |
# =========================
|
| 122 |
-
|
| 123 |
-
async def chat(req: ChatRequest):
|
| 124 |
|
| 125 |
prompt = build_prompt(req.message)
|
| 126 |
|
| 127 |
-
#
|
| 128 |
save_message("user", req.message, req.request_id)
|
| 129 |
|
| 130 |
-
#
|
| 131 |
res = supabase.table("messages").insert({
|
| 132 |
"role": "assistant",
|
| 133 |
"content": "",
|
|
@@ -137,9 +136,7 @@ async def chat(req: ChatRequest):
|
|
| 137 |
|
| 138 |
msg_id = res.data[0]["id"]
|
| 139 |
|
| 140 |
-
# β
Stream tokens and update same row every 3 tokens
|
| 141 |
full_text = ""
|
| 142 |
-
buffer_count = 0
|
| 143 |
|
| 144 |
stream = model(
|
| 145 |
prompt,
|
|
@@ -153,25 +150,34 @@ async def chat(req: ChatRequest):
|
|
| 153 |
|
| 154 |
for chunk in stream:
|
| 155 |
token = chunk["choices"][0]["text"]
|
|
|
|
| 156 |
full_text += token
|
| 157 |
-
buffer_count += 1
|
| 158 |
|
| 159 |
-
#
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
|
| 166 |
-
#
|
| 167 |
-
|
| 168 |
|
| 169 |
supabase.table("messages").update({
|
| 170 |
-
"content":
|
| 171 |
"status": "done"
|
| 172 |
}).eq("id", msg_id).execute()
|
| 173 |
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
# =========================
|
| 177 |
# GET RESPONSE
|
|
@@ -192,7 +198,7 @@ def get_response(request_id: str):
|
|
| 192 |
if data:
|
| 193 |
return {
|
| 194 |
"response": data[0]["content"],
|
| 195 |
-
"status": data[0]["status"]
|
| 196 |
}
|
| 197 |
else:
|
| 198 |
return {"response": None, "status": "waiting"}
|
|
@@ -205,7 +211,7 @@ def get_response(request_id: str):
|
|
| 205 |
# =========================
|
| 206 |
@app.get("/")
|
| 207 |
def root():
|
| 208 |
-
return {"status": "LLaMA API running
|
| 209 |
|
| 210 |
# =========================
|
| 211 |
# RUN
|
|
|
|
| 4 |
from llama_cpp import Llama
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
from supabase import create_client
|
| 7 |
+
import os, uvicorn, threading, time
|
| 8 |
from contextlib import asynccontextmanager
|
| 9 |
|
| 10 |
# =========================
|
|
|
|
| 53 |
Rules:
|
| 54 |
- Always refer to yourself as Llama
|
| 55 |
- Speak naturally like a real voice conversation with a friend
|
| 56 |
+
- Use casual spoken language like hey sure yep got it
|
| 57 |
- Answer in 1 to 2 sentences only
|
| 58 |
- Keep answer under 30 words
|
| 59 |
+
- Do not use symbols
|
| 60 |
+
- Do not use abbreviations
|
| 61 |
+
- Use digits instead of words
|
| 62 |
+
- No new lines
|
| 63 |
- Output plain text only
|
| 64 |
<|eot_id|>
|
| 65 |
<|start_header_id|>user<|end_header_id|>
|
|
|
|
| 117 |
}).execute()
|
| 118 |
|
| 119 |
# =========================
|
| 120 |
+
# π₯ STREAMING GENERATION
|
| 121 |
# =========================
|
| 122 |
+
def generate_and_stream(req: ChatRequest):
|
|
|
|
| 123 |
|
| 124 |
prompt = build_prompt(req.message)
|
| 125 |
|
| 126 |
+
# save user
|
| 127 |
save_message("user", req.message, req.request_id)
|
| 128 |
|
| 129 |
+
# create assistant row
|
| 130 |
res = supabase.table("messages").insert({
|
| 131 |
"role": "assistant",
|
| 132 |
"content": "",
|
|
|
|
| 136 |
|
| 137 |
msg_id = res.data[0]["id"]
|
| 138 |
|
|
|
|
| 139 |
full_text = ""
|
|
|
|
| 140 |
|
| 141 |
stream = model(
|
| 142 |
prompt,
|
|
|
|
| 150 |
|
| 151 |
for chunk in stream:
|
| 152 |
token = chunk["choices"][0]["text"]
|
| 153 |
+
|
| 154 |
full_text += token
|
|
|
|
| 155 |
|
| 156 |
+
# π₯ REAL STREAMING UPDATE
|
| 157 |
+
supabase.table("messages").update({
|
| 158 |
+
"content": full_text
|
| 159 |
+
}).eq("id", msg_id).execute()
|
| 160 |
+
|
| 161 |
+
time.sleep(0.05) # π₯ critical for visible streaming
|
| 162 |
|
| 163 |
+
# final clean
|
| 164 |
+
final = clean_output(full_text)
|
| 165 |
|
| 166 |
supabase.table("messages").update({
|
| 167 |
+
"content": final,
|
| 168 |
"status": "done"
|
| 169 |
}).eq("id", msg_id).execute()
|
| 170 |
|
| 171 |
+
# =========================
|
| 172 |
+
# CHAT
|
| 173 |
+
# =========================
|
| 174 |
+
@app.post("/v1/chat")
|
| 175 |
+
async def chat(req: ChatRequest):
|
| 176 |
+
|
| 177 |
+
# π₯ run in background
|
| 178 |
+
threading.Thread(target=generate_and_stream, args=(req,)).start()
|
| 179 |
+
|
| 180 |
+
return {"status": "streaming_started"}
|
| 181 |
|
| 182 |
# =========================
|
| 183 |
# GET RESPONSE
|
|
|
|
| 198 |
if data:
|
| 199 |
return {
|
| 200 |
"response": data[0]["content"],
|
| 201 |
+
"status": data[0]["status"]
|
| 202 |
}
|
| 203 |
else:
|
| 204 |
return {"response": None, "status": "waiting"}
|
|
|
|
| 211 |
# =========================
|
| 212 |
@app.get("/")
|
| 213 |
def root():
|
| 214 |
+
return {"status": "LLaMA API running"}
|
| 215 |
|
| 216 |
# =========================
|
| 217 |
# RUN
|