Valtry commited on
Commit
df77e05
Β·
verified Β·
1 Parent(s): 49b3087

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -26
app.py CHANGED
@@ -4,7 +4,7 @@ from pydantic import BaseModel
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
  from supabase import create_client
7
- import os, uvicorn
8
  from contextlib import asynccontextmanager
9
 
10
  # =========================
@@ -53,13 +53,13 @@ Your name is Llama and you are a cheerful friendly AI buddy made for voice conve
53
  Rules:
54
  - Always refer to yourself as Llama
55
  - Speak naturally like a real voice conversation with a friend
56
- - Use casual spoken language like "hey" "sure" "yep" "got it"
57
  - Answer in 1 to 2 sentences only
58
  - Keep answer under 30 words
59
- - Never use symbols like * - : ! or bullet points
60
- - Never use abbreviations like etc or eg
61
- - Never spell out numbers use digits like 3 not three
62
- - Do NOT use new lines or formatting
63
  - Output plain text only
64
  <|eot_id|>
65
  <|start_header_id|>user<|end_header_id|>
@@ -117,17 +117,16 @@ def save_message(role, content, request_id):
117
  }).execute()
118
 
119
  # =========================
120
- # CHAT
121
  # =========================
122
- @app.post("/v1/chat")
123
- async def chat(req: ChatRequest):
124
 
125
  prompt = build_prompt(req.message)
126
 
127
- # βœ… Save user message
128
  save_message("user", req.message, req.request_id)
129
 
130
- # βœ… Create empty assistant row with status "streaming"
131
  res = supabase.table("messages").insert({
132
  "role": "assistant",
133
  "content": "",
@@ -137,9 +136,7 @@ async def chat(req: ChatRequest):
137
 
138
  msg_id = res.data[0]["id"]
139
 
140
- # βœ… Stream tokens and update same row every 3 tokens
141
  full_text = ""
142
- buffer_count = 0
143
 
144
  stream = model(
145
  prompt,
@@ -153,25 +150,34 @@ async def chat(req: ChatRequest):
153
 
154
  for chunk in stream:
155
  token = chunk["choices"][0]["text"]
 
156
  full_text += token
157
- buffer_count += 1
158
 
159
- # βœ… Update Supabase every 3 tokens
160
- if buffer_count >= 3:
161
- supabase.table("messages").update({
162
- "content": full_text
163
- }).eq("id", msg_id).execute()
164
- buffer_count = 0
165
 
166
- # βœ… Final clean + mark as done
167
- text = clean_output(full_text)
168
 
169
  supabase.table("messages").update({
170
- "content": text,
171
  "status": "done"
172
  }).eq("id", msg_id).execute()
173
 
174
- return {"status": "saved"}
 
 
 
 
 
 
 
 
 
175
 
176
  # =========================
177
  # GET RESPONSE
@@ -192,7 +198,7 @@ def get_response(request_id: str):
192
  if data:
193
  return {
194
  "response": data[0]["content"],
195
- "status": data[0]["status"] # "streaming" or "done"
196
  }
197
  else:
198
  return {"response": None, "status": "waiting"}
@@ -205,7 +211,7 @@ def get_response(request_id: str):
205
  # =========================
206
  @app.get("/")
207
  def root():
208
- return {"status": "LLaMA API running πŸš€"}
209
 
210
  # =========================
211
  # RUN
 
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
  from supabase import create_client
7
+ import os, uvicorn, threading, time
8
  from contextlib import asynccontextmanager
9
 
10
  # =========================
 
53
  Rules:
54
  - Always refer to yourself as Llama
55
  - Speak naturally like a real voice conversation with a friend
56
+ - Use casual spoken language like hey sure yep got it
57
  - Answer in 1 to 2 sentences only
58
  - Keep answer under 30 words
59
+ - Do not use symbols
60
+ - Do not use abbreviations
61
+ - Use digits instead of words
62
+ - No new lines
63
  - Output plain text only
64
  <|eot_id|>
65
  <|start_header_id|>user<|end_header_id|>
 
117
  }).execute()
118
 
119
  # =========================
120
+ # πŸ”₯ STREAMING GENERATION
121
  # =========================
122
+ def generate_and_stream(req: ChatRequest):
 
123
 
124
  prompt = build_prompt(req.message)
125
 
126
+ # save user
127
  save_message("user", req.message, req.request_id)
128
 
129
+ # create assistant row
130
  res = supabase.table("messages").insert({
131
  "role": "assistant",
132
  "content": "",
 
136
 
137
  msg_id = res.data[0]["id"]
138
 
 
139
  full_text = ""
 
140
 
141
  stream = model(
142
  prompt,
 
150
 
151
  for chunk in stream:
152
  token = chunk["choices"][0]["text"]
153
+
154
  full_text += token
 
155
 
156
+ # πŸ”₯ REAL STREAMING UPDATE
157
+ supabase.table("messages").update({
158
+ "content": full_text
159
+ }).eq("id", msg_id).execute()
160
+
161
+ time.sleep(0.05) # πŸ”₯ critical for visible streaming
162
 
163
+ # final clean
164
+ final = clean_output(full_text)
165
 
166
  supabase.table("messages").update({
167
+ "content": final,
168
  "status": "done"
169
  }).eq("id", msg_id).execute()
170
 
171
+ # =========================
172
+ # CHAT
173
+ # =========================
174
+ @app.post("/v1/chat")
175
+ async def chat(req: ChatRequest):
176
+
177
+ # πŸ”₯ run in background
178
+ threading.Thread(target=generate_and_stream, args=(req,)).start()
179
+
180
+ return {"status": "streaming_started"}
181
 
182
  # =========================
183
  # GET RESPONSE
 
198
  if data:
199
  return {
200
  "response": data[0]["content"],
201
+ "status": data[0]["status"]
202
  }
203
  else:
204
  return {"response": None, "status": "waiting"}
 
211
  # =========================
212
  @app.get("/")
213
  def root():
214
+ return {"status": "LLaMA API running"}
215
 
216
  # =========================
217
  # RUN