Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,7 +4,8 @@ from pydantic import BaseModel
|
|
| 4 |
from llama_cpp import Llama
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
from supabase import create_client
|
| 7 |
-
import os, uvicorn
|
|
|
|
| 8 |
from contextlib import asynccontextmanager
|
| 9 |
|
| 10 |
# =========================
|
|
@@ -49,16 +50,17 @@ def clean_output(text):
|
|
| 49 |
def build_prompt(user_msg):
|
| 50 |
return f"""<|begin_of_text|>
|
| 51 |
<|start_header_id|>system<|end_header_id|>
|
| 52 |
-
Your name is Llama and you are a cheerful friendly AI buddy
|
| 53 |
Rules:
|
| 54 |
- Always refer to yourself as Llama
|
| 55 |
-
- Speak
|
| 56 |
-
-
|
|
|
|
| 57 |
- Keep answer under 30 words
|
| 58 |
-
-
|
| 59 |
-
-
|
| 60 |
-
-
|
| 61 |
-
- Do NOT use new lines
|
| 62 |
- Output plain text only
|
| 63 |
<|eot_id|>
|
| 64 |
<|start_header_id|>user<|end_header_id|>
|
|
@@ -78,11 +80,11 @@ def load_model():
|
|
| 78 |
token=HF_TOKEN,
|
| 79 |
cache_dir="/data"
|
| 80 |
),
|
| 81 |
-
n_ctx=
|
| 82 |
-
n_threads=
|
| 83 |
-
n_batch=
|
| 84 |
use_mmap=True,
|
| 85 |
-
use_mlock=
|
| 86 |
f16_kv=True,
|
| 87 |
verbose=False
|
| 88 |
)
|
|
@@ -123,13 +125,20 @@ async def chat(req: ChatRequest):
|
|
| 123 |
|
| 124 |
prompt = build_prompt(req.message)
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
)
|
| 134 |
|
| 135 |
text = clean_output(output["choices"][0]["text"])
|
|
|
|
| 4 |
from llama_cpp import Llama
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
from supabase import create_client
|
| 7 |
+
import os, uvicorn, asyncio
|
| 8 |
+
from functools import partial
|
| 9 |
from contextlib import asynccontextmanager
|
| 10 |
|
| 11 |
# =========================
|
|
|
|
| 50 |
def build_prompt(user_msg):
|
| 51 |
return f"""<|begin_of_text|>
|
| 52 |
<|start_header_id|>system<|end_header_id|>
|
| 53 |
+
Your name is Llama and you are a cheerful friendly AI buddy made for voice conversation.
|
| 54 |
Rules:
|
| 55 |
- Always refer to yourself as Llama
|
| 56 |
+
- Speak naturally like a real voice conversation with a friend
|
| 57 |
+
- Use casual spoken language like "hey" "sure" "yep" "got it"
|
| 58 |
+
- Answer in 1 to 2 sentences only
|
| 59 |
- Keep answer under 30 words
|
| 60 |
+
- Never use symbols like * - : ! or bullet points
|
| 61 |
+
- Never use abbreviations like etc or eg
|
| 62 |
+
- Never spell out numbers use digits like 3 not three
|
| 63 |
+
- Do NOT use new lines or formatting
|
| 64 |
- Output plain text only
|
| 65 |
<|eot_id|>
|
| 66 |
<|start_header_id|>user<|end_header_id|>
|
|
|
|
| 80 |
token=HF_TOKEN,
|
| 81 |
cache_dir="/data"
|
| 82 |
),
|
| 83 |
+
n_ctx=512, # β
reduced from 2048 β fits prompt + 80 token reply
|
| 84 |
+
n_threads=2, # β
matches HF free tier vCPU count (was 4)
|
| 85 |
+
n_batch=128, # β
smaller batch = faster for single requests (was 512)
|
| 86 |
use_mmap=True,
|
| 87 |
+
use_mlock=False, # β
set False β mlock can fail on HF shared infra (was True)
|
| 88 |
f16_kv=True,
|
| 89 |
verbose=False
|
| 90 |
)
|
|
|
|
| 125 |
|
| 126 |
prompt = build_prompt(req.message)
|
| 127 |
|
| 128 |
+
# β
Run model in thread pool so FastAPI event loop stays non-blocking
|
| 129 |
+
loop = asyncio.get_event_loop()
|
| 130 |
+
output = await loop.run_in_executor(
|
| 131 |
+
None,
|
| 132 |
+
partial(
|
| 133 |
+
model,
|
| 134 |
+
prompt,
|
| 135 |
+
max_tokens=100, # β
reduced from 2048 β 30 words β 60-80 tokens
|
| 136 |
+
temperature=req.temperature,
|
| 137 |
+
top_p=0.9,
|
| 138 |
+
top_k=40, # β
added β limits candidates, speeds up sampling
|
| 139 |
+
repeat_penalty=1.15,
|
| 140 |
+
stop=["<|eot_id|>", "<|end_of_text|>", "<|eof|>"]
|
| 141 |
+
)
|
| 142 |
)
|
| 143 |
|
| 144 |
text = clean_output(output["choices"][0]["text"])
|