Spaces:

Valtry
/

AI-Machine

Sleeping

App Files Files Community

Valtry commited on Apr 26

Commit

8147e6f

verified ·

1 Parent(s): 69e3535

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -19

app.py CHANGED Viewed

@@ -4,7 +4,8 @@ from pydantic import BaseModel
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from supabase import create_client
-import os, uvicorn
 from contextlib import asynccontextmanager
 # =========================
@@ -49,16 +50,17 @@ def clean_output(text):
 def build_prompt(user_msg):
     return f"""<|begin_of_text|>
 <|start_header_id|>system<|end_header_id|>
-Your name is Llama and you are a cheerful friendly AI buddy on a tiny device.
 Rules:
 - Always refer to yourself as Llama
-- Speak in a warm casual and conversational tone like talking to a friend
-- Answer in 1 to 2 short sentences only
 - Keep answer under 30 words
-- Use very simple everyday words
-- Do NOT use symbols like * - : or bullet points
-- Do NOT use headings or formatting
-- Do NOT use new lines
 - Output plain text only
 <|eot_id|>
 <|start_header_id|>user<|end_header_id|>
@@ -78,11 +80,11 @@ def load_model():
             token=HF_TOKEN,
             cache_dir="/data"
         ),
-        n_ctx=2048,
-        n_threads=4,
-        n_batch=512,
         use_mmap=True,
-        use_mlock=True,
         f16_kv=True,
         verbose=False
     )
@@ -123,13 +125,20 @@ async def chat(req: ChatRequest):
     prompt = build_prompt(req.message)
-    output = model(
-        prompt,
-        max_tokens=2048,
-        temperature=req.temperature,
-        top_p=0.9,
-        repeat_penalty=1.15,
-        stop=["<|eot_id|>", "<|end_of_text|>", "<|eof|>"]
     )
     text = clean_output(output["choices"][0]["text"])

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from supabase import create_client
+import os, uvicorn, asyncio
+from functools import partial
 from contextlib import asynccontextmanager
 # =========================
 def build_prompt(user_msg):
     return f"""<|begin_of_text|>
 <|start_header_id|>system<|end_header_id|>
+Your name is Llama and you are a cheerful friendly AI buddy made for voice conversation.
 Rules:
 - Always refer to yourself as Llama
+- Speak naturally like a real voice conversation with a friend
+- Use casual spoken language like "hey" "sure" "yep" "got it"
+- Answer in 1 to 2 sentences only
 - Keep answer under 30 words
+- Never use symbols like * - : ! or bullet points
+- Never use abbreviations like etc or eg
+- Never spell out numbers use digits like 3 not three
+- Do NOT use new lines or formatting
 - Output plain text only
 <|eot_id|>
 <|start_header_id|>user<|end_header_id|>
             token=HF_TOKEN,
             cache_dir="/data"
         ),
+        n_ctx=512,        # ✅ reduced from 2048 — fits prompt + 80 token reply
+        n_threads=2,      # ✅ matches HF free tier vCPU count (was 4)
+        n_batch=128,      # ✅ smaller batch = faster for single requests (was 512)
         use_mmap=True,
+        use_mlock=False,  # ✅ set False — mlock can fail on HF shared infra (was True)
         f16_kv=True,
         verbose=False
     )
     prompt = build_prompt(req.message)
+    # ✅ Run model in thread pool so FastAPI event loop stays non-blocking
+    loop = asyncio.get_event_loop()
+    output = await loop.run_in_executor(
+        None,
+        partial(
+            model,
+            prompt,
+            max_tokens=100,          # ✅ reduced from 2048 — 30 words ≈ 60-80 tokens
+            temperature=req.temperature,
+            top_p=0.9,
+            top_k=40,               # ✅ added — limits candidates, speeds up sampling
+            repeat_penalty=1.15,
+            stop=["<|eot_id|>", "<|end_of_text|>", "<|eof|>"]
+        )
     )
     text = clean_output(output["choices"][0]["text"])