Valtry commited on
Commit
8147e6f
Β·
verified Β·
1 Parent(s): 69e3535

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -19
app.py CHANGED
@@ -4,7 +4,8 @@ from pydantic import BaseModel
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
  from supabase import create_client
7
- import os, uvicorn
 
8
  from contextlib import asynccontextmanager
9
 
10
  # =========================
@@ -49,16 +50,17 @@ def clean_output(text):
49
  def build_prompt(user_msg):
50
  return f"""<|begin_of_text|>
51
  <|start_header_id|>system<|end_header_id|>
52
- Your name is Llama and you are a cheerful friendly AI buddy on a tiny device.
53
  Rules:
54
  - Always refer to yourself as Llama
55
- - Speak in a warm casual and conversational tone like talking to a friend
56
- - Answer in 1 to 2 short sentences only
 
57
  - Keep answer under 30 words
58
- - Use very simple everyday words
59
- - Do NOT use symbols like * - : or bullet points
60
- - Do NOT use headings or formatting
61
- - Do NOT use new lines
62
  - Output plain text only
63
  <|eot_id|>
64
  <|start_header_id|>user<|end_header_id|>
@@ -78,11 +80,11 @@ def load_model():
78
  token=HF_TOKEN,
79
  cache_dir="/data"
80
  ),
81
- n_ctx=2048,
82
- n_threads=4,
83
- n_batch=512,
84
  use_mmap=True,
85
- use_mlock=True,
86
  f16_kv=True,
87
  verbose=False
88
  )
@@ -123,13 +125,20 @@ async def chat(req: ChatRequest):
123
 
124
  prompt = build_prompt(req.message)
125
 
126
- output = model(
127
- prompt,
128
- max_tokens=2048,
129
- temperature=req.temperature,
130
- top_p=0.9,
131
- repeat_penalty=1.15,
132
- stop=["<|eot_id|>", "<|end_of_text|>", "<|eof|>"]
 
 
 
 
 
 
 
133
  )
134
 
135
  text = clean_output(output["choices"][0]["text"])
 
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
  from supabase import create_client
7
+ import os, uvicorn, asyncio
8
+ from functools import partial
9
  from contextlib import asynccontextmanager
10
 
11
  # =========================
 
50
  def build_prompt(user_msg):
51
  return f"""<|begin_of_text|>
52
  <|start_header_id|>system<|end_header_id|>
53
+ Your name is Llama and you are a cheerful friendly AI buddy made for voice conversation.
54
  Rules:
55
  - Always refer to yourself as Llama
56
+ - Speak naturally like a real voice conversation with a friend
57
+ - Use casual spoken language like "hey" "sure" "yep" "got it"
58
+ - Answer in 1 to 2 sentences only
59
  - Keep answer under 30 words
60
+ - Never use symbols like * - : ! or bullet points
61
+ - Never use abbreviations like etc or eg
62
+ - Never spell out numbers use digits like 3 not three
63
+ - Do NOT use new lines or formatting
64
  - Output plain text only
65
  <|eot_id|>
66
  <|start_header_id|>user<|end_header_id|>
 
80
  token=HF_TOKEN,
81
  cache_dir="/data"
82
  ),
83
+ n_ctx=512, # βœ… reduced from 2048 β€” fits prompt + 80 token reply
84
+ n_threads=2, # βœ… matches HF free tier vCPU count (was 4)
85
+ n_batch=128, # βœ… smaller batch = faster for single requests (was 512)
86
  use_mmap=True,
87
+ use_mlock=False, # βœ… set False β€” mlock can fail on HF shared infra (was True)
88
  f16_kv=True,
89
  verbose=False
90
  )
 
125
 
126
  prompt = build_prompt(req.message)
127
 
128
+ # βœ… Run model in thread pool so FastAPI event loop stays non-blocking
129
+ loop = asyncio.get_event_loop()
130
+ output = await loop.run_in_executor(
131
+ None,
132
+ partial(
133
+ model,
134
+ prompt,
135
+ max_tokens=100, # βœ… reduced from 2048 β€” 30 words β‰ˆ 60-80 tokens
136
+ temperature=req.temperature,
137
+ top_p=0.9,
138
+ top_k=40, # βœ… added β€” limits candidates, speeds up sampling
139
+ repeat_penalty=1.15,
140
+ stop=["<|eot_id|>", "<|end_of_text|>", "<|eof|>"]
141
+ )
142
  )
143
 
144
  text = clean_output(output["choices"][0]["text"])