fugthchat commited on
Commit
6715353
·
1 Parent(s): 44b6d82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -10
app.py CHANGED
@@ -9,6 +9,7 @@ from llama_cpp import Llama
9
 
10
  app = FastAPI()
11
 
 
12
  app.add_middleware(
13
  CORSMiddleware,
14
  allow_origins=["*"],
@@ -18,7 +19,7 @@ app.add_middleware(
18
  )
19
 
20
  # --- Configuration ---
21
- # Map the real filenames to your preferred names
22
  MODEL_MAP = {
23
  "qwen2.5-0.5b-instruct-q2_k.gguf": "Hannah-1.0 Light",
24
  "qwen2.5-0.5b-instruct-q4_k_m.gguf": "Hannah-1.0 Heavy"
@@ -39,12 +40,12 @@ def get_model(model_name):
39
  print(f"Loading {model_name}...")
40
  if current_model is not None: del current_model
41
 
42
- # Speed Optimization for 0.5B
43
  current_model = Llama(
44
  model_path=model_name,
45
- n_ctx=4096,
46
- n_threads=2,
47
- n_batch=1024,
48
  verbose=False
49
  )
50
  current_model_name = model_name
@@ -53,9 +54,9 @@ def get_model(model_name):
53
  @app.get("/api/models")
54
  async def list_models():
55
  models_info = []
56
- # Only look for the files you uploaded
57
  for f in glob.glob("*.gguf"):
58
- display_name = MODEL_MAP.get(f, f) # Use custom name if available, else filename
59
  size_mb = os.path.getsize(f) / (1024 * 1024)
60
  models_info.append({
61
  "filename": f,
@@ -91,9 +92,21 @@ async def chat(request: Request):
91
  llm = get_model(model_file)
92
 
93
  def iter_response():
94
- # Standard ChatML Prompt
95
- prompt = f"<|im_start|>system\nYou are Hannah 1.0, an intelligent pilot assistant.<|im_end|>\n<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n"
96
- stream = llm(prompt, max_tokens=2048, stop=["<|im_end|>"], stream=True)
 
 
 
 
 
 
 
 
 
 
 
 
97
  for output in stream:
98
  yield json.dumps({"text": output['choices'][0]['text']}) + "\n"
99
 
 
9
 
10
  app = FastAPI()
11
 
12
+ # --- CORS Permissions ---
13
  app.add_middleware(
14
  CORSMiddleware,
15
  allow_origins=["*"],
 
19
  )
20
 
21
  # --- Configuration ---
22
+ # Map filenames to "Hannah" names
23
  MODEL_MAP = {
24
  "qwen2.5-0.5b-instruct-q2_k.gguf": "Hannah-1.0 Light",
25
  "qwen2.5-0.5b-instruct-q4_k_m.gguf": "Hannah-1.0 Heavy"
 
40
  print(f"Loading {model_name}...")
41
  if current_model is not None: del current_model
42
 
43
+ # --- PERFORMANCE TUNING ---
44
  current_model = Llama(
45
  model_path=model_name,
46
+ n_ctx=4096, # Large memory for conversation history
47
+ n_threads=2, # MAX for Hugging Face Free Tier (Crucial for speed)
48
+ n_batch=512, # Process tokens in chunks
49
  verbose=False
50
  )
51
  current_model_name = model_name
 
54
  @app.get("/api/models")
55
  async def list_models():
56
  models_info = []
57
+ # Scan for .gguf files
58
  for f in glob.glob("*.gguf"):
59
+ display_name = MODEL_MAP.get(f, f)
60
  size_mb = os.path.getsize(f) / (1024 * 1024)
61
  models_info.append({
62
  "filename": f,
 
92
  llm = get_model(model_file)
93
 
94
  def iter_response():
95
+ # --- PROMPT ENGINEERING FOR ACCURACY ---
96
+ # Qwen 2.5 specific format for best results
97
+ prompt = f"""<|im_start|>system
98
+ You are Hannah 1.0, an intelligent, fast, and helpful pilot assistant. Answer efficiently.<|im_end|>
99
+ <|im_start|>user
100
+ {user_input}<|im_end|>
101
+ <|im_start|>assistant
102
+ """
103
+ # Stream response
104
+ stream = llm(
105
+ prompt,
106
+ max_tokens=2048,
107
+ stop=["<|im_end|>", "User:", "System:"], # Stop exactly when done
108
+ stream=True
109
+ )
110
  for output in stream:
111
  yield json.dumps({"text": output['choices'][0]['text']}) + "\n"
112