fugthchat commited on
Commit
1bde1ff
·
1 Parent(s): e477640

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -46
app.py CHANGED
@@ -2,12 +2,14 @@ import os
2
  import glob
3
  import json
4
  import psutil
 
 
5
  from fastapi import FastAPI, Request, HTTPException
6
  from fastapi.responses import StreamingResponse
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from llama_cpp import Llama
9
 
10
- app = FastAPI()
11
 
12
  # --- CORS Permissions ---
13
  app.add_middleware(
@@ -20,94 +22,138 @@ app.add_middleware(
20
 
21
  # --- Configuration ---
22
  # Map filenames to "Hannah" names
23
- MODEL_MAP = {
24
  "qwen2.5-0.5b-instruct-q2_k.gguf": "Hannah-1.0 Light",
25
- "qwen2.5-0.5b-instruct-q4_k_m.gguf": "Hannah-1.0 Heavy"
26
  }
27
 
28
- current_model = None
29
- current_model_name = ""
 
30
 
31
- def get_model(model_name):
32
  global current_model, current_model_name
33
-
34
- if not model_name: raise HTTPException(status_code=400, detail="No model selected")
35
- if not os.path.exists(model_name): raise HTTPException(status_code=404, detail="Model file not found")
 
 
36
 
37
  if current_model_name == model_name and current_model is not None:
38
  return current_model
39
 
40
  print(f"Loading {model_name}...")
41
- if current_model is not None: del current_model
42
-
43
- # --- PERFORMANCE TUNING ---
 
44
  current_model = Llama(
45
  model_path=model_name,
46
- n_ctx=4096, # Large memory for conversation history
47
- n_threads=2, # MAX for Hugging Face Free Tier (Crucial for speed)
48
- n_batch=512, # Process tokens in chunks
49
- verbose=False
50
  )
51
  current_model_name = model_name
52
  return current_model
53
 
 
 
 
 
 
 
54
  @app.get("/api/models")
55
  async def list_models():
56
- models_info = []
57
- # Scan for .gguf files
58
  for f in glob.glob("*.gguf"):
59
- display_name = MODEL_MAP.get(f, f)
60
  size_mb = os.path.getsize(f) / (1024 * 1024)
61
- models_info.append({
62
- "filename": f,
63
- "display_name": display_name,
64
- "size": f"{size_mb:.1f} MB"
65
- })
 
 
 
 
 
 
 
 
 
 
66
  return {"models": models_info}
67
 
 
68
  @app.get("/api/status")
69
  async def system_status():
70
  ram = psutil.virtual_memory()
71
  return {
72
- "ram_used": f"{ram.used / (1024*1024):.0f} MB",
73
- "cpu": f"{psutil.cpu_percent()}%"
74
  }
75
 
 
76
  @app.post("/api/gen_title")
77
  async def gen_title(request: Request):
78
  try:
79
  data = await request.json()
80
- message = data.get("message", "")
81
  words = message.split()[:4]
82
- title = " ".join(words).capitalize() + "..."
83
- return {"title": title}
84
- except: return {"title": "New Chat"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  @app.post("/api/chat")
87
  async def chat(request: Request):
88
  data = await request.json()
89
- user_input = data.get("message")
90
  model_file = data.get("model")
 
 
 
 
91
 
92
  llm = get_model(model_file)
93
 
94
  def iter_response():
95
- # --- PROMPT ENGINEERING FOR ACCURACY ---
96
- # Qwen 2.5 specific format for best results
97
- prompt = f"""<|im_start|>system
98
- You are Hannah 1.0, an intelligent, fast, and helpful pilot assistant. Answer efficiently.<|im_end|>
99
- <|im_start|>user
100
- {user_input}<|im_end|>
101
- <|im_start|>assistant
102
- """
103
- # Stream response
104
  stream = llm(
105
- prompt,
106
- max_tokens=2048,
107
- stop=["<|im_end|>", "User:", "System:"], # Stop exactly when done
108
- stream=True
109
  )
 
110
  for output in stream:
111
- yield json.dumps({"text": output['choices'][0]['text']}) + "\n"
 
112
 
113
- return StreamingResponse(iter_response(), media_type="application/x-ndjson")
 
 
2
  import glob
3
  import json
4
  import psutil
5
+ from typing import Any, Dict, List, Optional
6
+
7
  from fastapi import FastAPI, Request, HTTPException
8
  from fastapi.responses import StreamingResponse
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from llama_cpp import Llama
11
 
12
+ app = FastAPI(title="Hannah Pilot Interface")
13
 
14
  # --- CORS Permissions ---
15
  app.add_middleware(
 
22
 
23
  # --- Configuration ---
24
  # Map filenames to "Hannah" names
25
+ MODEL_MAP: Dict[str, str] = {
26
  "qwen2.5-0.5b-instruct-q2_k.gguf": "Hannah-1.0 Light",
27
+ "qwen2.5-0.5b-instruct-q4_k_m.gguf": "Hannah-1.0 Heavy",
28
  }
29
 
30
+ current_model: Optional[Llama] = None
31
+ current_model_name: str = ""
32
+
33
 
34
+ def get_model(model_name: str) -> Llama:
35
  global current_model, current_model_name
36
+
37
+ if not model_name:
38
+ raise HTTPException(status_code=400, detail="No model selected")
39
+ if not os.path.exists(model_name):
40
+ raise HTTPException(status_code=404, detail="Model file not found")
41
 
42
  if current_model_name == model_name and current_model is not None:
43
  return current_model
44
 
45
  print(f"Loading {model_name}...")
46
+ if current_model is not None:
47
+ del current_model
48
+
49
+ # --- PERFORMANCE TUNING (HF Free CPU) ---
50
  current_model = Llama(
51
  model_path=model_name,
52
+ n_ctx=4096,
53
+ n_threads=2,
54
+ n_batch=512,
55
+ verbose=False,
56
  )
57
  current_model_name = model_name
58
  return current_model
59
 
60
+
61
+ @app.get("/")
62
+ async def root():
63
+ return {"status": "ok", "name": "Hannah-1.0"}
64
+
65
+
66
  @app.get("/api/models")
67
  async def list_models():
68
+ models_info: List[Dict[str, Any]] = []
 
69
  for f in glob.glob("*.gguf"):
70
+ display_name = MODEL_MAP.get(f, f)
71
  size_mb = os.path.getsize(f) / (1024 * 1024)
72
+ models_info.append(
73
+ {
74
+ "filename": f,
75
+ "display_name": display_name,
76
+ "size": f"{size_mb:.1f} MB",
77
+ }
78
+ )
79
+
80
+ # Stable ordering (Heavy first if present)
81
+ models_info.sort(
82
+ key=lambda x: (
83
+ "Heavy" not in x.get("display_name", ""),
84
+ x.get("display_name", ""),
85
+ )
86
+ )
87
  return {"models": models_info}
88
 
89
+
90
  @app.get("/api/status")
91
  async def system_status():
92
  ram = psutil.virtual_memory()
93
  return {
94
+ "ram_used": f"{ram.used / (1024 * 1024):.0f} MB",
95
+ "cpu": f"{psutil.cpu_percent()}%",
96
  }
97
 
98
+
99
  @app.post("/api/gen_title")
100
  async def gen_title(request: Request):
101
  try:
102
  data = await request.json()
103
+ message = (data.get("message") or "").strip()
104
  words = message.split()[:4]
105
+ title = " ".join(words).capitalize() + ("..." if words else "")
106
+ return {"title": title or "New Chat"}
107
+ except Exception:
108
+ return {"title": "New Chat"}
109
+
110
+
111
+ def build_prompt(user_input: str, history: List[Dict[str, str]]) -> str:
112
+ # Qwen 2.5 chat format
113
+ system = (
114
+ "You are Hannah 1.0, an intelligent, fast, and helpful pilot assistant. "
115
+ "Answer efficiently and clearly."
116
+ )
117
+
118
+ parts: List[str] = ["<|im_start|>system\n" + system + "<|im_end|>\n"]
119
+
120
+ # Keep a small window of history for speed
121
+ for msg in history[-12:]:
122
+ role = msg.get("role")
123
+ content = msg.get("content") or ""
124
+ if role not in ("user", "assistant"):
125
+ continue
126
+ parts.append(f"<|im_start|>{role}\n{content}<|im_end|>\n")
127
+
128
+ parts.append(f"<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n")
129
+ return "".join(parts)
130
+
131
 
132
  @app.post("/api/chat")
133
  async def chat(request: Request):
134
  data = await request.json()
135
+ user_input = (data.get("message") or "").strip()
136
  model_file = data.get("model")
137
+ history = data.get("history") or []
138
+
139
+ if not user_input:
140
+ raise HTTPException(status_code=400, detail="Empty message")
141
 
142
  llm = get_model(model_file)
143
 
144
  def iter_response():
145
+ prompt = build_prompt(user_input, history)
146
+
 
 
 
 
 
 
 
147
  stream = llm(
148
+ prompt,
149
+ max_tokens=2048,
150
+ stop=["<|im_end|>", "User:", "System:"],
151
+ stream=True,
152
  )
153
+
154
  for output in stream:
155
+ token_text = output["choices"][0]["text"]
156
+ yield json.dumps({"text": token_text}) + "\n"
157
 
158
+ # NDJSON stream (frontend splits by newlines)
159
+ return StreamingResponse(iter_response(), media_type="application/x-ndjson")