fugthchat commited on
Commit
2c87c83
·
verified ·
1 Parent(s): e068f6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -34,8 +34,8 @@ app.add_middleware(
34
  # --- Configuration ---
35
  # Map filenames to "Hannah" names
36
  MODEL_MAP: Dict[str, str] = {
37
- "qwen2.5-0.5b-instruct-q2_k.gguf": "Hannah-1.0 Light",
38
- "qwen2.5-0.5b-instruct-q4_k_m.gguf": "Hannah-1.0 Heavy",
39
  }
40
 
41
  current_model: Optional[Llama] = None
@@ -116,21 +116,22 @@ def get_model(model_name: str) -> Llama:
116
  del current_model
117
 
118
  # --- PERFORMANCE TUNING (HF Free CPU) ---
119
- # 4096 ctx can be too memory heavy on small Spaces; start safer, then tune up later.
 
120
  threads = int(os.getenv("N_THREADS", "2"))
121
- n_ctx = int(os.getenv("N_CTX", "2048"))
122
- n_batch = int(os.getenv("N_BATCH", "256"))
123
 
124
  try:
125
  current_model = _try_load_model(
126
  model_path, n_ctx=n_ctx, n_threads=threads, n_batch=n_batch
127
  )
128
  except Exception as e:
129
- # Retry with very conservative settings in case this is memory pressure.
130
  print(f"Model load failed with N_CTX={n_ctx}, N_BATCH={n_batch}: {e}")
131
  try:
132
  current_model = _try_load_model(
133
- model_path, n_ctx=1024, n_threads=threads, n_batch=64
134
  )
135
  except Exception as e2:
136
  print(f"Model load retry failed: {e2}")
@@ -151,7 +152,7 @@ def get_model(model_name: str) -> Llama:
151
 
152
  @app.get("/")
153
  async def root():
154
- return {"status": "ok", "name": "Hannah-1.0"}
155
 
156
 
157
  @app.get("/api/models")
@@ -420,7 +421,7 @@ async def chat(request: Request):
420
 
421
  stream = llm(
422
  prompt,
423
- max_tokens=2048,
424
  stop=["<|im_end|>", "User:", "System:"],
425
  stream=True,
426
  )
 
34
  # --- Configuration ---
35
  # Map filenames to "Hannah" names
36
  MODEL_MAP: Dict[str, str] = {
37
+ "qwen2.5-0.5b-instruct-q2_k.gguf": "Hannah-1.1 Light",
38
+ "qwen2.5-0.5b-instruct-q4_k_m.gguf": "Hannah-1.1 Heavy",
39
  }
40
 
41
  current_model: Optional[Llama] = None
 
116
  del current_model
117
 
118
  # --- PERFORMANCE TUNING (HF Free CPU) ---
119
+ # Increased context for Hannah 1.1 with better memory management
120
+ # 4096 ctx provides more context awareness; fallback to 2048 if needed
121
  threads = int(os.getenv("N_THREADS", "2"))
122
+ n_ctx = int(os.getenv("N_CTX", "4096")) # Increased from 2048
123
+ n_batch = int(os.getenv("N_BATCH", "512")) # Increased from 256
124
 
125
  try:
126
  current_model = _try_load_model(
127
  model_path, n_ctx=n_ctx, n_threads=threads, n_batch=n_batch
128
  )
129
  except Exception as e:
130
+ # Retry with conservative settings in case of memory pressure
131
  print(f"Model load failed with N_CTX={n_ctx}, N_BATCH={n_batch}: {e}")
132
  try:
133
  current_model = _try_load_model(
134
+ model_path, n_ctx=2048, n_threads=threads, n_batch=256
135
  )
136
  except Exception as e2:
137
  print(f"Model load retry failed: {e2}")
 
152
 
153
  @app.get("/")
154
  async def root():
155
+ return {"status": "ok", "name": "Hannah-1.1"}
156
 
157
 
158
  @app.get("/api/models")
 
421
 
422
  stream = llm(
423
  prompt,
424
+ max_tokens=4096, # Increased from 2048 for better responses
425
  stop=["<|im_end|>", "User:", "System:"],
426
  stream=True,
427
  )