Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,8 +34,8 @@ app.add_middleware(
|
|
| 34 |
# --- Configuration ---
|
| 35 |
# Map filenames to "Hannah" names
|
| 36 |
MODEL_MAP: Dict[str, str] = {
|
| 37 |
-
"qwen2.5-0.5b-instruct-q2_k.gguf": "Hannah-1.
|
| 38 |
-
"qwen2.5-0.5b-instruct-q4_k_m.gguf": "Hannah-1.
|
| 39 |
}
|
| 40 |
|
| 41 |
current_model: Optional[Llama] = None
|
|
@@ -116,21 +116,22 @@ def get_model(model_name: str) -> Llama:
|
|
| 116 |
del current_model
|
| 117 |
|
| 118 |
# --- PERFORMANCE TUNING (HF Free CPU) ---
|
| 119 |
-
#
|
|
|
|
| 120 |
threads = int(os.getenv("N_THREADS", "2"))
|
| 121 |
-
n_ctx = int(os.getenv("N_CTX", "
|
| 122 |
-
n_batch = int(os.getenv("N_BATCH", "
|
| 123 |
|
| 124 |
try:
|
| 125 |
current_model = _try_load_model(
|
| 126 |
model_path, n_ctx=n_ctx, n_threads=threads, n_batch=n_batch
|
| 127 |
)
|
| 128 |
except Exception as e:
|
| 129 |
-
# Retry with
|
| 130 |
print(f"Model load failed with N_CTX={n_ctx}, N_BATCH={n_batch}: {e}")
|
| 131 |
try:
|
| 132 |
current_model = _try_load_model(
|
| 133 |
-
model_path, n_ctx=
|
| 134 |
)
|
| 135 |
except Exception as e2:
|
| 136 |
print(f"Model load retry failed: {e2}")
|
|
@@ -151,7 +152,7 @@ def get_model(model_name: str) -> Llama:
|
|
| 151 |
|
| 152 |
@app.get("/")
|
| 153 |
async def root():
|
| 154 |
-
return {"status": "ok", "name": "Hannah-1.
|
| 155 |
|
| 156 |
|
| 157 |
@app.get("/api/models")
|
|
@@ -420,7 +421,7 @@ async def chat(request: Request):
|
|
| 420 |
|
| 421 |
stream = llm(
|
| 422 |
prompt,
|
| 423 |
-
max_tokens=2048
|
| 424 |
stop=["<|im_end|>", "User:", "System:"],
|
| 425 |
stream=True,
|
| 426 |
)
|
|
|
|
| 34 |
# --- Configuration ---
|
| 35 |
# Map filenames to "Hannah" names
|
| 36 |
MODEL_MAP: Dict[str, str] = {
|
| 37 |
+
"qwen2.5-0.5b-instruct-q2_k.gguf": "Hannah-1.1 Light",
|
| 38 |
+
"qwen2.5-0.5b-instruct-q4_k_m.gguf": "Hannah-1.1 Heavy",
|
| 39 |
}
|
| 40 |
|
| 41 |
current_model: Optional[Llama] = None
|
|
|
|
| 116 |
del current_model
|
| 117 |
|
| 118 |
# --- PERFORMANCE TUNING (HF Free CPU) ---
|
| 119 |
+
# Increased context for Hannah 1.1 with better memory management
|
| 120 |
+
# 4096 ctx provides more context awareness; fallback to 2048 if needed
|
| 121 |
threads = int(os.getenv("N_THREADS", "2"))
|
| 122 |
+
n_ctx = int(os.getenv("N_CTX", "4096")) # Increased from 2048
|
| 123 |
+
n_batch = int(os.getenv("N_BATCH", "512")) # Increased from 256
|
| 124 |
|
| 125 |
try:
|
| 126 |
current_model = _try_load_model(
|
| 127 |
model_path, n_ctx=n_ctx, n_threads=threads, n_batch=n_batch
|
| 128 |
)
|
| 129 |
except Exception as e:
|
| 130 |
+
# Retry with conservative settings in case of memory pressure
|
| 131 |
print(f"Model load failed with N_CTX={n_ctx}, N_BATCH={n_batch}: {e}")
|
| 132 |
try:
|
| 133 |
current_model = _try_load_model(
|
| 134 |
+
model_path, n_ctx=2048, n_threads=threads, n_batch=256
|
| 135 |
)
|
| 136 |
except Exception as e2:
|
| 137 |
print(f"Model load retry failed: {e2}")
|
|
|
|
| 152 |
|
| 153 |
@app.get("/")
|
| 154 |
async def root():
|
| 155 |
+
return {"status": "ok", "name": "Hannah-1.1"}
|
| 156 |
|
| 157 |
|
| 158 |
@app.get("/api/models")
|
|
|
|
| 421 |
|
| 422 |
stream = llm(
|
| 423 |
prompt,
|
| 424 |
+
max_tokens=4096, # Increased from 2048 for better responses
|
| 425 |
stop=["<|im_end|>", "User:", "System:"],
|
| 426 |
stream=True,
|
| 427 |
)
|