Spaces:

fugthchat
/

Hannah-Pilot-Interface

Sleeping

App Files Files Community

fugthchat commited on Dec 21, 2025

Commit

2c87c83

verified ·

1 Parent(s): e068f6b

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -34,8 +34,8 @@ app.add_middleware(
 # --- Configuration ---
 # Map filenames to "Hannah" names
 MODEL_MAP: Dict[str, str] = {
-    "qwen2.5-0.5b-instruct-q2_k.gguf": "Hannah-1.0 Light",
-    "qwen2.5-0.5b-instruct-q4_k_m.gguf": "Hannah-1.0 Heavy",
 }
 current_model: Optional[Llama] = None
@@ -116,21 +116,22 @@ def get_model(model_name: str) -> Llama:
         del current_model
     # --- PERFORMANCE TUNING (HF Free CPU) ---
-    # 4096 ctx can be too memory heavy on small Spaces; start safer, then tune up later.
     threads = int(os.getenv("N_THREADS", "2"))
-    n_ctx = int(os.getenv("N_CTX", "2048"))
-    n_batch = int(os.getenv("N_BATCH", "256"))
     try:
         current_model = _try_load_model(
             model_path, n_ctx=n_ctx, n_threads=threads, n_batch=n_batch
         )
     except Exception as e:
-        # Retry with very conservative settings in case this is memory pressure.
         print(f"Model load failed with N_CTX={n_ctx}, N_BATCH={n_batch}: {e}")
         try:
             current_model = _try_load_model(
-                model_path, n_ctx=1024, n_threads=threads, n_batch=64
             )
         except Exception as e2:
             print(f"Model load retry failed: {e2}")
@@ -151,7 +152,7 @@ def get_model(model_name: str) -> Llama:
 @app.get("/")
 async def root():
-    return {"status": "ok", "name": "Hannah-1.0"}
 @app.get("/api/models")
@@ -420,7 +421,7 @@ async def chat(request: Request):
         stream = llm(
             prompt,
-            max_tokens=2048,
             stop=["<|im_end|>", "User:", "System:"],
             stream=True,
         )

 # --- Configuration ---
 # Map filenames to "Hannah" names
 MODEL_MAP: Dict[str, str] = {
+    "qwen2.5-0.5b-instruct-q2_k.gguf": "Hannah-1.1 Light",
+    "qwen2.5-0.5b-instruct-q4_k_m.gguf": "Hannah-1.1 Heavy",
 }
 current_model: Optional[Llama] = None
         del current_model
     # --- PERFORMANCE TUNING (HF Free CPU) ---
+    # Increased context for Hannah 1.1 with better memory management
+    # 4096 ctx provides more context awareness; fallback to 2048 if needed
     threads = int(os.getenv("N_THREADS", "2"))
+    n_ctx = int(os.getenv("N_CTX", "4096"))  # Increased from 2048
+    n_batch = int(os.getenv("N_BATCH", "512"))  # Increased from 256
     try:
         current_model = _try_load_model(
             model_path, n_ctx=n_ctx, n_threads=threads, n_batch=n_batch
         )
     except Exception as e:
+        # Retry with conservative settings in case of memory pressure
         print(f"Model load failed with N_CTX={n_ctx}, N_BATCH={n_batch}: {e}")
         try:
             current_model = _try_load_model(
+                model_path, n_ctx=2048, n_threads=threads, n_batch=256
             )
         except Exception as e2:
             print(f"Model load retry failed: {e2}")
 @app.get("/")
 async def root():
+    return {"status": "ok", "name": "Hannah-1.1"}
 @app.get("/api/models")
         stream = llm(
             prompt,
+            max_tokens=4096,  # Increased from 2048 for better responses
             stop=["<|im_end|>", "User:", "System:"],
             stream=True,
         )