Spaces:

metanthropic
/

metanthropic-api-phi3

Sleeping

App Files Files Community

ekjotsingh commited on Feb 15

Commit

7433133

verified ·

1 Parent(s): b5f44cc

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -22

app.py CHANGED Viewed

@@ -22,7 +22,6 @@ logger = logging.getLogger("MetanthropicNode")
 # --- 🧱 HARDWARE LIMITS (14GB CEILING) ---
 def set_memory_limit():
     # We set a hard ceiling to prevent OOM kills.
-    # The Engine is tuned to use ~12GB, leaving 2GB headroom.
     limit_bytes = 14 * 1024 * 1024 * 1024
     resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
     logger.info(f"🧱 MEMORY HARD LIMIT SET: 14.0 GB")
@@ -41,8 +40,7 @@ API_KEY_NAME = "x-metanthropic-key"
 api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
 # --- 🚦 BATCH CONTROLLER (SEMAPHORE) ---
-# Unlike a Lock() which allows 1, a Semaphore(N) allows N users at once.
-# If User #5 arrives, they wait for one of the 4 slots to free up.
 MAX_CONCURRENT_USERS = 4
 BATCH_SEMAPHORE = asyncio.Semaphore(MAX_CONCURRENT_USERS)
@@ -94,31 +92,26 @@ llm = None
 if initialize_engine():
     logger.info("🚀 STARTING ENGINE: PARALLEL BATCH MODE")
-    # 🔥 THE KARPATHY CONFIGURATION 🔥
-    # We are slicing the RAM into 4 distinct working lanes.
-    TOTAL_CONTEXT = 16384  # Total RAM pool for Context (High Usage)
     llm = Llama(
         model_path=TEMP_DECRYPTED,
-        # 1. PARALLEL SLOTS (The "Batch of 4")
-        # This tells the C++ backend to maintain 4 separate conversation states.
         n_parallel=MAX_CONCURRENT_USERS,
         # 2. TOTAL CONTEXT POOL
-        # 16k Total / 4 Users = 4096 Tokens per User.
-        # This fits perfectly in 14GB RAM with f16_kv=True.
         n_ctx=TOTAL_CONTEXT,
         # 3. COMPUTE DENSITY
-        n_batch=512,        # Process input prompts in chunks
-        f16_kv=True,        # High precision memory
         # 4. CPU STRATEGY
-        # We are on a shared vCPU environment.
-        # Setting threads too high causes "context switching" lag.
-        # 2 threads is the sweet spot for the Free Tier.
         n_threads=2,
         use_mlock=True,     # Pin to RAM
@@ -129,8 +122,6 @@ if initialize_engine():
 # --- 🏥 HEALTH CHECK ---
 @app.get("/")
 def health_check():
-    # Returns how many slots are currently free
-    # _value is internal, but useful for debugging
     free_slots = BATCH_SEMAPHORE._value
     return {"status": "active", "free_slots": free_slots}
@@ -154,12 +145,8 @@ async def chat_completion(request: Request, api_key: str = Security(get_api_key)
         raise HTTPException(status_code=400, detail=f"Bad Request: {str(e)}")
     # ⚡ THE BATCH GATE ⚡
-    # Instead of blocking everyone, we let 4 people in.
-    # The 5th person waits here until one of the 4 finishes.
     async with BATCH_SEMAPHORE:
         try:
-            # We offload to threadpool so the asyncio loop stays alive
-            # to accept the *next* request while this one generates.
             output = await run_in_threadpool(
                 llm,
                 prompt,

 # --- 🧱 HARDWARE LIMITS (14GB CEILING) ---
 def set_memory_limit():
     # We set a hard ceiling to prevent OOM kills.
     limit_bytes = 14 * 1024 * 1024 * 1024
     resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
     logger.info(f"🧱 MEMORY HARD LIMIT SET: 14.0 GB")
 api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
 # --- 🚦 BATCH CONTROLLER (SEMAPHORE) ---
+# Allows 4 concurrent users. User #5 waits.
 MAX_CONCURRENT_USERS = 4
 BATCH_SEMAPHORE = asyncio.Semaphore(MAX_CONCURRENT_USERS)
 if initialize_engine():
     logger.info("🚀 STARTING ENGINE: PARALLEL BATCH MODE")
+    # 🔥 THE STABLE CONFIGURATION 🔥
+    # FIXED: Reduced to 4096 to match Phi-3 Mini's hard limit.
+    # 4096 Total / 4 Users = 1024 Tokens per User (Context Window).
+    TOTAL_CONTEXT = 4096
     llm = Llama(
         model_path=TEMP_DECRYPTED,
+        # 1. PARALLEL SLOTS
         n_parallel=MAX_CONCURRENT_USERS,
         # 2. TOTAL CONTEXT POOL
         n_ctx=TOTAL_CONTEXT,
         # 3. COMPUTE DENSITY
+        n_batch=512,        # Process chunks
+        f16_kv=True,        # Memory precision
         # 4. CPU STRATEGY
+        # 2 threads is optimal for Hugging Face Free Tier vCPU
         n_threads=2,
         use_mlock=True,     # Pin to RAM
 # --- 🏥 HEALTH CHECK ---
 @app.get("/")
 def health_check():
     free_slots = BATCH_SEMAPHORE._value
     return {"status": "active", "free_slots": free_slots}
         raise HTTPException(status_code=400, detail=f"Bad Request: {str(e)}")
     # ⚡ THE BATCH GATE ⚡
     async with BATCH_SEMAPHORE:
         try:
             output = await run_in_threadpool(
                 llm,
                 prompt,