ekjotsingh commited on
Commit
7433133
Β·
verified Β·
1 Parent(s): b5f44cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -22
app.py CHANGED
@@ -22,7 +22,6 @@ logger = logging.getLogger("MetanthropicNode")
22
  # --- 🧱 HARDWARE LIMITS (14GB CEILING) ---
23
  def set_memory_limit():
24
  # We set a hard ceiling to prevent OOM kills.
25
- # The Engine is tuned to use ~12GB, leaving 2GB headroom.
26
  limit_bytes = 14 * 1024 * 1024 * 1024
27
  resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
28
  logger.info(f"🧱 MEMORY HARD LIMIT SET: 14.0 GB")
@@ -41,8 +40,7 @@ API_KEY_NAME = "x-metanthropic-key"
41
  api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
42
 
43
  # --- 🚦 BATCH CONTROLLER (SEMAPHORE) ---
44
- # Unlike a Lock() which allows 1, a Semaphore(N) allows N users at once.
45
- # If User #5 arrives, they wait for one of the 4 slots to free up.
46
  MAX_CONCURRENT_USERS = 4
47
  BATCH_SEMAPHORE = asyncio.Semaphore(MAX_CONCURRENT_USERS)
48
 
@@ -94,31 +92,26 @@ llm = None
94
  if initialize_engine():
95
  logger.info("πŸš€ STARTING ENGINE: PARALLEL BATCH MODE")
96
 
97
- # πŸ”₯ THE KARPATHY CONFIGURATION πŸ”₯
98
- # We are slicing the RAM into 4 distinct working lanes.
99
-
100
- TOTAL_CONTEXT = 16384 # Total RAM pool for Context (High Usage)
101
 
102
  llm = Llama(
103
  model_path=TEMP_DECRYPTED,
104
 
105
- # 1. PARALLEL SLOTS (The "Batch of 4")
106
- # This tells the C++ backend to maintain 4 separate conversation states.
107
  n_parallel=MAX_CONCURRENT_USERS,
108
 
109
  # 2. TOTAL CONTEXT POOL
110
- # 16k Total / 4 Users = 4096 Tokens per User.
111
- # This fits perfectly in 14GB RAM with f16_kv=True.
112
  n_ctx=TOTAL_CONTEXT,
113
 
114
  # 3. COMPUTE DENSITY
115
- n_batch=512, # Process input prompts in chunks
116
- f16_kv=True, # High precision memory
117
 
118
  # 4. CPU STRATEGY
119
- # We are on a shared vCPU environment.
120
- # Setting threads too high causes "context switching" lag.
121
- # 2 threads is the sweet spot for the Free Tier.
122
  n_threads=2,
123
 
124
  use_mlock=True, # Pin to RAM
@@ -129,8 +122,6 @@ if initialize_engine():
129
  # --- πŸ₯ HEALTH CHECK ---
130
  @app.get("/")
131
  def health_check():
132
- # Returns how many slots are currently free
133
- # _value is internal, but useful for debugging
134
  free_slots = BATCH_SEMAPHORE._value
135
  return {"status": "active", "free_slots": free_slots}
136
 
@@ -154,12 +145,8 @@ async def chat_completion(request: Request, api_key: str = Security(get_api_key)
154
  raise HTTPException(status_code=400, detail=f"Bad Request: {str(e)}")
155
 
156
  # ⚑ THE BATCH GATE ⚑
157
- # Instead of blocking everyone, we let 4 people in.
158
- # The 5th person waits here until one of the 4 finishes.
159
  async with BATCH_SEMAPHORE:
160
  try:
161
- # We offload to threadpool so the asyncio loop stays alive
162
- # to accept the *next* request while this one generates.
163
  output = await run_in_threadpool(
164
  llm,
165
  prompt,
 
22
  # --- 🧱 HARDWARE LIMITS (14GB CEILING) ---
23
  def set_memory_limit():
24
  # We set a hard ceiling to prevent OOM kills.
 
25
  limit_bytes = 14 * 1024 * 1024 * 1024
26
  resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
27
  logger.info(f"🧱 MEMORY HARD LIMIT SET: 14.0 GB")
 
40
  api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
41
 
42
  # --- 🚦 BATCH CONTROLLER (SEMAPHORE) ---
43
+ # Allows 4 concurrent users. User #5 waits.
 
44
  MAX_CONCURRENT_USERS = 4
45
  BATCH_SEMAPHORE = asyncio.Semaphore(MAX_CONCURRENT_USERS)
46
 
 
92
  if initialize_engine():
93
  logger.info("πŸš€ STARTING ENGINE: PARALLEL BATCH MODE")
94
 
95
+ # πŸ”₯ THE STABLE CONFIGURATION πŸ”₯
96
+ # FIXED: Reduced to 4096 to match Phi-3 Mini's hard limit.
97
+ # 4096 Total / 4 Users = 1024 Tokens per User (Context Window).
98
+ TOTAL_CONTEXT = 4096
99
 
100
  llm = Llama(
101
  model_path=TEMP_DECRYPTED,
102
 
103
+ # 1. PARALLEL SLOTS
 
104
  n_parallel=MAX_CONCURRENT_USERS,
105
 
106
  # 2. TOTAL CONTEXT POOL
 
 
107
  n_ctx=TOTAL_CONTEXT,
108
 
109
  # 3. COMPUTE DENSITY
110
+ n_batch=512, # Process chunks
111
+ f16_kv=True, # Memory precision
112
 
113
  # 4. CPU STRATEGY
114
+ # 2 threads is optimal for Hugging Face Free Tier vCPU
 
 
115
  n_threads=2,
116
 
117
  use_mlock=True, # Pin to RAM
 
122
  # --- πŸ₯ HEALTH CHECK ---
123
  @app.get("/")
124
  def health_check():
 
 
125
  free_slots = BATCH_SEMAPHORE._value
126
  return {"status": "active", "free_slots": free_slots}
127
 
 
145
  raise HTTPException(status_code=400, detail=f"Bad Request: {str(e)}")
146
 
147
  # ⚑ THE BATCH GATE ⚑
 
 
148
  async with BATCH_SEMAPHORE:
149
  try:
 
 
150
  output = await run_in_threadpool(
151
  llm,
152
  prompt,