ekjotsingh's picture
Update app.py
7433133 verified
import os
import uvicorn
import struct
import logging
import asyncio
import resource
from fastapi import FastAPI, HTTPException, Security, Request, status
from fastapi.security import APIKeyHeader
from starlette.concurrency import run_in_threadpool
from llama_cpp import Llama
from huggingface_hub import hf_hub_download, login
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
# --- βš™οΈ LOGGING CONFIGURATION ---
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%H:%M:%S"
)
logger = logging.getLogger("MetanthropicNode")
# --- 🧱 HARDWARE LIMITS (14GB CEILING) ---
def set_memory_limit():
# We set a hard ceiling to prevent OOM kills.
limit_bytes = 14 * 1024 * 1024 * 1024
resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
logger.info(f"🧱 MEMORY HARD LIMIT SET: 14.0 GB")
set_memory_limit()
# --- πŸ”§ ENGINE CONFIGURATION ---
SOURCE_REPO = "metanthropic/metanthropic-phi3-encrypted"
SOURCE_FILE = "metanthropic-phi3-v1.mguf"
TEMP_DECRYPTED = "/dev/shm/metanthropic_v3_16.gguf"
# --- πŸ”‘ SECURITY SECRETS ---
HF_TOKEN = os.environ.get("HF_TOKEN")
DECRYPTION_KEY = os.environ.get("DECRYPTION_KEY")
API_KEY_NAME = "x-metanthropic-key"
api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
# --- 🚦 BATCH CONTROLLER (SEMAPHORE) ---
# Allows 4 concurrent users. User #5 waits.
MAX_CONCURRENT_USERS = 4
BATCH_SEMAPHORE = asyncio.Semaphore(MAX_CONCURRENT_USERS)
app = FastAPI(title="Metanthropic Sovereign Node")
# --- πŸ”’ SECURITY HANDSHAKE ---
async def get_api_key(api_key: str = Security(api_key_header)):
if not DECRYPTION_KEY:
raise HTTPException(status_code=500, detail="Node Config Error")
if api_key == DECRYPTION_KEY:
return api_key
raise HTTPException(status_code=403, detail="Access Denied")
# --- πŸ› οΈ BOOTLOADER (DECRYPTION) ---
def initialize_engine():
if os.path.exists(TEMP_DECRYPTED):
logger.info("⚑ RAM-Disk Hit: Model resident.")
return True
if not HF_TOKEN or not DECRYPTION_KEY:
logger.error("❌ CRITICAL: Missing Secrets")
return False
try:
login(token=HF_TOKEN)
logger.info(f"⏳ Downloading & Decrypting...")
path = hf_hub_download(repo_id=SOURCE_REPO, filename=SOURCE_FILE, local_dir=".")
key = bytes.fromhex(DECRYPTION_KEY)
aes = AESGCM(key)
with open(path, "rb") as f_in, open(TEMP_DECRYPTED, "wb") as f_out:
nonce = f_in.read(12)
h_len = struct.unpack("<I", f_in.read(4))[0]
header = f_in.read(h_len)
f_out.write(aes.decrypt(nonce, header, None))
while chunk := f_in.read(128 * 1024 * 1024):
f_out.write(chunk)
if os.path.exists(path): os.remove(path)
logger.info("βœ… Decryption Complete.")
return True
except Exception as e:
logger.error(f"❌ BOOT FAILURE: {e}")
return False
# --- πŸš€ INITIALIZE ENGINE GLOBAL ---
llm = None
if initialize_engine():
logger.info("πŸš€ STARTING ENGINE: PARALLEL BATCH MODE")
# πŸ”₯ THE STABLE CONFIGURATION πŸ”₯
# FIXED: Reduced to 4096 to match Phi-3 Mini's hard limit.
# 4096 Total / 4 Users = 1024 Tokens per User (Context Window).
TOTAL_CONTEXT = 4096
llm = Llama(
model_path=TEMP_DECRYPTED,
# 1. PARALLEL SLOTS
n_parallel=MAX_CONCURRENT_USERS,
# 2. TOTAL CONTEXT POOL
n_ctx=TOTAL_CONTEXT,
# 3. COMPUTE DENSITY
n_batch=512, # Process chunks
f16_kv=True, # Memory precision
# 4. CPU STRATEGY
# 2 threads is optimal for Hugging Face Free Tier vCPU
n_threads=2,
use_mlock=True, # Pin to RAM
verbose=False # Keep logs clean
)
logger.info(f"βœ… ENGINE READY. Capacity: {MAX_CONCURRENT_USERS} Simultaneous Streams.")
# --- πŸ₯ HEALTH CHECK ---
@app.get("/")
def health_check():
free_slots = BATCH_SEMAPHORE._value
return {"status": "active", "free_slots": free_slots}
# --- πŸ“‘ API ENDPOINT ---
@app.post("/v1/chat/completions")
async def chat_completion(request: Request, api_key: str = Security(get_api_key)):
if not llm:
raise HTTPException(status_code=503, detail="System Booting...")
try:
data = await request.json()
messages = data.get("messages", [])
max_tokens = data.get("max_tokens", 1024)
temperature = data.get("temperature", 0.7)
prompt = ""
for msg in messages:
prompt += f"<|{msg['role']}|>\n{msg['content']}<|end|>\n"
prompt += "<|assistant|>\n"
except Exception as e:
raise HTTPException(status_code=400, detail=f"Bad Request: {str(e)}")
# ⚑ THE BATCH GATE ⚑
async with BATCH_SEMAPHORE:
try:
output = await run_in_threadpool(
llm,
prompt,
max_tokens=max_tokens,
temperature=temperature,
stop=["<|end|>"],
echo=False
)
return {
"id": output["id"],
"object": "chat.completion",
"created": output["created"],
"model": "metanthropic-phi3-parallel",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": output['choices'][0]['text'].strip()
},
"finish_reason": output['choices'][0]['finish_reason']
}
],
"usage": output["usage"]
}
except Exception as e:
logger.error(f"Inference Error: {e}")
raise HTTPException(status_code=500, detail="Neural Core Error")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")