Spaces:

metanthropic
/

metanthropic-api-phi3

Sleeping

App Files Files Community

metanthropic-api-phi3 / app.py

ekjotsingh

Update app.py

7433133 verified 24 days ago

raw

history blame contribute delete

6.12 kB

	import os
	import uvicorn
	import struct
	import logging
	import asyncio
	import resource
	from fastapi import FastAPI, HTTPException, Security, Request, status
	from fastapi.security import APIKeyHeader
	from starlette.concurrency import run_in_threadpool
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download, login
	from cryptography.hazmat.primitives.ciphers.aead import AESGCM

	# --- ⚙️ LOGGING CONFIGURATION ---
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s \| %(levelname)s \| %(message)s",
	datefmt="%H:%M:%S"
	)
	logger = logging.getLogger("MetanthropicNode")

	# --- 🧱 HARDWARE LIMITS (14GB CEILING) ---
	def set_memory_limit():
	# We set a hard ceiling to prevent OOM kills.
	limit_bytes = 14 * 1024 * 1024 * 1024
	resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
	logger.info(f"🧱 MEMORY HARD LIMIT SET: 14.0 GB")

	set_memory_limit()

	# --- 🔧 ENGINE CONFIGURATION ---
	SOURCE_REPO = "metanthropic/metanthropic-phi3-encrypted"
	SOURCE_FILE = "metanthropic-phi3-v1.mguf"
	TEMP_DECRYPTED = "/dev/shm/metanthropic_v3_16.gguf"

	# --- 🔑 SECURITY SECRETS ---
	HF_TOKEN = os.environ.get("HF_TOKEN")
	DECRYPTION_KEY = os.environ.get("DECRYPTION_KEY")
	API_KEY_NAME = "x-metanthropic-key"
	api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)

	# --- 🚦 BATCH CONTROLLER (SEMAPHORE) ---
	# Allows 4 concurrent users. User #5 waits.
	MAX_CONCURRENT_USERS = 4
	BATCH_SEMAPHORE = asyncio.Semaphore(MAX_CONCURRENT_USERS)

	app = FastAPI(title="Metanthropic Sovereign Node")

	# --- 🔒 SECURITY HANDSHAKE ---
	async def get_api_key(api_key: str = Security(api_key_header)):
	if not DECRYPTION_KEY:
	raise HTTPException(status_code=500, detail="Node Config Error")
	if api_key == DECRYPTION_KEY:
	return api_key
	raise HTTPException(status_code=403, detail="Access Denied")

	# --- 🛠️ BOOTLOADER (DECRYPTION) ---
	def initialize_engine():
	if os.path.exists(TEMP_DECRYPTED):
	logger.info("⚡ RAM-Disk Hit: Model resident.")
	return True

	if not HF_TOKEN or not DECRYPTION_KEY:
	logger.error("❌ CRITICAL: Missing Secrets")
	return False

	try:
	login(token=HF_TOKEN)
	logger.info(f"⏳ Downloading & Decrypting...")
	path = hf_hub_download(repo_id=SOURCE_REPO, filename=SOURCE_FILE, local_dir=".")

	key = bytes.fromhex(DECRYPTION_KEY)
	aes = AESGCM(key)

	with open(path, "rb") as f_in, open(TEMP_DECRYPTED, "wb") as f_out:
	nonce = f_in.read(12)
	h_len = struct.unpack("<I", f_in.read(4))[0]
	header = f_in.read(h_len)
	f_out.write(aes.decrypt(nonce, header, None))
	while chunk := f_in.read(128 * 1024 * 1024):
	f_out.write(chunk)

	if os.path.exists(path): os.remove(path)
	logger.info("✅ Decryption Complete.")
	return True
	except Exception as e:
	logger.error(f"❌ BOOT FAILURE: {e}")
	return False

	# --- 🚀 INITIALIZE ENGINE GLOBAL ---
	llm = None
	if initialize_engine():
	logger.info("🚀 STARTING ENGINE: PARALLEL BATCH MODE")

	# 🔥 THE STABLE CONFIGURATION 🔥
	# FIXED: Reduced to 4096 to match Phi-3 Mini's hard limit.
	# 4096 Total / 4 Users = 1024 Tokens per User (Context Window).
	TOTAL_CONTEXT = 4096

	llm = Llama(
	model_path=TEMP_DECRYPTED,

	# 1. PARALLEL SLOTS
	n_parallel=MAX_CONCURRENT_USERS,

	# 2. TOTAL CONTEXT POOL
	n_ctx=TOTAL_CONTEXT,

	# 3. COMPUTE DENSITY
	n_batch=512, # Process chunks
	f16_kv=True, # Memory precision

	# 4. CPU STRATEGY
	# 2 threads is optimal for Hugging Face Free Tier vCPU
	n_threads=2,

	use_mlock=True, # Pin to RAM
	verbose=False # Keep logs clean
	)
	logger.info(f"✅ ENGINE READY. Capacity: {MAX_CONCURRENT_USERS} Simultaneous Streams.")

	# --- 🏥 HEALTH CHECK ---
	@app.get("/")
	def health_check():
	free_slots = BATCH_SEMAPHORE._value
	return {"status": "active", "free_slots": free_slots}

	# --- 📡 API ENDPOINT ---
	@app.post("/v1/chat/completions")
	async def chat_completion(request: Request, api_key: str = Security(get_api_key)):
	if not llm:
	raise HTTPException(status_code=503, detail="System Booting...")

	try:
	data = await request.json()
	messages = data.get("messages", [])
	max_tokens = data.get("max_tokens", 1024)
	temperature = data.get("temperature", 0.7)

	prompt = ""
	for msg in messages:
	prompt += f"<\|{msg['role']}\|>\n{msg['content']}<\|end\|>\n"
	prompt += "<\|assistant\|>\n"
	except Exception as e:
	raise HTTPException(status_code=400, detail=f"Bad Request: {str(e)}")

	# ⚡ THE BATCH GATE ⚡
	async with BATCH_SEMAPHORE:
	try:
	output = await run_in_threadpool(
	llm,
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	stop=["<\|end\|>"],
	echo=False
	)

	return {
	"id": output["id"],
	"object": "chat.completion",
	"created": output["created"],
	"model": "metanthropic-phi3-parallel",
	"choices": [
	{
	"index": 0,
	"message": {
	"role": "assistant",
	"content": output['choices'][0]['text'].strip()
	},
	"finish_reason": output['choices'][0]['finish_reason']
	}
	],
	"usage": output["usage"]
	}
	except Exception as e:
	logger.error(f"Inference Error: {e}")
	raise HTTPException(status_code=500, detail="Neural Core Error")

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")