Spaces:
Sleeping
Sleeping
| import os | |
| import uvicorn | |
| import struct | |
| import logging | |
| import asyncio | |
| import resource | |
| from fastapi import FastAPI, HTTPException, Security, Request, status | |
| from fastapi.security import APIKeyHeader | |
| from starlette.concurrency import run_in_threadpool | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download, login | |
| from cryptography.hazmat.primitives.ciphers.aead import AESGCM | |
| # --- βοΈ LOGGING CONFIGURATION --- | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s | %(levelname)s | %(message)s", | |
| datefmt="%H:%M:%S" | |
| ) | |
| logger = logging.getLogger("MetanthropicNode") | |
| # --- π§± HARDWARE LIMITS (14GB CEILING) --- | |
| def set_memory_limit(): | |
| # We set a hard ceiling to prevent OOM kills. | |
| limit_bytes = 14 * 1024 * 1024 * 1024 | |
| resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes)) | |
| logger.info(f"π§± MEMORY HARD LIMIT SET: 14.0 GB") | |
| set_memory_limit() | |
| # --- π§ ENGINE CONFIGURATION --- | |
| SOURCE_REPO = "metanthropic/metanthropic-phi3-encrypted" | |
| SOURCE_FILE = "metanthropic-phi3-v1.mguf" | |
| TEMP_DECRYPTED = "/dev/shm/metanthropic_v3_16.gguf" | |
| # --- π SECURITY SECRETS --- | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| DECRYPTION_KEY = os.environ.get("DECRYPTION_KEY") | |
| API_KEY_NAME = "x-metanthropic-key" | |
| api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False) | |
| # --- π¦ BATCH CONTROLLER (SEMAPHORE) --- | |
| # Allows 4 concurrent users. User #5 waits. | |
| MAX_CONCURRENT_USERS = 4 | |
| BATCH_SEMAPHORE = asyncio.Semaphore(MAX_CONCURRENT_USERS) | |
| app = FastAPI(title="Metanthropic Sovereign Node") | |
| # --- π SECURITY HANDSHAKE --- | |
| async def get_api_key(api_key: str = Security(api_key_header)): | |
| if not DECRYPTION_KEY: | |
| raise HTTPException(status_code=500, detail="Node Config Error") | |
| if api_key == DECRYPTION_KEY: | |
| return api_key | |
| raise HTTPException(status_code=403, detail="Access Denied") | |
| # --- π οΈ BOOTLOADER (DECRYPTION) --- | |
| def initialize_engine(): | |
| if os.path.exists(TEMP_DECRYPTED): | |
| logger.info("β‘ RAM-Disk Hit: Model resident.") | |
| return True | |
| if not HF_TOKEN or not DECRYPTION_KEY: | |
| logger.error("β CRITICAL: Missing Secrets") | |
| return False | |
| try: | |
| login(token=HF_TOKEN) | |
| logger.info(f"β³ Downloading & Decrypting...") | |
| path = hf_hub_download(repo_id=SOURCE_REPO, filename=SOURCE_FILE, local_dir=".") | |
| key = bytes.fromhex(DECRYPTION_KEY) | |
| aes = AESGCM(key) | |
| with open(path, "rb") as f_in, open(TEMP_DECRYPTED, "wb") as f_out: | |
| nonce = f_in.read(12) | |
| h_len = struct.unpack("<I", f_in.read(4))[0] | |
| header = f_in.read(h_len) | |
| f_out.write(aes.decrypt(nonce, header, None)) | |
| while chunk := f_in.read(128 * 1024 * 1024): | |
| f_out.write(chunk) | |
| if os.path.exists(path): os.remove(path) | |
| logger.info("β Decryption Complete.") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β BOOT FAILURE: {e}") | |
| return False | |
| # --- π INITIALIZE ENGINE GLOBAL --- | |
| llm = None | |
| if initialize_engine(): | |
| logger.info("π STARTING ENGINE: PARALLEL BATCH MODE") | |
| # π₯ THE STABLE CONFIGURATION π₯ | |
| # FIXED: Reduced to 4096 to match Phi-3 Mini's hard limit. | |
| # 4096 Total / 4 Users = 1024 Tokens per User (Context Window). | |
| TOTAL_CONTEXT = 4096 | |
| llm = Llama( | |
| model_path=TEMP_DECRYPTED, | |
| # 1. PARALLEL SLOTS | |
| n_parallel=MAX_CONCURRENT_USERS, | |
| # 2. TOTAL CONTEXT POOL | |
| n_ctx=TOTAL_CONTEXT, | |
| # 3. COMPUTE DENSITY | |
| n_batch=512, # Process chunks | |
| f16_kv=True, # Memory precision | |
| # 4. CPU STRATEGY | |
| # 2 threads is optimal for Hugging Face Free Tier vCPU | |
| n_threads=2, | |
| use_mlock=True, # Pin to RAM | |
| verbose=False # Keep logs clean | |
| ) | |
| logger.info(f"β ENGINE READY. Capacity: {MAX_CONCURRENT_USERS} Simultaneous Streams.") | |
| # --- π₯ HEALTH CHECK --- | |
| def health_check(): | |
| free_slots = BATCH_SEMAPHORE._value | |
| return {"status": "active", "free_slots": free_slots} | |
| # --- π‘ API ENDPOINT --- | |
| async def chat_completion(request: Request, api_key: str = Security(get_api_key)): | |
| if not llm: | |
| raise HTTPException(status_code=503, detail="System Booting...") | |
| try: | |
| data = await request.json() | |
| messages = data.get("messages", []) | |
| max_tokens = data.get("max_tokens", 1024) | |
| temperature = data.get("temperature", 0.7) | |
| prompt = "" | |
| for msg in messages: | |
| prompt += f"<|{msg['role']}|>\n{msg['content']}<|end|>\n" | |
| prompt += "<|assistant|>\n" | |
| except Exception as e: | |
| raise HTTPException(status_code=400, detail=f"Bad Request: {str(e)}") | |
| # β‘ THE BATCH GATE β‘ | |
| async with BATCH_SEMAPHORE: | |
| try: | |
| output = await run_in_threadpool( | |
| llm, | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| stop=["<|end|>"], | |
| echo=False | |
| ) | |
| return { | |
| "id": output["id"], | |
| "object": "chat.completion", | |
| "created": output["created"], | |
| "model": "metanthropic-phi3-parallel", | |
| "choices": [ | |
| { | |
| "index": 0, | |
| "message": { | |
| "role": "assistant", | |
| "content": output['choices'][0]['text'].strip() | |
| }, | |
| "finish_reason": output['choices'][0]['finish_reason'] | |
| } | |
| ], | |
| "usage": output["usage"] | |
| } | |
| except Exception as e: | |
| logger.error(f"Inference Error: {e}") | |
| raise HTTPException(status_code=500, detail="Neural Core Error") | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning") |