Spaces:

metanthropic
/

metanthropic-api-phi3

Sleeping

File size: 6,119 Bytes

88858f1
 
 
5a2372a
9c948a7
b5f44cc
88858f1
 
9c948a7
88858f1
 
 
 
5a2372a
 
 
 
 
 
 
 
b5f44cc
b9a2df5
b5f44cc
b9a2df5
 
 
 
 
 
5a2372a
88858f1
 
5a2372a
 
 
88858f1
 
5a2372a
88858f1
 
b5f44cc
7433133
b5f44cc
 
9c948a7
5a2372a
88858f1
5a2372a
4107b58
5a2372a
b5f44cc
5a2372a
 
b5f44cc
88858f1
5a2372a
88858f1
5a2372a
b5f44cc
5a2372a
 
 
b5f44cc
fb3b67e
5a2372a
88858f1
5a2372a
b5f44cc
88858f1
5a2372a
88858f1
 
d6f65e1
88858f1
 
b5f44cc
 
 
 
5a2372a
d6f65e1
b5f44cc
 
88858f1
 
5a2372a
88858f1
 
5a2372a
88858f1
 
b5f44cc
 
7433133
 
 
 
5a2372a
d6f65e1
 
b9a2df5
7433133
b5f44cc
b9a2df5
b5f44cc
 
b9a2df5
b5f44cc
7433133
 
b9a2df5
b5f44cc
7433133
b9a2df5
 
b5f44cc
 
d6f65e1
b5f44cc
88858f1
9c948a7
 
 
b5f44cc
 
9c948a7
5a2372a
88858f1
 
5a2372a
b9a2df5
d6f65e1
5a2372a
 
 
 
 
 
 
 
 
 
 
9c948a7
 
b5f44cc
 
9c948a7
 
 
 
 
 
 
 
 
 
 
 
 
 
b5f44cc
9c948a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88858f1
 
9c948a7

import os
import uvicorn
import struct
import logging
import asyncio
import resource
from fastapi import FastAPI, HTTPException, Security, Request, status
from fastapi.security import APIKeyHeader
from starlette.concurrency import run_in_threadpool
from llama_cpp import Llama
from huggingface_hub import hf_hub_download, login
from cryptography.hazmat.primitives.ciphers.aead import AESGCM

# --- ⚙️ LOGGING CONFIGURATION ---
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S"
)
logger = logging.getLogger("MetanthropicNode")

# --- 🧱 HARDWARE LIMITS (14GB CEILING) ---
def set_memory_limit():
    # We set a hard ceiling to prevent OOM kills.
    limit_bytes = 14 * 1024 * 1024 * 1024 
    resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
    logger.info(f"🧱 MEMORY HARD LIMIT SET: 14.0 GB")

set_memory_limit()

# --- 🔧 ENGINE CONFIGURATION ---
SOURCE_REPO = "metanthropic/metanthropic-phi3-encrypted"
SOURCE_FILE = "metanthropic-phi3-v1.mguf"
TEMP_DECRYPTED = "/dev/shm/metanthropic_v3_16.gguf"

# --- 🔑 SECURITY SECRETS ---
HF_TOKEN = os.environ.get("HF_TOKEN")
DECRYPTION_KEY = os.environ.get("DECRYPTION_KEY")
API_KEY_NAME = "x-metanthropic-key"
api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)

# --- 🚦 BATCH CONTROLLER (SEMAPHORE) ---
# Allows 4 concurrent users. User #5 waits.
MAX_CONCURRENT_USERS = 4
BATCH_SEMAPHORE = asyncio.Semaphore(MAX_CONCURRENT_USERS)

app = FastAPI(title="Metanthropic Sovereign Node")

# --- 🔒 SECURITY HANDSHAKE ---
async def get_api_key(api_key: str = Security(api_key_header)):
    if not DECRYPTION_KEY:
        raise HTTPException(status_code=500, detail="Node Config Error")
    if api_key == DECRYPTION_KEY:
        return api_key
    raise HTTPException(status_code=403, detail="Access Denied")

# --- 🛠️ BOOTLOADER (DECRYPTION) ---
def initialize_engine():
    if os.path.exists(TEMP_DECRYPTED):
        logger.info("⚡ RAM-Disk Hit: Model resident.")
        return True

    if not HF_TOKEN or not DECRYPTION_KEY:
        logger.error("❌ CRITICAL: Missing Secrets")
        return False

    try:
        login(token=HF_TOKEN)
        logger.info(f"⏳ Downloading & Decrypting...")
        path = hf_hub_download(repo_id=SOURCE_REPO, filename=SOURCE_FILE, local_dir=".")
        
        key = bytes.fromhex(DECRYPTION_KEY)
        aes = AESGCM(key)
        
        with open(path, "rb") as f_in, open(TEMP_DECRYPTED, "wb") as f_out:
            nonce = f_in.read(12)
            h_len = struct.unpack("<I", f_in.read(4))[0]
            header = f_in.read(h_len)
            f_out.write(aes.decrypt(nonce, header, None))
            while chunk := f_in.read(128 * 1024 * 1024):
                f_out.write(chunk)
        
        if os.path.exists(path): os.remove(path)
        logger.info("✅ Decryption Complete.")
        return True
    except Exception as e:
        logger.error(f"❌ BOOT FAILURE: {e}")
        return False

# --- 🚀 INITIALIZE ENGINE GLOBAL ---
llm = None
if initialize_engine():
    logger.info("🚀 STARTING ENGINE: PARALLEL BATCH MODE")
    
    # 🔥 THE STABLE CONFIGURATION 🔥
    # FIXED: Reduced to 4096 to match Phi-3 Mini's hard limit.
    # 4096 Total / 4 Users = 1024 Tokens per User (Context Window).
    TOTAL_CONTEXT = 4096  
    
    llm = Llama(
        model_path=TEMP_DECRYPTED,
        
        # 1. PARALLEL SLOTS
        n_parallel=MAX_CONCURRENT_USERS, 
        
        # 2. TOTAL CONTEXT POOL
        n_ctx=TOTAL_CONTEXT, 
        
        # 3. COMPUTE DENSITY
        n_batch=512,        # Process chunks
        f16_kv=True,        # Memory precision
        
        # 4. CPU STRATEGY
        # 2 threads is optimal for Hugging Face Free Tier vCPU
        n_threads=2,
        
        use_mlock=True,     # Pin to RAM
        verbose=False       # Keep logs clean
    )
    logger.info(f"✅ ENGINE READY. Capacity: {MAX_CONCURRENT_USERS} Simultaneous Streams.")

# --- 🏥 HEALTH CHECK ---
@app.get("/")
def health_check():
    free_slots = BATCH_SEMAPHORE._value 
    return {"status": "active", "free_slots": free_slots}

# --- 📡 API ENDPOINT ---
@app.post("/v1/chat/completions")
async def chat_completion(request: Request, api_key: str = Security(get_api_key)):
    if not llm:
        raise HTTPException(status_code=503, detail="System Booting...")
    
    try:
        data = await request.json()
        messages = data.get("messages", [])
        max_tokens = data.get("max_tokens", 1024)
        temperature = data.get("temperature", 0.7)
        
        prompt = ""
        for msg in messages:
            prompt += f"<|{msg['role']}|>\n{msg['content']}<|end|>\n"
        prompt += "<|assistant|>\n"
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Bad Request: {str(e)}")

    # ⚡ THE BATCH GATE ⚡
    async with BATCH_SEMAPHORE:
        try:
            output = await run_in_threadpool(
                llm,
                prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                stop=["<|end|>"],
                echo=False
            )
            
            return {
                "id": output["id"],
                "object": "chat.completion",
                "created": output["created"],
                "model": "metanthropic-phi3-parallel",
                "choices": [
                    {
                        "index": 0,
                        "message": {
                            "role": "assistant",
                            "content": output['choices'][0]['text'].strip()
                        },
                        "finish_reason": output['choices'][0]['finish_reason']
                    }
                ],
                "usage": output["usage"]
            }
        except Exception as e:
            logger.error(f"Inference Error: {e}")
            raise HTTPException(status_code=500, detail="Neural Core Error")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")