Spaces:
Sleeping
Sleeping
File size: 6,119 Bytes
88858f1 5a2372a 9c948a7 b5f44cc 88858f1 9c948a7 88858f1 5a2372a b5f44cc b9a2df5 b5f44cc b9a2df5 5a2372a 88858f1 5a2372a 88858f1 5a2372a 88858f1 b5f44cc 7433133 b5f44cc 9c948a7 5a2372a 88858f1 5a2372a 4107b58 5a2372a b5f44cc 5a2372a b5f44cc 88858f1 5a2372a 88858f1 5a2372a b5f44cc 5a2372a b5f44cc fb3b67e 5a2372a 88858f1 5a2372a b5f44cc 88858f1 5a2372a 88858f1 d6f65e1 88858f1 b5f44cc 5a2372a d6f65e1 b5f44cc 88858f1 5a2372a 88858f1 5a2372a 88858f1 b5f44cc 7433133 5a2372a d6f65e1 b9a2df5 7433133 b5f44cc b9a2df5 b5f44cc b9a2df5 b5f44cc 7433133 b9a2df5 b5f44cc 7433133 b9a2df5 b5f44cc d6f65e1 b5f44cc 88858f1 9c948a7 b5f44cc 9c948a7 5a2372a 88858f1 5a2372a b9a2df5 d6f65e1 5a2372a 9c948a7 b5f44cc 9c948a7 b5f44cc 9c948a7 88858f1 9c948a7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | import os
import uvicorn
import struct
import logging
import asyncio
import resource
from fastapi import FastAPI, HTTPException, Security, Request, status
from fastapi.security import APIKeyHeader
from starlette.concurrency import run_in_threadpool
from llama_cpp import Llama
from huggingface_hub import hf_hub_download, login
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
# --- βοΈ LOGGING CONFIGURATION ---
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%H:%M:%S"
)
logger = logging.getLogger("MetanthropicNode")
# --- π§± HARDWARE LIMITS (14GB CEILING) ---
def set_memory_limit():
# We set a hard ceiling to prevent OOM kills.
limit_bytes = 14 * 1024 * 1024 * 1024
resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
logger.info(f"π§± MEMORY HARD LIMIT SET: 14.0 GB")
set_memory_limit()
# --- π§ ENGINE CONFIGURATION ---
SOURCE_REPO = "metanthropic/metanthropic-phi3-encrypted"
SOURCE_FILE = "metanthropic-phi3-v1.mguf"
TEMP_DECRYPTED = "/dev/shm/metanthropic_v3_16.gguf"
# --- π SECURITY SECRETS ---
HF_TOKEN = os.environ.get("HF_TOKEN")
DECRYPTION_KEY = os.environ.get("DECRYPTION_KEY")
API_KEY_NAME = "x-metanthropic-key"
api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
# --- π¦ BATCH CONTROLLER (SEMAPHORE) ---
# Allows 4 concurrent users. User #5 waits.
MAX_CONCURRENT_USERS = 4
BATCH_SEMAPHORE = asyncio.Semaphore(MAX_CONCURRENT_USERS)
app = FastAPI(title="Metanthropic Sovereign Node")
# --- π SECURITY HANDSHAKE ---
async def get_api_key(api_key: str = Security(api_key_header)):
if not DECRYPTION_KEY:
raise HTTPException(status_code=500, detail="Node Config Error")
if api_key == DECRYPTION_KEY:
return api_key
raise HTTPException(status_code=403, detail="Access Denied")
# --- π οΈ BOOTLOADER (DECRYPTION) ---
def initialize_engine():
if os.path.exists(TEMP_DECRYPTED):
logger.info("β‘ RAM-Disk Hit: Model resident.")
return True
if not HF_TOKEN or not DECRYPTION_KEY:
logger.error("β CRITICAL: Missing Secrets")
return False
try:
login(token=HF_TOKEN)
logger.info(f"β³ Downloading & Decrypting...")
path = hf_hub_download(repo_id=SOURCE_REPO, filename=SOURCE_FILE, local_dir=".")
key = bytes.fromhex(DECRYPTION_KEY)
aes = AESGCM(key)
with open(path, "rb") as f_in, open(TEMP_DECRYPTED, "wb") as f_out:
nonce = f_in.read(12)
h_len = struct.unpack("<I", f_in.read(4))[0]
header = f_in.read(h_len)
f_out.write(aes.decrypt(nonce, header, None))
while chunk := f_in.read(128 * 1024 * 1024):
f_out.write(chunk)
if os.path.exists(path): os.remove(path)
logger.info("β
Decryption Complete.")
return True
except Exception as e:
logger.error(f"β BOOT FAILURE: {e}")
return False
# --- π INITIALIZE ENGINE GLOBAL ---
llm = None
if initialize_engine():
logger.info("π STARTING ENGINE: PARALLEL BATCH MODE")
# π₯ THE STABLE CONFIGURATION π₯
# FIXED: Reduced to 4096 to match Phi-3 Mini's hard limit.
# 4096 Total / 4 Users = 1024 Tokens per User (Context Window).
TOTAL_CONTEXT = 4096
llm = Llama(
model_path=TEMP_DECRYPTED,
# 1. PARALLEL SLOTS
n_parallel=MAX_CONCURRENT_USERS,
# 2. TOTAL CONTEXT POOL
n_ctx=TOTAL_CONTEXT,
# 3. COMPUTE DENSITY
n_batch=512, # Process chunks
f16_kv=True, # Memory precision
# 4. CPU STRATEGY
# 2 threads is optimal for Hugging Face Free Tier vCPU
n_threads=2,
use_mlock=True, # Pin to RAM
verbose=False # Keep logs clean
)
logger.info(f"β
ENGINE READY. Capacity: {MAX_CONCURRENT_USERS} Simultaneous Streams.")
# --- π₯ HEALTH CHECK ---
@app.get("/")
def health_check():
free_slots = BATCH_SEMAPHORE._value
return {"status": "active", "free_slots": free_slots}
# --- π‘ API ENDPOINT ---
@app.post("/v1/chat/completions")
async def chat_completion(request: Request, api_key: str = Security(get_api_key)):
if not llm:
raise HTTPException(status_code=503, detail="System Booting...")
try:
data = await request.json()
messages = data.get("messages", [])
max_tokens = data.get("max_tokens", 1024)
temperature = data.get("temperature", 0.7)
prompt = ""
for msg in messages:
prompt += f"<|{msg['role']}|>\n{msg['content']}<|end|>\n"
prompt += "<|assistant|>\n"
except Exception as e:
raise HTTPException(status_code=400, detail=f"Bad Request: {str(e)}")
# β‘ THE BATCH GATE β‘
async with BATCH_SEMAPHORE:
try:
output = await run_in_threadpool(
llm,
prompt,
max_tokens=max_tokens,
temperature=temperature,
stop=["<|end|>"],
echo=False
)
return {
"id": output["id"],
"object": "chat.completion",
"created": output["created"],
"model": "metanthropic-phi3-parallel",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": output['choices'][0]['text'].strip()
},
"finish_reason": output['choices'][0]['finish_reason']
}
],
"usage": output["usage"]
}
except Exception as e:
logger.error(f"Inference Error: {e}")
raise HTTPException(status_code=500, detail="Neural Core Error")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning") |