File size: 6,119 Bytes
88858f1
 
 
5a2372a
9c948a7
b5f44cc
88858f1
 
9c948a7
88858f1
 
 
 
5a2372a
 
 
 
 
 
 
 
b5f44cc
b9a2df5
b5f44cc
b9a2df5
 
 
 
 
 
5a2372a
88858f1
 
5a2372a
 
 
88858f1
 
5a2372a
88858f1
 
b5f44cc
7433133
b5f44cc
 
9c948a7
5a2372a
88858f1
5a2372a
4107b58
5a2372a
b5f44cc
5a2372a
 
b5f44cc
88858f1
5a2372a
88858f1
5a2372a
b5f44cc
5a2372a
 
 
b5f44cc
fb3b67e
5a2372a
88858f1
5a2372a
b5f44cc
88858f1
5a2372a
88858f1
 
d6f65e1
88858f1
 
b5f44cc
 
 
 
5a2372a
d6f65e1
b5f44cc
 
88858f1
 
5a2372a
88858f1
 
5a2372a
88858f1
 
b5f44cc
 
7433133
 
 
 
5a2372a
d6f65e1
 
b9a2df5
7433133
b5f44cc
b9a2df5
b5f44cc
 
b9a2df5
b5f44cc
7433133
 
b9a2df5
b5f44cc
7433133
b9a2df5
 
b5f44cc
 
d6f65e1
b5f44cc
88858f1
9c948a7
 
 
b5f44cc
 
9c948a7
5a2372a
88858f1
 
5a2372a
b9a2df5
d6f65e1
5a2372a
 
 
 
 
 
 
 
 
 
 
9c948a7
 
b5f44cc
 
9c948a7
 
 
 
 
 
 
 
 
 
 
 
 
 
b5f44cc
9c948a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88858f1
 
9c948a7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import os
import uvicorn
import struct
import logging
import asyncio
import resource
from fastapi import FastAPI, HTTPException, Security, Request, status
from fastapi.security import APIKeyHeader
from starlette.concurrency import run_in_threadpool
from llama_cpp import Llama
from huggingface_hub import hf_hub_download, login
from cryptography.hazmat.primitives.ciphers.aead import AESGCM

# --- βš™οΈ LOGGING CONFIGURATION ---
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S"
)
logger = logging.getLogger("MetanthropicNode")

# --- 🧱 HARDWARE LIMITS (14GB CEILING) ---
def set_memory_limit():
    # We set a hard ceiling to prevent OOM kills.
    limit_bytes = 14 * 1024 * 1024 * 1024 
    resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
    logger.info(f"🧱 MEMORY HARD LIMIT SET: 14.0 GB")

set_memory_limit()

# --- πŸ”§ ENGINE CONFIGURATION ---
SOURCE_REPO = "metanthropic/metanthropic-phi3-encrypted"
SOURCE_FILE = "metanthropic-phi3-v1.mguf"
TEMP_DECRYPTED = "/dev/shm/metanthropic_v3_16.gguf"

# --- πŸ”‘ SECURITY SECRETS ---
HF_TOKEN = os.environ.get("HF_TOKEN")
DECRYPTION_KEY = os.environ.get("DECRYPTION_KEY")
API_KEY_NAME = "x-metanthropic-key"
api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)

# --- 🚦 BATCH CONTROLLER (SEMAPHORE) ---
# Allows 4 concurrent users. User #5 waits.
MAX_CONCURRENT_USERS = 4
BATCH_SEMAPHORE = asyncio.Semaphore(MAX_CONCURRENT_USERS)

app = FastAPI(title="Metanthropic Sovereign Node")

# --- πŸ”’ SECURITY HANDSHAKE ---
async def get_api_key(api_key: str = Security(api_key_header)):
    if not DECRYPTION_KEY:
        raise HTTPException(status_code=500, detail="Node Config Error")
    if api_key == DECRYPTION_KEY:
        return api_key
    raise HTTPException(status_code=403, detail="Access Denied")

# --- πŸ› οΈ BOOTLOADER (DECRYPTION) ---
def initialize_engine():
    if os.path.exists(TEMP_DECRYPTED):
        logger.info("⚑ RAM-Disk Hit: Model resident.")
        return True

    if not HF_TOKEN or not DECRYPTION_KEY:
        logger.error("❌ CRITICAL: Missing Secrets")
        return False

    try:
        login(token=HF_TOKEN)
        logger.info(f"⏳ Downloading & Decrypting...")
        path = hf_hub_download(repo_id=SOURCE_REPO, filename=SOURCE_FILE, local_dir=".")
        
        key = bytes.fromhex(DECRYPTION_KEY)
        aes = AESGCM(key)
        
        with open(path, "rb") as f_in, open(TEMP_DECRYPTED, "wb") as f_out:
            nonce = f_in.read(12)
            h_len = struct.unpack("<I", f_in.read(4))[0]
            header = f_in.read(h_len)
            f_out.write(aes.decrypt(nonce, header, None))
            while chunk := f_in.read(128 * 1024 * 1024):
                f_out.write(chunk)
        
        if os.path.exists(path): os.remove(path)
        logger.info("βœ… Decryption Complete.")
        return True
    except Exception as e:
        logger.error(f"❌ BOOT FAILURE: {e}")
        return False

# --- πŸš€ INITIALIZE ENGINE GLOBAL ---
llm = None
if initialize_engine():
    logger.info("πŸš€ STARTING ENGINE: PARALLEL BATCH MODE")
    
    # πŸ”₯ THE STABLE CONFIGURATION πŸ”₯
    # FIXED: Reduced to 4096 to match Phi-3 Mini's hard limit.
    # 4096 Total / 4 Users = 1024 Tokens per User (Context Window).
    TOTAL_CONTEXT = 4096  
    
    llm = Llama(
        model_path=TEMP_DECRYPTED,
        
        # 1. PARALLEL SLOTS
        n_parallel=MAX_CONCURRENT_USERS, 
        
        # 2. TOTAL CONTEXT POOL
        n_ctx=TOTAL_CONTEXT, 
        
        # 3. COMPUTE DENSITY
        n_batch=512,        # Process chunks
        f16_kv=True,        # Memory precision
        
        # 4. CPU STRATEGY
        # 2 threads is optimal for Hugging Face Free Tier vCPU
        n_threads=2,
        
        use_mlock=True,     # Pin to RAM
        verbose=False       # Keep logs clean
    )
    logger.info(f"βœ… ENGINE READY. Capacity: {MAX_CONCURRENT_USERS} Simultaneous Streams.")

# --- πŸ₯ HEALTH CHECK ---
@app.get("/")
def health_check():
    free_slots = BATCH_SEMAPHORE._value 
    return {"status": "active", "free_slots": free_slots}

# --- πŸ“‘ API ENDPOINT ---
@app.post("/v1/chat/completions")
async def chat_completion(request: Request, api_key: str = Security(get_api_key)):
    if not llm:
        raise HTTPException(status_code=503, detail="System Booting...")
    
    try:
        data = await request.json()
        messages = data.get("messages", [])
        max_tokens = data.get("max_tokens", 1024)
        temperature = data.get("temperature", 0.7)
        
        prompt = ""
        for msg in messages:
            prompt += f"<|{msg['role']}|>\n{msg['content']}<|end|>\n"
        prompt += "<|assistant|>\n"
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Bad Request: {str(e)}")

    # ⚑ THE BATCH GATE ⚑
    async with BATCH_SEMAPHORE:
        try:
            output = await run_in_threadpool(
                llm,
                prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                stop=["<|end|>"],
                echo=False
            )
            
            return {
                "id": output["id"],
                "object": "chat.completion",
                "created": output["created"],
                "model": "metanthropic-phi3-parallel",
                "choices": [
                    {
                        "index": 0,
                        "message": {
                            "role": "assistant",
                            "content": output['choices'][0]['text'].strip()
                        },
                        "finish_reason": output['choices'][0]['finish_reason']
                    }
                ],
                "usage": output["usage"]
            }
        except Exception as e:
            logger.error(f"Inference Error: {e}")
            raise HTTPException(status_code=500, detail="Neural Core Error")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")