File size: 4,001 Bytes
4b90eb9
e2d080d
4b90eb9
e2d080d
4b90eb9
85346fb
e2d080d
85346fb
ecd24a9
0822b66
85346fb
e2d080d
 
 
 
 
85346fb
0acd62e
 
f92aefd
e2d080d
 
 
85346fb
 
e2d080d
 
 
 
 
f41cf72
e2d080d
 
 
 
 
f92aefd
e2d080d
 
85346fb
e2d080d
 
 
 
 
 
 
 
f92aefd
e2d080d
 
f41cf72
e2d080d
 
f92aefd
4b90eb9
85346fb
e2d080d
f41cf72
85346fb
 
fec01e0
f92aefd
f41cf72
f92aefd
85346fb
 
f41cf72
e2d080d
f92aefd
ecd24a9
f92aefd
ecd24a9
 
 
 
f92aefd
 
ecd24a9
fec01e0
f41cf72
fec01e0
ecd24a9
f41cf72
e2d080d
 
f92aefd
e2d080d
 
f92aefd
85346fb
 
f92aefd
 
85346fb
 
 
 
 
 
 
f92aefd
 
 
 
 
 
 
 
 
 
85346fb
f92aefd
f41cf72
85346fb
 
 
fe8f4fb
f41cf72
85346fb
 
ecd24a9
 
4b90eb9
ecd24a9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import sys
import struct
import traceback
import gradio as gr
from llama_cpp import Llama
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
from huggingface_hub import hf_hub_download, login
from fastapi import FastAPI, Request

# --- GLOBAL DIAGNOSTICS & LOGGING ---
DIAGNOSTIC_LOG = []
def log_status(msg):
    print(msg)
    DIAGNOSTIC_LOG.append(msg)

# --- CONFIGURATION ---
SOURCE_REPO = "metanthropic/metanthropic-phi3-encrypted"
SOURCE_FILE = "metanthropic-phi3-v1.mguf"
TEMP_DECRYPTED = "/tmp/model_stable_v3.gguf"
HF_TOKEN = os.environ.get("HF_TOKEN")
SECRET_KEY_HEX = os.environ.get("DECRYPTION_KEY")

# --- SOVEREIGN BOOTLOADER ---
def initialize_weights():
    try:
        if os.path.exists(TEMP_DECRYPTED):
            return True

        if not HF_TOKEN or not SECRET_KEY_HEX:
            log_status("❌ [SECURITY] Credentials missing.")
            return False

        login(token=HF_TOKEN)
        path = hf_hub_download(repo_id=SOURCE_REPO, filename=SOURCE_FILE, local_dir=".")
        
        log_status("🔓 [DECRYPT] Unlocking GGUF weights...")
        key = bytes.fromhex(SECRET_KEY_HEX)
        aes = AESGCM(key)
        
        with open(path, "rb") as f_in, open(TEMP_DECRYPTED, "wb") as f_out:
            nonce = f_in.read(12)
            h_len = struct.unpack("<I", f_in.read(4))[0]
            f_out.write(aes.decrypt(nonce, f_in.read(h_len), None))
            while chunk := f_in.read(64*1024*1024):
                f_out.write(chunk)
        
        os.remove(path)
        log_status("✅ [SYSTEM] Weight integrity verified.")
        return True
    except Exception as e:
        log_status(f"❌ [BOOT ERROR] {str(e)}")
        return False

# --- ENGINE INITIALIZATION ---
llm = None
if initialize_weights():
    try:
        log_status("🧠 [ENGINE] Initializing Llama...")
        llm = Llama(
            model_path=TEMP_DECRYPTED,
            n_ctx=2048,
            n_threads=2,     # Locked to 2-vCPU Free Tier limit
            n_batch=512,
            use_mlock=True,  # Pin model to RAM
            verbose=False
        )
        log_status("🚀 [SYSTEM] Node Online.")
    except Exception as e:
        log_status(f"❌ [ENGINE ERROR] Neural load failed: {e}")

# --- API CORE (CONVEX BRIDGE) ---
app = FastAPI()

@app.post("/run_inference")
async def run_inference(request: Request):
    if not llm: return {"error": "System Offline"}
    
    data = await request.json()
    if data.get("secretKey") != SECRET_KEY_HEX:
        return {"error": "Unauthorized Access"}
    
    prompt = data.get("prompt", "")
    output = llm(f"<|user|>\n{prompt}<|end|>\n<|assistant|>", max_tokens=512, stop=["<|end|>"])
    return {"response": output['choices'][0]['text'].strip()}

# --- PREMIUM UI LOGIC (STREAMING FIX) ---
def ui_chat(msg, hist):
    if not llm:
        yield "🚨 SYSTEM OFFLINE. CHECK LOGS."
        return

    # Use the High-Level __call__ API with stream=True
    stream_iterator = llm(
        f"<|user|>\n{msg}<|end|>\n<|assistant|>",
        max_tokens=512,
        stop=["<|end|>", "<|endoftext|>"],
        stream=True
    )
    
    partial_text = ""
    try:
        for chunk in stream_iterator:
            # FIX: CompletionChunks store text in ['choices'][0]['text']
            # There is NO 'delta' or 'content' in this API mode.
            token = chunk['choices'][0].get('text', "")
            if token:
                partial_text += token
                yield partial_text
    except Exception as e:
        yield f"⚠️ Stream Interrupted: {str(e)}"

# --- BRANDED INTERFACE ---
custom_css = "footer {visibility: hidden} .gradio-container {background-color: #050505 !important}"

demo = gr.ChatInterface(
    ui_chat,
    title="METANTHROPIC · PHI-3",
    theme=gr.themes.Soft(primary_hue="slate", neutral_hue="zinc"),
    css=custom_css
)

app = gr.mount_gradio_app(app, demo, path="/")

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)