ekjotsingh's picture
Update app.py
fe8f4fb verified
import os
import sys
import struct
import traceback
import gradio as gr
from llama_cpp import Llama
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
from huggingface_hub import hf_hub_download, login
from fastapi import FastAPI, Request
# --- GLOBAL DIAGNOSTICS & LOGGING ---
DIAGNOSTIC_LOG = []
def log_status(msg):
print(msg)
DIAGNOSTIC_LOG.append(msg)
# --- CONFIGURATION ---
SOURCE_REPO = "metanthropic/metanthropic-phi3-encrypted"
SOURCE_FILE = "metanthropic-phi3-v1.mguf"
TEMP_DECRYPTED = "/tmp/model_stable_v3.gguf"
HF_TOKEN = os.environ.get("HF_TOKEN")
SECRET_KEY_HEX = os.environ.get("DECRYPTION_KEY")
# --- SOVEREIGN BOOTLOADER ---
def initialize_weights():
try:
if os.path.exists(TEMP_DECRYPTED):
return True
if not HF_TOKEN or not SECRET_KEY_HEX:
log_status("❌ [SECURITY] Credentials missing.")
return False
login(token=HF_TOKEN)
path = hf_hub_download(repo_id=SOURCE_REPO, filename=SOURCE_FILE, local_dir=".")
log_status("πŸ”“ [DECRYPT] Unlocking GGUF weights...")
key = bytes.fromhex(SECRET_KEY_HEX)
aes = AESGCM(key)
with open(path, "rb") as f_in, open(TEMP_DECRYPTED, "wb") as f_out:
nonce = f_in.read(12)
h_len = struct.unpack("<I", f_in.read(4))[0]
f_out.write(aes.decrypt(nonce, f_in.read(h_len), None))
while chunk := f_in.read(64*1024*1024):
f_out.write(chunk)
os.remove(path)
log_status("βœ… [SYSTEM] Weight integrity verified.")
return True
except Exception as e:
log_status(f"❌ [BOOT ERROR] {str(e)}")
return False
# --- ENGINE INITIALIZATION ---
llm = None
if initialize_weights():
try:
log_status("🧠 [ENGINE] Initializing Llama...")
llm = Llama(
model_path=TEMP_DECRYPTED,
n_ctx=2048,
n_threads=2, # Locked to 2-vCPU Free Tier limit
n_batch=512,
use_mlock=True, # Pin model to RAM
verbose=False
)
log_status("πŸš€ [SYSTEM] Node Online.")
except Exception as e:
log_status(f"❌ [ENGINE ERROR] Neural load failed: {e}")
# --- API CORE (CONVEX BRIDGE) ---
app = FastAPI()
@app.post("/run_inference")
async def run_inference(request: Request):
if not llm: return {"error": "System Offline"}
data = await request.json()
if data.get("secretKey") != SECRET_KEY_HEX:
return {"error": "Unauthorized Access"}
prompt = data.get("prompt", "")
output = llm(f"<|user|>\n{prompt}<|end|>\n<|assistant|>", max_tokens=512, stop=["<|end|>"])
return {"response": output['choices'][0]['text'].strip()}
# --- PREMIUM UI LOGIC (STREAMING FIX) ---
def ui_chat(msg, hist):
if not llm:
yield "🚨 SYSTEM OFFLINE. CHECK LOGS."
return
# Use the High-Level __call__ API with stream=True
stream_iterator = llm(
f"<|user|>\n{msg}<|end|>\n<|assistant|>",
max_tokens=512,
stop=["<|end|>", "<|endoftext|>"],
stream=True
)
partial_text = ""
try:
for chunk in stream_iterator:
# FIX: CompletionChunks store text in ['choices'][0]['text']
# There is NO 'delta' or 'content' in this API mode.
token = chunk['choices'][0].get('text', "")
if token:
partial_text += token
yield partial_text
except Exception as e:
yield f"⚠️ Stream Interrupted: {str(e)}"
# --- BRANDED INTERFACE ---
custom_css = "footer {visibility: hidden} .gradio-container {background-color: #050505 !important}"
demo = gr.ChatInterface(
ui_chat,
title="METANTHROPIC Β· PHI-3",
theme=gr.themes.Soft(primary_hue="slate", neutral_hue="zinc"),
css=custom_css
)
app = gr.mount_gradio_app(app, demo, path="/")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)