Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import struct | |
| import traceback | |
| import gradio as gr | |
| from llama_cpp import Llama | |
| from cryptography.hazmat.primitives.ciphers.aead import AESGCM | |
| from huggingface_hub import hf_hub_download, login | |
| from fastapi import FastAPI, Request | |
| # --- GLOBAL DIAGNOSTICS & LOGGING --- | |
| DIAGNOSTIC_LOG = [] | |
| def log_status(msg): | |
| print(msg) | |
| DIAGNOSTIC_LOG.append(msg) | |
| # --- CONFIGURATION --- | |
| SOURCE_REPO = "metanthropic/metanthropic-phi3-encrypted" | |
| SOURCE_FILE = "metanthropic-phi3-v1.mguf" | |
| TEMP_DECRYPTED = "/tmp/model_stable_v3.gguf" | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| SECRET_KEY_HEX = os.environ.get("DECRYPTION_KEY") | |
| # --- SOVEREIGN BOOTLOADER --- | |
| def initialize_weights(): | |
| try: | |
| if os.path.exists(TEMP_DECRYPTED): | |
| return True | |
| if not HF_TOKEN or not SECRET_KEY_HEX: | |
| log_status("β [SECURITY] Credentials missing.") | |
| return False | |
| login(token=HF_TOKEN) | |
| path = hf_hub_download(repo_id=SOURCE_REPO, filename=SOURCE_FILE, local_dir=".") | |
| log_status("π [DECRYPT] Unlocking GGUF weights...") | |
| key = bytes.fromhex(SECRET_KEY_HEX) | |
| aes = AESGCM(key) | |
| with open(path, "rb") as f_in, open(TEMP_DECRYPTED, "wb") as f_out: | |
| nonce = f_in.read(12) | |
| h_len = struct.unpack("<I", f_in.read(4))[0] | |
| f_out.write(aes.decrypt(nonce, f_in.read(h_len), None)) | |
| while chunk := f_in.read(64*1024*1024): | |
| f_out.write(chunk) | |
| os.remove(path) | |
| log_status("β [SYSTEM] Weight integrity verified.") | |
| return True | |
| except Exception as e: | |
| log_status(f"β [BOOT ERROR] {str(e)}") | |
| return False | |
| # --- ENGINE INITIALIZATION --- | |
| llm = None | |
| if initialize_weights(): | |
| try: | |
| log_status("π§ [ENGINE] Initializing Llama...") | |
| llm = Llama( | |
| model_path=TEMP_DECRYPTED, | |
| n_ctx=2048, | |
| n_threads=2, # Locked to 2-vCPU Free Tier limit | |
| n_batch=512, | |
| use_mlock=True, # Pin model to RAM | |
| verbose=False | |
| ) | |
| log_status("π [SYSTEM] Node Online.") | |
| except Exception as e: | |
| log_status(f"β [ENGINE ERROR] Neural load failed: {e}") | |
| # --- API CORE (CONVEX BRIDGE) --- | |
| app = FastAPI() | |
| async def run_inference(request: Request): | |
| if not llm: return {"error": "System Offline"} | |
| data = await request.json() | |
| if data.get("secretKey") != SECRET_KEY_HEX: | |
| return {"error": "Unauthorized Access"} | |
| prompt = data.get("prompt", "") | |
| output = llm(f"<|user|>\n{prompt}<|end|>\n<|assistant|>", max_tokens=512, stop=["<|end|>"]) | |
| return {"response": output['choices'][0]['text'].strip()} | |
| # --- PREMIUM UI LOGIC (STREAMING FIX) --- | |
| def ui_chat(msg, hist): | |
| if not llm: | |
| yield "π¨ SYSTEM OFFLINE. CHECK LOGS." | |
| return | |
| # Use the High-Level __call__ API with stream=True | |
| stream_iterator = llm( | |
| f"<|user|>\n{msg}<|end|>\n<|assistant|>", | |
| max_tokens=512, | |
| stop=["<|end|>", "<|endoftext|>"], | |
| stream=True | |
| ) | |
| partial_text = "" | |
| try: | |
| for chunk in stream_iterator: | |
| # FIX: CompletionChunks store text in ['choices'][0]['text'] | |
| # There is NO 'delta' or 'content' in this API mode. | |
| token = chunk['choices'][0].get('text', "") | |
| if token: | |
| partial_text += token | |
| yield partial_text | |
| except Exception as e: | |
| yield f"β οΈ Stream Interrupted: {str(e)}" | |
| # --- BRANDED INTERFACE --- | |
| custom_css = "footer {visibility: hidden} .gradio-container {background-color: #050505 !important}" | |
| demo = gr.ChatInterface( | |
| ui_chat, | |
| title="METANTHROPIC Β· PHI-3", | |
| theme=gr.themes.Soft(primary_hue="slate", neutral_hue="zinc"), | |
| css=custom_css | |
| ) | |
| app = gr.mount_gradio_app(app, demo, path="/") | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |