from huggingface_hub import hf_hub_download
import os
import sys

print("⬇ Downloading Roy model (IQ4_XS – optimized)...")

# ===== MODEL CHANGED HERE =====
MODEL_FILE = "Roy-v1.IQ4_XS.gguf"

hf_hub_download(
    repo_id="mradermacher/Roy-v1-GGUF",
    filename=MODEL_FILE,
    local_dir="/app"
)

print("📁 Files in /app:")
for root, dirs, files in os.walk("/app"):
    for f in files:
        print(os.path.join(root, f))

# Auto-detect server binary
server_path = None
for root, dirs, files in os.walk("/app"):
    for f in files:
        if f in ["server", "llama-server"]:
            server_path = os.path.join(root, f)

if not server_path:
    print("❌ Server binary not found")
    sys.exit(1)

print("🚀 Launching:", server_path)

# ===== PERFORMANCE TUNING ADDED =====
os.execv(server_path, [
    server_path,
    "-m", f"/app/{MODEL_FILE}",

    "--host", "0.0.0.0",
    "--port", "7860",

    # Speed settings
    "--ctx-size", "256",      # smaller context = faster
    "--n-predict", "120",     # limit response length
    "--threads", "4",         # match HF CPU
])