from huggingface_hub import hf_hub_download import os import sys print("⬇ Downloading Roy model (IQ4_XS – optimized)...") # ===== MODEL CHANGED HERE ===== MODEL_FILE = "Roy-v1.IQ4_XS.gguf" hf_hub_download( repo_id="mradermacher/Roy-v1-GGUF", filename=MODEL_FILE, local_dir="/app" ) print("📁 Files in /app:") for root, dirs, files in os.walk("/app"): for f in files: print(os.path.join(root, f)) # Auto-detect server binary server_path = None for root, dirs, files in os.walk("/app"): for f in files: if f in ["server", "llama-server"]: server_path = os.path.join(root, f) if not server_path: print("❌ Server binary not found") sys.exit(1) print("🚀 Launching:", server_path) # ===== PERFORMANCE TUNING ADDED ===== os.execv(server_path, [ server_path, "-m", f"/app/{MODEL_FILE}", "--host", "0.0.0.0", "--port", "7860", # Speed settings "--ctx-size", "256", # smaller context = faster "--n-predict", "120", # limit response length "--threads", "4", # match HF CPU ])