File size: 1,091 Bytes
728fdc9
 
cd6c204
728fdc9
453d3bf
e97f198
453d3bf
e97f198
728fdc9
 
 
e97f198
728fdc9
 
 
72cb8c0
 
 
 
453d3bf
 
f3ba1cb
 
 
cd6c204
72cb8c0
 
 
 
 
f3ba1cb
453d3bf
728fdc9
453d3bf
 
f3ba1cb
e97f198
453d3bf
728fdc9
e97f198
 
453d3bf
 
 
 
72cb8c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from huggingface_hub import hf_hub_download
import os
import sys

print("⬇ Downloading Roy model (IQ4_XS – optimized)...")

# ===== MODEL CHANGED HERE =====
MODEL_FILE = "Roy-v1.IQ4_XS.gguf"

hf_hub_download(
    repo_id="mradermacher/Roy-v1-GGUF",
    filename=MODEL_FILE,
    local_dir="/app"
)

print("πŸ“ Files in /app:")
for root, dirs, files in os.walk("/app"):
    for f in files:
        print(os.path.join(root, f))

# Auto-detect server binary
server_path = None
for root, dirs, files in os.walk("/app"):
    for f in files:
        if f in ["server", "llama-server"]:
            server_path = os.path.join(root, f)

if not server_path:
    print("❌ Server binary not found")
    sys.exit(1)

print("πŸš€ Launching:", server_path)

# ===== PERFORMANCE TUNING ADDED =====
os.execv(server_path, [
    server_path,
    "-m", f"/app/{MODEL_FILE}",

    "--host", "0.0.0.0",
    "--port", "7860",

    # Speed settings
    "--ctx-size", "256",      # smaller context = faster
    "--n-predict", "120",     # limit response length
    "--threads", "4",         # match HF CPU
])