Update app.py
Browse files
app.py
CHANGED
|
@@ -85,7 +85,7 @@ print("=" * 60)
|
|
| 85 |
def setup_and_start_backend():
|
| 86 |
# 1. Download Model
|
| 87 |
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
| 88 |
-
print(f"[SETUP] Verifying
|
| 89 |
model_path = hf_hub_download(
|
| 90 |
repo_id=GGUF_REPO,
|
| 91 |
filename=GGUF_FILE,
|
|
@@ -93,9 +93,9 @@ def setup_and_start_backend():
|
|
| 93 |
local_dir_use_symlinks=False
|
| 94 |
)
|
| 95 |
|
| 96 |
-
# 2. Download Pre-compiled Binary
|
| 97 |
if not LLAMA_EXE.exists():
|
| 98 |
-
print("[SETUP] Bypassing
|
| 99 |
LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
|
| 100 |
zip_path = LLAMA_BIN_DIR / "llama.zip"
|
| 101 |
url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
|
|
@@ -113,33 +113,43 @@ def setup_and_start_backend():
|
|
| 113 |
os.rename(found_exe, str(LLAMA_EXE))
|
| 114 |
break
|
| 115 |
|
| 116 |
-
#
|
| 117 |
-
|
| 118 |
-
threads =
|
|
|
|
| 119 |
|
| 120 |
-
print(f"[SETUP] Starting Native llama-server engine on {threads}
|
| 121 |
-
|
|
|
|
|
|
|
| 122 |
str(LLAMA_EXE),
|
| 123 |
"-m", model_path,
|
| 124 |
"-c", "4096",
|
| 125 |
-
"--port",
|
| 126 |
"--host", "127.0.0.1",
|
| 127 |
"-t", threads
|
| 128 |
-
], stdout=subprocess.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
# 4. Wait for Server to wake up
|
| 131 |
-
for
|
| 132 |
try:
|
| 133 |
-
if requests.get("http://127.0.0.1:
|
| 134 |
-
print("[SETUP] llama-server backend is ONLINE and ready!")
|
| 135 |
-
return True
|
| 136 |
except requests.exceptions.ConnectionError:
|
| 137 |
time.sleep(1)
|
| 138 |
|
| 139 |
-
print("[SETUP] FAILED to start llama-server backend.")
|
| 140 |
-
return False
|
| 141 |
|
| 142 |
-
backend_ready = setup_and_start_backend()
|
| 143 |
|
| 144 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 145 |
# CHAT MEMORY
|
|
@@ -193,7 +203,7 @@ def generate_response(user_input: str, session_id: str) -> str:
|
|
| 193 |
|
| 194 |
try:
|
| 195 |
# Request completion natively from our C++ binary
|
| 196 |
-
res = requests.post("http://127.0.0.1:
|
| 197 |
response = res.get("content", "").strip()
|
| 198 |
except Exception as exc:
|
| 199 |
print(f"[GENERATE] Error communicating with llama-server: {exc}")
|
|
|
|
| 85 |
def setup_and_start_backend():
|
| 86 |
# 1. Download Model
|
| 87 |
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
| 88 |
+
print(f"[SETUP] Verifying Model: {GGUF_FILE} ...")
|
| 89 |
model_path = hf_hub_download(
|
| 90 |
repo_id=GGUF_REPO,
|
| 91 |
filename=GGUF_FILE,
|
|
|
|
| 93 |
local_dir_use_symlinks=False
|
| 94 |
)
|
| 95 |
|
| 96 |
+
# 2. Download Pre-compiled Binary
|
| 97 |
if not LLAMA_EXE.exists():
|
| 98 |
+
print("[SETUP] Bypassing PIP - Downloading pre-compiled C++ binary directly...")
|
| 99 |
LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
|
| 100 |
zip_path = LLAMA_BIN_DIR / "llama.zip"
|
| 101 |
url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
|
|
|
|
| 113 |
os.rename(found_exe, str(LLAMA_EXE))
|
| 114 |
break
|
| 115 |
|
| 116 |
+
# Hugging Face Free tier reports 16 cores but throttles to 2.
|
| 117 |
+
# 15 threads will crash the sandbox container. 4 is the safe maximum.
|
| 118 |
+
threads = "4"
|
| 119 |
+
port = "8089" # Using 8089 to prevent HF internal routing conflicts
|
| 120 |
|
| 121 |
+
print(f"[SETUP] Starting Native llama-server engine on {threads} threads, port {port}...")
|
| 122 |
+
|
| 123 |
+
# We use subprocess.PIPE to read the internal logs of the C++ binary!
|
| 124 |
+
proc = subprocess.Popen([
|
| 125 |
str(LLAMA_EXE),
|
| 126 |
"-m", model_path,
|
| 127 |
"-c", "4096",
|
| 128 |
+
"--port", port,
|
| 129 |
"--host", "127.0.0.1",
|
| 130 |
"-t", threads
|
| 131 |
+
], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
| 132 |
+
|
| 133 |
+
# Stream C++ logs directly to our console
|
| 134 |
+
def stream_logs():
|
| 135 |
+
for line in proc.stdout:
|
| 136 |
+
print(f"[ENGINE LOG] {line.strip()}")
|
| 137 |
+
|
| 138 |
+
threading.Thread(target=stream_logs, daemon=True).start()
|
| 139 |
|
| 140 |
# 4. Wait for Server to wake up
|
| 141 |
+
for attempt in range(30):
|
| 142 |
try:
|
| 143 |
+
if requests.get(f"http://127.0.0.1:{port}/").status_code == 200:
|
| 144 |
+
print("\n[SETUP] llama-server backend is ONLINE and ready!\n")
|
| 145 |
+
return True, port
|
| 146 |
except requests.exceptions.ConnectionError:
|
| 147 |
time.sleep(1)
|
| 148 |
|
| 149 |
+
print("\n[SETUP] FAILED to start llama-server backend. Check the [ENGINE LOG] lines above.\n")
|
| 150 |
+
return False, port
|
| 151 |
|
| 152 |
+
backend_ready, engine_port = setup_and_start_backend()
|
| 153 |
|
| 154 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
# CHAT MEMORY
|
|
|
|
| 203 |
|
| 204 |
try:
|
| 205 |
# Request completion natively from our C++ binary
|
| 206 |
+
res = requests.post(f"http://127.0.0.1:{engine_port}/completion", json=payload, timeout=60).json()
|
| 207 |
response = res.get("content", "").strip()
|
| 208 |
except Exception as exc:
|
| 209 |
print(f"[GENERATE] Error communicating with llama-server: {exc}")
|