Ana-2

Runtime error

App Files Files Community

OrbitMC commited on Apr 1

Commit

40d6eda

verified ·

1 Parent(s): c776b5a

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -18

app.py CHANGED Viewed

@@ -85,7 +85,7 @@ print("=" * 60)
 def setup_and_start_backend():
     # 1. Download Model
     MODEL_DIR.mkdir(parents=True, exist_ok=True)
-    print(f"[SETUP] Verifying/Downloading Model: {GGUF_FILE} ...")
     model_path = hf_hub_download(
         repo_id=GGUF_REPO,
         filename=GGUF_FILE,
@@ -93,9 +93,9 @@ def setup_and_start_backend():
         local_dir_use_symlinks=False
     )
-    # 2. Download Pre-compiled Binary (Instant)
     if not LLAMA_EXE.exists():
-        print("[SETUP] Bypassing python pip - Downloading pre-compiled C++ binary directly...")
         LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
         zip_path = LLAMA_BIN_DIR / "llama.zip"
         url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
@@ -113,33 +113,43 @@ def setup_and_start_backend():
                     os.rename(found_exe, str(LLAMA_EXE))
                 break
-    # 3. Boot Server in Background
-    cpu_cores = os.cpu_count() or 2
-    threads = str(max(1, cpu_cores - 1))
-    print(f"[SETUP] Starting Native llama-server engine on {threads} CPU threads...")
-    subprocess.Popen([
         str(LLAMA_EXE),
         "-m", model_path,
         "-c", "4096",
-        "--port", "8080",
         "--host", "127.0.0.1",
         "-t", threads
-    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
     # 4. Wait for Server to wake up
-    for _ in range(30):
         try:
-            if requests.get("http://127.0.0.1:8080/").status_code == 200:
-                print("[SETUP] llama-server backend is ONLINE and ready!")
-                return True
         except requests.exceptions.ConnectionError:
             time.sleep(1)
-    print("[SETUP] FAILED to start llama-server backend.")
-    return False
-backend_ready = setup_and_start_backend()
 # ══════════════════════════════════════════════════════════════════
 # CHAT MEMORY
@@ -193,7 +203,7 @@ def generate_response(user_input: str, session_id: str) -> str:
     try:
         # Request completion natively from our C++ binary
-        res = requests.post("http://127.0.0.1:8080/completion", json=payload, timeout=60).json()
         response = res.get("content", "").strip()
     except Exception as exc:
         print(f"[GENERATE] Error communicating with llama-server: {exc}")

 def setup_and_start_backend():
     # 1. Download Model
     MODEL_DIR.mkdir(parents=True, exist_ok=True)
+    print(f"[SETUP] Verifying Model: {GGUF_FILE} ...")
     model_path = hf_hub_download(
         repo_id=GGUF_REPO,
         filename=GGUF_FILE,
         local_dir_use_symlinks=False
     )
+    # 2. Download Pre-compiled Binary
     if not LLAMA_EXE.exists():
+        print("[SETUP] Bypassing PIP - Downloading pre-compiled C++ binary directly...")
         LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
         zip_path = LLAMA_BIN_DIR / "llama.zip"
         url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
                     os.rename(found_exe, str(LLAMA_EXE))
                 break
+    # Hugging Face Free tier reports 16 cores but throttles to 2.
+    # 15 threads will crash the sandbox container. 4 is the safe maximum.
+    threads = "4"
+    port = "8089" # Using 8089 to prevent HF internal routing conflicts
+    print(f"[SETUP] Starting Native llama-server engine on {threads} threads, port {port}...")
+    # We use subprocess.PIPE to read the internal logs of the C++ binary!
+    proc = subprocess.Popen([
         str(LLAMA_EXE),
         "-m", model_path,
         "-c", "4096",
+        "--port", port,
         "--host", "127.0.0.1",
         "-t", threads
+    ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    # Stream C++ logs directly to our console
+    def stream_logs():
+        for line in proc.stdout:
+            print(f"[ENGINE LOG] {line.strip()}")
+    threading.Thread(target=stream_logs, daemon=True).start()
     # 4. Wait for Server to wake up
+    for attempt in range(30):
         try:
+            if requests.get(f"http://127.0.0.1:{port}/").status_code == 200:
+                print("\n[SETUP] llama-server backend is ONLINE and ready!\n")
+                return True, port
         except requests.exceptions.ConnectionError:
             time.sleep(1)
+    print("\n[SETUP] FAILED to start llama-server backend. Check the [ENGINE LOG] lines above.\n")
+    return False, port
+backend_ready, engine_port = setup_and_start_backend()
 # ══════════════════════════════════════════════════════════════════
 # CHAT MEMORY
     try:
         # Request completion natively from our C++ binary
+        res = requests.post(f"http://127.0.0.1:{engine_port}/completion", json=payload, timeout=60).json()
         response = res.get("content", "").strip()
     except Exception as exc:
         print(f"[GENERATE] Error communicating with llama-server: {exc}")