Spaces:
OrbitMC
/
Runtime error

OrbitMC commited on
Commit
40d6eda
Β·
verified Β·
1 Parent(s): c776b5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -18
app.py CHANGED
@@ -85,7 +85,7 @@ print("=" * 60)
85
  def setup_and_start_backend():
86
  # 1. Download Model
87
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
88
- print(f"[SETUP] Verifying/Downloading Model: {GGUF_FILE} ...")
89
  model_path = hf_hub_download(
90
  repo_id=GGUF_REPO,
91
  filename=GGUF_FILE,
@@ -93,9 +93,9 @@ def setup_and_start_backend():
93
  local_dir_use_symlinks=False
94
  )
95
 
96
- # 2. Download Pre-compiled Binary (Instant)
97
  if not LLAMA_EXE.exists():
98
- print("[SETUP] Bypassing python pip - Downloading pre-compiled C++ binary directly...")
99
  LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
100
  zip_path = LLAMA_BIN_DIR / "llama.zip"
101
  url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
@@ -113,33 +113,43 @@ def setup_and_start_backend():
113
  os.rename(found_exe, str(LLAMA_EXE))
114
  break
115
 
116
- # 3. Boot Server in Background
117
- cpu_cores = os.cpu_count() or 2
118
- threads = str(max(1, cpu_cores - 1))
 
119
 
120
- print(f"[SETUP] Starting Native llama-server engine on {threads} CPU threads...")
121
- subprocess.Popen([
 
 
122
  str(LLAMA_EXE),
123
  "-m", model_path,
124
  "-c", "4096",
125
- "--port", "8080",
126
  "--host", "127.0.0.1",
127
  "-t", threads
128
- ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
 
 
 
 
 
 
129
 
130
  # 4. Wait for Server to wake up
131
- for _ in range(30):
132
  try:
133
- if requests.get("http://127.0.0.1:8080/").status_code == 200:
134
- print("[SETUP] llama-server backend is ONLINE and ready!")
135
- return True
136
  except requests.exceptions.ConnectionError:
137
  time.sleep(1)
138
 
139
- print("[SETUP] FAILED to start llama-server backend.")
140
- return False
141
 
142
- backend_ready = setup_and_start_backend()
143
 
144
  # ══════════════════════════════════════════════════════════════════
145
  # CHAT MEMORY
@@ -193,7 +203,7 @@ def generate_response(user_input: str, session_id: str) -> str:
193
 
194
  try:
195
  # Request completion natively from our C++ binary
196
- res = requests.post("http://127.0.0.1:8080/completion", json=payload, timeout=60).json()
197
  response = res.get("content", "").strip()
198
  except Exception as exc:
199
  print(f"[GENERATE] Error communicating with llama-server: {exc}")
 
85
  def setup_and_start_backend():
86
  # 1. Download Model
87
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
88
+ print(f"[SETUP] Verifying Model: {GGUF_FILE} ...")
89
  model_path = hf_hub_download(
90
  repo_id=GGUF_REPO,
91
  filename=GGUF_FILE,
 
93
  local_dir_use_symlinks=False
94
  )
95
 
96
+ # 2. Download Pre-compiled Binary
97
  if not LLAMA_EXE.exists():
98
+ print("[SETUP] Bypassing PIP - Downloading pre-compiled C++ binary directly...")
99
  LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
100
  zip_path = LLAMA_BIN_DIR / "llama.zip"
101
  url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
 
113
  os.rename(found_exe, str(LLAMA_EXE))
114
  break
115
 
116
+ # Hugging Face Free tier reports 16 cores but throttles to 2.
117
+ # 15 threads will crash the sandbox container. 4 is the safe maximum.
118
+ threads = "4"
119
+ port = "8089" # Using 8089 to prevent HF internal routing conflicts
120
 
121
+ print(f"[SETUP] Starting Native llama-server engine on {threads} threads, port {port}...")
122
+
123
+ # We use subprocess.PIPE to read the internal logs of the C++ binary!
124
+ proc = subprocess.Popen([
125
  str(LLAMA_EXE),
126
  "-m", model_path,
127
  "-c", "4096",
128
+ "--port", port,
129
  "--host", "127.0.0.1",
130
  "-t", threads
131
+ ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
132
+
133
+ # Stream C++ logs directly to our console
134
+ def stream_logs():
135
+ for line in proc.stdout:
136
+ print(f"[ENGINE LOG] {line.strip()}")
137
+
138
+ threading.Thread(target=stream_logs, daemon=True).start()
139
 
140
  # 4. Wait for Server to wake up
141
+ for attempt in range(30):
142
  try:
143
+ if requests.get(f"http://127.0.0.1:{port}/").status_code == 200:
144
+ print("\n[SETUP] llama-server backend is ONLINE and ready!\n")
145
+ return True, port
146
  except requests.exceptions.ConnectionError:
147
  time.sleep(1)
148
 
149
+ print("\n[SETUP] FAILED to start llama-server backend. Check the [ENGINE LOG] lines above.\n")
150
+ return False, port
151
 
152
+ backend_ready, engine_port = setup_and_start_backend()
153
 
154
  # ══════════════════════════════════════════════════════════════════
155
  # CHAT MEMORY
 
203
 
204
  try:
205
  # Request completion natively from our C++ binary
206
+ res = requests.post(f"http://127.0.0.1:{engine_port}/completion", json=payload, timeout=60).json()
207
  response = res.get("content", "").strip()
208
  except Exception as exc:
209
  print(f"[GENERATE] Error communicating with llama-server: {exc}")