Spaces:
OrbitMC
/
Runtime error

OrbitMC commited on
Commit
3cd46db
Β·
verified Β·
1 Parent(s): 6baa494

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -51
app.py CHANGED
@@ -5,16 +5,14 @@ import base64
5
  import threading
6
  import traceback
7
  import asyncio
 
 
 
 
 
8
  from pathlib import Path
9
  from flask import Flask, request, jsonify, send_from_directory, Response
10
-
11
- # ══════════════════════════════════════════════════════════════════
12
- # LLAMA.CPP BACKEND IMPORTS (Universal GGUF Support)
13
- # ══════════════════════════════════════════════════════════════════
14
- # Install via: pip install llama-cpp-python huggingface-hub
15
- from llama_cpp import Llama
16
  from huggingface_hub import hf_hub_download
17
-
18
  import edge_tts
19
 
20
  # ══════════════════════════════════════════════════════════════════
@@ -27,10 +25,12 @@ TTS_RATE = int(os.environ.get("TTS_RATE", "-4"))
27
  TTS_PITCH = int(os.environ.get("TTS_PITCH", "7"))
28
  IMG_DIR = Path(__file__).parent / "img"
29
 
30
- # You can swap this with ANY GGUF model supported by llama.cpp
31
  GGUF_REPO = os.environ.get("GGUF_REPO", "Qwen/Qwen2.5-1.5B-Instruct-GGUF")
32
  GGUF_FILE = os.environ.get("GGUF_FILE", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
33
  MODEL_DIR = Path(__file__).parent / "models"
 
 
34
 
35
  # ══════════════════════════════════════════════════════════════════
36
  # SYSTEM PROMPT
@@ -76,38 +76,70 @@ def clean_for_tts(text: str) -> str:
76
  return clean
77
 
78
  # ══════════════════════════════════════════════════════════════════
79
- # MODEL LOADING (llama.cpp CPU)
80
  # ══════════════════════════════════════════════════════════════════
81
  print("=" * 60)
82
- print(" Visual AI -- Booting Systems (llama.cpp Backend)")
83
  print("=" * 60)
84
 
85
- model = None
86
-
87
- try:
88
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
89
-
90
- print(f"[MODEL] Verifying/Downloading {GGUF_FILE} from {GGUF_REPO} ...")
91
  model_path = hf_hub_download(
92
  repo_id=GGUF_REPO,
93
  filename=GGUF_FILE,
94
  local_dir=str(MODEL_DIR),
95
  local_dir_use_symlinks=False
96
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- print(f"[MODEL] Loading {GGUF_FILE} on CPU with llama.cpp ...")
99
- # n_ctx determines context length.
100
- # n_threads utilizes optimal CPU cores automatically if set to None.
101
- model = Llama(
102
- model_path=model_path,
103
- n_ctx=4096,
104
- n_threads=max(1, os.cpu_count() - 1),
105
- verbose=False # Set to True to see llama.cpp debug logs
106
- )
107
- print(" OK Model loaded successfully!")
108
- except Exception as exc:
109
- print(f" FAILED Model load error: {exc}")
110
- traceback.print_exc()
 
 
 
 
 
 
 
 
 
 
111
 
112
  # ══════════════════════════════════════════════════════════════════
113
  # CHAT MEMORY
@@ -127,7 +159,7 @@ def add_to_memory(sid: str, role: str, content: str):
127
  sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
128
 
129
  # ══════════════════════════════════════════════════════════════════
130
- # RESPONSE GENERATION
131
  # ══════════════════════════════════════════════════════════════════
132
  STOP_TOKENS = [
133
  "<end_of_turn>", "<start_of_turn>",
@@ -135,38 +167,44 @@ STOP_TOKENS = [
135
  ]
136
 
137
  def generate_response(user_input: str, session_id: str) -> str:
138
- if model is None:
139
- return "[sad] My mind is offline right now. Please give me a moment."
140
 
141
  memory = get_memory(session_id)
142
  recent = memory[-(6 * 2):]
143
 
144
- # Build prompt string
145
  prompt = f"System: {SYSTEM_PROMPT}\n\n"
146
  for msg in recent:
147
  label = "User" if msg["role"] == "user" else "Ana"
148
  prompt += f"{label}: {msg['content']}\n"
149
  prompt += f"User: {user_input}\nAna:"
150
 
 
 
 
 
 
 
 
 
 
 
 
151
  try:
152
- # llama.cpp Generation
153
- output = model.create_completion(
154
- prompt=prompt,
155
- max_tokens=MAX_NEW_TOKENS,
156
- temperature=0.90,
157
- top_k=50,
158
- top_p=0.95,
159
- repeat_penalty=1.1,
160
- stop=STOP_TOKENS,
161
- echo=False
162
- )
163
- response = output["choices"][0]["text"].strip()
164
  except Exception as exc:
165
- print(f"[GENERATE] Error: {exc}")
166
  traceback.print_exc()
167
  return "[sad] Something went wrong in my mind. Could you say that again?"
168
 
169
  # Post-process cleanup
 
 
 
 
170
  if "\n\n" in response:
171
  response = response.split("\n\n")[0].strip()
172
 
@@ -244,10 +282,6 @@ body{
244
  justify-content:center;
245
  }
246
 
247
- /*
248
- object-fit: contain prevents cuts/overflow and displays the full image intact.
249
- No transitions = INSTANT image swapping.
250
- */
251
  #bgImg{
252
  width:100%;
253
  height:100%;
@@ -607,8 +641,8 @@ def clear():
607
  @app.route("/health")
608
  def health():
609
  return jsonify({
610
- "model_loaded": model is not None,
611
- "backend": "llama.cpp (CPU GGUF)",
612
  })
613
 
614
  if __name__ == "__main__":
 
5
  import threading
6
  import traceback
7
  import asyncio
8
+ import urllib.request
9
+ import zipfile
10
+ import subprocess
11
+ import time
12
+ import requests
13
  from pathlib import Path
14
  from flask import Flask, request, jsonify, send_from_directory, Response
 
 
 
 
 
 
15
  from huggingface_hub import hf_hub_download
 
16
  import edge_tts
17
 
18
  # ══════════════════════════════════════════════════════════════════
 
25
  TTS_PITCH = int(os.environ.get("TTS_PITCH", "7"))
26
  IMG_DIR = Path(__file__).parent / "img"
27
 
28
+ # You can swap this with ANY GGUF model
29
  GGUF_REPO = os.environ.get("GGUF_REPO", "Qwen/Qwen2.5-1.5B-Instruct-GGUF")
30
  GGUF_FILE = os.environ.get("GGUF_FILE", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
31
  MODEL_DIR = Path(__file__).parent / "models"
32
+ LLAMA_BIN_DIR = Path(__file__).parent / "llama_bin"
33
+ LLAMA_EXE = LLAMA_BIN_DIR / "llama-server"
34
 
35
  # ══════════════════════════════════════════════════════════════════
36
  # SYSTEM PROMPT
 
76
  return clean
77
 
78
  # ══════════════════════════════════════════════════════════════════
79
+ # NATIVE LLAMA.CPP SERVER (BYPASS PIP COMPILATION ENTIRELY)
80
  # ══════════════════════════════════════════════════════════════════
81
  print("=" * 60)
82
+ print(" Visual AI -- Booting Systems (Native llama.cpp Backend)")
83
  print("=" * 60)
84
 
85
+ def setup_and_start_backend():
86
+ # 1. Download Model
 
87
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
88
+ print(f"[SETUP] Verifying/Downloading Model: {GGUF_FILE} ...")
 
89
  model_path = hf_hub_download(
90
  repo_id=GGUF_REPO,
91
  filename=GGUF_FILE,
92
  local_dir=str(MODEL_DIR),
93
  local_dir_use_symlinks=False
94
  )
95
+
96
+ # 2. Download Pre-compiled Binary (Instant)
97
+ if not LLAMA_EXE.exists():
98
+ print("[SETUP] Bypassing python pip - Downloading pre-compiled C++ binary directly...")
99
+ LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
100
+ zip_path = LLAMA_BIN_DIR / "llama.zip"
101
+ url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
102
+ urllib.request.urlretrieve(url, zip_path)
103
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
104
+ zip_ref.extractall(LLAMA_BIN_DIR)
105
+ os.remove(zip_path)
106
+
107
+ # Locate the binary in the unzipped folder
108
+ for root, _, files in os.walk(LLAMA_BIN_DIR):
109
+ if "llama-server" in files:
110
+ found_exe = os.path.join(root, "llama-server")
111
+ os.chmod(found_exe, 0o755)
112
+ if found_exe != str(LLAMA_EXE):
113
+ os.rename(found_exe, str(LLAMA_EXE))
114
+ break
115
+
116
+ # 3. Boot Server in Background
117
+ cpu_cores = os.cpu_count() or 2
118
+ threads = str(max(1, cpu_cores - 1))
119
 
120
+ print(f"[SETUP] Starting Native llama-server engine on {threads} CPU threads...")
121
+ subprocess.Popen([
122
+ str(LLAMA_EXE),
123
+ "-m", model_path,
124
+ "-c", "4096",
125
+ "--port", "8080",
126
+ "--host", "127.0.0.1",
127
+ "-t", threads
128
+ ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
129
+
130
+ # 4. Wait for Server to wake up
131
+ for _ in range(30):
132
+ try:
133
+ if requests.get("http://127.0.0.1:8080/").status_code == 200:
134
+ print("[SETUP] llama-server backend is ONLINE and ready!")
135
+ return True
136
+ except requests.exceptions.ConnectionError:
137
+ time.sleep(1)
138
+
139
+ print("[SETUP] FAILED to start llama-server backend.")
140
+ return False
141
+
142
+ backend_ready = setup_and_start_backend()
143
 
144
  # ══════════════════════════════════════════════════════════════════
145
  # CHAT MEMORY
 
159
  sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
160
 
161
  # ══════════════════════════════════════════════════════════════════
162
+ # RESPONSE GENERATION (Proxied to local native binary)
163
  # ══════════════════════════════════════════════════════════════════
164
  STOP_TOKENS = [
165
  "<end_of_turn>", "<start_of_turn>",
 
167
  ]
168
 
169
  def generate_response(user_input: str, session_id: str) -> str:
170
+ if not backend_ready:
171
+ return "[sad] My core engine failed to start. Please check the logs."
172
 
173
  memory = get_memory(session_id)
174
  recent = memory[-(6 * 2):]
175
 
176
+ # Build prompt string explicitly
177
  prompt = f"System: {SYSTEM_PROMPT}\n\n"
178
  for msg in recent:
179
  label = "User" if msg["role"] == "user" else "Ana"
180
  prompt += f"{label}: {msg['content']}\n"
181
  prompt += f"User: {user_input}\nAna:"
182
 
183
+ payload = {
184
+ "prompt": prompt,
185
+ "n_predict": MAX_NEW_TOKENS,
186
+ "temperature": 0.90,
187
+ "top_k": 50,
188
+ "top_p": 0.95,
189
+ "repeat_penalty": 1.1,
190
+ "stop": STOP_TOKENS,
191
+ "stream": False
192
+ }
193
+
194
  try:
195
+ # Request completion natively from our C++ binary
196
+ res = requests.post("http://127.0.0.1:8080/completion", json=payload, timeout=60).json()
197
+ response = res.get("content", "").strip()
 
 
 
 
 
 
 
 
 
198
  except Exception as exc:
199
+ print(f"[GENERATE] Error communicating with llama-server: {exc}")
200
  traceback.print_exc()
201
  return "[sad] Something went wrong in my mind. Could you say that again?"
202
 
203
  # Post-process cleanup
204
+ for stop in STOP_TOKENS:
205
+ if stop in response:
206
+ response = response.split(stop)[0].strip()
207
+
208
  if "\n\n" in response:
209
  response = response.split("\n\n")[0].strip()
210
 
 
282
  justify-content:center;
283
  }
284
 
 
 
 
 
285
  #bgImg{
286
  width:100%;
287
  height:100%;
 
641
  @app.route("/health")
642
  def health():
643
  return jsonify({
644
+ "backend_ready": backend_ready,
645
+ "type": "native-llama-server"
646
  })
647
 
648
  if __name__ == "__main__":