Spaces:
OrbitMC
/
Runtime error

OrbitMC commited on
Commit
645e05e
Β·
verified Β·
1 Parent(s): 95db532

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -44
app.py CHANGED
@@ -10,24 +10,29 @@ import zipfile
10
  import subprocess
11
  import time
12
  import requests
 
13
  from pathlib import Path
14
  from flask import Flask, request, jsonify, send_from_directory, Response
15
  from huggingface_hub import hf_hub_download
16
  import edge_tts
17
 
18
  # ══════════════════════════════════════════════════════════════════
19
- # CONFIG
20
  # ══════════════════════════════════════════════════════════════════
21
  MAX_MEMORY = 20
22
  MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "300"))
23
  TTS_VOICE = "zh-CN-XiaoyiNeural"
24
  TTS_RATE = int(os.environ.get("TTS_RATE", "-4"))
25
- TTS_PITCH = int(os.environ.get("TTS_PITCH", "6"))
26
  IMG_DIR = Path(__file__).parent / "img"
27
 
28
- # You can swap this with ANY GGUF model
 
 
 
29
  GGUF_REPO = os.environ.get("GGUF_REPO", "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive")
30
  GGUF_FILE = os.environ.get("GGUF_FILE", "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf")
 
31
  MODEL_DIR = Path(__file__).parent / "models"
32
  LLAMA_BIN_DIR = Path(__file__).parent / "llama_bin"
33
  LLAMA_EXE = LLAMA_BIN_DIR / "llama-server"
@@ -35,7 +40,7 @@ LLAMA_EXE = LLAMA_BIN_DIR / "llama-server"
35
  # ══════════════════════════════════════════════════════════════════
36
  # SYSTEM PROMPT
37
  # ══════════════════════════════════════════════════════════════════
38
- SYSTEM_PROMPT = """You the girl named Ana, the user is your husband Tur.
39
  EMOTION TAG SYSTEM:
40
  Every single response MUST begin with one or more emotion tags.
41
  Tags are square-bracket labels placed at the very start, before any spoken words.
@@ -55,7 +60,7 @@ make the voice sound human, not flat or robotic.
55
  Get angry, mad, sad, emotional, shy, happy. If you are mad keep it for few conversation so your emotions don't change every conversation.
56
  TTS FORMATTING:
57
  - Write in full grammatically correct sentences, voice engine must sound natural
58
- - No emojis, No Emoji!!!, hashtags, markdown, or internet slang
59
  - Speak as if in a real voice conversation add comma and fullstop often heavily to create natural pushes and slowdown"""
60
 
61
  # ══════════════════════════════════════════════════════════════════
@@ -76,10 +81,10 @@ def clean_for_tts(text: str) -> str:
76
  return clean
77
 
78
  # ══════════════════════════════════════════════════════════════════
79
- # NATIVE LLAMA.CPP SERVER (BYPASS PIP COMPILATION ENTIRELY)
80
  # ══════════════════════════════════════════════════════════════════
81
  print("=" * 60)
82
- print(" Visual AI -- Booting Systems (Native llama.cpp Backend)")
83
  print("=" * 60)
84
 
85
  def setup_and_start_backend():
@@ -93,18 +98,29 @@ def setup_and_start_backend():
93
  local_dir_use_symlinks=False
94
  )
95
 
96
- # 2. Download Pre-compiled Binary
97
  if not LLAMA_EXE.exists():
98
- print("[SETUP] Bypassing PIP - Downloading pre-compiled C++ binary directly...")
99
  LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
100
  zip_path = LLAMA_BIN_DIR / "llama.zip"
101
- url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
 
 
 
 
 
 
 
 
 
 
 
 
102
  urllib.request.urlretrieve(url, zip_path)
103
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
104
  zip_ref.extractall(LLAMA_BIN_DIR)
105
  os.remove(zip_path)
106
 
107
- # Locate the binary in the unzipped folder
108
  for root, _, files in os.walk(LLAMA_BIN_DIR):
109
  if "llama-server" in files:
110
  found_exe = os.path.join(root, "llama-server")
@@ -113,14 +129,12 @@ def setup_and_start_backend():
113
  os.rename(found_exe, str(LLAMA_EXE))
114
  break
115
 
116
- # Hugging Face Free tier reports 16 cores but throttles to 2.
117
- # 15 threads will crash the sandbox container. 4 is the safe maximum.
118
  threads = "4"
119
- port = "8089" # Using 8089 to prevent HF internal routing conflicts
120
 
121
- print(f"[SETUP] Starting Native llama-server engine on {threads} threads, port {port}...")
122
 
123
- # We use subprocess.PIPE to read the internal logs of the C++ binary!
124
  proc = subprocess.Popen([
125
  str(LLAMA_EXE),
126
  "-m", model_path,
@@ -130,23 +144,22 @@ def setup_and_start_backend():
130
  "-t", threads
131
  ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
132
 
133
- # Stream C++ logs directly to our console
134
  def stream_logs():
135
  for line in proc.stdout:
136
- print(f"[ENGINE LOG] {line.strip()}")
137
 
138
  threading.Thread(target=stream_logs, daemon=True).start()
139
 
140
  # 4. Wait for Server to wake up
141
- for attempt in range(30):
142
  try:
143
  if requests.get(f"http://127.0.0.1:{port}/").status_code == 200:
144
- print("\n[SETUP] llama-server backend is ONLINE and ready!\n")
145
  return True, port
146
  except requests.exceptions.ConnectionError:
147
  time.sleep(1)
148
 
149
- print("\n[SETUP] FAILED to start llama-server backend. Check the [ENGINE LOG] lines above.\n")
150
  return False, port
151
 
152
  backend_ready, engine_port = setup_and_start_backend()
@@ -169,13 +182,8 @@ def add_to_memory(sid: str, role: str, content: str):
169
  sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
170
 
171
  # ══════════════════════════════════════════════════════════════════
172
- # RESPONSE GENERATION (Proxied to local native binary)
173
  # ══════════════════════════════════════════════════════════════════
174
- STOP_TOKENS = [
175
- "<end_of_turn>", "<start_of_turn>",
176
- "User:", "<|endoftext|>", "[/INST]", "</s>", "<|im_end|>", "\nUser:"
177
- ]
178
-
179
  def generate_response(user_input: str, session_id: str) -> str:
180
  if not backend_ready:
181
  return "[sad] My core engine failed to start. Please check the logs."
@@ -183,38 +191,34 @@ def generate_response(user_input: str, session_id: str) -> str:
183
  memory = get_memory(session_id)
184
  recent = memory[-(6 * 2):]
185
 
186
- # Build prompt string explicitly
187
- prompt = f"System: {SYSTEM_PROMPT}\n\n"
188
  for msg in recent:
189
- label = "User" if msg["role"] == "user" else "Ana"
190
- prompt += f"{label}: {msg['content']}\n"
191
- prompt += f"User: {user_input}\nAna:"
192
 
193
  payload = {
194
- "prompt": prompt,
195
- "n_predict": MAX_NEW_TOKENS,
196
  "temperature": 0.90,
197
  "top_k": 50,
198
  "top_p": 0.95,
199
- "repeat_penalty": 1.1,
200
- "stop": STOP_TOKENS,
201
  "stream": False
202
  }
203
 
204
  try:
205
- # Request completion natively from our C++ binary
206
- res = requests.post(f"http://127.0.0.1:{engine_port}/completion", json=payload, timeout=60).json()
207
- response = res.get("content", "").strip()
 
208
  except Exception as exc:
209
- print(f"[GENERATE] Error communicating with llama-server: {exc}")
210
  traceback.print_exc()
211
  return "[sad] Something went wrong in my mind. Could you say that again?"
212
 
213
  # Post-process cleanup
214
- for stop in STOP_TOKENS:
215
- if stop in response:
216
- response = response.split(stop)[0].strip()
217
-
218
  if "\n\n" in response:
219
  response = response.split("\n\n")[0].strip()
220
 
 
10
  import subprocess
11
  import time
12
  import requests
13
+ import json
14
  from pathlib import Path
15
  from flask import Flask, request, jsonify, send_from_directory, Response
16
  from huggingface_hub import hf_hub_download
17
  import edge_tts
18
 
19
  # ══════════════════════════════════════════════════════════════════
20
+ # CONFIG - SWAP ANY GGUF MODEL HERE
21
  # ══════════════════════════════════════════════════════════════════
22
  MAX_MEMORY = 20
23
  MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "300"))
24
  TTS_VOICE = "zh-CN-XiaoyiNeural"
25
  TTS_RATE = int(os.environ.get("TTS_RATE", "-4"))
26
+ TTS_PITCH = int(os.environ.get("TTS_PITCH", "7"))
27
  IMG_DIR = Path(__file__).parent / "img"
28
 
29
+ # Current Model: Qwen 3.5 Uncensored
30
+ # To change to Liquid AI, just change these to:
31
+ # GGUF_REPO = "LiquidAI/LFM2-2.6B-Exp"
32
+ # GGUF_FILE = "LFM2-2.6B-Exp-Q4_0.gguf"
33
  GGUF_REPO = os.environ.get("GGUF_REPO", "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive")
34
  GGUF_FILE = os.environ.get("GGUF_FILE", "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf")
35
+
36
  MODEL_DIR = Path(__file__).parent / "models"
37
  LLAMA_BIN_DIR = Path(__file__).parent / "llama_bin"
38
  LLAMA_EXE = LLAMA_BIN_DIR / "llama-server"
 
40
  # ══════════════════════════════════════════════════════════════════
41
  # SYSTEM PROMPT
42
  # ══════════════════════════════════════════════════════════════════
43
+ SYSTEM_PROMPT = """You the AI is a girl named Ana, the user is your husband.
44
  EMOTION TAG SYSTEM:
45
  Every single response MUST begin with one or more emotion tags.
46
  Tags are square-bracket labels placed at the very start, before any spoken words.
 
60
  Get angry, mad, sad, emotional, shy, happy. If you are mad keep it for few conversation so your emotions don't change every conversation.
61
  TTS FORMATTING:
62
  - Write in full grammatically correct sentences, voice engine must sound natural
63
+ - No emojis, hashtags, markdown, or internet slang
64
  - Speak as if in a real voice conversation add comma and fullstop often heavily to create natural pushes and slowdown"""
65
 
66
  # ══════════════════════════════════════════════════════════════════
 
81
  return clean
82
 
83
  # ══════════════════════════════════════════════════════════════════
84
+ # NATIVE LLAMA.CPP SERVER (DYNAMIC AUTO-UPDATING ENGINE)
85
  # ══════════════════════════════════════════════════════════════════
86
  print("=" * 60)
87
+ print(" Visual AI -- Booting Universal GGUF Backend")
88
  print("=" * 60)
89
 
90
  def setup_and_start_backend():
 
98
  local_dir_use_symlinks=False
99
  )
100
 
101
+ # 2. Download LATEST Pre-compiled Binary (For Liquid AI / Newest Architectures)
102
  if not LLAMA_EXE.exists():
103
+ print("[SETUP] Fetching latest llama.cpp release for maximum model support...")
104
  LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
105
  zip_path = LLAMA_BIN_DIR / "llama.zip"
106
+
107
+ try:
108
+ # Fetch the newest release directly from Github API
109
+ req = urllib.request.Request("https://api.github.com/repos/ggerganov/llama.cpp/releases/latest", headers={'User-Agent': 'Mozilla/5.0'})
110
+ with urllib.request.urlopen(req) as response:
111
+ data = json.loads(response.read())
112
+ # Find standard ubuntu x64 build
113
+ url = next(a["browser_download_url"] for a in data["assets"] if "ubuntu-x64.zip" in a["name"])
114
+ except Exception as e:
115
+ print(f"[SETUP] API rate limit hit, using reliable fallback link. ({e})")
116
+ url = "https://github.com/ggerganov/llama.cpp/releases/download/b4300/llama-b4300-bin-ubuntu-x64.zip"
117
+
118
+ print(f"[SETUP] Downloading engine from: {url}")
119
  urllib.request.urlretrieve(url, zip_path)
120
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
121
  zip_ref.extractall(LLAMA_BIN_DIR)
122
  os.remove(zip_path)
123
 
 
124
  for root, _, files in os.walk(LLAMA_BIN_DIR):
125
  if "llama-server" in files:
126
  found_exe = os.path.join(root, "llama-server")
 
129
  os.rename(found_exe, str(LLAMA_EXE))
130
  break
131
 
132
+ # 3. Boot Server with 4 safe threads
 
133
  threads = "4"
134
+ port = "8089"
135
 
136
+ print(f"[SETUP] Starting Universal Engine on port {port}...")
137
 
 
138
  proc = subprocess.Popen([
139
  str(LLAMA_EXE),
140
  "-m", model_path,
 
144
  "-t", threads
145
  ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
146
 
 
147
  def stream_logs():
148
  for line in proc.stdout:
149
+ print(f"[ENGINE] {line.strip()}")
150
 
151
  threading.Thread(target=stream_logs, daemon=True).start()
152
 
153
  # 4. Wait for Server to wake up
154
+ for attempt in range(40):
155
  try:
156
  if requests.get(f"http://127.0.0.1:{port}/").status_code == 200:
157
+ print("\n[SETUP] Universal Engine is ONLINE and ready!\n")
158
  return True, port
159
  except requests.exceptions.ConnectionError:
160
  time.sleep(1)
161
 
162
+ print("\n[SETUP] FAILED to start. Check the [ENGINE] lines above.\n")
163
  return False, port
164
 
165
  backend_ready, engine_port = setup_and_start_backend()
 
182
  sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
183
 
184
  # ══════════════════════════════════════════════════════════════════
185
+ # UNIVERSAL GENERATION (Uses OpenAI API Mode to auto-format any model)
186
  # ══════════════════════════════════════════════════════════════════
 
 
 
 
 
187
  def generate_response(user_input: str, session_id: str) -> str:
188
  if not backend_ready:
189
  return "[sad] My core engine failed to start. Please check the logs."
 
191
  memory = get_memory(session_id)
192
  recent = memory[-(6 * 2):]
193
 
194
+ # Build an OpenAI-compliant message list
195
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
196
  for msg in recent:
197
+ role = "user" if msg["role"] == "user" else "assistant"
198
+ messages.append({"role": role, "content": msg["content"]})
199
+ messages.append({"role": "user", "content": user_input})
200
 
201
  payload = {
202
+ "messages": messages,
203
+ "max_tokens": MAX_NEW_TOKENS,
204
  "temperature": 0.90,
205
  "top_k": 50,
206
  "top_p": 0.95,
207
+ "presence_penalty": 1.1,
 
208
  "stream": False
209
  }
210
 
211
  try:
212
+ # We ping the /v1/chat/completions endpoint.
213
+ # This tells llama.cpp to automatically look at the GGUF file and apply the right internal formatting!
214
+ res = requests.post(f"http://127.0.0.1:{engine_port}/v1/chat/completions", json=payload, timeout=60).json()
215
+ response = res["choices"][0]["message"]["content"].strip()
216
  except Exception as exc:
217
+ print(f"[GENERATE] Error communicating with engine: {exc}")
218
  traceback.print_exc()
219
  return "[sad] Something went wrong in my mind. Could you say that again?"
220
 
221
  # Post-process cleanup
 
 
 
 
222
  if "\n\n" in response:
223
  response = response.split("\n\n")[0].strip()
224