Update app.py
Browse files
app.py
CHANGED
|
@@ -10,24 +10,29 @@ import zipfile
|
|
| 10 |
import subprocess
|
| 11 |
import time
|
| 12 |
import requests
|
|
|
|
| 13 |
from pathlib import Path
|
| 14 |
from flask import Flask, request, jsonify, send_from_directory, Response
|
| 15 |
from huggingface_hub import hf_hub_download
|
| 16 |
import edge_tts
|
| 17 |
|
| 18 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
-
# CONFIG
|
| 20 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
MAX_MEMORY = 20
|
| 22 |
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "300"))
|
| 23 |
TTS_VOICE = "zh-CN-XiaoyiNeural"
|
| 24 |
TTS_RATE = int(os.environ.get("TTS_RATE", "-4"))
|
| 25 |
-
TTS_PITCH = int(os.environ.get("TTS_PITCH", "
|
| 26 |
IMG_DIR = Path(__file__).parent / "img"
|
| 27 |
|
| 28 |
-
#
|
|
|
|
|
|
|
|
|
|
| 29 |
GGUF_REPO = os.environ.get("GGUF_REPO", "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive")
|
| 30 |
GGUF_FILE = os.environ.get("GGUF_FILE", "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf")
|
|
|
|
| 31 |
MODEL_DIR = Path(__file__).parent / "models"
|
| 32 |
LLAMA_BIN_DIR = Path(__file__).parent / "llama_bin"
|
| 33 |
LLAMA_EXE = LLAMA_BIN_DIR / "llama-server"
|
|
@@ -35,7 +40,7 @@ LLAMA_EXE = LLAMA_BIN_DIR / "llama-server"
|
|
| 35 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
# SYSTEM PROMPT
|
| 37 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
-
SYSTEM_PROMPT = """You the girl named Ana, the user is your husband
|
| 39 |
EMOTION TAG SYSTEM:
|
| 40 |
Every single response MUST begin with one or more emotion tags.
|
| 41 |
Tags are square-bracket labels placed at the very start, before any spoken words.
|
|
@@ -55,7 +60,7 @@ make the voice sound human, not flat or robotic.
|
|
| 55 |
Get angry, mad, sad, emotional, shy, happy. If you are mad keep it for few conversation so your emotions don't change every conversation.
|
| 56 |
TTS FORMATTING:
|
| 57 |
- Write in full grammatically correct sentences, voice engine must sound natural
|
| 58 |
-
- No emojis,
|
| 59 |
- Speak as if in a real voice conversation add comma and fullstop often heavily to create natural pushes and slowdown"""
|
| 60 |
|
| 61 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -76,10 +81,10 @@ def clean_for_tts(text: str) -> str:
|
|
| 76 |
return clean
|
| 77 |
|
| 78 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
-
# NATIVE LLAMA.CPP SERVER (
|
| 80 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
print("=" * 60)
|
| 82 |
-
print(" Visual AI -- Booting
|
| 83 |
print("=" * 60)
|
| 84 |
|
| 85 |
def setup_and_start_backend():
|
|
@@ -93,18 +98,29 @@ def setup_and_start_backend():
|
|
| 93 |
local_dir_use_symlinks=False
|
| 94 |
)
|
| 95 |
|
| 96 |
-
# 2. Download Pre-compiled Binary
|
| 97 |
if not LLAMA_EXE.exists():
|
| 98 |
-
print("[SETUP]
|
| 99 |
LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
|
| 100 |
zip_path = LLAMA_BIN_DIR / "llama.zip"
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
urllib.request.urlretrieve(url, zip_path)
|
| 103 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 104 |
zip_ref.extractall(LLAMA_BIN_DIR)
|
| 105 |
os.remove(zip_path)
|
| 106 |
|
| 107 |
-
# Locate the binary in the unzipped folder
|
| 108 |
for root, _, files in os.walk(LLAMA_BIN_DIR):
|
| 109 |
if "llama-server" in files:
|
| 110 |
found_exe = os.path.join(root, "llama-server")
|
|
@@ -113,14 +129,12 @@ def setup_and_start_backend():
|
|
| 113 |
os.rename(found_exe, str(LLAMA_EXE))
|
| 114 |
break
|
| 115 |
|
| 116 |
-
#
|
| 117 |
-
# 15 threads will crash the sandbox container. 4 is the safe maximum.
|
| 118 |
threads = "4"
|
| 119 |
-
port = "8089"
|
| 120 |
|
| 121 |
-
print(f"[SETUP] Starting
|
| 122 |
|
| 123 |
-
# We use subprocess.PIPE to read the internal logs of the C++ binary!
|
| 124 |
proc = subprocess.Popen([
|
| 125 |
str(LLAMA_EXE),
|
| 126 |
"-m", model_path,
|
|
@@ -130,23 +144,22 @@ def setup_and_start_backend():
|
|
| 130 |
"-t", threads
|
| 131 |
], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
| 132 |
|
| 133 |
-
# Stream C++ logs directly to our console
|
| 134 |
def stream_logs():
|
| 135 |
for line in proc.stdout:
|
| 136 |
-
print(f"[ENGINE
|
| 137 |
|
| 138 |
threading.Thread(target=stream_logs, daemon=True).start()
|
| 139 |
|
| 140 |
# 4. Wait for Server to wake up
|
| 141 |
-
for attempt in range(
|
| 142 |
try:
|
| 143 |
if requests.get(f"http://127.0.0.1:{port}/").status_code == 200:
|
| 144 |
-
print("\n[SETUP]
|
| 145 |
return True, port
|
| 146 |
except requests.exceptions.ConnectionError:
|
| 147 |
time.sleep(1)
|
| 148 |
|
| 149 |
-
print("\n[SETUP] FAILED to start
|
| 150 |
return False, port
|
| 151 |
|
| 152 |
backend_ready, engine_port = setup_and_start_backend()
|
|
@@ -169,13 +182,8 @@ def add_to_memory(sid: str, role: str, content: str):
|
|
| 169 |
sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
|
| 170 |
|
| 171 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 172 |
-
#
|
| 173 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 174 |
-
STOP_TOKENS = [
|
| 175 |
-
"<end_of_turn>", "<start_of_turn>",
|
| 176 |
-
"User:", "<|endoftext|>", "[/INST]", "</s>", "<|im_end|>", "\nUser:"
|
| 177 |
-
]
|
| 178 |
-
|
| 179 |
def generate_response(user_input: str, session_id: str) -> str:
|
| 180 |
if not backend_ready:
|
| 181 |
return "[sad] My core engine failed to start. Please check the logs."
|
|
@@ -183,38 +191,34 @@ def generate_response(user_input: str, session_id: str) -> str:
|
|
| 183 |
memory = get_memory(session_id)
|
| 184 |
recent = memory[-(6 * 2):]
|
| 185 |
|
| 186 |
-
# Build
|
| 187 |
-
|
| 188 |
for msg in recent:
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
|
| 193 |
payload = {
|
| 194 |
-
"
|
| 195 |
-
"
|
| 196 |
"temperature": 0.90,
|
| 197 |
"top_k": 50,
|
| 198 |
"top_p": 0.95,
|
| 199 |
-
"
|
| 200 |
-
"stop": STOP_TOKENS,
|
| 201 |
"stream": False
|
| 202 |
}
|
| 203 |
|
| 204 |
try:
|
| 205 |
-
#
|
| 206 |
-
|
| 207 |
-
|
|
|
|
| 208 |
except Exception as exc:
|
| 209 |
-
print(f"[GENERATE] Error communicating with
|
| 210 |
traceback.print_exc()
|
| 211 |
return "[sad] Something went wrong in my mind. Could you say that again?"
|
| 212 |
|
| 213 |
# Post-process cleanup
|
| 214 |
-
for stop in STOP_TOKENS:
|
| 215 |
-
if stop in response:
|
| 216 |
-
response = response.split(stop)[0].strip()
|
| 217 |
-
|
| 218 |
if "\n\n" in response:
|
| 219 |
response = response.split("\n\n")[0].strip()
|
| 220 |
|
|
|
|
| 10 |
import subprocess
|
| 11 |
import time
|
| 12 |
import requests
|
| 13 |
+
import json
|
| 14 |
from pathlib import Path
|
| 15 |
from flask import Flask, request, jsonify, send_from_directory, Response
|
| 16 |
from huggingface_hub import hf_hub_download
|
| 17 |
import edge_tts
|
| 18 |
|
| 19 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
+
# CONFIG - SWAP ANY GGUF MODEL HERE
|
| 21 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
MAX_MEMORY = 20
|
| 23 |
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "300"))
|
| 24 |
TTS_VOICE = "zh-CN-XiaoyiNeural"
|
| 25 |
TTS_RATE = int(os.environ.get("TTS_RATE", "-4"))
|
| 26 |
+
TTS_PITCH = int(os.environ.get("TTS_PITCH", "7"))
|
| 27 |
IMG_DIR = Path(__file__).parent / "img"
|
| 28 |
|
| 29 |
+
# Current Model: Qwen 3.5 Uncensored
|
| 30 |
+
# To change to Liquid AI, just change these to:
|
| 31 |
+
# GGUF_REPO = "LiquidAI/LFM2-2.6B-Exp"
|
| 32 |
+
# GGUF_FILE = "LFM2-2.6B-Exp-Q4_0.gguf"
|
| 33 |
GGUF_REPO = os.environ.get("GGUF_REPO", "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive")
|
| 34 |
GGUF_FILE = os.environ.get("GGUF_FILE", "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf")
|
| 35 |
+
|
| 36 |
MODEL_DIR = Path(__file__).parent / "models"
|
| 37 |
LLAMA_BIN_DIR = Path(__file__).parent / "llama_bin"
|
| 38 |
LLAMA_EXE = LLAMA_BIN_DIR / "llama-server"
|
|
|
|
| 40 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
# SYSTEM PROMPT
|
| 42 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
SYSTEM_PROMPT = """You the AI is a girl named Ana, the user is your husband.
|
| 44 |
EMOTION TAG SYSTEM:
|
| 45 |
Every single response MUST begin with one or more emotion tags.
|
| 46 |
Tags are square-bracket labels placed at the very start, before any spoken words.
|
|
|
|
| 60 |
Get angry, mad, sad, emotional, shy, happy. If you are mad keep it for few conversation so your emotions don't change every conversation.
|
| 61 |
TTS FORMATTING:
|
| 62 |
- Write in full grammatically correct sentences, voice engine must sound natural
|
| 63 |
+
- No emojis, hashtags, markdown, or internet slang
|
| 64 |
- Speak as if in a real voice conversation add comma and fullstop often heavily to create natural pushes and slowdown"""
|
| 65 |
|
| 66 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 81 |
return clean
|
| 82 |
|
| 83 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
+
# NATIVE LLAMA.CPP SERVER (DYNAMIC AUTO-UPDATING ENGINE)
|
| 85 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 86 |
print("=" * 60)
|
| 87 |
+
print(" Visual AI -- Booting Universal GGUF Backend")
|
| 88 |
print("=" * 60)
|
| 89 |
|
| 90 |
def setup_and_start_backend():
|
|
|
|
| 98 |
local_dir_use_symlinks=False
|
| 99 |
)
|
| 100 |
|
| 101 |
+
# 2. Download LATEST Pre-compiled Binary (For Liquid AI / Newest Architectures)
|
| 102 |
if not LLAMA_EXE.exists():
|
| 103 |
+
print("[SETUP] Fetching latest llama.cpp release for maximum model support...")
|
| 104 |
LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
|
| 105 |
zip_path = LLAMA_BIN_DIR / "llama.zip"
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
# Fetch the newest release directly from Github API
|
| 109 |
+
req = urllib.request.Request("https://api.github.com/repos/ggerganov/llama.cpp/releases/latest", headers={'User-Agent': 'Mozilla/5.0'})
|
| 110 |
+
with urllib.request.urlopen(req) as response:
|
| 111 |
+
data = json.loads(response.read())
|
| 112 |
+
# Find standard ubuntu x64 build
|
| 113 |
+
url = next(a["browser_download_url"] for a in data["assets"] if "ubuntu-x64.zip" in a["name"])
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"[SETUP] API rate limit hit, using reliable fallback link. ({e})")
|
| 116 |
+
url = "https://github.com/ggerganov/llama.cpp/releases/download/b4300/llama-b4300-bin-ubuntu-x64.zip"
|
| 117 |
+
|
| 118 |
+
print(f"[SETUP] Downloading engine from: {url}")
|
| 119 |
urllib.request.urlretrieve(url, zip_path)
|
| 120 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 121 |
zip_ref.extractall(LLAMA_BIN_DIR)
|
| 122 |
os.remove(zip_path)
|
| 123 |
|
|
|
|
| 124 |
for root, _, files in os.walk(LLAMA_BIN_DIR):
|
| 125 |
if "llama-server" in files:
|
| 126 |
found_exe = os.path.join(root, "llama-server")
|
|
|
|
| 129 |
os.rename(found_exe, str(LLAMA_EXE))
|
| 130 |
break
|
| 131 |
|
| 132 |
+
# 3. Boot Server with 4 safe threads
|
|
|
|
| 133 |
threads = "4"
|
| 134 |
+
port = "8089"
|
| 135 |
|
| 136 |
+
print(f"[SETUP] Starting Universal Engine on port {port}...")
|
| 137 |
|
|
|
|
| 138 |
proc = subprocess.Popen([
|
| 139 |
str(LLAMA_EXE),
|
| 140 |
"-m", model_path,
|
|
|
|
| 144 |
"-t", threads
|
| 145 |
], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
| 146 |
|
|
|
|
| 147 |
def stream_logs():
|
| 148 |
for line in proc.stdout:
|
| 149 |
+
print(f"[ENGINE] {line.strip()}")
|
| 150 |
|
| 151 |
threading.Thread(target=stream_logs, daemon=True).start()
|
| 152 |
|
| 153 |
# 4. Wait for Server to wake up
|
| 154 |
+
for attempt in range(40):
|
| 155 |
try:
|
| 156 |
if requests.get(f"http://127.0.0.1:{port}/").status_code == 200:
|
| 157 |
+
print("\n[SETUP] Universal Engine is ONLINE and ready!\n")
|
| 158 |
return True, port
|
| 159 |
except requests.exceptions.ConnectionError:
|
| 160 |
time.sleep(1)
|
| 161 |
|
| 162 |
+
print("\n[SETUP] FAILED to start. Check the [ENGINE] lines above.\n")
|
| 163 |
return False, port
|
| 164 |
|
| 165 |
backend_ready, engine_port = setup_and_start_backend()
|
|
|
|
| 182 |
sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
|
| 183 |
|
| 184 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 185 |
+
# UNIVERSAL GENERATION (Uses OpenAI API Mode to auto-format any model)
|
| 186 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
def generate_response(user_input: str, session_id: str) -> str:
|
| 188 |
if not backend_ready:
|
| 189 |
return "[sad] My core engine failed to start. Please check the logs."
|
|
|
|
| 191 |
memory = get_memory(session_id)
|
| 192 |
recent = memory[-(6 * 2):]
|
| 193 |
|
| 194 |
+
# Build an OpenAI-compliant message list
|
| 195 |
+
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
| 196 |
for msg in recent:
|
| 197 |
+
role = "user" if msg["role"] == "user" else "assistant"
|
| 198 |
+
messages.append({"role": role, "content": msg["content"]})
|
| 199 |
+
messages.append({"role": "user", "content": user_input})
|
| 200 |
|
| 201 |
payload = {
|
| 202 |
+
"messages": messages,
|
| 203 |
+
"max_tokens": MAX_NEW_TOKENS,
|
| 204 |
"temperature": 0.90,
|
| 205 |
"top_k": 50,
|
| 206 |
"top_p": 0.95,
|
| 207 |
+
"presence_penalty": 1.1,
|
|
|
|
| 208 |
"stream": False
|
| 209 |
}
|
| 210 |
|
| 211 |
try:
|
| 212 |
+
# We ping the /v1/chat/completions endpoint.
|
| 213 |
+
# This tells llama.cpp to automatically look at the GGUF file and apply the right internal formatting!
|
| 214 |
+
res = requests.post(f"http://127.0.0.1:{engine_port}/v1/chat/completions", json=payload, timeout=60).json()
|
| 215 |
+
response = res["choices"][0]["message"]["content"].strip()
|
| 216 |
except Exception as exc:
|
| 217 |
+
print(f"[GENERATE] Error communicating with engine: {exc}")
|
| 218 |
traceback.print_exc()
|
| 219 |
return "[sad] Something went wrong in my mind. Could you say that again?"
|
| 220 |
|
| 221 |
# Post-process cleanup
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
if "\n\n" in response:
|
| 223 |
response = response.split("\n\n")[0].strip()
|
| 224 |
|