Update app.py
Browse files
app.py
CHANGED
|
@@ -5,16 +5,14 @@ import base64
|
|
| 5 |
import threading
|
| 6 |
import traceback
|
| 7 |
import asyncio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from pathlib import Path
|
| 9 |
from flask import Flask, request, jsonify, send_from_directory, Response
|
| 10 |
-
|
| 11 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
-
# LLAMA.CPP BACKEND IMPORTS (Universal GGUF Support)
|
| 13 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
-
# Install via: pip install llama-cpp-python huggingface-hub
|
| 15 |
-
from llama_cpp import Llama
|
| 16 |
from huggingface_hub import hf_hub_download
|
| 17 |
-
|
| 18 |
import edge_tts
|
| 19 |
|
| 20 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -27,10 +25,12 @@ TTS_RATE = int(os.environ.get("TTS_RATE", "-4"))
|
|
| 27 |
TTS_PITCH = int(os.environ.get("TTS_PITCH", "7"))
|
| 28 |
IMG_DIR = Path(__file__).parent / "img"
|
| 29 |
|
| 30 |
-
# You can swap this with ANY GGUF model
|
| 31 |
GGUF_REPO = os.environ.get("GGUF_REPO", "Qwen/Qwen2.5-1.5B-Instruct-GGUF")
|
| 32 |
GGUF_FILE = os.environ.get("GGUF_FILE", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
|
| 33 |
MODEL_DIR = Path(__file__).parent / "models"
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
# SYSTEM PROMPT
|
|
@@ -76,38 +76,70 @@ def clean_for_tts(text: str) -> str:
|
|
| 76 |
return clean
|
| 77 |
|
| 78 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
-
#
|
| 80 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
print("=" * 60)
|
| 82 |
-
print(" Visual AI -- Booting Systems (llama.cpp Backend)")
|
| 83 |
print("=" * 60)
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
try:
|
| 88 |
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
| 89 |
-
|
| 90 |
-
print(f"[MODEL] Verifying/Downloading {GGUF_FILE} from {GGUF_REPO} ...")
|
| 91 |
model_path = hf_hub_download(
|
| 92 |
repo_id=GGUF_REPO,
|
| 93 |
filename=GGUF_FILE,
|
| 94 |
local_dir=str(MODEL_DIR),
|
| 95 |
local_dir_use_symlinks=False
|
| 96 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
print(f"[
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
)
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 113 |
# CHAT MEMORY
|
|
@@ -127,7 +159,7 @@ def add_to_memory(sid: str, role: str, content: str):
|
|
| 127 |
sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
|
| 128 |
|
| 129 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 130 |
-
# RESPONSE GENERATION
|
| 131 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 132 |
STOP_TOKENS = [
|
| 133 |
"<end_of_turn>", "<start_of_turn>",
|
|
@@ -135,38 +167,44 @@ STOP_TOKENS = [
|
|
| 135 |
]
|
| 136 |
|
| 137 |
def generate_response(user_input: str, session_id: str) -> str:
|
| 138 |
-
if
|
| 139 |
-
return "[sad] My
|
| 140 |
|
| 141 |
memory = get_memory(session_id)
|
| 142 |
recent = memory[-(6 * 2):]
|
| 143 |
|
| 144 |
-
# Build prompt string
|
| 145 |
prompt = f"System: {SYSTEM_PROMPT}\n\n"
|
| 146 |
for msg in recent:
|
| 147 |
label = "User" if msg["role"] == "user" else "Ana"
|
| 148 |
prompt += f"{label}: {msg['content']}\n"
|
| 149 |
prompt += f"User: {user_input}\nAna:"
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
try:
|
| 152 |
-
#
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
max_tokens=MAX_NEW_TOKENS,
|
| 156 |
-
temperature=0.90,
|
| 157 |
-
top_k=50,
|
| 158 |
-
top_p=0.95,
|
| 159 |
-
repeat_penalty=1.1,
|
| 160 |
-
stop=STOP_TOKENS,
|
| 161 |
-
echo=False
|
| 162 |
-
)
|
| 163 |
-
response = output["choices"][0]["text"].strip()
|
| 164 |
except Exception as exc:
|
| 165 |
-
print(f"[GENERATE] Error: {exc}")
|
| 166 |
traceback.print_exc()
|
| 167 |
return "[sad] Something went wrong in my mind. Could you say that again?"
|
| 168 |
|
| 169 |
# Post-process cleanup
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
if "\n\n" in response:
|
| 171 |
response = response.split("\n\n")[0].strip()
|
| 172 |
|
|
@@ -244,10 +282,6 @@ body{
|
|
| 244 |
justify-content:center;
|
| 245 |
}
|
| 246 |
|
| 247 |
-
/*
|
| 248 |
-
object-fit: contain prevents cuts/overflow and displays the full image intact.
|
| 249 |
-
No transitions = INSTANT image swapping.
|
| 250 |
-
*/
|
| 251 |
#bgImg{
|
| 252 |
width:100%;
|
| 253 |
height:100%;
|
|
@@ -607,8 +641,8 @@ def clear():
|
|
| 607 |
@app.route("/health")
|
| 608 |
def health():
|
| 609 |
return jsonify({
|
| 610 |
-
"
|
| 611 |
-
"
|
| 612 |
})
|
| 613 |
|
| 614 |
if __name__ == "__main__":
|
|
|
|
| 5 |
import threading
|
| 6 |
import traceback
|
| 7 |
import asyncio
|
| 8 |
+
import urllib.request
|
| 9 |
+
import zipfile
|
| 10 |
+
import subprocess
|
| 11 |
+
import time
|
| 12 |
+
import requests
|
| 13 |
from pathlib import Path
|
| 14 |
from flask import Flask, request, jsonify, send_from_directory, Response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 16 |
import edge_tts
|
| 17 |
|
| 18 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 25 |
TTS_PITCH = int(os.environ.get("TTS_PITCH", "7"))
|
| 26 |
IMG_DIR = Path(__file__).parent / "img"
|
| 27 |
|
| 28 |
+
# You can swap this with ANY GGUF model
|
| 29 |
GGUF_REPO = os.environ.get("GGUF_REPO", "Qwen/Qwen2.5-1.5B-Instruct-GGUF")
|
| 30 |
GGUF_FILE = os.environ.get("GGUF_FILE", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
|
| 31 |
MODEL_DIR = Path(__file__).parent / "models"
|
| 32 |
+
LLAMA_BIN_DIR = Path(__file__).parent / "llama_bin"
|
| 33 |
+
LLAMA_EXE = LLAMA_BIN_DIR / "llama-server"
|
| 34 |
|
| 35 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
# SYSTEM PROMPT
|
|
|
|
| 76 |
return clean
|
| 77 |
|
| 78 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
+
# NATIVE LLAMA.CPP SERVER (BYPASS PIP COMPILATION ENTIRELY)
|
| 80 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
print("=" * 60)
|
| 82 |
+
print(" Visual AI -- Booting Systems (Native llama.cpp Backend)")
|
| 83 |
print("=" * 60)
|
| 84 |
|
| 85 |
+
def setup_and_start_backend():
|
| 86 |
+
# 1. Download Model
|
|
|
|
| 87 |
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
| 88 |
+
print(f"[SETUP] Verifying/Downloading Model: {GGUF_FILE} ...")
|
|
|
|
| 89 |
model_path = hf_hub_download(
|
| 90 |
repo_id=GGUF_REPO,
|
| 91 |
filename=GGUF_FILE,
|
| 92 |
local_dir=str(MODEL_DIR),
|
| 93 |
local_dir_use_symlinks=False
|
| 94 |
)
|
| 95 |
+
|
| 96 |
+
# 2. Download Pre-compiled Binary (Instant)
|
| 97 |
+
if not LLAMA_EXE.exists():
|
| 98 |
+
print("[SETUP] Bypassing python pip - Downloading pre-compiled C++ binary directly...")
|
| 99 |
+
LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
|
| 100 |
+
zip_path = LLAMA_BIN_DIR / "llama.zip"
|
| 101 |
+
url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
|
| 102 |
+
urllib.request.urlretrieve(url, zip_path)
|
| 103 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 104 |
+
zip_ref.extractall(LLAMA_BIN_DIR)
|
| 105 |
+
os.remove(zip_path)
|
| 106 |
+
|
| 107 |
+
# Locate the binary in the unzipped folder
|
| 108 |
+
for root, _, files in os.walk(LLAMA_BIN_DIR):
|
| 109 |
+
if "llama-server" in files:
|
| 110 |
+
found_exe = os.path.join(root, "llama-server")
|
| 111 |
+
os.chmod(found_exe, 0o755)
|
| 112 |
+
if found_exe != str(LLAMA_EXE):
|
| 113 |
+
os.rename(found_exe, str(LLAMA_EXE))
|
| 114 |
+
break
|
| 115 |
+
|
| 116 |
+
# 3. Boot Server in Background
|
| 117 |
+
cpu_cores = os.cpu_count() or 2
|
| 118 |
+
threads = str(max(1, cpu_cores - 1))
|
| 119 |
|
| 120 |
+
print(f"[SETUP] Starting Native llama-server engine on {threads} CPU threads...")
|
| 121 |
+
subprocess.Popen([
|
| 122 |
+
str(LLAMA_EXE),
|
| 123 |
+
"-m", model_path,
|
| 124 |
+
"-c", "4096",
|
| 125 |
+
"--port", "8080",
|
| 126 |
+
"--host", "127.0.0.1",
|
| 127 |
+
"-t", threads
|
| 128 |
+
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 129 |
+
|
| 130 |
+
# 4. Wait for Server to wake up
|
| 131 |
+
for _ in range(30):
|
| 132 |
+
try:
|
| 133 |
+
if requests.get("http://127.0.0.1:8080/").status_code == 200:
|
| 134 |
+
print("[SETUP] llama-server backend is ONLINE and ready!")
|
| 135 |
+
return True
|
| 136 |
+
except requests.exceptions.ConnectionError:
|
| 137 |
+
time.sleep(1)
|
| 138 |
+
|
| 139 |
+
print("[SETUP] FAILED to start llama-server backend.")
|
| 140 |
+
return False
|
| 141 |
+
|
| 142 |
+
backend_ready = setup_and_start_backend()
|
| 143 |
|
| 144 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 145 |
# CHAT MEMORY
|
|
|
|
| 159 |
sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
|
| 160 |
|
| 161 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 162 |
+
# RESPONSE GENERATION (Proxied to local native binary)
|
| 163 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
STOP_TOKENS = [
|
| 165 |
"<end_of_turn>", "<start_of_turn>",
|
|
|
|
| 167 |
]
|
| 168 |
|
| 169 |
def generate_response(user_input: str, session_id: str) -> str:
|
| 170 |
+
if not backend_ready:
|
| 171 |
+
return "[sad] My core engine failed to start. Please check the logs."
|
| 172 |
|
| 173 |
memory = get_memory(session_id)
|
| 174 |
recent = memory[-(6 * 2):]
|
| 175 |
|
| 176 |
+
# Build prompt string explicitly
|
| 177 |
prompt = f"System: {SYSTEM_PROMPT}\n\n"
|
| 178 |
for msg in recent:
|
| 179 |
label = "User" if msg["role"] == "user" else "Ana"
|
| 180 |
prompt += f"{label}: {msg['content']}\n"
|
| 181 |
prompt += f"User: {user_input}\nAna:"
|
| 182 |
|
| 183 |
+
payload = {
|
| 184 |
+
"prompt": prompt,
|
| 185 |
+
"n_predict": MAX_NEW_TOKENS,
|
| 186 |
+
"temperature": 0.90,
|
| 187 |
+
"top_k": 50,
|
| 188 |
+
"top_p": 0.95,
|
| 189 |
+
"repeat_penalty": 1.1,
|
| 190 |
+
"stop": STOP_TOKENS,
|
| 191 |
+
"stream": False
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
try:
|
| 195 |
+
# Request completion natively from our C++ binary
|
| 196 |
+
res = requests.post("http://127.0.0.1:8080/completion", json=payload, timeout=60).json()
|
| 197 |
+
response = res.get("content", "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
except Exception as exc:
|
| 199 |
+
print(f"[GENERATE] Error communicating with llama-server: {exc}")
|
| 200 |
traceback.print_exc()
|
| 201 |
return "[sad] Something went wrong in my mind. Could you say that again?"
|
| 202 |
|
| 203 |
# Post-process cleanup
|
| 204 |
+
for stop in STOP_TOKENS:
|
| 205 |
+
if stop in response:
|
| 206 |
+
response = response.split(stop)[0].strip()
|
| 207 |
+
|
| 208 |
if "\n\n" in response:
|
| 209 |
response = response.split("\n\n")[0].strip()
|
| 210 |
|
|
|
|
| 282 |
justify-content:center;
|
| 283 |
}
|
| 284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
#bgImg{
|
| 286 |
width:100%;
|
| 287 |
height:100%;
|
|
|
|
| 641 |
@app.route("/health")
|
| 642 |
def health():
|
| 643 |
return jsonify({
|
| 644 |
+
"backend_ready": backend_ready,
|
| 645 |
+
"type": "native-llama-server"
|
| 646 |
})
|
| 647 |
|
| 648 |
if __name__ == "__main__":
|