Update app.py
Browse files
app.py
CHANGED
|
@@ -8,8 +8,11 @@ import asyncio
|
|
| 8 |
from pathlib import Path
|
| 9 |
from flask import Flask, request, jsonify, send_from_directory, Response
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
from huggingface_hub import hf_hub_download
|
| 14 |
|
| 15 |
import edge_tts
|
|
@@ -24,9 +27,9 @@ TTS_RATE = int(os.environ.get("TTS_RATE", "-4"))
|
|
| 24 |
TTS_PITCH = int(os.environ.get("TTS_PITCH", "7"))
|
| 25 |
IMG_DIR = Path(__file__).parent / "img"
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
GGUF_REPO = os.environ.get("GGUF_REPO", "
|
| 29 |
-
GGUF_FILE = os.environ.get("GGUF_FILE", "
|
| 30 |
MODEL_DIR = Path(__file__).parent / "models"
|
| 31 |
|
| 32 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -73,10 +76,10 @@ def clean_for_tts(text: str) -> str:
|
|
| 73 |
return clean
|
| 74 |
|
| 75 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
-
# MODEL LOADING (
|
| 77 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 78 |
print("=" * 60)
|
| 79 |
-
print(" Visual AI -- Booting Systems (
|
| 80 |
print("=" * 60)
|
| 81 |
|
| 82 |
model = None
|
|
@@ -85,15 +88,22 @@ try:
|
|
| 85 |
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
| 86 |
|
| 87 |
print(f"[MODEL] Verifying/Downloading {GGUF_FILE} from {GGUF_REPO} ...")
|
| 88 |
-
hf_hub_download(
|
| 89 |
repo_id=GGUF_REPO,
|
| 90 |
filename=GGUF_FILE,
|
| 91 |
local_dir=str(MODEL_DIR),
|
| 92 |
local_dir_use_symlinks=False
|
| 93 |
)
|
| 94 |
|
| 95 |
-
print(f"[MODEL] Loading {GGUF_FILE} on CPU ...")
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
print(" OK Model loaded successfully!")
|
| 98 |
except Exception as exc:
|
| 99 |
print(f" FAILED Model load error: {exc}")
|
|
@@ -121,7 +131,7 @@ def add_to_memory(sid: str, role: str, content: str):
|
|
| 121 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 122 |
STOP_TOKENS = [
|
| 123 |
"<end_of_turn>", "<start_of_turn>",
|
| 124 |
-
"User:", "<|endoftext|>", "[/INST]", "</s>", "<|im_end|>"
|
| 125 |
]
|
| 126 |
|
| 127 |
def generate_response(user_input: str, session_id: str) -> str:
|
|
@@ -131,7 +141,7 @@ def generate_response(user_input: str, session_id: str) -> str:
|
|
| 131 |
memory = get_memory(session_id)
|
| 132 |
recent = memory[-(6 * 2):]
|
| 133 |
|
| 134 |
-
# Build prompt string
|
| 135 |
prompt = f"System: {SYSTEM_PROMPT}\n\n"
|
| 136 |
for msg in recent:
|
| 137 |
label = "User" if msg["role"] == "user" else "Ana"
|
|
@@ -139,28 +149,24 @@ def generate_response(user_input: str, session_id: str) -> str:
|
|
| 139 |
prompt += f"User: {user_input}\nAna:"
|
| 140 |
|
| 141 |
try:
|
| 142 |
-
#
|
| 143 |
-
|
| 144 |
prompt=prompt,
|
| 145 |
max_tokens=MAX_NEW_TOKENS,
|
| 146 |
-
|
| 147 |
top_k=50,
|
| 148 |
top_p=0.95,
|
| 149 |
repeat_penalty=1.1,
|
| 150 |
-
|
|
|
|
| 151 |
)
|
|
|
|
| 152 |
except Exception as exc:
|
| 153 |
print(f"[GENERATE] Error: {exc}")
|
| 154 |
traceback.print_exc()
|
| 155 |
return "[sad] Something went wrong in my mind. Could you say that again?"
|
| 156 |
|
| 157 |
-
response = response.strip()
|
| 158 |
-
|
| 159 |
# Post-process cleanup
|
| 160 |
-
for stop in STOP_TOKENS:
|
| 161 |
-
if stop in response:
|
| 162 |
-
response = response.split(stop)[0].strip()
|
| 163 |
-
|
| 164 |
if "\n\n" in response:
|
| 165 |
response = response.split("\n\n")[0].strip()
|
| 166 |
|
|
@@ -602,7 +608,7 @@ def clear():
|
|
| 602 |
def health():
|
| 603 |
return jsonify({
|
| 604 |
"model_loaded": model is not None,
|
| 605 |
-
"backend": "
|
| 606 |
})
|
| 607 |
|
| 608 |
if __name__ == "__main__":
|
|
|
|
| 8 |
from pathlib import Path
|
| 9 |
from flask import Flask, request, jsonify, send_from_directory, Response
|
| 10 |
|
| 11 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
+
# LLAMA.CPP BACKEND IMPORTS (Universal GGUF Support)
|
| 13 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
+
# Install via: pip install llama-cpp-python huggingface-hub
|
| 15 |
+
from llama_cpp import Llama
|
| 16 |
from huggingface_hub import hf_hub_download
|
| 17 |
|
| 18 |
import edge_tts
|
|
|
|
| 27 |
TTS_PITCH = int(os.environ.get("TTS_PITCH", "7"))
|
| 28 |
IMG_DIR = Path(__file__).parent / "img"
|
| 29 |
|
| 30 |
+
# You can swap this with ANY GGUF model supported by llama.cpp
|
| 31 |
+
GGUF_REPO = os.environ.get("GGUF_REPO", "Qwen/Qwen2.5-1.5B-Instruct-GGUF")
|
| 32 |
+
GGUF_FILE = os.environ.get("GGUF_FILE", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
|
| 33 |
MODEL_DIR = Path(__file__).parent / "models"
|
| 34 |
|
| 35 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 76 |
return clean
|
| 77 |
|
| 78 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
+
# MODEL LOADING (llama.cpp CPU)
|
| 80 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
print("=" * 60)
|
| 82 |
+
print(" Visual AI -- Booting Systems (llama.cpp Backend)")
|
| 83 |
print("=" * 60)
|
| 84 |
|
| 85 |
model = None
|
|
|
|
| 88 |
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
| 89 |
|
| 90 |
print(f"[MODEL] Verifying/Downloading {GGUF_FILE} from {GGUF_REPO} ...")
|
| 91 |
+
model_path = hf_hub_download(
|
| 92 |
repo_id=GGUF_REPO,
|
| 93 |
filename=GGUF_FILE,
|
| 94 |
local_dir=str(MODEL_DIR),
|
| 95 |
local_dir_use_symlinks=False
|
| 96 |
)
|
| 97 |
|
| 98 |
+
print(f"[MODEL] Loading {GGUF_FILE} on CPU with llama.cpp ...")
|
| 99 |
+
# n_ctx determines context length.
|
| 100 |
+
# n_threads utilizes optimal CPU cores automatically if set to None.
|
| 101 |
+
model = Llama(
|
| 102 |
+
model_path=model_path,
|
| 103 |
+
n_ctx=4096,
|
| 104 |
+
n_threads=max(1, os.cpu_count() - 1),
|
| 105 |
+
verbose=False # Set to True to see llama.cpp debug logs
|
| 106 |
+
)
|
| 107 |
print(" OK Model loaded successfully!")
|
| 108 |
except Exception as exc:
|
| 109 |
print(f" FAILED Model load error: {exc}")
|
|
|
|
| 131 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 132 |
STOP_TOKENS = [
|
| 133 |
"<end_of_turn>", "<start_of_turn>",
|
| 134 |
+
"User:", "<|endoftext|>", "[/INST]", "</s>", "<|im_end|>", "\nUser:"
|
| 135 |
]
|
| 136 |
|
| 137 |
def generate_response(user_input: str, session_id: str) -> str:
|
|
|
|
| 141 |
memory = get_memory(session_id)
|
| 142 |
recent = memory[-(6 * 2):]
|
| 143 |
|
| 144 |
+
# Build prompt string
|
| 145 |
prompt = f"System: {SYSTEM_PROMPT}\n\n"
|
| 146 |
for msg in recent:
|
| 147 |
label = "User" if msg["role"] == "user" else "Ana"
|
|
|
|
| 149 |
prompt += f"User: {user_input}\nAna:"
|
| 150 |
|
| 151 |
try:
|
| 152 |
+
# llama.cpp Generation
|
| 153 |
+
output = model.create_completion(
|
| 154 |
prompt=prompt,
|
| 155 |
max_tokens=MAX_NEW_TOKENS,
|
| 156 |
+
temperature=0.90,
|
| 157 |
top_k=50,
|
| 158 |
top_p=0.95,
|
| 159 |
repeat_penalty=1.1,
|
| 160 |
+
stop=STOP_TOKENS,
|
| 161 |
+
echo=False
|
| 162 |
)
|
| 163 |
+
response = output["choices"][0]["text"].strip()
|
| 164 |
except Exception as exc:
|
| 165 |
print(f"[GENERATE] Error: {exc}")
|
| 166 |
traceback.print_exc()
|
| 167 |
return "[sad] Something went wrong in my mind. Could you say that again?"
|
| 168 |
|
|
|
|
|
|
|
| 169 |
# Post-process cleanup
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
if "\n\n" in response:
|
| 171 |
response = response.split("\n\n")[0].strip()
|
| 172 |
|
|
|
|
| 608 |
def health():
|
| 609 |
return jsonify({
|
| 610 |
"model_loaded": model is not None,
|
| 611 |
+
"backend": "llama.cpp (CPU GGUF)",
|
| 612 |
})
|
| 613 |
|
| 614 |
if __name__ == "__main__":
|