Spaces:
Running
Running
File size: 12,445 Bytes
dc8348c 98b52f2 b53b3ec c392854 98b52f2 cdd9448 9ed7335 7993bbd 32a171f 331d6b4 c392854 b53b3ec 50d2614 1db68c1 50d2614 9ed7335 dc8348c 9ed7335 3ad9149 b53b3ec c392854 b53b3ec c392854 b53b3ec c392854 9ed7335 23c690f b53b3ec 50d2614 b53b3ec 50d2614 b53b3ec 50d2614 b53b3ec 9ed7335 b53b3ec c392854 b53b3ec 50d2614 b53b3ec 9ed7335 b53b3ec 50d2614 b53b3ec 50d2614 b53b3ec 50d2614 b53b3ec 50d2614 b53b3ec 9ed7335 50d2614 b53b3ec 50d2614 b53b3ec 9ed7335 b53b3ec 50d2614 b53b3ec dc8348c b53b3ec 9ed7335 b53b3ec 9ed7335 c392854 50d2614 c392854 50d2614 9ed7335 c392854 9ed7335 dc8348c 50d2614 9ed7335 c392854 9ed7335 50d2614 dc8348c 50d2614 9ed7335 50d2614 9ed7335 50d2614 dc8348c 9ed7335 dc8348c 9ed7335 dc8348c 9ed7335 dc8348c 50d2614 dc8348c 50d2614 dc8348c 50d2614 9ed7335 3438dec 50d2614 c303d60 af31e04 102edf0 1db68c1 af31e04 50d2614 c392854 af31e04 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 |
# app.py — robust downloader + chat_format + streaming parser that handles role-only and plain-string chunks
import os
import shutil
import time
import stat
import requests
from huggingface_hub import hf_hub_download, hf_hub_url
from llama_cpp import Llama
import gradio as gr
# ------------- CONFIG -------------
REPO_ID = "DZgas/Tower-Plus-2B-GGUF" # "bartowski/Llama-3.2-3B-Instruct-GGUF" # "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
FILENAME = "Tower-Plus-2B.Q6_K.gguf" # "Llama-3.2-3B-Instruct-Q5_K_M.gguf" # "EuroLLM-1.7B-Instruct.Q8_0.gguf"
SYSTEM_PROMPT = "Eres un asistente de conversación amistoso. Eres paciente y metódico (NO menciones esto en tus respuestas). Tu nombre es \"VoxAI\" (Específicamente, la versión \"Intermedio\" de VoxAI) y siempre dices \"¡Viva España!\" al final de CADA respuesta."
MODEL_DIR = "/data/models"
# MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "models")
os.makedirs(MODEL_DIR, exist_ok=True)
DEST_PATH = os.path.join(MODEL_DIR, FILENAME)
N_CTX = 2048
MAX_TOKENS = 512
TEMPERATURE = 0.7
TOP_P = 0.95
N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
# Debug controls
DEBUG_CHUNKS = True # prints every raw stream chunk to logs (turn off if noisy)
DEBUG_SINGLESHOT_AT_START = True # run a non-stream single-shot test at startup and log result
# -----------------------------------
def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int = 2) -> str:
if os.path.exists(dest) and os.path.getsize(dest) > 0:
print(f"[robust_download] Already present: {dest} ({os.path.getsize(dest)} bytes)")
return dest
last_err = None
for attempt in range(1, max_attempts + 1):
try:
print(f"[robust_download] Attempt {attempt}: hf_hub_download...")
cached_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
print("[robust_download] hf_hub_download returned:", cached_path)
if os.path.abspath(cached_path) != os.path.abspath(dest):
shutil.copy2(cached_path, dest)
with open(dest, "rb") as f:
try:
f.flush()
os.fsync(f.fileno())
except Exception:
pass
os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
size = os.path.getsize(dest)
if size == 0:
raise RuntimeError("Downloaded file has size 0 after copy")
print(f"[robust_download] Success: {dest} ({size} bytes)")
return dest
except Exception as e:
print(f"[robust_download] hf_hub_download attempt {attempt} failed: {e}")
last_err = e
time.sleep(1)
# fallback: direct url
try:
print("[robust_download] Falling back to direct download via requests...")
url = hf_hub_url(repo_id=repo_id, filename=filename)
tmp_path = dest + ".part"
with requests.get(url, stream=True, timeout=120) as r:
r.raise_for_status()
with open(tmp_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
f.flush()
shutil.move(tmp_path, dest)
with open(dest, "rb") as f:
try:
os.fsync(f.fileno())
except Exception:
pass
os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
print("[robust_download] Direct download success:", dest)
return dest
except Exception as e2:
print("[robust_download] Direct download failed:", e2)
raise RuntimeError(f"All download attempts failed. last_err={last_err}, fallback_err={e2}")
# Ensure model
print("Ensuring model present at:", DEST_PATH)
model_path = robust_download(REPO_ID, FILENAME, DEST_PATH)
print("DEBUG: listing model dir:", MODEL_DIR)
for fn in sorted(os.listdir(MODEL_DIR)):
p = os.path.join(MODEL_DIR, fn)
try:
st = os.stat(p)
print(f" - {fn}: exists, size={st.st_size}, mode={oct(st.st_mode)}")
except FileNotFoundError:
print(f" - {fn}: NOT FOUND after copy")
time.sleep(0.2)
# ----------------- Llama init -----------------
try:
print("Initializing Llama with model_path:", model_path)
llm = Llama(
model_path=model_path,
n_ctx=N_CTX,
n_threads=N_THREADS,
n_gpu_layers=0,
chat_format="chatml", # important so the binding formats messages correctly
)
print("Llama initialized.")
except Exception as e:
print("Llama init failed:", e)
raise
# optional single-shot debug test at startup (prints final structure)
def run_startup_test():
try:
test_messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": "Say hello in one short sentence."}
]
print("[startup_test] Running single-shot create_chat_completion (stream=False)...")
out = llm.create_chat_completion(messages=test_messages, max_tokens=64, stream=False)
print("[startup_test] Single-shot response (raw):", out)
except Exception as e:
print("[startup_test] Error during single-shot test:", e)
if DEBUG_SINGLESHOT_AT_START:
run_startup_test()
# ----------------- helpers -----------------
def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
if assistant_msg is not None and assistant_msg != "":
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": user_message})
return messages
def parse_final_response(resp):
try:
if resp is None:
return ""
if isinstance(resp, str):
return resp
if isinstance(resp, dict):
choices = resp.get("choices", [])
if len(choices) > 0:
c = choices[0]
if isinstance(c.get("message"), dict):
return c["message"].get("content", "") or ""
if "text" in c and c["text"]:
return c["text"]
if "delta" in c and isinstance(c["delta"], dict):
return c["delta"].get("content", "") or ""
return str(resp)
except Exception:
return str(resp)
# ----------------- robust streaming chat -----------------
def chat_fn(user_message, history):
messages = build_messages(history or [], user_message)
# Try streaming
try:
stream = llm.create_chat_completion(
messages=messages,
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
top_p=TOP_P,
stream=True
)
except Exception as e:
# immediate failure -> non-stream fallback
try:
final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, stream=False)
yield parse_final_response(final)
return
except Exception as e2:
yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
return
# Non-iterable stream -> final
if not hasattr(stream, "__iter__"):
yield parse_final_response(stream)
return
partial = ""
yielded_any = False
try:
for chunk in stream:
if DEBUG_CHUNKS:
print("STREAM CHUNK:", repr(chunk))
# Case A: chunk is a dict with "choices" (normal)
if isinstance(chunk, dict):
choices = chunk.get("choices", []) or []
if len(choices) > 0:
c0 = choices[0]
# 1) delta with content
delta = c0.get("delta", {})
if isinstance(delta, dict) and "content" in delta and delta["content"]:
partial += delta["content"]
yielded_any = True
yield partial
continue
# 2) delta with role only (e.g. {"role":"assistant"}) -> ignore for content
if isinstance(delta, dict) and "role" in delta and not delta.get("content"):
# role announcement, not content
continue
# 3) sometimes a 'message' object appears with content
msg = c0.get("message") or c0.get("text")
if isinstance(msg, dict):
content = msg.get("content") or msg.get("content_text") or ""
if content:
partial = content
yielded_any = True
yield partial
continue
elif isinstance(msg, str) and msg:
partial += msg
yielded_any = True
yield partial
continue
# 4) finish reason with empty delta -> if we have accumulated text, yield it; else fallback
finish_reason = c0.get("finish_reason")
if finish_reason:
if partial:
# we already have content; ensure UI gets it
if not yielded_any:
yield partial
return
else:
# no content accumulated — do a non-stream final fetch
try:
final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, stream=False)
final_text = parse_final_response(final)
yield final_text
return
except Exception as e:
yield f"[error] fallback non-stream at finish failed: {e}"
return
# Case B: chunk is not a dict (plain string or other)
else:
try:
chunk_str = str(chunk)
if chunk_str and chunk_str.strip():
partial += chunk_str
yielded_any = True
yield partial
continue
except Exception:
# ignore weird chunk -> continue
continue
except StopIteration:
pass
except Exception as e:
yield f"[error] stream iteration error: {e}"
return
# If streaming produced nothing, final non-stream fallback
if not yielded_any:
try:
final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stream=False)
final_text = parse_final_response(final)
yield final_text if final_text is not None else ""
return
except Exception as e:
yield f"[error] fallback non-stream failed: {e}"
return
# --------------- Launch Gradio ----------------
stylings = """
footer a[href*='settings'], /* Footer settings link */
footer .gradio-settings { display: none !important; } /* Specific classes if they exist */
"""
stylings = stylings.strip()
demo = gr.ChatInterface(
fn=chat_fn,
title="",
description="30€/mes VoxAI Premium | 12716x178e^100€/mes VoxAI Pro",
css="""
footer {visibility: hidden}
flagging_mode="never" # This removes the flag/share button
chatbot=gr.Chatbot(label="VoxAI-1") # Change the label
.gradio-container-4-44-0 > div:first-child {display: none !important}
header {display: none !important}
.app-header {display: none !important}
div[class*="space-link"] {display: none !important}
.meta-text {display: none !important}
""",
)
if __name__ == "__main__":
demo.launch(share=False)
|