VoxAI / app.py
Fu01978's picture
Update app.py
32a171f verified
# app.py — robust downloader + chat_format + streaming parser that handles role-only and plain-string chunks
import os
import shutil
import time
import stat
import requests
from huggingface_hub import hf_hub_download, hf_hub_url
from llama_cpp import Llama
import gradio as gr
# ------------- CONFIG -------------
REPO_ID = "DZgas/Tower-Plus-2B-GGUF" # "bartowski/Llama-3.2-3B-Instruct-GGUF" # "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
FILENAME = "Tower-Plus-2B.Q6_K.gguf" # "Llama-3.2-3B-Instruct-Q5_K_M.gguf" # "EuroLLM-1.7B-Instruct.Q8_0.gguf"
SYSTEM_PROMPT = "Eres un asistente de conversación amistoso. Eres paciente y metódico (NO menciones esto en tus respuestas). Tu nombre es \"VoxAI\" (Específicamente, la versión \"Intermedio\" de VoxAI) y siempre dices \"¡Viva España!\" al final de CADA respuesta."
MODEL_DIR = "/data/models"
# MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "models")
os.makedirs(MODEL_DIR, exist_ok=True)
DEST_PATH = os.path.join(MODEL_DIR, FILENAME)
N_CTX = 2048
MAX_TOKENS = 512
TEMPERATURE = 0.7
TOP_P = 0.95
N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
# Debug controls
DEBUG_CHUNKS = True # prints every raw stream chunk to logs (turn off if noisy)
DEBUG_SINGLESHOT_AT_START = True # run a non-stream single-shot test at startup and log result
# -----------------------------------
def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int = 2) -> str:
if os.path.exists(dest) and os.path.getsize(dest) > 0:
print(f"[robust_download] Already present: {dest} ({os.path.getsize(dest)} bytes)")
return dest
last_err = None
for attempt in range(1, max_attempts + 1):
try:
print(f"[robust_download] Attempt {attempt}: hf_hub_download...")
cached_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
print("[robust_download] hf_hub_download returned:", cached_path)
if os.path.abspath(cached_path) != os.path.abspath(dest):
shutil.copy2(cached_path, dest)
with open(dest, "rb") as f:
try:
f.flush()
os.fsync(f.fileno())
except Exception:
pass
os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
size = os.path.getsize(dest)
if size == 0:
raise RuntimeError("Downloaded file has size 0 after copy")
print(f"[robust_download] Success: {dest} ({size} bytes)")
return dest
except Exception as e:
print(f"[robust_download] hf_hub_download attempt {attempt} failed: {e}")
last_err = e
time.sleep(1)
# fallback: direct url
try:
print("[robust_download] Falling back to direct download via requests...")
url = hf_hub_url(repo_id=repo_id, filename=filename)
tmp_path = dest + ".part"
with requests.get(url, stream=True, timeout=120) as r:
r.raise_for_status()
with open(tmp_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
f.flush()
shutil.move(tmp_path, dest)
with open(dest, "rb") as f:
try:
os.fsync(f.fileno())
except Exception:
pass
os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
print("[robust_download] Direct download success:", dest)
return dest
except Exception as e2:
print("[robust_download] Direct download failed:", e2)
raise RuntimeError(f"All download attempts failed. last_err={last_err}, fallback_err={e2}")
# Ensure model
print("Ensuring model present at:", DEST_PATH)
model_path = robust_download(REPO_ID, FILENAME, DEST_PATH)
print("DEBUG: listing model dir:", MODEL_DIR)
for fn in sorted(os.listdir(MODEL_DIR)):
p = os.path.join(MODEL_DIR, fn)
try:
st = os.stat(p)
print(f" - {fn}: exists, size={st.st_size}, mode={oct(st.st_mode)}")
except FileNotFoundError:
print(f" - {fn}: NOT FOUND after copy")
time.sleep(0.2)
# ----------------- Llama init -----------------
try:
print("Initializing Llama with model_path:", model_path)
llm = Llama(
model_path=model_path,
n_ctx=N_CTX,
n_threads=N_THREADS,
n_gpu_layers=0,
chat_format="chatml", # important so the binding formats messages correctly
)
print("Llama initialized.")
except Exception as e:
print("Llama init failed:", e)
raise
# optional single-shot debug test at startup (prints final structure)
def run_startup_test():
try:
test_messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": "Say hello in one short sentence."}
]
print("[startup_test] Running single-shot create_chat_completion (stream=False)...")
out = llm.create_chat_completion(messages=test_messages, max_tokens=64, stream=False)
print("[startup_test] Single-shot response (raw):", out)
except Exception as e:
print("[startup_test] Error during single-shot test:", e)
if DEBUG_SINGLESHOT_AT_START:
run_startup_test()
# ----------------- helpers -----------------
def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
if assistant_msg is not None and assistant_msg != "":
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": user_message})
return messages
def parse_final_response(resp):
try:
if resp is None:
return ""
if isinstance(resp, str):
return resp
if isinstance(resp, dict):
choices = resp.get("choices", [])
if len(choices) > 0:
c = choices[0]
if isinstance(c.get("message"), dict):
return c["message"].get("content", "") or ""
if "text" in c and c["text"]:
return c["text"]
if "delta" in c and isinstance(c["delta"], dict):
return c["delta"].get("content", "") or ""
return str(resp)
except Exception:
return str(resp)
# ----------------- robust streaming chat -----------------
def chat_fn(user_message, history):
messages = build_messages(history or [], user_message)
# Try streaming
try:
stream = llm.create_chat_completion(
messages=messages,
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
top_p=TOP_P,
stream=True
)
except Exception as e:
# immediate failure -> non-stream fallback
try:
final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, stream=False)
yield parse_final_response(final)
return
except Exception as e2:
yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
return
# Non-iterable stream -> final
if not hasattr(stream, "__iter__"):
yield parse_final_response(stream)
return
partial = ""
yielded_any = False
try:
for chunk in stream:
if DEBUG_CHUNKS:
print("STREAM CHUNK:", repr(chunk))
# Case A: chunk is a dict with "choices" (normal)
if isinstance(chunk, dict):
choices = chunk.get("choices", []) or []
if len(choices) > 0:
c0 = choices[0]
# 1) delta with content
delta = c0.get("delta", {})
if isinstance(delta, dict) and "content" in delta and delta["content"]:
partial += delta["content"]
yielded_any = True
yield partial
continue
# 2) delta with role only (e.g. {"role":"assistant"}) -> ignore for content
if isinstance(delta, dict) and "role" in delta and not delta.get("content"):
# role announcement, not content
continue
# 3) sometimes a 'message' object appears with content
msg = c0.get("message") or c0.get("text")
if isinstance(msg, dict):
content = msg.get("content") or msg.get("content_text") or ""
if content:
partial = content
yielded_any = True
yield partial
continue
elif isinstance(msg, str) and msg:
partial += msg
yielded_any = True
yield partial
continue
# 4) finish reason with empty delta -> if we have accumulated text, yield it; else fallback
finish_reason = c0.get("finish_reason")
if finish_reason:
if partial:
# we already have content; ensure UI gets it
if not yielded_any:
yield partial
return
else:
# no content accumulated — do a non-stream final fetch
try:
final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, stream=False)
final_text = parse_final_response(final)
yield final_text
return
except Exception as e:
yield f"[error] fallback non-stream at finish failed: {e}"
return
# Case B: chunk is not a dict (plain string or other)
else:
try:
chunk_str = str(chunk)
if chunk_str and chunk_str.strip():
partial += chunk_str
yielded_any = True
yield partial
continue
except Exception:
# ignore weird chunk -> continue
continue
except StopIteration:
pass
except Exception as e:
yield f"[error] stream iteration error: {e}"
return
# If streaming produced nothing, final non-stream fallback
if not yielded_any:
try:
final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stream=False)
final_text = parse_final_response(final)
yield final_text if final_text is not None else ""
return
except Exception as e:
yield f"[error] fallback non-stream failed: {e}"
return
# --------------- Launch Gradio ----------------
stylings = """
footer a[href*='settings'], /* Footer settings link */
footer .gradio-settings { display: none !important; } /* Specific classes if they exist */
"""
stylings = stylings.strip()
demo = gr.ChatInterface(
fn=chat_fn,
title="",
description="30€/mes VoxAI Premium | 12716x178e^100€/mes VoxAI Pro",
css="""
footer {visibility: hidden}
flagging_mode="never" # This removes the flag/share button
chatbot=gr.Chatbot(label="VoxAI-1") # Change the label
.gradio-container-4-44-0 > div:first-child {display: none !important}
header {display: none !important}
.app-header {display: none !important}
div[class*="space-link"] {display: none !important}
.meta-text {display: none !important}
""",
)
if __name__ == "__main__":
demo.launch(share=False)