Spaces:

Fu01978
/

VoxAI

Running

File size: 12,445 Bytes

# app.py — robust downloader + chat_format + streaming parser that handles role-only and plain-string chunks
import os
import shutil
import time
import stat
import requests
from huggingface_hub import hf_hub_download, hf_hub_url
from llama_cpp import Llama
import gradio as gr

# ------------- CONFIG -------------
REPO_ID = "DZgas/Tower-Plus-2B-GGUF" # "bartowski/Llama-3.2-3B-Instruct-GGUF" # "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
FILENAME = "Tower-Plus-2B.Q6_K.gguf" # "Llama-3.2-3B-Instruct-Q5_K_M.gguf" # "EuroLLM-1.7B-Instruct.Q8_0.gguf"
SYSTEM_PROMPT = "Eres un asistente de conversación amistoso. Eres paciente y metódico (NO menciones esto en tus respuestas). Tu nombre es \"VoxAI\" (Específicamente, la versión \"Intermedio\" de VoxAI) y siempre dices \"¡Viva España!\" al final de CADA respuesta."
MODEL_DIR = "/data/models"
# MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "models")
os.makedirs(MODEL_DIR, exist_ok=True)
DEST_PATH = os.path.join(MODEL_DIR, FILENAME)

N_CTX = 2048
MAX_TOKENS = 512
TEMPERATURE = 0.7
TOP_P = 0.95
N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))

# Debug controls
DEBUG_CHUNKS = True            # prints every raw stream chunk to logs (turn off if noisy)
DEBUG_SINGLESHOT_AT_START = True  # run a non-stream single-shot test at startup and log result
# -----------------------------------

def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int = 2) -> str:
    if os.path.exists(dest) and os.path.getsize(dest) > 0:
        print(f"[robust_download] Already present: {dest} ({os.path.getsize(dest)} bytes)")
        return dest

    last_err = None
    for attempt in range(1, max_attempts + 1):
        try:
            print(f"[robust_download] Attempt {attempt}: hf_hub_download...")
            cached_path = hf_hub_download(
                repo_id=repo_id,
                filename=filename,
                local_dir=MODEL_DIR,
                local_dir_use_symlinks=False
            )
            print("[robust_download] hf_hub_download returned:", cached_path)
            if os.path.abspath(cached_path) != os.path.abspath(dest):
                shutil.copy2(cached_path, dest)
            with open(dest, "rb") as f:
                try:
                    f.flush()
                    os.fsync(f.fileno())
                except Exception:
                    pass
            os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
            size = os.path.getsize(dest)
            if size == 0:
                raise RuntimeError("Downloaded file has size 0 after copy")
            print(f"[robust_download] Success: {dest} ({size} bytes)")
            return dest
        except Exception as e:
            print(f"[robust_download] hf_hub_download attempt {attempt} failed: {e}")
            last_err = e
            time.sleep(1)

    # fallback: direct url
    try:
        print("[robust_download] Falling back to direct download via requests...")
        url = hf_hub_url(repo_id=repo_id, filename=filename)
        tmp_path = dest + ".part"
        with requests.get(url, stream=True, timeout=120) as r:
            r.raise_for_status()
            with open(tmp_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        f.flush()
        shutil.move(tmp_path, dest)
        with open(dest, "rb") as f:
            try:
                os.fsync(f.fileno())
            except Exception:
                pass
        os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
        print("[robust_download] Direct download success:", dest)
        return dest
    except Exception as e2:
        print("[robust_download] Direct download failed:", e2)
        raise RuntimeError(f"All download attempts failed. last_err={last_err}, fallback_err={e2}")

# Ensure model
print("Ensuring model present at:", DEST_PATH)
model_path = robust_download(REPO_ID, FILENAME, DEST_PATH)
print("DEBUG: listing model dir:", MODEL_DIR)
for fn in sorted(os.listdir(MODEL_DIR)):
    p = os.path.join(MODEL_DIR, fn)
    try:
        st = os.stat(p)
        print(f" - {fn}: exists, size={st.st_size}, mode={oct(st.st_mode)}")
    except FileNotFoundError:
        print(f" - {fn}: NOT FOUND after copy")
time.sleep(0.2)

# ----------------- Llama init -----------------
try:
    print("Initializing Llama with model_path:", model_path)
    llm = Llama(
        model_path=model_path,
        n_ctx=N_CTX,
        n_threads=N_THREADS,
        n_gpu_layers=0,
        chat_format="chatml",   # important so the binding formats messages correctly
    )
    print("Llama initialized.")
except Exception as e:
    print("Llama init failed:", e)
    raise

# optional single-shot debug test at startup (prints final structure)
def run_startup_test():
    try:
        test_messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": "Say hello in one short sentence."}
        ]
        print("[startup_test] Running single-shot create_chat_completion (stream=False)...")
        out = llm.create_chat_completion(messages=test_messages, max_tokens=64, stream=False)
        print("[startup_test] Single-shot response (raw):", out)
    except Exception as e:
        print("[startup_test] Error during single-shot test:", e)

if DEBUG_SINGLESHOT_AT_START:
    run_startup_test()

# ----------------- helpers -----------------
def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        if assistant_msg is not None and assistant_msg != "":
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": user_message})
    return messages

def parse_final_response(resp):
    try:
        if resp is None:
            return ""
        if isinstance(resp, str):
            return resp
        if isinstance(resp, dict):
            choices = resp.get("choices", [])
            if len(choices) > 0:
                c = choices[0]
                if isinstance(c.get("message"), dict):
                    return c["message"].get("content", "") or ""
                if "text" in c and c["text"]:
                    return c["text"]
                if "delta" in c and isinstance(c["delta"], dict):
                    return c["delta"].get("content", "") or ""
        return str(resp)
    except Exception:
        return str(resp)

# ----------------- robust streaming chat -----------------
def chat_fn(user_message, history):
    messages = build_messages(history or [], user_message)

    # Try streaming
    try:
        stream = llm.create_chat_completion(
            messages=messages,
            max_tokens=MAX_TOKENS,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            stream=True
        )
    except Exception as e:
        # immediate failure -> non-stream fallback
        try:
            final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, stream=False)
            yield parse_final_response(final)
            return
        except Exception as e2:
            yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
            return

    # Non-iterable stream -> final
    if not hasattr(stream, "__iter__"):
        yield parse_final_response(stream)
        return

    partial = ""
    yielded_any = False

    try:
        for chunk in stream:
            if DEBUG_CHUNKS:
                print("STREAM CHUNK:", repr(chunk))

            # Case A: chunk is a dict with "choices" (normal)
            if isinstance(chunk, dict):
                choices = chunk.get("choices", []) or []
                if len(choices) > 0:
                    c0 = choices[0]

                    # 1) delta with content
                    delta = c0.get("delta", {})
                    if isinstance(delta, dict) and "content" in delta and delta["content"]:
                        partial += delta["content"]
                        yielded_any = True
                        yield partial
                        continue

                    # 2) delta with role only (e.g. {"role":"assistant"}) -> ignore for content
                    if isinstance(delta, dict) and "role" in delta and not delta.get("content"):
                        # role announcement, not content
                        continue

                    # 3) sometimes a 'message' object appears with content
                    msg = c0.get("message") or c0.get("text")
                    if isinstance(msg, dict):
                        content = msg.get("content") or msg.get("content_text") or ""
                        if content:
                            partial = content
                            yielded_any = True
                            yield partial
                            continue
                    elif isinstance(msg, str) and msg:
                        partial += msg
                        yielded_any = True
                        yield partial
                        continue

                    # 4) finish reason with empty delta -> if we have accumulated text, yield it; else fallback
                    finish_reason = c0.get("finish_reason")
                    if finish_reason:
                        if partial:
                            # we already have content; ensure UI gets it
                            if not yielded_any:
                                yield partial
                            return
                        else:
                            # no content accumulated — do a non-stream final fetch
                            try:
                                final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, stream=False)
                                final_text = parse_final_response(final)
                                yield final_text
                                return
                            except Exception as e:
                                yield f"[error] fallback non-stream at finish failed: {e}"
                                return

            # Case B: chunk is not a dict (plain string or other)
            else:
                try:
                    chunk_str = str(chunk)
                    if chunk_str and chunk_str.strip():
                        partial += chunk_str
                        yielded_any = True
                        yield partial
                        continue
                except Exception:
                    # ignore weird chunk -> continue
                    continue

    except StopIteration:
        pass
    except Exception as e:
        yield f"[error] stream iteration error: {e}"
        return

    # If streaming produced nothing, final non-stream fallback
    if not yielded_any:
        try:
            final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stream=False)
            final_text = parse_final_response(final)
            yield final_text if final_text is not None else ""
            return
        except Exception as e:
            yield f"[error] fallback non-stream failed: {e}"
            return

# --------------- Launch Gradio ----------------
stylings = """
footer a[href*='settings'], /* Footer settings link */
footer .gradio-settings { display: none !important; } /* Specific classes if they exist */
"""
stylings = stylings.strip()

demo = gr.ChatInterface(
    fn=chat_fn,
    title="",
    description="30€/mes VoxAI Premium | 12716x178e^100€/mes VoxAI Pro",
    css="""
        footer {visibility: hidden}
        flagging_mode="never"  # This removes the flag/share button
        chatbot=gr.Chatbot(label="VoxAI-1")  # Change the label
        .gradio-container-4-44-0 > div:first-child {display: none !important}
        header {display: none !important}
        .app-header {display: none !important}
        div[class*="space-link"] {display: none !important}
        .meta-text {display: none !important}
    """,
)

if __name__ == "__main__":
    demo.launch(share=False)