VoxAI / app.py
Fu01978's picture
Update app.py
c392854 verified
raw
history blame
5.43 kB
import os
import requests
from huggingface_hub import hf_hub_download, hf_hub_url
from llama_cpp import Llama
import gradio as gr
import tempfile
# -------------------------
# Config: change if you want
# -------------------------
REPO_ID = "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
FILENAME = "EuroLLM-1.7B-Instruct.Q8_0.gguf"
SYSTEM_PROMPT = "You are a helpful assistant. Answer concisely and helpfully."
# local path we'll store the model
MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "models")
os.makedirs(MODEL_DIR, exist_ok=True)
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# -------------------------
# Helper: robust download
# -------------------------
def download_from_hf(repo_id: str, filename: str, dest: str) -> str:
"""Download using huggingface_hub if possible; fallback to direct url via requests."""
if os.path.exists(dest) and os.path.getsize(dest) > 0:
print(f"Model already exists at {dest}")
return dest
try:
print("Trying hf_hub_download...")
path = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_DIR)
# hf_hub_download may return a cache path; copy/move to dest if needed
if os.path.abspath(path) != os.path.abspath(dest):
# move the cached file into our models folder with the expected name
os.replace(path, dest)
path = dest
print("Downloaded via hf_hub_download:", path)
return path
except Exception as e:
print("hf_hub_download failed:", e)
# fallback: construct the direct URL and download via requests
try:
print("Falling back to direct URL via requests...")
url = hf_hub_url(repo_id=repo_id, filename=filename)
# url is the Hub URL (signed? but usually works for public repos)
# If user provided direct URL with ?download=true, you can paste that directly.
print("Downloading from:", url)
with requests.get(url, stream=True, timeout=60) as r:
r.raise_for_status()
with open(dest, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Downloaded fallback to:", dest)
return dest
except Exception as e2:
raise RuntimeError(f"Both hf_hub_download and direct download failed: {e2}")
# -------------------------
# Ensure model is present
# -------------------------
model_path = download_from_hf(REPO_ID, FILENAME, MODEL_PATH)
# -------------------------
# Load the model (llama-cpp-python)
# -------------------------
llm = Llama(
model_path=model_path,
n_ctx=2048, # lower if you need less memory
n_threads=4,
n_gpu_layers=0, # CPU-only. If you have GPU layers available, adjust.
# stream is set per-call in create_chat_completion below.
)
# -------------------------
# Chat formatting helpers
# -------------------------
def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
"""
Convert history (list of [user, assistant]) into chat messages format expected by create_chat_completion.
Then append the current user_message at the end.
"""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
# history is list of [user, assistant] pairs
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
if assistant_msg is not None and assistant_msg != "":
messages.append({"role": "assistant", "content": assistant_msg})
# now add current user message
messages.append({"role": "user", "content": user_message})
return messages
# -------------------------
# Streaming generator for Gradio
# -------------------------
def chat_fn(user_message, history):
"""
Gradio ChatInterface expects either a single return (reply string) or a generator that yields partial strings.
We'll stream partial assistant text as it arrives from llama-cpp-python create_chat_completion(..., stream=True).
"""
# history is list of [user, assistant] pairs from Gradio
messages = build_messages(history or [], user_message)
# create_chat_completion returns an iterator when stream=True
stream = llm.create_chat_completion(
messages=messages,
max_tokens=512,
temperature=0.2,
top_p=0.95,
stream=True
)
# accumulate incremental content and yield progressive replies
partial = ""
for chunk in stream:
# chunk structure: {"id":..., "object":"chat.completion.chunk", "choices":[{"delta":{"content": "..."}}, ...]}
try:
if "choices" in chunk and len(chunk["choices"]) > 0:
delta = chunk["choices"][0].get("delta", {})
if "content" in delta:
partial += delta["content"]
yield partial
except Exception:
# ignore malformed chunk and continue
continue
# -------------------------
# Launch Gradio
# -------------------------
demo = gr.ChatInterface(
fn=chat_fn,
title="EuroLLM 1.7B (GGUF) — streaming chat",
description="Model: mradermacher/EuroLLM-1.7B-Instruct (Q8_0). System prompt enabled. Streaming ON.",
)
if __name__ == "__main__":
demo.launch()