Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| from huggingface_hub import hf_hub_download, hf_hub_url | |
| from llama_cpp import Llama | |
| import gradio as gr | |
| import tempfile | |
| # ------------------------- | |
| # Config: change if you want | |
| # ------------------------- | |
| REPO_ID = "mradermacher/EuroLLM-1.7B-Instruct-GGUF" | |
| FILENAME = "EuroLLM-1.7B-Instruct.Q8_0.gguf" | |
| SYSTEM_PROMPT = "You are a helpful assistant. Answer concisely and helpfully." | |
| # local path we'll store the model | |
| MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "models") | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| MODEL_PATH = os.path.join(MODEL_DIR, FILENAME) | |
| # ------------------------- | |
| # Helper: robust download | |
| # ------------------------- | |
| def download_from_hf(repo_id: str, filename: str, dest: str) -> str: | |
| """Download using huggingface_hub if possible; fallback to direct url via requests.""" | |
| if os.path.exists(dest) and os.path.getsize(dest) > 0: | |
| print(f"Model already exists at {dest}") | |
| return dest | |
| try: | |
| print("Trying hf_hub_download...") | |
| path = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_DIR) | |
| # hf_hub_download may return a cache path; copy/move to dest if needed | |
| if os.path.abspath(path) != os.path.abspath(dest): | |
| # move the cached file into our models folder with the expected name | |
| os.replace(path, dest) | |
| path = dest | |
| print("Downloaded via hf_hub_download:", path) | |
| return path | |
| except Exception as e: | |
| print("hf_hub_download failed:", e) | |
| # fallback: construct the direct URL and download via requests | |
| try: | |
| print("Falling back to direct URL via requests...") | |
| url = hf_hub_url(repo_id=repo_id, filename=filename) | |
| # url is the Hub URL (signed? but usually works for public repos) | |
| # If user provided direct URL with ?download=true, you can paste that directly. | |
| print("Downloading from:", url) | |
| with requests.get(url, stream=True, timeout=60) as r: | |
| r.raise_for_status() | |
| with open(dest, "wb") as f: | |
| for chunk in r.iter_content(chunk_size=8192): | |
| if chunk: | |
| f.write(chunk) | |
| print("Downloaded fallback to:", dest) | |
| return dest | |
| except Exception as e2: | |
| raise RuntimeError(f"Both hf_hub_download and direct download failed: {e2}") | |
| # ------------------------- | |
| # Ensure model is present | |
| # ------------------------- | |
| model_path = download_from_hf(REPO_ID, FILENAME, MODEL_PATH) | |
| # ------------------------- | |
| # Load the model (llama-cpp-python) | |
| # ------------------------- | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, # lower if you need less memory | |
| n_threads=4, | |
| n_gpu_layers=0, # CPU-only. If you have GPU layers available, adjust. | |
| # stream is set per-call in create_chat_completion below. | |
| ) | |
| # ------------------------- | |
| # Chat formatting helpers | |
| # ------------------------- | |
| def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT): | |
| """ | |
| Convert history (list of [user, assistant]) into chat messages format expected by create_chat_completion. | |
| Then append the current user_message at the end. | |
| """ | |
| messages = [] | |
| if system_prompt: | |
| messages.append({"role": "system", "content": system_prompt}) | |
| # history is list of [user, assistant] pairs | |
| for user_msg, assistant_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| if assistant_msg is not None and assistant_msg != "": | |
| messages.append({"role": "assistant", "content": assistant_msg}) | |
| # now add current user message | |
| messages.append({"role": "user", "content": user_message}) | |
| return messages | |
| # ------------------------- | |
| # Streaming generator for Gradio | |
| # ------------------------- | |
| def chat_fn(user_message, history): | |
| """ | |
| Gradio ChatInterface expects either a single return (reply string) or a generator that yields partial strings. | |
| We'll stream partial assistant text as it arrives from llama-cpp-python create_chat_completion(..., stream=True). | |
| """ | |
| # history is list of [user, assistant] pairs from Gradio | |
| messages = build_messages(history or [], user_message) | |
| # create_chat_completion returns an iterator when stream=True | |
| stream = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=512, | |
| temperature=0.2, | |
| top_p=0.95, | |
| stream=True | |
| ) | |
| # accumulate incremental content and yield progressive replies | |
| partial = "" | |
| for chunk in stream: | |
| # chunk structure: {"id":..., "object":"chat.completion.chunk", "choices":[{"delta":{"content": "..."}}, ...]} | |
| try: | |
| if "choices" in chunk and len(chunk["choices"]) > 0: | |
| delta = chunk["choices"][0].get("delta", {}) | |
| if "content" in delta: | |
| partial += delta["content"] | |
| yield partial | |
| except Exception: | |
| # ignore malformed chunk and continue | |
| continue | |
| # ------------------------- | |
| # Launch Gradio | |
| # ------------------------- | |
| demo = gr.ChatInterface( | |
| fn=chat_fn, | |
| title="EuroLLM 1.7B (GGUF) — streaming chat", | |
| description="Model: mradermacher/EuroLLM-1.7B-Instruct (Q8_0). System prompt enabled. Streaming ON.", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |