Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| import gradio as gr | |
| from llama_cpp import Llama | |
| MODEL_URL = "https://huggingface.co/QuantFactory/Ministral-3b-instruct-GGUF/resolve/main/Ministral-3b-instruct.Q4_1.gguf?download=true" # truncated for clarity | |
| MODEL_PATH = "Ministral-3b-instruct.Q4_1.gguf" | |
| # Download model if not already downloaded | |
| if not os.path.exists(MODEL_PATH): | |
| print("Downloading model...") | |
| with requests.get(MODEL_URL, stream=True) as r: | |
| r.raise_for_status() | |
| with open(MODEL_PATH, 'wb') as f: | |
| for chunk in r.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| print("Model downloaded.") | |
| # Load the model with adjustments for CPU | |
| llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_ctx=4096, # Reduced context window size | |
| n_threads=2, # Reduced threads for CPU use | |
| n_gpu_layers=0, # Set to 0 since we're using CPU | |
| chat_format="chatml" | |
| ) | |
| def chat_interface(message, history): | |
| if history is None: | |
| history = [] | |
| chat_prompt = [] | |
| for user_msg, bot_msg in history: | |
| chat_prompt.append({"role": "user", "content": user_msg}) | |
| chat_prompt.append({"role": "assistant", "content": bot_msg}) | |
| chat_prompt.append({"role": "user", "content": message}) | |
| response = llm.create_chat_completion(messages=chat_prompt, stream=False) | |
| reply = response["choices"][0]["message"]["content"] | |
| history.append((message, reply)) | |
| return reply, history | |
| gr.ChatInterface(fn=chat_interface, title="Ministral 3B Chat").launch() | |