File size: 1,926 Bytes
f821365 c86e03f 29a7fe7 389d21e 539f835 389d21e c86e03f 50e06b0 c86e03f 389d21e c86e03f 389d21e 02799cd 389d21e be40b4e 50e06b0 02799cd 389d21e d152984 389d21e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# --- Configuration ---
# 1. Update with your model's repo ID and file name
MODEL_REPO = "Kezovic/iris-f16gguf-test"
MODEL_FILE = "llama-3.2-1b-instruct.F16.gguf"
# Adjust context window and other params as needed
CONTEXT_WINDOW = 4096
MAX_NEW_TOKENS = 512
TEMPERATURE = 1.5
# --- Model Loading Function ---
def load_llm():
"""Downloads the GGUF model and initializes LlamaCPP."""
print("Downloading model...")
model_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILE
)
# Initialize the LLM with the downloaded model path
# n_ctx is the context window size
# n_threads is set to 2 (free CPU core limit) for better parallelization
llm = Llama(
model_path=model_path,
n_ctx=CONTEXT_WINDOW,
n_threads=2,
verbose=False, # Set to True for debugging
min_p=0.1
)
print("Model loaded successfully!")
return llm
# Load the model only once when the Space starts
llm = load_llm()
# --- Inference Function ---
def generate(prompt, history):
"""Generates a response using the Llama model."""
# Use a basic prompt template (adjust for your model's specific format)
full_prompt = f"### Human: {prompt}\n### Assistant:"
output = llm(
prompt=full_prompt,
max_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
stop=["### Human:"], # Stop generation at the next user turn
echo=False
)
# Extract the text from the response object
response_text = output['choices'][0]['text'].strip()
return response_text
# --- Gradio Interface ---
# Use the ChatInterface for a quick, functional chat UI
gr.ChatInterface(
generate,
title=f"Chat with {MODEL_FILE}",
description="A GGUF LLM hosted on Hugging Face CPU Space using llama-cpp-python."
).launch() |