import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # --- Configuration --- # 1. Update with your model's repo ID and file name MODEL_REPO = "Kezovic/iris-f16gguf-test" MODEL_FILE = "llama-3.2-1b-instruct.F16.gguf" # Adjust context window and other params as needed CONTEXT_WINDOW = 4096 MAX_NEW_TOKENS = 512 TEMPERATURE = 1.5 # --- Model Loading Function --- def load_llm(): """Downloads the GGUF model and initializes LlamaCPP.""" print("Downloading model...") model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE ) # Initialize the LLM with the downloaded model path # n_ctx is the context window size # n_threads is set to 2 (free CPU core limit) for better parallelization llm = Llama( model_path=model_path, n_ctx=CONTEXT_WINDOW, n_threads=2, verbose=False, # Set to True for debugging min_p=0.1 ) print("Model loaded successfully!") return llm # Load the model only once when the Space starts llm = load_llm() # --- Inference Function --- def generate(prompt, history): """Generates a response using the Llama model.""" # Use a basic prompt template (adjust for your model's specific format) full_prompt = f"### Human: {prompt}\n### Assistant:" output = llm( prompt=full_prompt, max_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE, stop=["### Human:"], # Stop generation at the next user turn echo=False ) # Extract the text from the response object response_text = output['choices'][0]['text'].strip() return response_text # --- Gradio Interface --- # Use the ChatInterface for a quick, functional chat UI gr.ChatInterface( generate, title=f"Chat with {MODEL_FILE}", description="A GGUF LLM hosted on Hugging Face CPU Space using llama-cpp-python." ).launch()