| import gradio as gr |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
|
|
| |
| |
| MODEL_REPO = "Kezovic/iris-f16gguf-test" |
| MODEL_FILE = "llama-3.2-1b-instruct.F16.gguf" |
| |
| CONTEXT_WINDOW = 4096 |
| MAX_NEW_TOKENS = 512 |
| TEMPERATURE = 0.7 |
|
|
| |
| def load_llm(): |
| """Downloads the GGUF model and initializes LlamaCPP.""" |
| print("Downloading model...") |
| model_path = hf_hub_download( |
| repo_id=MODEL_REPO, |
| filename=MODEL_FILE |
| ) |
| |
| |
| |
| |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=CONTEXT_WINDOW, |
| n_threads=2, |
| verbose=False |
| ) |
| print("Model loaded successfully!") |
| return llm |
|
|
| |
| llm = load_llm() |
|
|
| |
| def generate(prompt, history): |
| """Generates a response using the Llama model.""" |
| |
| full_prompt = f"### Human: {prompt}\n### Assistant:" |
|
|
| output = llm( |
| prompt=full_prompt, |
| max_tokens=MAX_NEW_TOKENS, |
| temperature=TEMPERATURE, |
| stop=["### Human:"], |
| echo=False |
| ) |
| |
| |
| response_text = output['choices'][0]['text'].strip() |
| return response_text |
|
|
| |
| |
| gr.ChatInterface( |
| generate, |
| title=f"Chat with {MODEL_FILE}", |
| description="A GGUF LLM hosted on Hugging Face CPU Space using llama-cpp-python." |
| ).launch() |