import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Download GGUF from your HF repo model_path = hf_hub_download( repo_id="astegaras/Llama3.2_3B", filename="model-Q2_K.gguf" ) # Load model (llama.cpp) llm = Llama( model_path=model_path, n_ctx=4096, chat_format=None, n_gpu_layers=0, add_bos_token=False, add_eos_token=False, ) # Build inference prompt according to your dataset format def format_prompt(user_message): return ( "<|begin_of_text|>" "<|start_header_id|>system<|end_header_id|>\n" "You are a helpful assistant.\n" "<|start_header_id|>user<|end_header_id|>\n" f"{user_message}\n" "<|start_header_id|>assistant<|end_header_id|>\n" ) def respond(user_input): prompt = format_prompt(user_input) output = llm( prompt, max_tokens=512, temperature=0.7, top_p=0.9, stop=["<|user|>", "<|system|>"], # avoid looping ) return output["choices"][0]["text"] # Gradio UI gr.Interface( fn=respond, inputs=gr.components.Textbox(label="Ask"), outputs=gr.components.Textbox(label="Answer"), title="Llama3.2-3B Fine-tuned Assistant" ).launch()