Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| # Download GGUF from your HF repo | |
| model_path = hf_hub_download( | |
| repo_id="astegaras/Llama3.2_3B", | |
| filename="model-Q2_K.gguf" | |
| ) | |
| # Load model (llama.cpp) | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=4096, | |
| chat_format=None, | |
| n_gpu_layers=0, | |
| add_bos_token=False, | |
| add_eos_token=False, | |
| ) | |
| # Build inference prompt according to your dataset format | |
| def format_prompt(user_message): | |
| return ( | |
| "<|begin_of_text|>" | |
| "<|start_header_id|>system<|end_header_id|>\n" | |
| "You are a helpful assistant.\n" | |
| "<|start_header_id|>user<|end_header_id|>\n" | |
| f"{user_message}\n" | |
| "<|start_header_id|>assistant<|end_header_id|>\n" | |
| ) | |
| def respond(user_input): | |
| prompt = format_prompt(user_input) | |
| output = llm( | |
| prompt, | |
| max_tokens=512, | |
| temperature=0.7, | |
| top_p=0.9, | |
| stop=["<|user|>", "<|system|>"], # avoid looping | |
| ) | |
| return output["choices"][0]["text"] | |
| # Gradio UI | |
| gr.Interface( | |
| fn=respond, | |
| inputs=gr.components.Textbox(label="Ask"), | |
| outputs=gr.components.Textbox(label="Answer"), | |
| title="Llama3.2-3B Fine-tuned Assistant" | |
| ).launch() | |