import gradio as gr from unsloth import FastLanguageModel from peft import PeftModel import torch # Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Sumit404/Llama-3.2-3B-Instruct-bnb-4bit-finetuned", torch_dtype="auto") # Load the base model and tokenizer max_seq_length = 4096 dtype = None load_in_4bit = True model, tokenizer = FastLanguageModel.from_pretrained( model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit", max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit ) # Load the LoRA adapters LORA_ADAPTER_PATH = "Sumit404/Llama-3.2-3B-Instruct-bnb-4bit-finetuned" # Replace with your repo ID model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH) # Set tokenizer and model for inference from unsloth.chat_templates import get_chat_template tokenizer = get_chat_template( tokenizer, chat_template = "llama-3.2", ) tokenizer.pad_token = tokenizer.eos_token FastLanguageModel.for_inference(model) def generate_text(prompt): messages = [{"role": "user", "content": prompt}] inputs = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", padding=True, ).to("cuda") attention_mask = inputs != tokenizer.pad_token_id outputs = model.generate( input_ids=inputs, attention_mask=attention_mask, max_new_tokens=128, # Increased output length for potentially longer answers use_cache=True, temperature=0.6, min_p=0.1, ) text = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the assistant's response assistant_response_start = text.find("<|start_header_id|>assistant<|end_header_id|>\n\n") if assistant_response_start != -1: text = text[assistant_response_start + len("<|start_header_id|>assistant<|end_header_id|>\n\n"):] return text # Create the Gradio interface interface = gr.Interface( fn=generate_text, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs="text", title="Fine-tuned Llama-3.2 Instruct Model", description="Ask a question to the fine-tuned model." ) # To run this in Colab, set share=True interface.launch(share=True)