import gradio as gr from unsloth import FastLanguageModel import torch # ---------------------------- # Load LoRA-finetuned model # ---------------------------- max_seq_length = 1024 model, tokenizer = FastLanguageModel.from_pretrained( model_name="umarfarzan/my-finetuned-model2-lora", max_seq_length=max_seq_length, dtype=None, load_in_4bit=True # still works on CPU with int4 quantization ) FastLanguageModel.for_inference(model) # ---------------------------- # Inference function # ---------------------------- alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" def generate_response(instruction, input_text=""): prompt = alpaca_prompt.format(instruction, input_text, "") inputs = tokenizer([prompt], return_tensors="pt").to("cpu") outputs = model.generate( **inputs, max_new_tokens=512, temperature=0.7, top_p=0.9, do_sample=True, use_cache=True ) return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] # ---------------------------- # Gradio UI # ---------------------------- with gr.Blocks() as demo: gr.Markdown("## LoRA Qwen2.5-7B Demo (CPU)") instruction_input = gr.Textbox(label="Instruction", lines=3) context_input = gr.Textbox(label="Input (Optional)", lines=2) output_box = gr.Textbox(label="Output", lines=10) submit_btn = gr.Button("Generate") submit_btn.click( generate_response, inputs=[instruction_input, context_input], outputs=output_box ) demo.launch()