Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from unsloth import FastLanguageModel | |
| import torch | |
| # ---------------------------- | |
| # Load LoRA-finetuned model | |
| # ---------------------------- | |
| max_seq_length = 1024 | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name="umarfarzan/my-finetuned-model2-lora", | |
| max_seq_length=max_seq_length, | |
| dtype=None, | |
| load_in_4bit=True # still works on CPU with int4 quantization | |
| ) | |
| FastLanguageModel.for_inference(model) | |
| # ---------------------------- | |
| # Inference function | |
| # ---------------------------- | |
| alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
| ### Instruction: | |
| {} | |
| ### Input: | |
| {} | |
| ### Response: | |
| {}""" | |
| def generate_response(instruction, input_text=""): | |
| prompt = alpaca_prompt.format(instruction, input_text, "") | |
| inputs = tokenizer([prompt], return_tensors="pt").to("cpu") | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| top_p=0.9, | |
| do_sample=True, | |
| use_cache=True | |
| ) | |
| return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] | |
| # ---------------------------- | |
| # Gradio UI | |
| # ---------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## LoRA Qwen2.5-7B Demo (CPU)") | |
| instruction_input = gr.Textbox(label="Instruction", lines=3) | |
| context_input = gr.Textbox(label="Input (Optional)", lines=2) | |
| output_box = gr.Textbox(label="Output", lines=10) | |
| submit_btn = gr.Button("Generate") | |
| submit_btn.click( | |
| generate_response, | |
| inputs=[instruction_input, context_input], | |
| outputs=output_box | |
| ) | |
| demo.launch() | |