Inspaire / app.py
umarfarzan's picture
Update app.py
4e98186 verified
import gradio as gr
from unsloth import FastLanguageModel
import torch
# ----------------------------
# Load LoRA-finetuned model
# ----------------------------
max_seq_length = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="umarfarzan/my-finetuned-model2-lora",
max_seq_length=max_seq_length,
dtype=None,
load_in_4bit=True # still works on CPU with int4 quantization
)
FastLanguageModel.for_inference(model)
# ----------------------------
# Inference function
# ----------------------------
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
def generate_response(instruction, input_text=""):
prompt = alpaca_prompt.format(instruction, input_text, "")
inputs = tokenizer([prompt], return_tensors="pt").to("cpu")
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
top_p=0.9,
do_sample=True,
use_cache=True
)
return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks() as demo:
gr.Markdown("## LoRA Qwen2.5-7B Demo (CPU)")
instruction_input = gr.Textbox(label="Instruction", lines=3)
context_input = gr.Textbox(label="Input (Optional)", lines=2)
output_box = gr.Textbox(label="Output", lines=10)
submit_btn = gr.Button("Generate")
submit_btn.click(
generate_response,
inputs=[instruction_input, context_input],
outputs=output_box
)
demo.launch()