import gradio as gr
from unsloth import FastLanguageModel
import torch

# ----------------------------
# Load LoRA-finetuned model
# ----------------------------
max_seq_length = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="umarfarzan/my-finetuned-model2-lora",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True  # still works on CPU with int4 quantization
)

FastLanguageModel.for_inference(model)

# ----------------------------
# Inference function
# ----------------------------
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def generate_response(instruction, input_text=""):
    prompt = alpaca_prompt.format(instruction, input_text, "")
    inputs = tokenizer([prompt], return_tensors="pt").to("cpu")
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        use_cache=True
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks() as demo:
    gr.Markdown("## LoRA Qwen2.5-7B Demo (CPU)")
    instruction_input = gr.Textbox(label="Instruction", lines=3)
    context_input = gr.Textbox(label="Input (Optional)", lines=2)
    output_box = gr.Textbox(label="Output", lines=10)
    submit_btn = gr.Button("Generate")

    submit_btn.click(
        generate_response,
        inputs=[instruction_input, context_input],
        outputs=output_box
    )

demo.launch()