CHAT_BOX / app.py
anabury's picture
Update app.py
2ba0f71 verified
raw
history blame
1.64 kB
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
BASE_MODEL = "unsloth/phi-4-unsloth-bnb-4bit" # base that you finetuned from
ADAPTER_ID = "Anabury/My_Finetuned_Phi-4" # your adapter repo
# tokenizer (either base or adapter works; use base)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
# load base model (4-bit quant is fine on Spaces GPU/CPU)
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="auto",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
trust_remote_code=True
)
# attach your LoRA adapter
model = PeftModel.from_pretrained(base, ADAPTER_ID)
model.eval()
def chat(message, history):
# build a simple prompt; adapt if you have a chat template in your repo
prompt = message
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=256,
do_sample=True,
temperature=0.7,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id
)
reply = tokenizer.decode(output[0], skip_special_tokens=True)
history.append((message, reply))
return history, history
with gr.Blocks() as demo:
gr.Markdown("# Phi-4 Chat (LoRA)")
chatbot = gr.Chatbot(height=420)
msg = gr.Textbox(placeholder="Ask me anything…")
clear = gr.Button("Clear")
msg.submit(chat, [msg, chatbot], [chatbot, chatbot])
clear.click(lambda: [], None, chatbot, queue=False)
demo.launch()