WalidEAGLE's picture
optimized the code - eval/no_grad
ff45a77
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import gradio as gr
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
base_model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
lora_model = PeftModel.from_pretrained(base_model, "WalidEAGLE/lora-finetuned-gpt2-shakespeare")
lora_model.to(device)
lora_model.eval()
tokenizer = AutoTokenizer.from_pretrained("WalidEAGLE/lora-finetuned-gpt2-shakespeare")
def chat_stream(prompt, history):
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
outputs = lora_model.generate(
**inputs,
max_new_tokens=100,
do_sample=True,
temperature=0.7,
pad_token_id=tokenizer.eos_token_id
)
full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = ""
for word in full_text.split():
response += word + " "
yield response
# Create Gradio interface
gr.ChatInterface(
fn=chat_stream,
title="🎭 Shakespearean GPT-2 Chatbot",
description="Talk like Shakespeare! Enter a prompt and watch the Shakespearean text stream in real time.",
).launch()