File size: 1,697 Bytes
7f028d4 024e1b5 7f028d4 f8636f6 7f028d4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import torch
import gradio as gr
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
# Replace with your HF username and repo name
MODEL_REPO = "i3-lab/i3-GPT2"
# Load model and tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_REPO)
model = GPT2LMHeadModel.from_pretrained(MODEL_REPO)
# Move to GPU if the Space has one, else CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
def generate_response(message, history):
# Construct the prompt using the same format as your training script
prompt = ""
for user_msg, assistant_msg in history:
prompt += f"User: {user_msg}\nAssistant: {assistant_msg}<|endoftext|>\n"
prompt += f"User: {message}\nAssistant:"
# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# Generate
with torch.no_grad():
output_tokens = model.generate(
**inputs,
max_new_tokens=150,
do_sample=True,
top_p=0.9,
temperature=0.7,
pad_token_id=tokenizer.eos_token_id,
repetition_penalty=1.2
)
# Extract only the newly generated text
response = tokenizer.decode(output_tokens[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
# Clean up formatting (cutting off if the model generates a new 'User:' tag)
clean_response = response.split("User:")[0].strip()
return clean_response
# Launch Gradio Chat Interface
demo = gr.ChatInterface(
fn=generate_response,
title="i3-GPT",
examples=["Tell me a joke.", "What is the capital of France?", "How does a lightbulb work?"]
)
if __name__ == "__main__":
demo.launch() |