|
|
import gradio as gr
|
|
|
from transformers import BlipProcessor, BlipForConditionalGeneration
|
|
|
from PIL import Image
|
|
|
import torch
|
|
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
|
|
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
|
|
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
|
|
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
|
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
|
|
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
|
|
|
|
|
|
def generate_paragraph(image):
|
|
|
if image.mode != 'RGB':
|
|
|
image = image.convert('RGB')
|
|
|
inputs = processor(images=image, return_tensors="pt").to(device)
|
|
|
output_ids = model.generate(**inputs, max_length=50)
|
|
|
caption = processor.decode(output_ids[0], skip_special_tokens=True)
|
|
|
|
|
|
prompt = f"Write a detailed paragraph about this image: {caption}\n\nDetails:"
|
|
|
tokens = tokenizer.encode(prompt, return_tensors='pt').to(device)
|
|
|
outputs = gpt2_model.generate(tokens, max_length=150, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, pad_token_id=tokenizer.eos_token_id)
|
|
|
paragraph = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
if paragraph.lower().startswith(prompt.lower()):
|
|
|
paragraph = paragraph[len(prompt):].strip()
|
|
|
|
|
|
return paragraph
|
|
|
|
|
|
iface = gr.Interface(
|
|
|
fn=generate_paragraph,
|
|
|
inputs=gr.Image(type="pil"),
|
|
|
outputs="textbox",
|
|
|
title="Image Paragraph Description Generator",
|
|
|
description="Upload an image to get a detailed paragraph description generated."
|
|
|
)
|
|
|
|
|
|
iface.launch()
|
|
|
|