Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| from PIL import Image | |
| import torch | |
| from transformers import GPT2Tokenizer, GPT2LMHeadModel | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device) | |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device) | |
| def generate_paragraph(image): | |
| if image.mode != 'RGB': | |
| image = image.convert('RGB') | |
| inputs = processor(images=image, return_tensors="pt").to(device) | |
| output_ids = model.generate(**inputs, max_length=50) | |
| caption = processor.decode(output_ids[0], skip_special_tokens=True) | |
| prompt = f"Write a detailed paragraph about this image: {caption}\n\nDetails:" | |
| tokens = tokenizer.encode(prompt, return_tensors='pt').to(device) | |
| outputs = gpt2_model.generate(tokens, max_length=150, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, pad_token_id=tokenizer.eos_token_id) | |
| paragraph = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Post-process to avoid repeating the prompt | |
| if paragraph.lower().startswith(prompt.lower()): | |
| paragraph = paragraph[len(prompt):].strip() | |
| return paragraph | |
| iface = gr.Interface( | |
| fn=generate_paragraph, | |
| inputs=gr.Image(type="pil"), | |
| outputs="textbox", | |
| title="Image Paragraph Description Generator", | |
| description="Upload an image to get a detailed paragraph description generated." | |
| ) | |
| iface.launch() | |