gowshiselva's picture
Update app.py
2cbccec verified
raw
history blame
6.91 kB
import gradio as gr
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, Blip2Processor, Blip2ForConditionalGeneration
# Initial setup
print("Loading models...")
# Main model for detailed captions
blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
# Secondary model for emotion and detail detection
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
# Move models to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
blip2_model.to(device)
blip_model.to(device)
print(f"Models loaded. Using device: {device}")
def generate_advanced_description(image, detail_level, emotion_focus, style_focus):
"""
Generate an advanced description of the image with varying levels of detail.
Args:
image: Input image
detail_level: Level of detail (1-5)
emotion_focus: Focus on emotions (0-5)
style_focus: Focus on artistic style (0-5)
"""
if image is None:
return "Please upload an image to generate a description."
try:
# Generate both basic and detailed descriptions
with torch.no_grad():
# Get basic caption from BLIP large
inputs = blip_processor(image, return_tensors="pt").to(device)
basic_outputs = blip_model.generate(**inputs, max_length=50)
basic_caption = blip_processor.decode(basic_outputs[0], skip_special_tokens=True)
# Create prompt text based on sliders
detail_text = f"Describe this image with extreme detail, focus on {'all elements including tiny details' if detail_level > 3 else 'main elements'}"
emotion_text = "Describe the mood, emotions, and atmosphere conveyed in this image" if emotion_focus > 2 else ""
style_text = "Describe the artistic style, lighting, colors, and composition" if style_focus > 2 else ""
# Combine texts based on focus areas
prompt_text = f"{detail_text}. {emotion_text}. {style_text}"
# Process with BLIP-2
inputs = blip2_processor(image, text=prompt_text, return_tensors="pt").to(device)
max_length = 150 + (detail_level * 50)
outputs = blip2_model.generate(
**inputs,
max_length=max_length,
num_beams=5,
min_length=50,
top_p=0.9,
repetition_penalty=1.5,
length_penalty=1.0
)
detailed_description = blip2_processor.decode(outputs[0], skip_special_tokens=True)
# Format results for AI image generation
formatted_result = ""
# Add basic subject identification
formatted_result += f"## Basic Caption:\n{basic_caption}\n\n"
# Add detailed description
formatted_result += f"## Detailed Description for AI Image Recreation:\n{detailed_description}\n\n"
# Add formatting guide based on detail level
if detail_level >= 4:
# Extract potential elements for structured description
elements = []
if "person" in detailed_description.lower() or "people" in detailed_description.lower():
elements.append("subjects")
if any(word in detailed_description.lower() for word in ["background", "scene", "setting"]):
elements.append("setting")
if any(word in detailed_description.lower() for word in ["light", "shadow", "bright", "dark"]):
elements.append("lighting")
if any(word in detailed_description.lower() for word in ["color", "red", "blue", "green", "yellow", "tone"]):
elements.append("colors")
# Create a structured breakdown
formatted_result += "## Structured Elements:\n"
for element in elements:
formatted_result += f"- {element.capitalize()}: " + \
f"[Extract relevant details about {element} from the description]\n"
# Add prompt suggestion
formatted_result += "\n## Suggested AI Image Prompt:\n"
formatted_result += f"{basic_caption}, {', '.join(detailed_description.split('.')[:3])}, " + \
f"{'high detail' if detail_level > 3 else 'moderate detail'}, " + \
f"{'emotional' if emotion_focus > 3 else ''}, " + \
f"{'artistic' if style_focus > 3 else ''}"
return formatted_result
except Exception as e:
return f"Error generating description: {str(e)}\n\nTraceback: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}"
# Create Gradio interface
with gr.Blocks(title="Advanced Image Description Generator") as demo:
gr.Markdown("# Advanced Image Description Generator for AI Image Recreation")
gr.Markdown("Upload an image to generate a detailed description that can help AI image generators recreate similar images.")
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(label="Upload Image", type="pil")
with gr.Row():
detail_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Detail Level")
emotion_slider = gr.Slider(minimum=0, maximum=5, value=3, step=1, label="Emotion Focus")
style_slider = gr.Slider(minimum=0, maximum=5, value=3, step=1, label="Style/Artistic Focus")
submit_btn = gr.Button("Generate Description")
with gr.Column(scale=1):
output_text = gr.Textbox(label="Image Description", lines=20)
submit_btn.click(
fn=generate_advanced_description,
inputs=[input_image, detail_slider, emotion_slider, style_slider],
outputs=output_text
)
gr.Markdown("""
## How to Use
1. Upload an image
2. Adjust the sliders to control description detail:
- Detail Level: How comprehensive the description should be
- Emotion Focus: Emphasis on mood and feelings
- Style Focus: Emphasis on artistic elements
3. Click "Generate Description"
4. Use the generated text to prompt AI image generators
## About
This app uses BLIP-2 and BLIP large models to analyze images and generate detailed descriptions
suitable for recreating similar images with AI image generators like Stable Diffusion, DALL-E, or Midjourney.
""")
# Launch the app
if __name__ == "__main__":
demo.launch()