Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| from transformers import BlipProcessor, BlipForConditionalGeneration, Blip2Processor, Blip2ForConditionalGeneration | |
| # Initial setup | |
| print("Loading models...") | |
| # Main model for detailed captions | |
| blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") | |
| blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b") | |
| # Secondary model for emotion and detail detection | |
| blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
| blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") | |
| # Move models to GPU if available | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| blip2_model.to(device) | |
| blip_model.to(device) | |
| print(f"Models loaded. Using device: {device}") | |
| def generate_advanced_description(image, detail_level, emotion_focus, style_focus): | |
| """ | |
| Generate an advanced description of the image with varying levels of detail. | |
| Args: | |
| image: Input image | |
| detail_level: Level of detail (1-5) | |
| emotion_focus: Focus on emotions (0-5) | |
| style_focus: Focus on artistic style (0-5) | |
| """ | |
| if image is None: | |
| return "Please upload an image to generate a description." | |
| try: | |
| # Generate both basic and detailed descriptions | |
| with torch.no_grad(): | |
| # Get basic caption from BLIP large | |
| inputs = blip_processor(image, return_tensors="pt").to(device) | |
| basic_outputs = blip_model.generate(**inputs, max_length=50) | |
| basic_caption = blip_processor.decode(basic_outputs[0], skip_special_tokens=True) | |
| # Create prompt text based on sliders | |
| detail_text = f"Describe this image with extreme detail, focus on {'all elements including tiny details' if detail_level > 3 else 'main elements'}" | |
| emotion_text = "Describe the mood, emotions, and atmosphere conveyed in this image" if emotion_focus > 2 else "" | |
| style_text = "Describe the artistic style, lighting, colors, and composition" if style_focus > 2 else "" | |
| # Combine texts based on focus areas | |
| prompt_text = f"{detail_text}. {emotion_text}. {style_text}" | |
| # Process with BLIP-2 | |
| inputs = blip2_processor(image, text=prompt_text, return_tensors="pt").to(device) | |
| max_length = 150 + (detail_level * 50) | |
| outputs = blip2_model.generate( | |
| **inputs, | |
| max_length=max_length, | |
| num_beams=5, | |
| min_length=50, | |
| top_p=0.9, | |
| repetition_penalty=1.5, | |
| length_penalty=1.0 | |
| ) | |
| detailed_description = blip2_processor.decode(outputs[0], skip_special_tokens=True) | |
| # Format results for AI image generation | |
| formatted_result = "" | |
| # Add basic subject identification | |
| formatted_result += f"## Basic Caption:\n{basic_caption}\n\n" | |
| # Add detailed description | |
| formatted_result += f"## Detailed Description for AI Image Recreation:\n{detailed_description}\n\n" | |
| # Add formatting guide based on detail level | |
| if detail_level >= 4: | |
| # Extract potential elements for structured description | |
| elements = [] | |
| if "person" in detailed_description.lower() or "people" in detailed_description.lower(): | |
| elements.append("subjects") | |
| if any(word in detailed_description.lower() for word in ["background", "scene", "setting"]): | |
| elements.append("setting") | |
| if any(word in detailed_description.lower() for word in ["light", "shadow", "bright", "dark"]): | |
| elements.append("lighting") | |
| if any(word in detailed_description.lower() for word in ["color", "red", "blue", "green", "yellow", "tone"]): | |
| elements.append("colors") | |
| # Create a structured breakdown | |
| formatted_result += "## Structured Elements:\n" | |
| for element in elements: | |
| formatted_result += f"- {element.capitalize()}: " + \ | |
| f"[Extract relevant details about {element} from the description]\n" | |
| # Add prompt suggestion | |
| formatted_result += "\n## Suggested AI Image Prompt:\n" | |
| formatted_result += f"{basic_caption}, {', '.join(detailed_description.split('.')[:3])}, " + \ | |
| f"{'high detail' if detail_level > 3 else 'moderate detail'}, " + \ | |
| f"{'emotional' if emotion_focus > 3 else ''}, " + \ | |
| f"{'artistic' if style_focus > 3 else ''}" | |
| return formatted_result | |
| except Exception as e: | |
| return f"Error generating description: {str(e)}\n\nTraceback: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Advanced Image Description Generator") as demo: | |
| gr.Markdown("# Advanced Image Description Generator for AI Image Recreation") | |
| gr.Markdown("Upload an image to generate a detailed description that can help AI image generators recreate similar images.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_image = gr.Image(label="Upload Image", type="pil") | |
| with gr.Row(): | |
| detail_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Detail Level") | |
| emotion_slider = gr.Slider(minimum=0, maximum=5, value=3, step=1, label="Emotion Focus") | |
| style_slider = gr.Slider(minimum=0, maximum=5, value=3, step=1, label="Style/Artistic Focus") | |
| submit_btn = gr.Button("Generate Description") | |
| with gr.Column(scale=1): | |
| output_text = gr.Textbox(label="Image Description", lines=20) | |
| submit_btn.click( | |
| fn=generate_advanced_description, | |
| inputs=[input_image, detail_slider, emotion_slider, style_slider], | |
| outputs=output_text | |
| ) | |
| gr.Markdown(""" | |
| ## How to Use | |
| 1. Upload an image | |
| 2. Adjust the sliders to control description detail: | |
| - Detail Level: How comprehensive the description should be | |
| - Emotion Focus: Emphasis on mood and feelings | |
| - Style Focus: Emphasis on artistic elements | |
| 3. Click "Generate Description" | |
| 4. Use the generated text to prompt AI image generators | |
| ## About | |
| This app uses BLIP-2 and BLIP large models to analyze images and generate detailed descriptions | |
| suitable for recreating similar images with AI image generators like Stable Diffusion, DALL-E, or Midjourney. | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |