Spaces:

gowshiselva
/

image-description

Runtime error

App Files Files Community

gowshiselva commited on Mar 23, 2025

Commit

2e6a234

verified ·

1 Parent(s): 2cbccec

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -72

app.py CHANGED Viewed

@@ -1,115 +1,153 @@
 import gradio as gr
 import torch
 from PIL import Image
-from transformers import BlipProcessor, BlipForConditionalGeneration, Blip2Processor, Blip2ForConditionalGeneration
 # Initial setup
 print("Loading models...")
-# Main model for detailed captions
-blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
-# Secondary model for emotion and detail detection
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-# Move models to GPU if available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-blip2_model.to(device)
 blip_model.to(device)
-print(f"Models loaded. Using device: {device}")
 def generate_advanced_description(image, detail_level, emotion_focus, style_focus):
     """
-    Generate an advanced description of the image with varying levels of detail.
-    Args:
-        image: Input image
-        detail_level: Level of detail (1-5)
-        emotion_focus: Focus on emotions (0-5)
-        style_focus: Focus on artistic style (0-5)
     """
     if image is None:
         return "Please upload an image to generate a description."
     try:
-        # Generate both basic and detailed descriptions
         with torch.no_grad():
-            # Get basic caption from BLIP large
-            inputs = blip_processor(image, return_tensors="pt").to(device)
-            basic_outputs = blip_model.generate(**inputs, max_length=50)
-            basic_caption = blip_processor.decode(basic_outputs[0], skip_special_tokens=True)
-            # Create prompt text based on sliders
-            detail_text = f"Describe this image with extreme detail, focus on {'all elements including tiny details' if detail_level > 3 else 'main elements'}"
-            emotion_text = "Describe the mood, emotions, and atmosphere conveyed in this image" if emotion_focus > 2 else ""
-            style_text = "Describe the artistic style, lighting, colors, and composition" if style_focus > 2 else ""
-            # Combine texts based on focus areas
-            prompt_text = f"{detail_text}. {emotion_text}. {style_text}"
-            # Process with BLIP-2
-            inputs = blip2_processor(image, text=prompt_text, return_tensors="pt").to(device)
-            max_length = 150 + (detail_level * 50)
-            outputs = blip2_model.generate(
-                **inputs,
-                max_length=max_length,
-                num_beams=5,
-                min_length=50,
-                top_p=0.9,
-                repetition_penalty=1.5,
-                length_penalty=1.0
-            )
-            detailed_description = blip2_processor.decode(outputs[0], skip_special_tokens=True)
         # Format results for AI image generation
         formatted_result = ""
         # Add basic subject identification
         formatted_result += f"## Basic Caption:\n{basic_caption}\n\n"
-        # Add detailed description
-        formatted_result += f"## Detailed Description for AI Image Recreation:\n{detailed_description}\n\n"
-        # Add formatting guide based on detail level
         if detail_level >= 4:
-            # Extract potential elements for structured description
-            elements = []
-            if "person" in detailed_description.lower() or "people" in detailed_description.lower():
-                elements.append("subjects")
-            if any(word in detailed_description.lower() for word in ["background", "scene", "setting"]):
-                elements.append("setting")
-            if any(word in detailed_description.lower() for word in ["light", "shadow", "bright", "dark"]):
-                elements.append("lighting")
-            if any(word in detailed_description.lower() for word in ["color", "red", "blue", "green", "yellow", "tone"]):
-                elements.append("colors")
-            # Create a structured breakdown
-            formatted_result += "## Structured Elements:\n"
-            for element in elements:
-                formatted_result += f"- {element.capitalize()}: " + \
-                                   f"[Extract relevant details about {element} from the description]\n"
-            # Add prompt suggestion
-            formatted_result += "\n## Suggested AI Image Prompt:\n"
-            formatted_result += f"{basic_caption}, {', '.join(detailed_description.split('.')[:3])}, " + \
-                               f"{'high detail' if detail_level > 3 else 'moderate detail'}, " + \
-                               f"{'emotional' if emotion_focus > 3 else ''}, " + \
-                               f"{'artistic' if style_focus > 3 else ''}"
         return formatted_result
     except Exception as e:
-        return f"Error generating description: {str(e)}\n\nTraceback: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}"
 # Create Gradio interface
 with gr.Blocks(title="Advanced Image Description Generator") as demo:
-    gr.Markdown("# Advanced Image Description Generator for AI Image Recreation")
-    gr.Markdown("Upload an image to generate a detailed description that can help AI image generators recreate similar images.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -140,8 +178,8 @@ with gr.Blocks(title="Advanced Image Description Generator") as demo:
     4. Use the generated text to prompt AI image generators
     ## About
-    This app uses BLIP-2 and BLIP large models to analyze images and generate detailed descriptions
-    suitable for recreating similar images with AI image generators like Stable Diffusion, DALL-E, or Midjourney.
     """)
 # Launch the app

 import gradio as gr
 import torch
 from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+import re
 # Initial setup
 print("Loading models...")
+# Use a single, more reliable model for comprehensive descriptions
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+# Move model to GPU if available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 blip_model.to(device)
+print(f"Model loaded. Using device: {device}")
+def generate_caption(image, prompt):
+    """Generate a caption based on image and text prompt"""
+    inputs = blip_processor(image, prompt, return_tensors="pt").to(device)
+    outputs = blip_model.generate(**inputs, max_new_tokens=100)
+    return blip_processor.decode(outputs[0], skip_special_tokens=True)
 def generate_advanced_description(image, detail_level, emotion_focus, style_focus):
     """
+    Generate an advanced description using multiple targeted prompts
     """
     if image is None:
         return "Please upload an image to generate a description."
     try:
         with torch.no_grad():
+            # Generate multiple aspects of the description using different targeted prompts
+            # 1. Basic caption
+            basic_caption = generate_caption(image, "a detailed caption of this image:")
+            # 2. Subject description
+            subject_prompt = "Describe the main subjects in this image with details about their appearance:"
+            subject_desc = generate_caption(image, subject_prompt)
+            # 3. Setting/background
+            setting_prompt = "Describe the setting, location, and background of this image:"
+            setting_desc = generate_caption(image, setting_prompt)
+            # 4. Colors and visual elements
+            if style_focus >= 3:
+                color_prompt = "Describe the color scheme, visual composition, and artistic style of this image:"
+                color_desc = generate_caption(image, color_prompt)
+            else:
+                color_desc = ""
+            # 5. Emotion and mood
+            if emotion_focus >= 3:
+                emotion_prompt = "Describe the mood, emotional tone, and atmosphere conveyed in this image:"
+                emotion_desc = generate_caption(image, emotion_prompt)
+            else:
+                emotion_desc = ""
+            # 6. Lighting and time
+            lighting_prompt = "Describe the lighting conditions and time of day in this image:"
+            lighting_desc = generate_caption(image, lighting_prompt)
+            # 7. Details and textures (only for high detail levels)
+            if detail_level >= 4:
+                detail_prompt = "Describe the fine details, textures, and small elements visible in this image:"
+                detail_desc = generate_caption(image, detail_prompt)
+            else:
+                detail_desc = ""
+        # Clean up responses (sometimes the model repeats the prompt)
+        def clean_response(response, prompt):
+            # Remove the prompt if it appears at the beginning
+            if response.startswith(prompt):
+                response = response[len(prompt):].strip()
+            return response
+        subject_desc = clean_response(subject_desc, subject_prompt)
+        setting_desc = clean_response(setting_desc, setting_prompt)
+        if style_focus >= 3:
+            color_desc = clean_response(color_desc, color_prompt)
+        if emotion_focus >= 3:
+            emotion_desc = clean_response(emotion_desc, emotion_prompt)
+        lighting_desc = clean_response(lighting_desc, lighting_prompt)
+        if detail_level >= 4:
+            detail_desc = clean_response(detail_desc, detail_prompt)
         # Format results for AI image generation
         formatted_result = ""
         # Add basic subject identification
         formatted_result += f"## Basic Caption:\n{basic_caption}\n\n"
+        # Add comprehensive description section
+        formatted_result += f"## Detailed Description for AI Image Recreation:\n\n"
+        formatted_result += f"**Main Subject(s):** {subject_desc}\n\n"
+        formatted_result += f"**Setting/Background:** {setting_desc}\n\n"
+        formatted_result += f"**Lighting/Atmosphere:** {lighting_desc}\n\n"
+        if style_focus >= 3:
+            formatted_result += f"**Visual Style/Colors:** {color_desc}\n\n"
+        if emotion_focus >= 3:
+            formatted_result += f"**Mood/Emotional Tone:** {emotion_desc}\n\n"
         if detail_level >= 4:
+            formatted_result += f"**Fine Details/Textures:** {detail_desc}\n\n"
+        # Additional section for AI generation prompts
+        descriptions = [basic_caption.strip(".")]
+        if len(subject_desc) > 10:
+            descriptions.append(subject_desc.split(".")[0])
+        if len(setting_desc) > 10:
+            descriptions.append(setting_desc.split(".")[0])
+        if style_focus >= 3 and len(color_desc) > 10:
+            descriptions.append(color_desc.split(".")[0])
+        # Create specific prompt for AI image generation
+        formatted_result += "## Suggested AI Image Generation Prompt:\n\n"
+        ai_prompt = ", ".join(descriptions)
+        # Add qualifiers based on settings
+        qualifiers = []
+        if detail_level >= 4:
+            qualifiers.append("highly detailed")
+            qualifiers.append("intricate")
+        if emotion_focus >= 4:
+            qualifiers.append("emotional")
+            qualifiers.append("evocative")
+        if style_focus >= 4:
+            qualifiers.append("artistic composition")
+            qualifiers.append("professional photography")
+        if qualifiers:
+            ai_prompt += ", " + ", ".join(qualifiers)
+        formatted_result += ai_prompt
         return formatted_result
     except Exception as e:
+        return f"Error generating description: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Advanced Image Description Generator") as demo:
+    gr.Markdown("# Advanced Image Description Generator for AI Recreation")
+    gr.Markdown("Upload an image to generate detailed descriptions that help AI image generators recreate similar images.")
     with gr.Row():
         with gr.Column(scale=1):
     4. Use the generated text to prompt AI image generators
     ## About
+    This app analyzes images and generates detailed descriptions suitable for recreating
+    similar images with AI image generators like Stable Diffusion, Midjourney, or DALL-E.
     """)
 # Launch the app