Spaces:

gowshiselva
/

image-description

Runtime error

App Files Files Community

gowshiselva commited on Mar 23, 2025

Commit

2cbccec

verified ·

1 Parent(s): 90a72f2

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -24

app.py CHANGED Viewed

@@ -1,23 +1,23 @@
 import gradio as gr
 import torch
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, Blip2ForConditionalGeneration
 # Initial setup
 print("Loading models...")
 # Main model for detailed captions
 blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
-blip2_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
 # Secondary model for emotion and detail detection
-blip_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-blip_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
 # Move models to GPU if available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 blip2_model.to(device)
-blip_large.to(device)
 print(f"Models loaded. Using device: {device}")
@@ -34,32 +34,30 @@ def generate_advanced_description(image, detail_level, emotion_focus, style_focu
     if image is None:
         return "Please upload an image to generate a description."
-    # Process image for both models
-    blip_inputs = blip_processor(images=image, return_tensors="pt").to(device)
-    # Basic prompts for different aspects
-    detail_prompt = f"Describe this image with extreme detail, focus on {'all elements including tiny details' if detail_level > 3 else 'main elements'}"
-    emotion_prompt = "Describe the mood, emotions, and atmosphere conveyed in this image" if emotion_focus > 2 else ""
-    style_prompt = "Describe the artistic style, lighting, colors, and composition" if style_focus > 2 else ""
-    # Combine prompts based on focus areas
-    combined_prompt = f"{detail_prompt}. {emotion_prompt}. {style_prompt}"
     try:
         # Generate both basic and detailed descriptions
         with torch.no_grad():
             # Get basic caption from BLIP large
-            basic_outputs = blip_large.generate(**blip_inputs, max_length=50)
             basic_caption = blip_processor.decode(basic_outputs[0], skip_special_tokens=True)
-            # Get detailed description from BLIP-2
-            # BLIP-2 requires text input to be processed with the image
-            text = "a detailed description: " + combined_prompt
-            blip2_inputs = blip2_processor(image, text=text, return_tensors="pt").to(device)
             outputs = blip2_model.generate(
-                **blip2_inputs,
-                max_length=150 + (detail_level * 50),
                 num_beams=5,
                 min_length=50,
                 top_p=0.9,
@@ -106,7 +104,7 @@ def generate_advanced_description(image, detail_level, emotion_focus, style_focu
         return formatted_result
     except Exception as e:
-        return f"Error generating description: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Advanced Image Description Generator") as demo:

 import gradio as gr
 import torch
 from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration, Blip2Processor, Blip2ForConditionalGeneration
 # Initial setup
 print("Loading models...")
 # Main model for detailed captions
+blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
 blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
 # Secondary model for emotion and detail detection
+blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
 # Move models to GPU if available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 blip2_model.to(device)
+blip_model.to(device)
 print(f"Models loaded. Using device: {device}")
     if image is None:
         return "Please upload an image to generate a description."
     try:
         # Generate both basic and detailed descriptions
         with torch.no_grad():
             # Get basic caption from BLIP large
+            inputs = blip_processor(image, return_tensors="pt").to(device)
+            basic_outputs = blip_model.generate(**inputs, max_length=50)
             basic_caption = blip_processor.decode(basic_outputs[0], skip_special_tokens=True)
+            # Create prompt text based on sliders
+            detail_text = f"Describe this image with extreme detail, focus on {'all elements including tiny details' if detail_level > 3 else 'main elements'}"
+            emotion_text = "Describe the mood, emotions, and atmosphere conveyed in this image" if emotion_focus > 2 else ""
+            style_text = "Describe the artistic style, lighting, colors, and composition" if style_focus > 2 else ""
+            # Combine texts based on focus areas
+            prompt_text = f"{detail_text}. {emotion_text}. {style_text}"
+            # Process with BLIP-2
+            inputs = blip2_processor(image, text=prompt_text, return_tensors="pt").to(device)
+            max_length = 150 + (detail_level * 50)
             outputs = blip2_model.generate(
+                **inputs,
+                max_length=max_length,
                 num_beams=5,
                 min_length=50,
                 top_p=0.9,
         return formatted_result
     except Exception as e:
+        return f"Error generating description: {str(e)}\n\nTraceback: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}"
 # Create Gradio interface
 with gr.Blocks(title="Advanced Image Description Generator") as demo: