gowshiselva commited on
Commit
2e6a234
·
verified ·
1 Parent(s): 2cbccec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -72
app.py CHANGED
@@ -1,115 +1,153 @@
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
- from transformers import BlipProcessor, BlipForConditionalGeneration, Blip2Processor, Blip2ForConditionalGeneration
 
5
 
6
  # Initial setup
7
  print("Loading models...")
8
 
9
- # Main model for detailed captions
10
- blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
11
- blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
12
-
13
- # Secondary model for emotion and detail detection
14
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
15
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
16
 
17
- # Move models to GPU if available
18
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
- blip2_model.to(device)
20
  blip_model.to(device)
21
 
22
- print(f"Models loaded. Using device: {device}")
 
 
 
 
 
 
23
 
24
  def generate_advanced_description(image, detail_level, emotion_focus, style_focus):
25
  """
26
- Generate an advanced description of the image with varying levels of detail.
27
-
28
- Args:
29
- image: Input image
30
- detail_level: Level of detail (1-5)
31
- emotion_focus: Focus on emotions (0-5)
32
- style_focus: Focus on artistic style (0-5)
33
  """
34
  if image is None:
35
  return "Please upload an image to generate a description."
36
 
37
  try:
38
- # Generate both basic and detailed descriptions
39
  with torch.no_grad():
40
- # Get basic caption from BLIP large
41
- inputs = blip_processor(image, return_tensors="pt").to(device)
42
- basic_outputs = blip_model.generate(**inputs, max_length=50)
43
- basic_caption = blip_processor.decode(basic_outputs[0], skip_special_tokens=True)
44
 
45
- # Create prompt text based on sliders
46
- detail_text = f"Describe this image with extreme detail, focus on {'all elements including tiny details' if detail_level > 3 else 'main elements'}"
47
- emotion_text = "Describe the mood, emotions, and atmosphere conveyed in this image" if emotion_focus > 2 else ""
48
- style_text = "Describe the artistic style, lighting, colors, and composition" if style_focus > 2 else ""
49
 
50
- # Combine texts based on focus areas
51
- prompt_text = f"{detail_text}. {emotion_text}. {style_text}"
 
52
 
53
- # Process with BLIP-2
54
- inputs = blip2_processor(image, text=prompt_text, return_tensors="pt").to(device)
 
55
 
56
- max_length = 150 + (detail_level * 50)
 
 
 
 
 
57
 
58
- outputs = blip2_model.generate(
59
- **inputs,
60
- max_length=max_length,
61
- num_beams=5,
62
- min_length=50,
63
- top_p=0.9,
64
- repetition_penalty=1.5,
65
- length_penalty=1.0
66
- )
67
- detailed_description = blip2_processor.decode(outputs[0], skip_special_tokens=True)
68
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # Format results for AI image generation
70
  formatted_result = ""
71
 
72
  # Add basic subject identification
73
  formatted_result += f"## Basic Caption:\n{basic_caption}\n\n"
74
 
75
- # Add detailed description
76
- formatted_result += f"## Detailed Description for AI Image Recreation:\n{detailed_description}\n\n"
 
 
 
 
 
 
 
 
 
 
77
 
78
- # Add formatting guide based on detail level
79
  if detail_level >= 4:
80
- # Extract potential elements for structured description
81
- elements = []
82
- if "person" in detailed_description.lower() or "people" in detailed_description.lower():
83
- elements.append("subjects")
84
- if any(word in detailed_description.lower() for word in ["background", "scene", "setting"]):
85
- elements.append("setting")
86
- if any(word in detailed_description.lower() for word in ["light", "shadow", "bright", "dark"]):
87
- elements.append("lighting")
88
- if any(word in detailed_description.lower() for word in ["color", "red", "blue", "green", "yellow", "tone"]):
89
- elements.append("colors")
90
-
91
- # Create a structured breakdown
92
- formatted_result += "## Structured Elements:\n"
93
- for element in elements:
94
- formatted_result += f"- {element.capitalize()}: " + \
95
- f"[Extract relevant details about {element} from the description]\n"
96
-
97
- # Add prompt suggestion
98
- formatted_result += "\n## Suggested AI Image Prompt:\n"
99
- formatted_result += f"{basic_caption}, {', '.join(detailed_description.split('.')[:3])}, " + \
100
- f"{'high detail' if detail_level > 3 else 'moderate detail'}, " + \
101
- f"{'emotional' if emotion_focus > 3 else ''}, " + \
102
- f"{'artistic' if style_focus > 3 else ''}"
 
 
 
 
 
 
 
 
103
 
104
  return formatted_result
105
 
106
  except Exception as e:
107
- return f"Error generating description: {str(e)}\n\nTraceback: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}"
108
 
109
  # Create Gradio interface
110
  with gr.Blocks(title="Advanced Image Description Generator") as demo:
111
- gr.Markdown("# Advanced Image Description Generator for AI Image Recreation")
112
- gr.Markdown("Upload an image to generate a detailed description that can help AI image generators recreate similar images.")
113
 
114
  with gr.Row():
115
  with gr.Column(scale=1):
@@ -140,8 +178,8 @@ with gr.Blocks(title="Advanced Image Description Generator") as demo:
140
  4. Use the generated text to prompt AI image generators
141
 
142
  ## About
143
- This app uses BLIP-2 and BLIP large models to analyze images and generate detailed descriptions
144
- suitable for recreating similar images with AI image generators like Stable Diffusion, DALL-E, or Midjourney.
145
  """)
146
 
147
  # Launch the app
 
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
+ from transformers import BlipProcessor, BlipForConditionalGeneration
5
+ import re
6
 
7
  # Initial setup
8
  print("Loading models...")
9
 
10
+ # Use a single, more reliable model for comprehensive descriptions
 
 
 
 
11
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
12
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
13
 
14
+ # Move model to GPU if available
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
16
  blip_model.to(device)
17
 
18
+ print(f"Model loaded. Using device: {device}")
19
+
20
+ def generate_caption(image, prompt):
21
+ """Generate a caption based on image and text prompt"""
22
+ inputs = blip_processor(image, prompt, return_tensors="pt").to(device)
23
+ outputs = blip_model.generate(**inputs, max_new_tokens=100)
24
+ return blip_processor.decode(outputs[0], skip_special_tokens=True)
25
 
26
  def generate_advanced_description(image, detail_level, emotion_focus, style_focus):
27
  """
28
+ Generate an advanced description using multiple targeted prompts
 
 
 
 
 
 
29
  """
30
  if image is None:
31
  return "Please upload an image to generate a description."
32
 
33
  try:
 
34
  with torch.no_grad():
35
+ # Generate multiple aspects of the description using different targeted prompts
 
 
 
36
 
37
+ # 1. Basic caption
38
+ basic_caption = generate_caption(image, "a detailed caption of this image:")
 
 
39
 
40
+ # 2. Subject description
41
+ subject_prompt = "Describe the main subjects in this image with details about their appearance:"
42
+ subject_desc = generate_caption(image, subject_prompt)
43
 
44
+ # 3. Setting/background
45
+ setting_prompt = "Describe the setting, location, and background of this image:"
46
+ setting_desc = generate_caption(image, setting_prompt)
47
 
48
+ # 4. Colors and visual elements
49
+ if style_focus >= 3:
50
+ color_prompt = "Describe the color scheme, visual composition, and artistic style of this image:"
51
+ color_desc = generate_caption(image, color_prompt)
52
+ else:
53
+ color_desc = ""
54
 
55
+ # 5. Emotion and mood
56
+ if emotion_focus >= 3:
57
+ emotion_prompt = "Describe the mood, emotional tone, and atmosphere conveyed in this image:"
58
+ emotion_desc = generate_caption(image, emotion_prompt)
59
+ else:
60
+ emotion_desc = ""
61
+
62
+ # 6. Lighting and time
63
+ lighting_prompt = "Describe the lighting conditions and time of day in this image:"
64
+ lighting_desc = generate_caption(image, lighting_prompt)
65
+
66
+ # 7. Details and textures (only for high detail levels)
67
+ if detail_level >= 4:
68
+ detail_prompt = "Describe the fine details, textures, and small elements visible in this image:"
69
+ detail_desc = generate_caption(image, detail_prompt)
70
+ else:
71
+ detail_desc = ""
72
+
73
+ # Clean up responses (sometimes the model repeats the prompt)
74
+ def clean_response(response, prompt):
75
+ # Remove the prompt if it appears at the beginning
76
+ if response.startswith(prompt):
77
+ response = response[len(prompt):].strip()
78
+ return response
79
+
80
+ subject_desc = clean_response(subject_desc, subject_prompt)
81
+ setting_desc = clean_response(setting_desc, setting_prompt)
82
+ if style_focus >= 3:
83
+ color_desc = clean_response(color_desc, color_prompt)
84
+ if emotion_focus >= 3:
85
+ emotion_desc = clean_response(emotion_desc, emotion_prompt)
86
+ lighting_desc = clean_response(lighting_desc, lighting_prompt)
87
+ if detail_level >= 4:
88
+ detail_desc = clean_response(detail_desc, detail_prompt)
89
+
90
  # Format results for AI image generation
91
  formatted_result = ""
92
 
93
  # Add basic subject identification
94
  formatted_result += f"## Basic Caption:\n{basic_caption}\n\n"
95
 
96
+ # Add comprehensive description section
97
+ formatted_result += f"## Detailed Description for AI Image Recreation:\n\n"
98
+
99
+ formatted_result += f"**Main Subject(s):** {subject_desc}\n\n"
100
+ formatted_result += f"**Setting/Background:** {setting_desc}\n\n"
101
+ formatted_result += f"**Lighting/Atmosphere:** {lighting_desc}\n\n"
102
+
103
+ if style_focus >= 3:
104
+ formatted_result += f"**Visual Style/Colors:** {color_desc}\n\n"
105
+
106
+ if emotion_focus >= 3:
107
+ formatted_result += f"**Mood/Emotional Tone:** {emotion_desc}\n\n"
108
 
 
109
  if detail_level >= 4:
110
+ formatted_result += f"**Fine Details/Textures:** {detail_desc}\n\n"
111
+
112
+ # Additional section for AI generation prompts
113
+ descriptions = [basic_caption.strip(".")]
114
+ if len(subject_desc) > 10:
115
+ descriptions.append(subject_desc.split(".")[0])
116
+ if len(setting_desc) > 10:
117
+ descriptions.append(setting_desc.split(".")[0])
118
+ if style_focus >= 3 and len(color_desc) > 10:
119
+ descriptions.append(color_desc.split(".")[0])
120
+
121
+ # Create specific prompt for AI image generation
122
+ formatted_result += "## Suggested AI Image Generation Prompt:\n\n"
123
+ ai_prompt = ", ".join(descriptions)
124
+
125
+ # Add qualifiers based on settings
126
+ qualifiers = []
127
+ if detail_level >= 4:
128
+ qualifiers.append("highly detailed")
129
+ qualifiers.append("intricate")
130
+ if emotion_focus >= 4:
131
+ qualifiers.append("emotional")
132
+ qualifiers.append("evocative")
133
+ if style_focus >= 4:
134
+ qualifiers.append("artistic composition")
135
+ qualifiers.append("professional photography")
136
+
137
+ if qualifiers:
138
+ ai_prompt += ", " + ", ".join(qualifiers)
139
+
140
+ formatted_result += ai_prompt
141
 
142
  return formatted_result
143
 
144
  except Exception as e:
145
+ return f"Error generating description: {str(e)}"
146
 
147
  # Create Gradio interface
148
  with gr.Blocks(title="Advanced Image Description Generator") as demo:
149
+ gr.Markdown("# Advanced Image Description Generator for AI Recreation")
150
+ gr.Markdown("Upload an image to generate detailed descriptions that help AI image generators recreate similar images.")
151
 
152
  with gr.Row():
153
  with gr.Column(scale=1):
 
178
  4. Use the generated text to prompt AI image generators
179
 
180
  ## About
181
+ This app analyzes images and generates detailed descriptions suitable for recreating
182
+ similar images with AI image generators like Stable Diffusion, Midjourney, or DALL-E.
183
  """)
184
 
185
  # Launch the app