gowshiselva commited on
Commit
2cbccec
·
verified ·
1 Parent(s): 90a72f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -24
app.py CHANGED
@@ -1,23 +1,23 @@
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
- from transformers import AutoProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, Blip2ForConditionalGeneration
5
 
6
  # Initial setup
7
  print("Loading models...")
8
 
9
  # Main model for detailed captions
 
10
  blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
11
- blip2_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
12
 
13
  # Secondary model for emotion and detail detection
14
- blip_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
15
- blip_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
16
 
17
  # Move models to GPU if available
18
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
  blip2_model.to(device)
20
- blip_large.to(device)
21
 
22
  print(f"Models loaded. Using device: {device}")
23
 
@@ -34,32 +34,30 @@ def generate_advanced_description(image, detail_level, emotion_focus, style_focu
34
  if image is None:
35
  return "Please upload an image to generate a description."
36
 
37
- # Process image for both models
38
- blip_inputs = blip_processor(images=image, return_tensors="pt").to(device)
39
-
40
- # Basic prompts for different aspects
41
- detail_prompt = f"Describe this image with extreme detail, focus on {'all elements including tiny details' if detail_level > 3 else 'main elements'}"
42
- emotion_prompt = "Describe the mood, emotions, and atmosphere conveyed in this image" if emotion_focus > 2 else ""
43
- style_prompt = "Describe the artistic style, lighting, colors, and composition" if style_focus > 2 else ""
44
-
45
- # Combine prompts based on focus areas
46
- combined_prompt = f"{detail_prompt}. {emotion_prompt}. {style_prompt}"
47
-
48
  try:
49
  # Generate both basic and detailed descriptions
50
  with torch.no_grad():
51
  # Get basic caption from BLIP large
52
- basic_outputs = blip_large.generate(**blip_inputs, max_length=50)
 
53
  basic_caption = blip_processor.decode(basic_outputs[0], skip_special_tokens=True)
54
 
55
- # Get detailed description from BLIP-2
56
- # BLIP-2 requires text input to be processed with the image
57
- text = "a detailed description: " + combined_prompt
58
- blip2_inputs = blip2_processor(image, text=text, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
59
 
60
  outputs = blip2_model.generate(
61
- **blip2_inputs,
62
- max_length=150 + (detail_level * 50),
63
  num_beams=5,
64
  min_length=50,
65
  top_p=0.9,
@@ -106,7 +104,7 @@ def generate_advanced_description(image, detail_level, emotion_focus, style_focu
106
  return formatted_result
107
 
108
  except Exception as e:
109
- return f"Error generating description: {str(e)}"
110
 
111
  # Create Gradio interface
112
  with gr.Blocks(title="Advanced Image Description Generator") as demo:
 
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
+ from transformers import BlipProcessor, BlipForConditionalGeneration, Blip2Processor, Blip2ForConditionalGeneration
5
 
6
  # Initial setup
7
  print("Loading models...")
8
 
9
  # Main model for detailed captions
10
+ blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
11
  blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
 
12
 
13
  # Secondary model for emotion and detail detection
14
+ blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
15
+ blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
16
 
17
  # Move models to GPU if available
18
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
  blip2_model.to(device)
20
+ blip_model.to(device)
21
 
22
  print(f"Models loaded. Using device: {device}")
23
 
 
34
  if image is None:
35
  return "Please upload an image to generate a description."
36
 
 
 
 
 
 
 
 
 
 
 
 
37
  try:
38
  # Generate both basic and detailed descriptions
39
  with torch.no_grad():
40
  # Get basic caption from BLIP large
41
+ inputs = blip_processor(image, return_tensors="pt").to(device)
42
+ basic_outputs = blip_model.generate(**inputs, max_length=50)
43
  basic_caption = blip_processor.decode(basic_outputs[0], skip_special_tokens=True)
44
 
45
+ # Create prompt text based on sliders
46
+ detail_text = f"Describe this image with extreme detail, focus on {'all elements including tiny details' if detail_level > 3 else 'main elements'}"
47
+ emotion_text = "Describe the mood, emotions, and atmosphere conveyed in this image" if emotion_focus > 2 else ""
48
+ style_text = "Describe the artistic style, lighting, colors, and composition" if style_focus > 2 else ""
49
+
50
+ # Combine texts based on focus areas
51
+ prompt_text = f"{detail_text}. {emotion_text}. {style_text}"
52
+
53
+ # Process with BLIP-2
54
+ inputs = blip2_processor(image, text=prompt_text, return_tensors="pt").to(device)
55
+
56
+ max_length = 150 + (detail_level * 50)
57
 
58
  outputs = blip2_model.generate(
59
+ **inputs,
60
+ max_length=max_length,
61
  num_beams=5,
62
  min_length=50,
63
  top_p=0.9,
 
104
  return formatted_result
105
 
106
  except Exception as e:
107
+ return f"Error generating description: {str(e)}\n\nTraceback: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}"
108
 
109
  # Create Gradio interface
110
  with gr.Blocks(title="Advanced Image Description Generator") as demo: