Afsha001 commited on
Commit
0ae3fd7
Β·
verified Β·
1 Parent(s): 4b24723

update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -19
app.py CHANGED
@@ -56,11 +56,13 @@ if not GOOGLE_API_KEY:
56
  st.error("GOOGLE_API_KEY missing. Go to Space Settings β†’ Secrets and add it.")
57
  st.stop()
58
 
59
- # Configure Gemini β€” after GOOGLE_API_KEY is defined
60
  genai.configure(api_key=GOOGLE_API_KEY)
61
 
62
  # ============================================================================
63
  # LOAD LOCAL MODELS
 
 
64
  # ============================================================================
65
  @st.cache_resource
66
  def load_local_models():
@@ -74,6 +76,7 @@ def load_local_models():
74
  )
75
  gc.collect()
76
 
 
77
  blip_processor = BlipProcessor.from_pretrained(
78
  "Salesforce/blip-image-captioning-large"
79
  )
@@ -83,6 +86,7 @@ def load_local_models():
83
  )
84
  blip_itm_model.eval()
85
 
 
86
  dino_processor = AutoProcessor.from_pretrained(
87
  "IDEA-Research/grounding-dino-base"
88
  )
@@ -92,6 +96,7 @@ def load_local_models():
92
  )
93
  dino_model.eval()
94
 
 
95
  qwen_tokenizer = AutoTokenizer.from_pretrained(
96
  "Qwen/Qwen2.5-1.5B-Instruct"
97
  )
@@ -121,30 +126,49 @@ def image_to_data_uri(image: Image.Image) -> str:
121
  return f"data:image/jpeg;base64,{b64}"
122
 
123
  # ============================================================================
124
- # STEP 1 β€” GEMINI 2.0 FLASH (API): GENERATE 5 DIVERSE CAPTIONS
 
 
125
  # ============================================================================
126
  def generate_captions_gemini(image: Image.Image) -> list:
127
 
128
  model = genai.GenerativeModel("gemini-2.0-flash")
129
 
130
- prompts = [
131
- "Describe this image in detail covering the overall scene with every possible detail in simple language.",
132
- "Describe the people in this image β€” their clothing colors, style, and what they are doing.",
133
- "Describe the background, setting, and surroundings visible in this image.",
134
- "Describe all the objects, plants, and items visible around the people in this image.",
135
- "Write a full description of this image covering who is in it, what is happening, their appearance, and where it takes place."
136
- ]
137
 
138
- captions = []
 
 
 
 
139
 
140
- for prompt in prompts:
141
- try:
142
- response = model.generate_content([prompt, image])
143
- cap = response.text.strip().lower()
144
- captions.append(cap if cap else "a scene shown in the image")
145
- except Exception as e:
146
- st.warning(f"Gemini error: {str(e)[:80]}")
147
- captions.append("a scene shown in the image")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  seen, unique = set(), []
150
  for c in captions:
@@ -245,7 +269,7 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
245
  return [0.0] * len(captions)
246
 
247
  # ============================================================================
248
- # STEP 5 β€” MAJORITY VOTING
249
  # ============================================================================
250
  def majority_voting(captions, itm, jina, cosine) -> tuple:
251
  itm_r = np.argsort(itm)[::-1]
 
56
  st.error("GOOGLE_API_KEY missing. Go to Space Settings β†’ Secrets and add it.")
57
  st.stop()
58
 
59
+ # Configure Gemini after credentials are defined
60
  genai.configure(api_key=GOOGLE_API_KEY)
61
 
62
  # ============================================================================
63
  # LOAD LOCAL MODELS
64
+ # Local: BLIP ITM, DINO, Qwen2.5
65
+ # API: Gemini 2.0 Flash, Jina Reranker
66
  # ============================================================================
67
  @st.cache_resource
68
  def load_local_models():
 
76
  )
77
  gc.collect()
78
 
79
+ # BLIP β€” ITM scoring and cosine similarity
80
  blip_processor = BlipProcessor.from_pretrained(
81
  "Salesforce/blip-image-captioning-large"
82
  )
 
86
  )
87
  blip_itm_model.eval()
88
 
89
+ # DINO β€” object detection
90
  dino_processor = AutoProcessor.from_pretrained(
91
  "IDEA-Research/grounding-dino-base"
92
  )
 
96
  )
97
  dino_model.eval()
98
 
99
+ # Qwen2.5-1.5B β€” caption fusion
100
  qwen_tokenizer = AutoTokenizer.from_pretrained(
101
  "Qwen/Qwen2.5-1.5B-Instruct"
102
  )
 
126
  return f"data:image/jpeg;base64,{b64}"
127
 
128
  # ============================================================================
129
+ # STEP 1 β€” GEMINI 2.0 FLASH: GENERATE 5 DIVERSE CAPTIONS
130
+ # Single API call β€” all 5 captions in one request
131
+ # Avoids 429 rate limit that occurred with 5 separate calls
132
  # ============================================================================
133
  def generate_captions_gemini(image: Image.Image) -> list:
134
 
135
  model = genai.GenerativeModel("gemini-2.0-flash")
136
 
137
+ prompt = """Look at this image carefully and write 5 different captions from different perspectives.
 
 
 
 
 
 
138
 
139
+ 1. Overall scene: describing the image in every possible detail in simple language.
140
+ 2. People: Describe the people, their clothing colors, style, and what they are doing.
141
+ 3. Background: Describe the background, setting, and surroundings.
142
+ 4. Objects: Describe the objects, plants, and items visible in the image.
143
+ 5. Full description: A complete description covering who is in the image, what they are doing, their appearance, and where the scene takes place.
144
 
145
+ Reply in this exact format:
146
+ CAPTION_1: [your caption here]
147
+ CAPTION_2: [your caption here]
148
+ CAPTION_3: [your caption here]
149
+ CAPTION_4: [your caption here]
150
+ CAPTION_5: [your caption here]"""
151
+
152
+ try:
153
+ response = model.generate_content([prompt, image])
154
+ raw_text = response.text.strip()
155
+
156
+ captions = []
157
+ for i in range(1, 6):
158
+ marker = f"CAPTION_{i}:"
159
+ next_marker = f"CAPTION_{i+1}:" if i < 5 else None
160
+
161
+ if marker in raw_text:
162
+ start = raw_text.index(marker) + len(marker)
163
+ end = raw_text.index(next_marker) if next_marker and next_marker in raw_text else len(raw_text)
164
+ cap = raw_text[start:end].strip().lower()
165
+ captions.append(cap if cap else "a scene shown in the image")
166
+ else:
167
+ captions.append("a scene shown in the image")
168
+
169
+ except Exception as e:
170
+ st.warning(f"Gemini error: {str(e)[:80]}")
171
+ captions = ["a scene shown in the image"] * 5
172
 
173
  seen, unique = set(), []
174
  for c in captions:
 
269
  return [0.0] * len(captions)
270
 
271
  # ============================================================================
272
+ # STEP 5 β€” MAJORITY VOTING: SELECT TOP 2 CAPTIONS
273
  # ============================================================================
274
  def majority_voting(captions, itm, jina, cosine) -> tuple:
275
  itm_r = np.argsort(itm)[::-1]