Afsha001 commited on
Commit
4b24723
Β·
verified Β·
1 Parent(s): c2bd5ca

update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -15
app.py CHANGED
@@ -22,8 +22,8 @@ st.set_page_config(
22
  # ============================================================================
23
  # CREDENTIALS
24
  # ============================================================================
25
- JINA_KEY = os.environ.get("JINA_KEY", "")
26
- GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
27
 
28
  JINA_URL = "https://api.jina.ai/v1/rerank"
29
  JINA_HEADERS = {
@@ -56,14 +56,11 @@ if not GOOGLE_API_KEY:
56
  st.error("GOOGLE_API_KEY missing. Go to Space Settings β†’ Secrets and add it.")
57
  st.stop()
58
 
59
- # Configure Gemini API
60
  genai.configure(api_key=GOOGLE_API_KEY)
61
 
62
  # ============================================================================
63
  # LOAD LOCAL MODELS
64
- # Florence-2-Large removed β€” replaced by Gemini 1.5 Flash API
65
- # Saves 1.6GB RAM and 2-3 min startup time
66
- # Local: BLIP ITM, DINO, Qwen2.5
67
  # ============================================================================
68
  @st.cache_resource
69
  def load_local_models():
@@ -77,7 +74,6 @@ def load_local_models():
77
  )
78
  gc.collect()
79
 
80
- # BLIP β€” ITM scoring and cosine similarity
81
  blip_processor = BlipProcessor.from_pretrained(
82
  "Salesforce/blip-image-captioning-large"
83
  )
@@ -87,7 +83,6 @@ def load_local_models():
87
  )
88
  blip_itm_model.eval()
89
 
90
- # DINO β€” object detection
91
  dino_processor = AutoProcessor.from_pretrained(
92
  "IDEA-Research/grounding-dino-base"
93
  )
@@ -97,7 +92,6 @@ def load_local_models():
97
  )
98
  dino_model.eval()
99
 
100
- # Qwen2.5-1.5B β€” caption fusion
101
  qwen_tokenizer = AutoTokenizer.from_pretrained(
102
  "Qwen/Qwen2.5-1.5B-Instruct"
103
  )
@@ -127,17 +121,14 @@ def image_to_data_uri(image: Image.Image) -> str:
127
  return f"data:image/jpeg;base64,{b64}"
128
 
129
  # ============================================================================
130
- # STEP 1 β€” GEMINI 1.5 FLASH (API): GENERATE 5 DIVERSE CAPTIONS
131
- # 5 different prompts β€” each focuses on a different aspect of the image
132
- # Gemini sees the image directly as a VLM β€” no hallucination from task tokens
133
- # API response ~2-4 sec per caption β€” 5 captions in ~15-20 sec total
134
  # ============================================================================
135
  def generate_captions_gemini(image: Image.Image) -> list:
136
 
137
  model = genai.GenerativeModel("gemini-2.0-flash")
138
 
139
  prompts = [
140
- "Describe this image in detail covering the overall scene.",
141
  "Describe the people in this image β€” their clothing colors, style, and what they are doing.",
142
  "Describe the background, setting, and surroundings visible in this image.",
143
  "Describe all the objects, plants, and items visible around the people in this image.",
@@ -155,7 +146,6 @@ def generate_captions_gemini(image: Image.Image) -> list:
155
  st.warning(f"Gemini error: {str(e)[:80]}")
156
  captions.append("a scene shown in the image")
157
 
158
- # Deduplicate while keeping order
159
  seen, unique = set(), []
160
  for c in captions:
161
  if c not in seen:
 
22
  # ============================================================================
23
  # CREDENTIALS
24
  # ============================================================================
25
+ JINA_KEY = os.environ.get("JINA_KEY", "")
26
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
27
 
28
  JINA_URL = "https://api.jina.ai/v1/rerank"
29
  JINA_HEADERS = {
 
56
  st.error("GOOGLE_API_KEY missing. Go to Space Settings β†’ Secrets and add it.")
57
  st.stop()
58
 
59
+ # Configure Gemini β€” after GOOGLE_API_KEY is defined
60
  genai.configure(api_key=GOOGLE_API_KEY)
61
 
62
  # ============================================================================
63
  # LOAD LOCAL MODELS
 
 
 
64
  # ============================================================================
65
  @st.cache_resource
66
  def load_local_models():
 
74
  )
75
  gc.collect()
76
 
 
77
  blip_processor = BlipProcessor.from_pretrained(
78
  "Salesforce/blip-image-captioning-large"
79
  )
 
83
  )
84
  blip_itm_model.eval()
85
 
 
86
  dino_processor = AutoProcessor.from_pretrained(
87
  "IDEA-Research/grounding-dino-base"
88
  )
 
92
  )
93
  dino_model.eval()
94
 
 
95
  qwen_tokenizer = AutoTokenizer.from_pretrained(
96
  "Qwen/Qwen2.5-1.5B-Instruct"
97
  )
 
121
  return f"data:image/jpeg;base64,{b64}"
122
 
123
  # ============================================================================
124
+ # STEP 1 β€” GEMINI 2.0 FLASH (API): GENERATE 5 DIVERSE CAPTIONS
 
 
 
125
  # ============================================================================
126
  def generate_captions_gemini(image: Image.Image) -> list:
127
 
128
  model = genai.GenerativeModel("gemini-2.0-flash")
129
 
130
  prompts = [
131
+ "Describe this image in detail covering the overall scene with every possible detail in simple language.",
132
  "Describe the people in this image β€” their clothing colors, style, and what they are doing.",
133
  "Describe the background, setting, and surroundings visible in this image.",
134
  "Describe all the objects, plants, and items visible around the people in this image.",
 
146
  st.warning(f"Gemini error: {str(e)[:80]}")
147
  captions.append("a scene shown in the image")
148
 
 
149
  seen, unique = set(), []
150
  for c in captions:
151
  if c not in seen: