Afsha001 commited on
Commit
4f06c78
·
verified ·
1 Parent(s): c1c04ba

update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -49
app.py CHANGED
@@ -29,13 +29,9 @@ JINA_KEY = os.environ.get("JINA_KEY", "")
29
 
30
  # ============================================================================
31
  # API ENDPOINTS
32
- # GIT-Large-COCO: raw bytes, no Content-Type (replaces Florence-2-Large)
33
- # Qwen2.5: model-specific endpoint
34
  # Jina: query=plain string, documents=list of data URI strings
35
  # ============================================================================
36
- GIT_URL = "https://api-inference.huggingface.co/models/microsoft/git-large-coco"
37
- GIT_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
38
-
39
  QWEN_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
40
  HF_HEADERS = {
41
  "Authorization": f"Bearer {HF_TOKEN}",
@@ -69,11 +65,16 @@ if not JINA_KEY:
69
  st.stop()
70
 
71
  # ============================================================================
72
- # LOAD LOCAL MODELS — BLIP ITM + GROUNDING DINO
 
 
 
73
  # ============================================================================
74
  @st.cache_resource
75
  def load_local_models():
76
  from transformers import (
 
 
77
  BlipProcessor,
78
  BlipForImageTextRetrieval,
79
  AutoProcessor,
@@ -81,6 +82,19 @@ def load_local_models():
81
  )
82
  gc.collect()
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  blip_processor = BlipProcessor.from_pretrained(
85
  "Salesforce/blip-image-captioning-large"
86
  )
@@ -90,6 +104,7 @@ def load_local_models():
90
  )
91
  blip_itm_model.eval()
92
 
 
93
  dino_processor = AutoProcessor.from_pretrained(
94
  "IDEA-Research/grounding-dino-base"
95
  )
@@ -99,7 +114,7 @@ def load_local_models():
99
  )
100
  dino_model.eval()
101
 
102
- return blip_processor, blip_itm_model, dino_processor, dino_model
103
 
104
  # ============================================================================
105
  # HELPERS
@@ -115,47 +130,31 @@ def image_to_data_uri(image: Image.Image) -> str:
115
  return f"data:image/jpeg;base64,{b64}"
116
 
117
  # ============================================================================
118
- # STEP 1 — GIT-LARGE-COCO: GENERATE 5 CAPTIONS
119
- # Replaces Florence-2-Large (not available on HF serverless API)
120
- # microsoft/git-large-coco gives detailed captions, confirmed on HF API
121
- # Called 5 times with different sampling params for caption diversity
122
  # ============================================================================
123
- def generate_captions_git(image: Image.Image) -> list:
124
- img_bytes = image_to_bytes(image)
125
-
126
- parameter_sets = [
127
- {"max_new_tokens": 50},
128
- {"max_new_tokens": 80},
129
- {"max_new_tokens": 60, "temperature": 1.2, "do_sample": True},
130
- {"max_new_tokens": 70, "temperature": 1.5, "do_sample": True},
131
- {"max_new_tokens": 40, "temperature": 0.8, "do_sample": True},
132
  ]
133
 
134
  captions = []
135
 
136
- for i, params in enumerate(parameter_sets):
137
  try:
138
- response = requests.post(
139
- GIT_URL,
140
- headers=GIT_HEADERS,
141
- data=img_bytes,
142
- params={"wait_for_model": True},
143
- timeout=60
144
- )
145
- if response.status_code == 200:
146
- result = response.json()
147
- if isinstance(result, list):
148
- cap = result[0].get("generated_text", "").strip().lower()
149
- elif isinstance(result, dict):
150
- cap = result.get("generated_text", "").strip().lower()
151
- else:
152
- cap = ""
153
- captions.append(cap if cap else "a scene shown in the image")
154
- else:
155
- st.warning(f"GIT API error {response.status_code}")
156
- captions.append("a scene shown in the image")
157
  except Exception as e:
158
- st.warning(f"GIT exception: {str(e)[:80]}")
159
  captions.append("a scene shown in the image")
160
 
161
  seen, unique = set(), []
@@ -165,6 +164,7 @@ def generate_captions_git(image: Image.Image) -> list:
165
  unique.append(c)
166
  while len(unique) < 5:
167
  unique.append(unique[0])
 
168
  return unique[:5]
169
 
170
  # ============================================================================
@@ -373,7 +373,7 @@ with st.sidebar:
373
  st.markdown("---")
374
  st.markdown("### Pipeline Steps")
375
  st.markdown("""
376
- **1. GIT-Large-COCO** (API)
377
  Generate 5 captions
378
 
379
  **2. BLIP ITM** (Local)
@@ -395,8 +395,8 @@ Object detection
395
  Caption fusion
396
  """)
397
  st.markdown("---")
398
- st.markdown("**Local:** BLIP ITM, DINO")
399
- st.markdown("**API:** GIT-Large, Jina, Qwen2.5")
400
 
401
  # ============================================================================
402
  # MAIN UI
@@ -421,14 +421,14 @@ if uploaded_file is not None:
421
  with col_run:
422
  if st.button("Generate Caption", type="primary", use_container_width=True):
423
 
424
- with st.spinner("Loading local models (first run takes 1-2 min)..."):
425
- blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
426
 
427
  progress = st.progress(0)
428
  status = st.empty()
429
 
430
- status.info("Step 1/7: Generating captions with GIT-Large-COCO...")
431
- captions = generate_captions_git(input_image)
432
  progress.progress(14)
433
 
434
  with st.expander("5 Generated Captions", expanded=True):
@@ -491,4 +491,3 @@ if uploaded_file is not None:
491
  f"line-height:1.6;'>{final}</div>",
492
  unsafe_allow_html=True
493
  )
494
-
 
29
 
30
  # ============================================================================
31
  # API ENDPOINTS
32
+ # Qwen2.5: model-specific endpoint for caption fusion
 
33
  # Jina: query=plain string, documents=list of data URI strings
34
  # ============================================================================
 
 
 
35
  QWEN_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
36
  HF_HEADERS = {
37
  "Authorization": f"Bearer {HF_TOKEN}",
 
65
  st.stop()
66
 
67
  # ============================================================================
68
+ # LOAD LOCAL MODELS
69
+ # Moondream2: caption generation
70
+ # BLIP ITM: image-text matching + cosine similarity
71
+ # DINO: object detection
72
  # ============================================================================
73
  @st.cache_resource
74
  def load_local_models():
75
  from transformers import (
76
+ AutoModelForCausalLM,
77
+ AutoTokenizer,
78
  BlipProcessor,
79
  BlipForImageTextRetrieval,
80
  AutoProcessor,
 
82
  )
83
  gc.collect()
84
 
85
+ # Moondream2 — Vision Language Model for caption generation
86
+ moon_tokenizer = AutoTokenizer.from_pretrained(
87
+ "vikhyatk/moondream2",
88
+ trust_remote_code=True
89
+ )
90
+ moon_model = AutoModelForCausalLM.from_pretrained(
91
+ "vikhyatk/moondream2",
92
+ trust_remote_code=True,
93
+ torch_dtype=torch.float32
94
+ )
95
+ moon_model.eval()
96
+
97
+ # BLIP — for ITM scoring and cosine similarity
98
  blip_processor = BlipProcessor.from_pretrained(
99
  "Salesforce/blip-image-captioning-large"
100
  )
 
104
  )
105
  blip_itm_model.eval()
106
 
107
+ # DINO — for object detection
108
  dino_processor = AutoProcessor.from_pretrained(
109
  "IDEA-Research/grounding-dino-base"
110
  )
 
114
  )
115
  dino_model.eval()
116
 
117
+ return moon_tokenizer, moon_model, blip_processor, blip_itm_model, dino_processor, dino_model
118
 
119
  # ============================================================================
120
  # HELPERS
 
130
  return f"data:image/jpeg;base64,{b64}"
131
 
132
  # ============================================================================
133
+ # STEP 1 — MOONDREAM2 (LOCAL): GENERATE 5 DIVERSE CAPTIONS
134
+ # vikhyatk/moondream2 small VLM (~2GB), runs on CPU
135
+ # 5 different prompts produce diverse caption perspectives
136
+ # No API needed fully local and reliable
137
  # ============================================================================
138
+ def generate_captions_moondream(image: Image.Image, moon_tok, moon_mod) -> list:
139
+
140
+ prompts = [
141
+ "Describe this image in detail.",
142
+ "What is happening in this image?",
143
+ "Describe the people, objects, and setting in this image.",
144
+ "What do you see in this photograph?",
145
+ "Describe the scene including background and foreground in detail."
 
146
  ]
147
 
148
  captions = []
149
 
150
+ for prompt in prompts:
151
  try:
152
+ enc_image = moon_mod.encode_image(image)
153
+ cap = moon_mod.answer_question(enc_image, prompt, moon_tok)
154
+ cap = cap.strip().lower()
155
+ captions.append(cap if cap else "a scene shown in the image")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  except Exception as e:
157
+ st.warning(f"Moondream error: {str(e)[:80]}")
158
  captions.append("a scene shown in the image")
159
 
160
  seen, unique = set(), []
 
164
  unique.append(c)
165
  while len(unique) < 5:
166
  unique.append(unique[0])
167
+
168
  return unique[:5]
169
 
170
  # ============================================================================
 
373
  st.markdown("---")
374
  st.markdown("### Pipeline Steps")
375
  st.markdown("""
376
+ **1. Moondream2** (Local)
377
  Generate 5 captions
378
 
379
  **2. BLIP ITM** (Local)
 
395
  Caption fusion
396
  """)
397
  st.markdown("---")
398
+ st.markdown("**Local:** Moondream2, BLIP ITM, DINO")
399
+ st.markdown("**API:** Jina, Qwen2.5")
400
 
401
  # ============================================================================
402
  # MAIN UI
 
421
  with col_run:
422
  if st.button("Generate Caption", type="primary", use_container_width=True):
423
 
424
+ with st.spinner("Loading local models (first run takes 2-3 min)..."):
425
+ moon_tok, moon_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
426
 
427
  progress = st.progress(0)
428
  status = st.empty()
429
 
430
+ status.info("Step 1/7: Generating captions with Moondream2...")
431
+ captions = generate_captions_moondream(input_image, moon_tok, moon_mod)
432
  progress.progress(14)
433
 
434
  with st.expander("5 Generated Captions", expanded=True):
 
491
  f"line-height:1.6;'>{final}</div>",
492
  unsafe_allow_html=True
493
  )