Afsha001 commited on
Commit
3465e8b
Β·
verified Β·
1 Parent(s): 61adf2e
Files changed (1) hide show
  1. app.py +11 -23
app.py CHANGED
@@ -66,15 +66,14 @@ if not JINA_KEY:
66
 
67
  # ============================================================================
68
  # LOAD LOCAL MODELS
69
- # Moondream2: caption generation
70
  # BLIP ITM: image-text matching + cosine similarity
71
  # DINO: object detection
72
  # ============================================================================
73
  @st.cache_resource
74
  def load_local_models():
 
75
  from transformers import (
76
- AutoModelForCausalLM,
77
- AutoTokenizer,
78
  BlipProcessor,
79
  BlipForImageTextRetrieval,
80
  AutoProcessor,
@@ -82,17 +81,8 @@ def load_local_models():
82
  )
83
  gc.collect()
84
 
85
- # Moondream2 β€” Vision Language Model for caption generation
86
- moon_tokenizer = AutoTokenizer.from_pretrained(
87
- "vikhyatk/moondream2",
88
- trust_remote_code=True
89
- )
90
- moon_model = AutoModelForCausalLM.from_pretrained(
91
- "vikhyatk/moondream2",
92
- trust_remote_code=True,
93
- torch_dtype=torch.float32
94
- )
95
- moon_model.eval()
96
 
97
  # BLIP β€” for ITM scoring and cosine similarity
98
  blip_processor = BlipProcessor.from_pretrained(
@@ -114,7 +104,7 @@ def load_local_models():
114
  )
115
  dino_model.eval()
116
 
117
- return moon_tokenizer, moon_model, blip_processor, blip_itm_model, dino_processor, dino_model
118
 
119
  # ============================================================================
120
  # HELPERS
@@ -131,11 +121,10 @@ def image_to_data_uri(image: Image.Image) -> str:
131
 
132
  # ============================================================================
133
  # STEP 1 β€” MOONDREAM2 (LOCAL): GENERATE 5 DIVERSE CAPTIONS
134
- # vikhyatk/moondream2 β€” small VLM (~2GB), runs on CPU
135
  # 5 different prompts produce diverse caption perspectives
136
- # No API needed β€” fully local and reliable
137
  # ============================================================================
138
- def generate_captions_moondream(image: Image.Image, moon_tok, moon_mod) -> list:
139
 
140
  prompts = [
141
  "Describe this image in detail.",
@@ -149,9 +138,8 @@ def generate_captions_moondream(image: Image.Image, moon_tok, moon_mod) -> list:
149
 
150
  for prompt in prompts:
151
  try:
152
- enc_image = moon_mod.encode_image(image)
153
- cap = moon_mod.answer_question(enc_image, prompt, moon_tok)
154
- cap = cap.strip().lower()
155
  captions.append(cap if cap else "a scene shown in the image")
156
  except Exception as e:
157
  st.warning(f"Moondream error: {str(e)[:80]}")
@@ -422,13 +410,13 @@ if uploaded_file is not None:
422
  if st.button("Generate Caption", type="primary", use_container_width=True):
423
 
424
  with st.spinner("Loading local models (first run takes 2-3 min)..."):
425
- moon_tok, moon_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
426
 
427
  progress = st.progress(0)
428
  status = st.empty()
429
 
430
  status.info("Step 1/7: Generating captions with Moondream2...")
431
- captions = generate_captions_moondream(input_image, moon_tok, moon_mod)
432
  progress.progress(14)
433
 
434
  with st.expander("5 Generated Captions", expanded=True):
 
66
 
67
  # ============================================================================
68
  # LOAD LOCAL MODELS
69
+ # Moondream2: caption generation via official moondream package
70
  # BLIP ITM: image-text matching + cosine similarity
71
  # DINO: object detection
72
  # ============================================================================
73
  @st.cache_resource
74
  def load_local_models():
75
+ import moondream as md
76
  from transformers import (
 
 
77
  BlipProcessor,
78
  BlipForImageTextRetrieval,
79
  AutoProcessor,
 
81
  )
82
  gc.collect()
83
 
84
+ # Moondream2 β€” official package avoids transformers version conflict
85
+ moon_model = md.vl(model="moondream-2b")
 
 
 
 
 
 
 
 
 
86
 
87
  # BLIP β€” for ITM scoring and cosine similarity
88
  blip_processor = BlipProcessor.from_pretrained(
 
104
  )
105
  dino_model.eval()
106
 
107
+ return moon_model, blip_processor, blip_itm_model, dino_processor, dino_model
108
 
109
  # ============================================================================
110
  # HELPERS
 
121
 
122
  # ============================================================================
123
  # STEP 1 β€” MOONDREAM2 (LOCAL): GENERATE 5 DIVERSE CAPTIONS
124
+ # Official moondream package β€” no transformers conflict
125
  # 5 different prompts produce diverse caption perspectives
 
126
  # ============================================================================
127
+ def generate_captions_moondream(image: Image.Image, moon_mod) -> list:
128
 
129
  prompts = [
130
  "Describe this image in detail.",
 
138
 
139
  for prompt in prompts:
140
  try:
141
+ result = moon_mod.query(image, prompt)
142
+ cap = result["answer"].strip().lower()
 
143
  captions.append(cap if cap else "a scene shown in the image")
144
  except Exception as e:
145
  st.warning(f"Moondream error: {str(e)[:80]}")
 
410
  if st.button("Generate Caption", type="primary", use_container_width=True):
411
 
412
  with st.spinner("Loading local models (first run takes 2-3 min)..."):
413
+ moon_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
414
 
415
  progress = st.progress(0)
416
  status = st.empty()
417
 
418
  status.info("Step 1/7: Generating captions with Moondream2...")
419
+ captions = generate_captions_moondream(input_image, moon_mod)
420
  progress.progress(14)
421
 
422
  with st.expander("5 Generated Captions", expanded=True):