Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 18 days ago

Commit

3465e8b

verified ·

1 Parent(s): 61adf2e

update

Browse files

Files changed (1) hide show

app.py +11 -23

app.py CHANGED Viewed

@@ -66,15 +66,14 @@ if not JINA_KEY:
 # ============================================================================
 # LOAD LOCAL MODELS
-# Moondream2: caption generation
 # BLIP ITM:   image-text matching + cosine similarity
 # DINO:       object detection
 # ============================================================================
 @st.cache_resource
 def load_local_models():
     from transformers import (
-        AutoModelForCausalLM,
-        AutoTokenizer,
         BlipProcessor,
         BlipForImageTextRetrieval,
         AutoProcessor,
@@ -82,17 +81,8 @@ def load_local_models():
     )
     gc.collect()
-    # Moondream2 — Vision Language Model for caption generation
-    moon_tokenizer = AutoTokenizer.from_pretrained(
-        "vikhyatk/moondream2",
-        trust_remote_code=True
-    )
-    moon_model = AutoModelForCausalLM.from_pretrained(
-        "vikhyatk/moondream2",
-        trust_remote_code=True,
-        torch_dtype=torch.float32
-    )
-    moon_model.eval()
     # BLIP — for ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
@@ -114,7 +104,7 @@ def load_local_models():
     )
     dino_model.eval()
-    return moon_tokenizer, moon_model, blip_processor, blip_itm_model, dino_processor, dino_model
 # ============================================================================
 # HELPERS
@@ -131,11 +121,10 @@ def image_to_data_uri(image: Image.Image) -> str:
 # ============================================================================
 # STEP 1 — MOONDREAM2 (LOCAL): GENERATE 5 DIVERSE CAPTIONS
-# vikhyatk/moondream2 — small VLM (~2GB), runs on CPU
 # 5 different prompts produce diverse caption perspectives
-# No API needed — fully local and reliable
 # ============================================================================
-def generate_captions_moondream(image: Image.Image, moon_tok, moon_mod) -> list:
     prompts = [
         "Describe this image in detail.",
@@ -149,9 +138,8 @@ def generate_captions_moondream(image: Image.Image, moon_tok, moon_mod) -> list:
     for prompt in prompts:
         try:
-            enc_image = moon_mod.encode_image(image)
-            cap = moon_mod.answer_question(enc_image, prompt, moon_tok)
-            cap = cap.strip().lower()
             captions.append(cap if cap else "a scene shown in the image")
         except Exception as e:
             st.warning(f"Moondream error: {str(e)[:80]}")
@@ -422,13 +410,13 @@ if uploaded_file is not None:
         if st.button("Generate Caption", type="primary", use_container_width=True):
             with st.spinner("Loading local models (first run takes 2-3 min)..."):
-                moon_tok, moon_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
             status.info("Step 1/7: Generating captions with Moondream2...")
-            captions = generate_captions_moondream(input_image, moon_tok, moon_mod)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):

 # ============================================================================
 # LOAD LOCAL MODELS
+# Moondream2: caption generation via official moondream package
 # BLIP ITM:   image-text matching + cosine similarity
 # DINO:       object detection
 # ============================================================================
 @st.cache_resource
 def load_local_models():
+    import moondream as md
     from transformers import (
         BlipProcessor,
         BlipForImageTextRetrieval,
         AutoProcessor,
     )
     gc.collect()
+    # Moondream2 — official package avoids transformers version conflict
+    moon_model = md.vl(model="moondream-2b")
     # BLIP — for ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
     )
     dino_model.eval()
+    return moon_model, blip_processor, blip_itm_model, dino_processor, dino_model
 # ============================================================================
 # HELPERS
 # ============================================================================
 # STEP 1 — MOONDREAM2 (LOCAL): GENERATE 5 DIVERSE CAPTIONS
+# Official moondream package — no transformers conflict
 # 5 different prompts produce diverse caption perspectives
 # ============================================================================
+def generate_captions_moondream(image: Image.Image, moon_mod) -> list:
     prompts = [
         "Describe this image in detail.",
     for prompt in prompts:
         try:
+            result = moon_mod.query(image, prompt)
+            cap    = result["answer"].strip().lower()
             captions.append(cap if cap else "a scene shown in the image")
         except Exception as e:
             st.warning(f"Moondream error: {str(e)[:80]}")
         if st.button("Generate Caption", type="primary", use_container_width=True):
             with st.spinner("Loading local models (first run takes 2-3 min)..."):
+                moon_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
             status.info("Step 1/7: Generating captions with Moondream2...")
+            captions = generate_captions_moondream(input_image, moon_mod)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):