Spaces:
Running
Running
update
Browse files
app.py
CHANGED
|
@@ -66,15 +66,14 @@ if not JINA_KEY:
|
|
| 66 |
|
| 67 |
# ============================================================================
|
| 68 |
# LOAD LOCAL MODELS
|
| 69 |
-
# Moondream2: caption generation
|
| 70 |
# BLIP ITM: image-text matching + cosine similarity
|
| 71 |
# DINO: object detection
|
| 72 |
# ============================================================================
|
| 73 |
@st.cache_resource
|
| 74 |
def load_local_models():
|
|
|
|
| 75 |
from transformers import (
|
| 76 |
-
AutoModelForCausalLM,
|
| 77 |
-
AutoTokenizer,
|
| 78 |
BlipProcessor,
|
| 79 |
BlipForImageTextRetrieval,
|
| 80 |
AutoProcessor,
|
|
@@ -82,17 +81,8 @@ def load_local_models():
|
|
| 82 |
)
|
| 83 |
gc.collect()
|
| 84 |
|
| 85 |
-
# Moondream2 β
|
| 86 |
-
|
| 87 |
-
"vikhyatk/moondream2",
|
| 88 |
-
trust_remote_code=True
|
| 89 |
-
)
|
| 90 |
-
moon_model = AutoModelForCausalLM.from_pretrained(
|
| 91 |
-
"vikhyatk/moondream2",
|
| 92 |
-
trust_remote_code=True,
|
| 93 |
-
torch_dtype=torch.float32
|
| 94 |
-
)
|
| 95 |
-
moon_model.eval()
|
| 96 |
|
| 97 |
# BLIP β for ITM scoring and cosine similarity
|
| 98 |
blip_processor = BlipProcessor.from_pretrained(
|
|
@@ -114,7 +104,7 @@ def load_local_models():
|
|
| 114 |
)
|
| 115 |
dino_model.eval()
|
| 116 |
|
| 117 |
-
return
|
| 118 |
|
| 119 |
# ============================================================================
|
| 120 |
# HELPERS
|
|
@@ -131,11 +121,10 @@ def image_to_data_uri(image: Image.Image) -> str:
|
|
| 131 |
|
| 132 |
# ============================================================================
|
| 133 |
# STEP 1 β MOONDREAM2 (LOCAL): GENERATE 5 DIVERSE CAPTIONS
|
| 134 |
-
#
|
| 135 |
# 5 different prompts produce diverse caption perspectives
|
| 136 |
-
# No API needed β fully local and reliable
|
| 137 |
# ============================================================================
|
| 138 |
-
def generate_captions_moondream(image: Image.Image,
|
| 139 |
|
| 140 |
prompts = [
|
| 141 |
"Describe this image in detail.",
|
|
@@ -149,9 +138,8 @@ def generate_captions_moondream(image: Image.Image, moon_tok, moon_mod) -> list:
|
|
| 149 |
|
| 150 |
for prompt in prompts:
|
| 151 |
try:
|
| 152 |
-
|
| 153 |
-
cap
|
| 154 |
-
cap = cap.strip().lower()
|
| 155 |
captions.append(cap if cap else "a scene shown in the image")
|
| 156 |
except Exception as e:
|
| 157 |
st.warning(f"Moondream error: {str(e)[:80]}")
|
|
@@ -422,13 +410,13 @@ if uploaded_file is not None:
|
|
| 422 |
if st.button("Generate Caption", type="primary", use_container_width=True):
|
| 423 |
|
| 424 |
with st.spinner("Loading local models (first run takes 2-3 min)..."):
|
| 425 |
-
|
| 426 |
|
| 427 |
progress = st.progress(0)
|
| 428 |
status = st.empty()
|
| 429 |
|
| 430 |
status.info("Step 1/7: Generating captions with Moondream2...")
|
| 431 |
-
captions = generate_captions_moondream(input_image,
|
| 432 |
progress.progress(14)
|
| 433 |
|
| 434 |
with st.expander("5 Generated Captions", expanded=True):
|
|
|
|
| 66 |
|
| 67 |
# ============================================================================
|
| 68 |
# LOAD LOCAL MODELS
|
| 69 |
+
# Moondream2: caption generation via official moondream package
|
| 70 |
# BLIP ITM: image-text matching + cosine similarity
|
| 71 |
# DINO: object detection
|
| 72 |
# ============================================================================
|
| 73 |
@st.cache_resource
|
| 74 |
def load_local_models():
|
| 75 |
+
import moondream as md
|
| 76 |
from transformers import (
|
|
|
|
|
|
|
| 77 |
BlipProcessor,
|
| 78 |
BlipForImageTextRetrieval,
|
| 79 |
AutoProcessor,
|
|
|
|
| 81 |
)
|
| 82 |
gc.collect()
|
| 83 |
|
| 84 |
+
# Moondream2 β official package avoids transformers version conflict
|
| 85 |
+
moon_model = md.vl(model="moondream-2b")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
# BLIP β for ITM scoring and cosine similarity
|
| 88 |
blip_processor = BlipProcessor.from_pretrained(
|
|
|
|
| 104 |
)
|
| 105 |
dino_model.eval()
|
| 106 |
|
| 107 |
+
return moon_model, blip_processor, blip_itm_model, dino_processor, dino_model
|
| 108 |
|
| 109 |
# ============================================================================
|
| 110 |
# HELPERS
|
|
|
|
| 121 |
|
| 122 |
# ============================================================================
|
| 123 |
# STEP 1 β MOONDREAM2 (LOCAL): GENERATE 5 DIVERSE CAPTIONS
|
| 124 |
+
# Official moondream package β no transformers conflict
|
| 125 |
# 5 different prompts produce diverse caption perspectives
|
|
|
|
| 126 |
# ============================================================================
|
| 127 |
+
def generate_captions_moondream(image: Image.Image, moon_mod) -> list:
|
| 128 |
|
| 129 |
prompts = [
|
| 130 |
"Describe this image in detail.",
|
|
|
|
| 138 |
|
| 139 |
for prompt in prompts:
|
| 140 |
try:
|
| 141 |
+
result = moon_mod.query(image, prompt)
|
| 142 |
+
cap = result["answer"].strip().lower()
|
|
|
|
| 143 |
captions.append(cap if cap else "a scene shown in the image")
|
| 144 |
except Exception as e:
|
| 145 |
st.warning(f"Moondream error: {str(e)[:80]}")
|
|
|
|
| 410 |
if st.button("Generate Caption", type="primary", use_container_width=True):
|
| 411 |
|
| 412 |
with st.spinner("Loading local models (first run takes 2-3 min)..."):
|
| 413 |
+
moon_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
|
| 414 |
|
| 415 |
progress = st.progress(0)
|
| 416 |
status = st.empty()
|
| 417 |
|
| 418 |
status.info("Step 1/7: Generating captions with Moondream2...")
|
| 419 |
+
captions = generate_captions_moondream(input_image, moon_mod)
|
| 420 |
progress.progress(14)
|
| 421 |
|
| 422 |
with st.expander("5 Generated Captions", expanded=True):
|