Spaces:
Running
Running
replace Qwen2-VL with Florence-2 API for caption generation
Browse files
app.py
CHANGED
|
@@ -19,7 +19,7 @@ JINA_KEY = os.environ.get("JINA_KEY", "")
|
|
| 19 |
DEVICE = "cpu"
|
| 20 |
|
| 21 |
# ββ Correct API endpoints ββ
|
| 22 |
-
|
| 23 |
QWEN_LM_URL = "https://api-inference.huggingface.co/v1/chat/completions"
|
| 24 |
JINA_URL = "https://api.jina.ai/v1/rerank"
|
| 25 |
HF_HEADERS = {"Authorization": "Bearer " + HF_TOKEN, "Content-Type": "application/json"}
|
|
@@ -254,7 +254,7 @@ with st.sidebar:
|
|
| 254 |
st.title(" Image Caption Fusion")
|
| 255 |
st.markdown("---")
|
| 256 |
st.markdown("### Pipeline Steps")
|
| 257 |
-
st.markdown("1.
|
| 258 |
st.markdown("2. BLIP ITM β Image-text matching")
|
| 259 |
st.markdown("3. Jina Reranker M0 β Semantic reranking")
|
| 260 |
st.markdown("4. Cosine Similarity β Embedding similarity")
|
|
@@ -263,7 +263,7 @@ with st.sidebar:
|
|
| 263 |
st.markdown("7. Qwen2.5-1.5B β Caption fusion")
|
| 264 |
st.markdown("---")
|
| 265 |
st.markdown("**Local:** BLIP ITM, DINO")
|
| 266 |
-
st.markdown("**API:**
|
| 267 |
|
| 268 |
# ββ MAIN UI ββ
|
| 269 |
st.title(" Image Caption Fusion System")
|
|
@@ -285,7 +285,7 @@ if uploaded:
|
|
| 285 |
progress = st.progress(0)
|
| 286 |
status = st.empty()
|
| 287 |
|
| 288 |
-
status.info(" Step 1/7 β Generating
|
| 289 |
captions = generate_captions_api(image)
|
| 290 |
progress.progress(14)
|
| 291 |
with st.expander(" 5 Generated Captions"):
|
|
|
|
| 19 |
DEVICE = "cpu"
|
| 20 |
|
| 21 |
# ββ Correct API endpoints ββ
|
| 22 |
+
FLORENCE_URL = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large"
|
| 23 |
QWEN_LM_URL = "https://api-inference.huggingface.co/v1/chat/completions"
|
| 24 |
JINA_URL = "https://api.jina.ai/v1/rerank"
|
| 25 |
HF_HEADERS = {"Authorization": "Bearer " + HF_TOKEN, "Content-Type": "application/json"}
|
|
|
|
| 254 |
st.title(" Image Caption Fusion")
|
| 255 |
st.markdown("---")
|
| 256 |
st.markdown("### Pipeline Steps")
|
| 257 |
+
st.markdown("1. Florence-2 β Generate 4 captions + BLIP local")
|
| 258 |
st.markdown("2. BLIP ITM β Image-text matching")
|
| 259 |
st.markdown("3. Jina Reranker M0 β Semantic reranking")
|
| 260 |
st.markdown("4. Cosine Similarity β Embedding similarity")
|
|
|
|
| 263 |
st.markdown("7. Qwen2.5-1.5B β Caption fusion")
|
| 264 |
st.markdown("---")
|
| 265 |
st.markdown("**Local:** BLIP ITM, DINO")
|
| 266 |
+
st.markdown("**API:** Florence-2, Jina, Qwen2.5")
|
| 267 |
|
| 268 |
# ββ MAIN UI ββ
|
| 269 |
st.title(" Image Caption Fusion System")
|
|
|
|
| 285 |
progress = st.progress(0)
|
| 286 |
status = st.empty()
|
| 287 |
|
| 288 |
+
status.info(" Step 1/7 β Generating captions with Florence-2 + BLIP...")
|
| 289 |
captions = generate_captions_api(image)
|
| 290 |
progress.progress(14)
|
| 291 |
with st.expander(" 5 Generated Captions"):
|