Spaces:
Running
Running
Commit Β·
ddbec80
1
Parent(s): 4482ecc
Remove project overview and structure analysis sections into tabs
Browse files
app.py
CHANGED
|
@@ -215,48 +215,6 @@ unsafe_allow_html=True
|
|
| 215 |
)
|
| 216 |
|
| 217 |
|
| 218 |
-
st.markdown("""
|
| 219 |
-
### π Project Overview
|
| 220 |
-
|
| 221 |
-
This project focuses on **automatic image caption generation using transformer-based vision-language models**.
|
| 222 |
-
|
| 223 |
-
The system takes an input image and generates a natural language description of the scene.
|
| 224 |
-
|
| 225 |
-
Three architectures are evaluated:
|
| 226 |
-
|
| 227 |
-
β’ **BLIP (Bootstrapping Language Image Pretraining)** β multimodal transformer designed specifically for vision-language tasks
|
| 228 |
-
β’ **ViT-GPT2** β Vision Transformer encoder combined with GPT2 text decoder
|
| 229 |
-
β’ **GIT (Generative Image-to-Text Transformer)** β unified transformer architecture for image-to-text generation
|
| 230 |
-
|
| 231 |
-
The goal of this project is to **compare model architectures, caption quality, and generation performance** using the COCO dataset.
|
| 232 |
-
|
| 233 |
-
---
|
| 234 |
-
|
| 235 |
-
### π― Project Objective
|
| 236 |
-
|
| 237 |
-
Improve caption generation performance through **fine-tuning and decoding optimization**.
|
| 238 |
-
|
| 239 |
-
Training pipeline:
|
| 240 |
-
|
| 241 |
-
**Step 1 β Dataset Preparation**
|
| 242 |
-
- Use **MS COCO captions dataset**
|
| 243 |
-
- Train on a **10kβ50k image-caption subset**
|
| 244 |
-
|
| 245 |
-
**Step 2 β Model Fine-Tuning**
|
| 246 |
-
- Fine-tune **BLIP or VisionEncoderDecoder models**
|
| 247 |
-
|
| 248 |
-
**Step 3 β Training Configuration**
|
| 249 |
-
- Train with image resolution **224β384 px**
|
| 250 |
-
- Train for **3 epochs**
|
| 251 |
-
|
| 252 |
-
**Step 4 β Memory Optimization**
|
| 253 |
-
- Use **gradient checkpointing** to reduce GPU memory usage
|
| 254 |
-
|
| 255 |
-
**Step 5 β Target Performance**
|
| 256 |
-
- Achieve **10%+ improvement in CIDEr score** compared to baseline models
|
| 257 |
-
|
| 258 |
-
These steps allow the system to learn stronger **image-text alignment and caption generation capability**.
|
| 259 |
-
""")
|
| 260 |
|
| 261 |
|
| 262 |
# ================================
|
|
@@ -419,51 +377,50 @@ Brighter regions indicate higher importance for the caption generation process.
|
|
| 419 |
# ================================
|
| 420 |
|
| 421 |
st.divider()
|
| 422 |
-
st.header("π Model Architecture Comparison")
|
| 423 |
-
|
| 424 |
-
data = {
|
| 425 |
-
"Model":["BLIP","ViT-GPT2","GIT"],
|
| 426 |
-
"Architecture":[
|
| 427 |
-
"Vision Transformer + Text Decoder",
|
| 428 |
-
"ViT Encoder + GPT2 Decoder",
|
| 429 |
-
"Unified Transformer"
|
| 430 |
-
],
|
| 431 |
-
"Parameters":["~224M","~210M","~150M"],
|
| 432 |
-
"Training Time":["~1h 34m / epoch","~1h 20m / epoch","~11 min / epoch"],
|
| 433 |
-
"CIDEr Score":["0.61","0.60","0.17"]
|
| 434 |
-
}
|
| 435 |
-
|
| 436 |
-
df = pd.DataFrame(data)
|
| 437 |
|
| 438 |
-
st.
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
)
|
| 216 |
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
|
| 220 |
# ================================
|
|
|
|
| 377 |
# ================================
|
| 378 |
|
| 379 |
st.divider()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
+
tab1, tab2 = st.tabs(["π Model Architecture Comparison", "π Experiment Analysis"])
|
| 382 |
+
|
| 383 |
+
with tab1:
|
| 384 |
+
st.header("Model Architecture Comparison")
|
| 385 |
+
|
| 386 |
+
data = {
|
| 387 |
+
"Model":["BLIP","ViT-GPT2","GIT"],
|
| 388 |
+
"Architecture":[
|
| 389 |
+
"Vision Transformer + Text Decoder",
|
| 390 |
+
"ViT Encoder + GPT2 Decoder",
|
| 391 |
+
"Unified Transformer"
|
| 392 |
+
],
|
| 393 |
+
"Parameters":["~224M","~210M","~150M"],
|
| 394 |
+
"Training Time":["~1h 34m / epoch","~1h 20m / epoch","~11 min / epoch"],
|
| 395 |
+
"CIDEr Score":["0.61","0.60","0.17"]
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
df = pd.DataFrame(data)
|
| 399 |
+
|
| 400 |
+
st.table(df)
|
| 401 |
+
|
| 402 |
+
with tab2:
|
| 403 |
+
st.header("Experiment Analysis")
|
| 404 |
+
|
| 405 |
+
st.subheader("Beam Size vs Caption Quality")
|
| 406 |
+
|
| 407 |
+
fig1 = plot_beam_experiment()
|
| 408 |
+
st.pyplot(fig1, use_container_width=True)
|
| 409 |
+
|
| 410 |
+
st.markdown("""
|
| 411 |
+
Beam search controls how many candidate captions are explored during generation.
|
| 412 |
+
Increasing beam size improves caption quality initially but eventually leads to diminishing returns.
|
| 413 |
+
""")
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
st.divider()
|
| 417 |
+
|
| 418 |
+
st.subheader("Caption Length vs Model Performance")
|
| 419 |
+
|
| 420 |
+
fig2 = plot_caption_length()
|
| 421 |
+
st.pyplot(fig2, use_container_width=True)
|
| 422 |
+
|
| 423 |
+
st.markdown("""
|
| 424 |
+
Caption length impacts performance because longer captions require more detailed reasoning about the scene.
|
| 425 |
+
Models generally perform better on shorter captions.
|
| 426 |
+
""")
|