Spaces:
Running
on
Zero
Running
on
Zero
File size: 13,691 Bytes
ed076e1 3c69511 dbd3ab1 b1c5b87 dbd3ab1 c17c199 dbd3ab1 c17c199 3c69511 c17c199 3c69511 c17c199 3c69511 c17c199 3c69511 c17c199 9863532 3c69511 c17c199 dbd3ab1 c17c199 3c69511 dbd3ab1 b1c5b87 dbd3ab1 b1c5b87 3c69511 b1c5b87 d898359 c17c199 dbd3ab1 b1c5b87 d898359 ed076e1 f661be0 ed076e1 f661be0 ed076e1 b1c5b87 ed076e1 b1c5b87 ed076e1 b1c5b87 ed076e1 b1c5b87 ed076e1 b1c5b87 ed076e1 b1c5b87 ed076e1 b1c5b87 ed076e1 b1c5b87 ed076e1 dbd3ab1 ed076e1 3c69511 ed076e1 c220cef ed076e1 dbd3ab1 b1c5b87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 |
import os
import torch
import gradio as gr
import spaces
from transformers import pipeline
BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
OUTPUT_TOKENS = 256
original_pipeline = None
ft_pipe = None
FORCE_CPU = os.getenv("FORCE_CPU", "0") == "1"
DEVICE_TYPE = "cuda" if (torch.cuda.is_available() and not FORCE_CPU) else "cpu"
if DEVICE_TYPE == "cuda":
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
def _get_dtype(device: str):
if device == "cuda":
if os.getenv("USE_FP16", "0") == "1":
return torch.float16
if os.getenv("USE_BF16", "0") == "1":
is_bf16_supported = getattr(torch.cuda, "is_bf16_supported", None)
if callable(is_bf16_supported) and is_bf16_supported():
return torch.bfloat16
return torch.float32
return torch.float32
def _make_pipe(model_id: str, device_type: str):
dtype = _get_dtype(device_type)
device_arg = 0 if device_type == "cuda" else -1
pipe = pipeline(
"image-text-to-text",
model=model_id,
device=device_arg,
torch_dtype=dtype,
)
model = getattr(pipe, "model", None)
generation_config = getattr(model, "generation_config", None)
if generation_config is not None:
generation_config.do_sample = False
generation_config.max_new_tokens = OUTPUT_TOKENS
try:
generation_config.max_length = None
except Exception:
pass
return pipe
ACTIVE_DEVICE_TYPE = DEVICE_TYPE
def _load_pipes(device_type: str):
global original_pipeline, ft_pipe, ACTIVE_DEVICE_TYPE
ACTIVE_DEVICE_TYPE = device_type
print(f"[INFO] Using device_type={ACTIVE_DEVICE_TYPE}")
original_pipeline = _make_pipe(BASE_MODEL_ID, ACTIVE_DEVICE_TYPE)
ft_pipe = _make_pipe(FINE_TUNED_MODEL_ID, ACTIVE_DEVICE_TYPE)
_load_pipes(DEVICE_TYPE)
def _extract_generated_text(pipe_output) -> str:
try:
item0 = pipe_output[0]
if isinstance(item0, dict) and "generated_text" in item0:
gt = item0["generated_text"]
else:
gt = pipe_output[0][0]["generated_text"]
if isinstance(gt, str):
return gt
if isinstance(gt, list) and gt:
last = gt[-1]
if isinstance(last, dict) and "content" in last:
return last["content"]
return str(gt)
except Exception:
return str(pipe_output)
def create_message(input_image):
return [{'role': 'user',
'content': [{'type': 'image',
'image': input_image},
{'type': 'text',
'text': "Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.\n\nOnly return valid JSON in the following form:\n\n```json\n{\n 'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)\n 'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present\n 'food_items': [], # list[str] - list of visible edible food item nouns\n 'drink_items': [] # list[str] - list of visible edible drink item nouns\n}\n```\n"}]}]
@spaces.GPU
def extract_foods_from_image(input_image):
if input_image is None:
return "Please upload an image", "Please upload an image"
input_image = input_image.convert("RGB")
input_image = input_image.resize(size=(512, 512))
input_message = create_message(input_image=input_image)
try:
original_pipeline_output = original_pipeline(text=[input_message])
outputs_pretrained = _extract_generated_text(original_pipeline_output)
ft_pipe_output = ft_pipe(text=[input_message])
outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
except RuntimeError as e:
msg = str(e)
is_cuda_linear_failure = (
"CUBLAS_STATUS_INVALID_VALUE" in msg
or "cublasGemmEx" in msg
or ("CUDA error" in msg and "CUBLAS" in msg)
)
if ACTIVE_DEVICE_TYPE == "cuda" and is_cuda_linear_failure:
try:
print("[WARN] CUDA GEMM failed, falling back to CPU.")
_load_pipes("cpu")
if torch.cuda.is_available():
torch.cuda.empty_cache()
original_pipeline_output = original_pipeline(text=[input_message])
outputs_pretrained = _extract_generated_text(original_pipeline_output)
ft_pipe_output = ft_pipe(text=[input_message])
outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
except Exception:
raise e
else:
raise
return outputs_pretrained, outputs_fine_tuned
CUSTOM_CSS = """
/* Global Theme */
.gradio-container {
max-width: 1200px !important;
margin: auto !important;
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
}
/* Header Styling */
.header-container {
text-align: center;
padding: 2rem 1rem;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 16px;
margin-bottom: 1.5rem;
box-shadow: 0 4px 20px rgba(102, 126, 234, 0.3);
}
.header-title {
font-size: 2.2rem !important;
font-weight: 700 !important;
color: white !important;
margin-bottom: 0.5rem !important;
text-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.header-subtitle {
font-size: 1.1rem !important;
color: rgba(255,255,255,0.9) !important;
font-weight: 400 !important;
}
/* Card Styling */
.info-card {
background: linear-gradient(145deg, #f8fafc 0%, #f1f5f9 100%);
border: 1px solid #e2e8f0;
border-radius: 12px;
padding: 1.25rem;
margin-bottom: 1rem;
transition: all 0.3s ease;
}
.info-card:hover {
box-shadow: 0 4px 12px rgba(0,0,0,0.08);
transform: translateY(-2px);
}
.tech-badge {
display: inline-block;
background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%);
color: white;
padding: 0.35rem 0.75rem;
border-radius: 20px;
font-size: 0.8rem;
font-weight: 500;
margin: 0.2rem;
box-shadow: 0 2px 4px rgba(59, 130, 246, 0.3);
}
/* Output Section */
.output-section {
background: #fafbfc;
border-radius: 12px;
padding: 1rem;
border: 1px solid #e5e7eb;
}
.comparison-header {
font-size: 0.9rem;
font-weight: 600;
color: #374151;
margin-bottom: 0.5rem;
display: flex;
align-items: center;
gap: 0.5rem;
}
/* Button Styling */
.primary-btn {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
border: none !important;
color: white !important;
font-weight: 600 !important;
padding: 0.75rem 2rem !important;
border-radius: 8px !important;
transition: all 0.3s ease !important;
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
}
.primary-btn:hover {
transform: translateY(-2px) !important;
box-shadow: 0 6px 20px rgba(102, 126, 234, 0.5) !important;
}
/* Links */
.resource-link {
color: #4f46e5;
text-decoration: none;
font-weight: 500;
transition: color 0.2s;
}
.resource-link:hover {
color: #7c3aed;
text-decoration: underline;
}
/* Footer */
.footer-section {
text-align: center;
padding: 1.5rem;
margin-top: 1.5rem;
border-top: 1px solid #e5e7eb;
color: #6b7280;
font-size: 0.9rem;
}
/* Accordion */
.accordion-header {
font-weight: 600 !important;
color: #1f2937 !important;
}
"""
with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft(), title="FoodExtract Vision | Fine-tuned VLM Demo") as demo:
# Header Section
gr.HTML("""
<div class="header-container">
<h1 class="header-title">๐ฝ๏ธ FoodExtract Vision</h1>
<p class="header-subtitle">Fine-tuned SmolVLM2-500M for Structured Food & Drink Extraction</p>
</div>
""")
# Project Overview Cards
with gr.Row():
with gr.Column(scale=1):
gr.HTML("""
<div class="info-card">
<h3 style="margin: 0 0 0.75rem 0; color: #1f2937; font-size: 1rem;">๐ฏ Project Objective</h3>
<p style="margin: 0; color: #4b5563; font-size: 0.9rem; line-height: 1.6;">
Extract food and drink items from images in a <strong>structured JSON format</strong>.
This demo compares the base model vs. fine-tuned model to showcase the improvement in output consistency.
</p>
</div>
""")
with gr.Column(scale=1):
gr.HTML("""
<div class="info-card">
<h3 style="margin: 0 0 0.75rem 0; color: #1f2937; font-size: 1rem;">๐ ๏ธ Tech Stack</h3>
<div>
<span class="tech-badge">SmolVLM2-500M</span>
<span class="tech-badge">Transformers</span>
<span class="tech-badge">PyTorch</span>
<span class="tech-badge">Gradio</span>
<span class="tech-badge">HF Spaces</span>
</div>
</div>
""")
# Resources Section
with gr.Accordion("๐ Model & Dataset Resources", open=False):
gr.Markdown("""
| Resource | Link |
|----------|------|
| **Base Model** | [HuggingFaceTB/SmolVLM2-500M-Video-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct) |
| **Fine-tuned Model** | [CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune](https://huggingface.co/CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune) |
| **Training Dataset** | [mrdbourke/FoodExtract-1k-Vision](https://huggingface.co/datasets/mrdbourke/FoodExtract-1k-Vision) (1k food + 500 non-food images) |
""")
gr.Markdown("---")
# Main Demo Section
gr.HTML('<h2 style="text-align: center; color: #1f2937; margin-bottom: 1rem;">๐ฌ Live Demo: Model Comparison</h2>')
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(type="pil", label="๐ท Upload Food Image", height=350)
submit_btn = gr.Button("๐ Extract Food Items", variant="primary", elem_classes=["primary-btn"])
with gr.Column(scale=1):
with gr.Group():
gr.HTML('<div class="comparison-header">โ Base Model (No Fine-tuning)</div>')
output_original = gr.Textbox(
lines=6,
label="",
placeholder="Base model output will appear here...",
show_label=False
)
with gr.Group():
gr.HTML('<div class="comparison-header">โ
Fine-tuned Model</div>')
output_finetuned = gr.Textbox(
lines=6,
label="",
placeholder="Fine-tuned model output will appear here...",
show_label=False
)
submit_btn.click(
fn=extract_foods_from_image,
inputs=[input_image],
outputs=[output_original, output_finetuned]
)
# Examples Section
gr.Markdown("### ๐ธ Example Images")
gr.Examples(
examples=[
["examples/food1.jpeg"],
["examples/food2.jpg"],
["examples/food4.jpeg"]
],
inputs=[input_image],
outputs=[output_original, output_finetuned],
fn=extract_foods_from_image,
cache_examples=False
)
gr.Markdown("---")
# Technical Details Accordion
with gr.Accordion("๐ Input Prompt & Expected Output Format", open=False):
gr.Markdown("""
**Both models receive the same input prompt:**
```text
Classify the given input image into food or not and if edible food or drink items
are present, extract those to a list. If no food/drink items are visible, return empty lists.
Only return valid JSON in the following form:
```
**Expected JSON Output Structure:**
```json
{
"is_food": 1,
"image_title": "Fresh Garden Salad",
"food_items": ["lettuce", "tomatoes", "cucumber"],
"drink_items": []
}
```
""")
with gr.Accordion("๐ฎ Future Improvements", open=False):
gr.Markdown("""
- **Remove Input Prompt**: Train the model for direct image โ JSON conversion to reduce inference tokens
- **Expand Training Data**: Current dataset is limited to 1.5k images; real-world data would improve generalization
- **Fix Repetitive Generation**: Address occasional repetitive outputs (e.g., "onions", "onions", "onions")
- **Multi-language Support**: Extend to support food extraction in multiple languages
""")
# Footer
gr.HTML("""
<div class="footer-section">
<p style="margin: 0;">Built with โค๏ธ by <strong>Jarvis Zhang</strong> |
<a href="https://huggingface.co/CreatorJarvis" target="_blank" style="color: #4f46e5;">๐ค Hugging Face</a> |
<a href="https://github.com/JarvisZhang24" target="_blank" style="color: #4f46e5;">๐ป GitHub</a>
</p>
<p style="margin: 0.5rem 0 0 0; font-size: 0.8rem; color: #9ca3af;">Fine-tuning Demo โข Vision Language Model โข Structured Output Generation</p>
</div>
""")
if __name__ == "__main__":
demo.launch(share=False)
|