Spaces:

berkeruveyik
/

FoodExtract-Vision

Sleeping

File size: 4,449 Bytes
import torch
import gradio as gr
import spaces
from transformers import pipeline

BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
FINE_TUNED_MODEL_ID = 'berkeruveyik/FoodExtraqt-Vision-SmoLVLM2-500M-fine-tune-v3'
OUTPUT_TOKENS = 256

print(f"[INFO] Loading Original Model")
original_pipeline = pipeline("image-text-to-text", model=BASE_MODEL_ID, dtype=torch.bfloat16, device_map="auto")

print(f"[INFO] Loading Fine-tuned Model")
ft_pipe = pipeline("image-text-to-text", model=FINE_TUNED_MODEL_ID, dtype=torch.bfloat16, device_map="auto")

def create_message(input_image):
    return [{'role': 'user', 'content': [{'type': 'image', 'image': input_image}, {'type': 'text', 'text': "Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.\n\nOnly return valid JSON in the following form:\n\n```json\n{\n  'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)\n  'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present\n  'food_items': [], # list[str] - list of visible edible food item nouns\n  'drink_items': [] # list[str] - list of visible edible drink item nouns\n}\n```\n"}]}]

@spaces.GPU
def extract_foods_from_image(input_image):
    input_image = input_image.resize(size=(512, 512))
    input_message = create_message(input_image=input_image)
    original_pipeline_output = original_pipeline(text=[input_message], max_new_tokens=OUTPUT_TOKENS)
    outputs_pretrained = original_pipeline_output[0][0]["generated_text"][-1]["content"]
    ft_pipe_output = ft_pipe(text=[input_message], max_new_tokens=OUTPUT_TOKENS)
    outputs_fine_tuned = ft_pipe_output[0][0]["generated_text"][-1]["content"]
    return outputs_pretrained, outputs_fine_tuned

demo_title = "🍕🔍 FoodExtract-Vision: Fine-tuned SmolVLM2-500M"
demo_description = """* **Base model:** https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct\n* **Fine-tuning dataset:** https://huggingface.co/datasets/berkeruveyik/vlm-food-4k-not-food-dataset\n* **Fine-tuned model:** https://huggingface.co/berkeruveyik/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v3\n\n## 📋 Overview\n\nThis demo showcases the power of fine-tuning for structured output generation. Compare a base vision-language model against its fine-tuned version specialized in extracting food and drink items from images in JSON format.\n\nThe **base model** often fails to follow the required output structure, producing inconsistent or unstructured responses. The **fine-tuned model** reliably generates valid JSON outputs matching the specified schema.\n\n## 🎯 Task Description\n\nBoth models receive identical input prompts requesting food/drink classification and extraction:\n\n````\nClassify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.\n\nOnly return valid JSON in the following form:\n\n```json\n{\n  'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)\n  'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present\n  'food_items': [], # list[str] - list of visible edible food item nouns\n  'drink_items': [] # list[str] - list of visible edible drink item nouns\n}\n```\n````\n\n## 🔧 Training Details\n\nThe fine-tuned model was trained on **3,698 images** from the vlm-food-4k-not-food-dataset:\n- **Food images:** Multiple categories from the Food270 dataset including various cuisines, ingredients, and prepared dishes\n- **Non-food images:** Random internet images to teach the model to correctly identify non-food content\n- Each image is labeled with structured JSON outputs including classification, titles, and extracted food/drink items"""

demo = gr.Interface(fn=extract_foods_from_image, inputs=gr.Image(type="pil"), title=demo_title, description=demo_description, outputs=[gr.Textbox(lines=4, label="Original Model (not fine-tuned)"), gr.Textbox(lines=4, label="Fine-tuned Model")], examples=[["./examples/36741.jpg"], ["./examples/IMG_3808.JPG"], ["./examples/istockphoto-175500494-612x612.jpg"]])

if __name__ == "__main__":
    demo.launch(share=True)