Spaces:

CreatorJarvis
/

FoodExtract-Vision

Running on Zero

App Files Files Community

CreatorJarvis commited on 21 days ago

Commit

d898359

verified ·

1 Parent(s): dbd3ab1

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -71

app.py CHANGED Viewed

@@ -1,73 +1,19 @@
 import torch
 import gradio as gr
 import spaces
-from transformers import pipeline
 BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
 FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
 OUTPUT_TOKENS = 256
-# Load original base model (no fine-tuning)
-print(f"[INFO] Loading Original Model")
-original_pipeline = pipeline(
-    "image-text-to-text",
-    model=BASE_MODEL_ID,
-    dtype=torch.bfloat16,
-    device_map="auto"
-)
-# Load fine-tuned model
-print(f"[INFO] Loading Fine-tuned Model")
-ft_pipe = pipeline(
-    "image-text-to-text",
-    model=FINE_TUNED_MODEL_ID,
-    dtype=torch.bfloat16,
-    device_map="auto"
-)
-def create_message(input_image):
-    return [{'role': 'user',
- 'content': [{'type': 'image',
-   'image': input_image},
-  {'type': 'text',
-   'text': "Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.\n\nOnly return valid JSON in the following form:\n\n```json\n{\n  'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)\n  'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present\n  'food_items': [], # list[str] - list of visible edible food item nouns\n  'drink_items': [] # list[str] - list of visible edible drink item nouns\n}\n```\n"}]}]
-@spaces.GPU
-def extract_foods_from_image(input_image):
-    input_image = input_image.resize((512, 512))
-    input_message = create_message(input_image=input_image)
-    # Get outputs from base model (not fine-tuned)
-    original_pipeline_output = original_pipeline(text=[input_message],
-                                                 max_new_tokens=OUTPUT_TOKENS)
-    outputs_pretrained = original_pipeline_output[0][0]["generated_text"][-1]["content"]
-    # Get outputs from fine-tuned model (fine-tuned on food images)
-    ft_pipe_output = ft_pipe(text=[input_message],
-                             max_new_tokens=OUTPUT_TOKENS)
-    outputs_fine_tuned = ft_pipe_output[0][0]["generated_text"][-1]["content"]
-    return outputs_pretrained, outputs_fine_tuned
-demo_title = "🥑➡️📝 FoodExtract-Vision with a fine-tuned SmolVLM2-500M"
-demo_description = """* **Base model:** https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct
-* **Fine-tuning dataset:** https://huggingface.co/datasets/mrdbourke/FoodExtract-1k-Vision (1k food images and 500 not food images)
-* **Fine-tuned model:** https://huggingface.co/mrdbourke/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1
-## Overview
-Extract food and drink items in a structured way from images.
-The original model outputs fail to capture the desired structure. But the fine-tuned model sticks to the output structure quite well.
-However, the fine-tuned model could definitely be improved with respects to its ability to extract the right food/drink items.
-Both models use the input prompt:
-````
-Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.
 Only return valid JSON in the following form:
@@ -79,24 +25,102 @@ Only return valid JSON in the following form:
   'drink_items': [] # list[str] - list of visible edible drink item nouns
 }
 ```
-````
-Except one model has been fine-tuned on the structured data whereas the other hasn't.
-Notable next steps would be:
-* **Remove the input prompt:** Just train the model to go straight from image -> text (no text prompt on input), this would save on inference tokens.
-* **Fine-tune on more real-world data:** Right now the model is only trained on 1k food images (from Food101) and 500 not food (random internet images), training on real world data would likely significantly improve performance.
-* **Fix the repetitive generation:** The model can sometimes get stuck in a repetitive generation pattern, e.g. "onions", "onions", "onions", etc. We could look into patterns to help reduce this.
 """
 demo = gr.Interface(
     fn=extract_foods_from_image,
-    inputs=gr.Image(type="pil"),
     title=demo_title,
     description=demo_description,
-    outputs=[gr.Textbox(lines=4, label="Original Model (not fine-tuned)"),
-             gr.Textbox(lines=4, label="Fine-tuned Model")],
 )
 if __name__ == "__main__":
-    demo.launch(share=False)

 import torch
 import gradio as gr
 import spaces
+from transformers import AutoProcessor, AutoModelForImageTextToText
 BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
 FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
 OUTPUT_TOKENS = 256
+SYSTEM_MESSAGE = """You are an expert food and drink image extractor.
+You provide structured data to visual inputs classifying them as edible food/drink or not.
+As well as titling the image with a simple food/drink related caption.
+Finally you extract any and all visible food/drink items to lists.
+"""
+USER_PROMPT = """Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.
 Only return valid JSON in the following form:
   'drink_items': [] # list[str] - list of visible edible drink item nouns
 }
 ```
+"""
+processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)
+print(f"[INFO] Loading Base Model: {BASE_MODEL_ID}")
+base_model = AutoModelForImageTextToText.from_pretrained(
+    BASE_MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    _attn_implementation="eager"
+).eval()
+print(f"[INFO] Loading Fine-tuned Model: {FINE_TUNED_MODEL_ID}")
+ft_model = AutoModelForImageTextToText.from_pretrained(
+    FINE_TUNED_MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    _attn_implementation="eager"
+).eval()
+@spaces.GPU
+def extract_foods_from_image(input_image):
+    if input_image is None:
+        return "Please upload an image", "Please upload an image"
+    input_image = input_image.resize((512, 512))
+    messages = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": SYSTEM_MESSAGE}]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": USER_PROMPT}
+            ]
+        }
+    ]
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+        images=[input_image]
+    )
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    base_model.to(device)
+    ft_model.to(device)
+    with torch.no_grad():
+        base_output = base_model.generate(
+            **inputs,
+            max_new_tokens=OUTPUT_TOKENS,
+            do_sample=False
+        )
+        ft_output = ft_model.generate(
+            **inputs,
+            max_new_tokens=OUTPUT_TOKENS,
+            do_sample=False
+        )
+    base_result = processor.decode(base_output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+    ft_result = processor.decode(ft_output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+    return base_result, ft_result
+demo_title = "🥑➡️📝 FoodExtract-Vision: Base vs Fine-tuned SmolVLM2-500M"
+demo_description = """
+## Model Comparison
+Compare the **base model** vs **fine-tuned model** for food extraction from images.
+| Model | Link |
+|-------|------|
+| Base Model | [HuggingFaceTB/SmolVLM2-500M-Video-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct) |
+| Fine-tuned Model | [CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune](https://huggingface.co/CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune) |
+Upload an image to see how the fine-tuned model better follows the structured JSON output format.
 """
 demo = gr.Interface(
     fn=extract_foods_from_image,
+    inputs=gr.Image(type="pil", label="Upload Image"),
     title=demo_title,
     description=demo_description,
+    outputs=[
+        gr.Textbox(lines=8, label="🔵 Base Model (Original)"),
+        gr.Textbox(lines=8, label="🟢 Fine-tuned Model")
+    ],
+    cache_examples=False
 )
 if __name__ == "__main__":
+    demo.launch()