Spaces:
Running
on
Zero
Running
on
Zero
| import torch | |
| import gradio as gr | |
| import spaces | |
| from transformers import pipeline | |
| BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" | |
| FINE_TUNED_MODEL_ID = "mrdbourke/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1" | |
| OUTPUT_TOKENS = 256 | |
| # Load original base model (no fine-tuning) | |
| print(f"[INFO] Loading Original Model") | |
| original_pipeline = pipeline( | |
| "image-text-to-text", | |
| model=BASE_MODEL_ID, | |
| dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| # Load fine-tuned model | |
| print(f"[INFO] Loading Fine-tuned Model") | |
| ft_pipe = pipeline( | |
| "image-text-to-text", | |
| model=FINE_TUNED_MODEL_ID, | |
| dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| def create_message(input_image): | |
| return [{'role': 'user', | |
| 'content': [{'type': 'image', | |
| 'image': input_image}, | |
| {'type': 'text', | |
| 'text': "Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.\n\nOnly return valid JSON in the following form:\n\n```json\n{\n 'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)\n 'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present\n 'food_items': [], # list[str] - list of visible edible food item nouns\n 'drink_items': [] # list[str] - list of visible edible drink item nouns\n}\n```\n"}]}] | |
| def extract_foods_from_image(input_image): | |
| input_image = input_image.resize(size=(512, 512)) | |
| input_message = create_message(input_image=input_image) | |
| # Get outputs from base model (not fine-tuned) | |
| original_pipeline_output = original_pipeline(text=[input_message], | |
| max_new_tokens=OUTPUT_TOKENS) | |
| outputs_pretrained = original_pipeline_output[0][0]["generated_text"][-1]["content"] | |
| # Get outputs from fine-tuned model (fine-tuned on food images) | |
| ft_pipe_output = ft_pipe(text=[input_message], | |
| max_new_tokens=OUTPUT_TOKENS) | |
| outputs_fine_tuned = ft_pipe_output[0][0]["generated_text"][-1]["content"] | |
| return outputs_pretrained, outputs_fine_tuned | |
| demo_title = "π₯β‘οΈπ FoodExtract-Vision with a fine-tuned SmolVLM2-500M" | |
| demo_description = """* **Base model:** https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct | |
| * **Fine-tuning dataset:** https://huggingface.co/datasets/mrdbourke/FoodExtract-1k-Vision (1k food images and 500 not food images) | |
| * **Fine-tuned model:** https://huggingface.co/mrdbourke/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1 | |
| ## Overview | |
| Extract food and drink items in a structured way from images. | |
| The original model outputs fail to capture the desired structure. But the fine-tuned model sticks to the output structure quite well. | |
| However, the fine-tuned model could definitely be improved with respects to its ability to extract the right food/drink items. | |
| Both models use the input prompt: | |
| ```` | |
| Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists. | |
| Only return valid JSON in the following form: | |
| ```json | |
| { | |
| 'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible) | |
| 'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present | |
| 'food_items': [], # list[str] - list of visible edible food item nouns | |
| 'drink_items': [] # list[str] - list of visible edible drink item nouns | |
| } | |
| ``` | |
| ```` | |
| Except one model has been fine-tuned on the structured data whereas the other hasn't. | |
| Notable next steps would be: | |
| * **Remove the input prompt:** Just train the model to go straight from image -> text (no text prompt on input), this would save on inference tokens. | |
| * **Fine-tune on more real-world data:** Right now the model is only trained on 1k food images (from Food101) and 500 not food (random internet images), training on real world data would likely significantly improve performance. | |
| * **Fix the repetitive generation:** The model can sometimes get stuck in a repetitive generation pattern, e.g. "onions", "onions", "onions", etc. We could look into patterns to help reduce this. | |
| """ | |
| demo = gr.Interface( | |
| fn=extract_foods_from_image, | |
| inputs=gr.Image(type="pil"), | |
| title=demo_title, | |
| description=demo_description, | |
| outputs=[gr.Textbox(lines=4, label="Original Model (not fine-tuned)"), | |
| gr.Textbox(lines=4, label="Fine-tuned Model")], | |
| examples=[["examples/camera.jpeg"], | |
| ["examples/Tandoori-Chicken.jpg"], | |
| ["examples/fries.jpeg"]], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) | |