CreatorJarvis commited on
Commit
d898359
Β·
verified Β·
1 Parent(s): dbd3ab1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -71
app.py CHANGED
@@ -1,73 +1,19 @@
1
  import torch
2
  import gradio as gr
3
-
4
  import spaces
5
- from transformers import pipeline
6
 
7
  BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
8
  FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
9
  OUTPUT_TOKENS = 256
10
 
11
- # Load original base model (no fine-tuning)
12
- print(f"[INFO] Loading Original Model")
13
- original_pipeline = pipeline(
14
- "image-text-to-text",
15
- model=BASE_MODEL_ID,
16
- dtype=torch.bfloat16,
17
- device_map="auto"
18
- )
19
-
20
- # Load fine-tuned model
21
- print(f"[INFO] Loading Fine-tuned Model")
22
- ft_pipe = pipeline(
23
- "image-text-to-text",
24
- model=FINE_TUNED_MODEL_ID,
25
- dtype=torch.bfloat16,
26
- device_map="auto"
27
- )
28
-
29
- def create_message(input_image):
30
- return [{'role': 'user',
31
- 'content': [{'type': 'image',
32
- 'image': input_image},
33
- {'type': 'text',
34
- 'text': "Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.\n\nOnly return valid JSON in the following form:\n\n```json\n{\n 'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)\n 'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present\n 'food_items': [], # list[str] - list of visible edible food item nouns\n 'drink_items': [] # list[str] - list of visible edible drink item nouns\n}\n```\n"}]}]
35
-
36
- @spaces.GPU
37
- def extract_foods_from_image(input_image):
38
- input_image = input_image.resize((512, 512))
39
- input_message = create_message(input_image=input_image)
40
-
41
- # Get outputs from base model (not fine-tuned)
42
- original_pipeline_output = original_pipeline(text=[input_message],
43
- max_new_tokens=OUTPUT_TOKENS)
44
-
45
- outputs_pretrained = original_pipeline_output[0][0]["generated_text"][-1]["content"]
46
-
47
- # Get outputs from fine-tuned model (fine-tuned on food images)
48
- ft_pipe_output = ft_pipe(text=[input_message],
49
- max_new_tokens=OUTPUT_TOKENS)
50
- outputs_fine_tuned = ft_pipe_output[0][0]["generated_text"][-1]["content"]
51
-
52
- return outputs_pretrained, outputs_fine_tuned
53
-
54
- demo_title = "πŸ₯‘βž‘οΈπŸ“ FoodExtract-Vision with a fine-tuned SmolVLM2-500M"
55
- demo_description = """* **Base model:** https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct
56
- * **Fine-tuning dataset:** https://huggingface.co/datasets/mrdbourke/FoodExtract-1k-Vision (1k food images and 500 not food images)
57
- * **Fine-tuned model:** https://huggingface.co/mrdbourke/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1
58
-
59
- ## Overview
60
-
61
- Extract food and drink items in a structured way from images.
62
-
63
- The original model outputs fail to capture the desired structure. But the fine-tuned model sticks to the output structure quite well.
64
-
65
- However, the fine-tuned model could definitely be improved with respects to its ability to extract the right food/drink items.
66
-
67
- Both models use the input prompt:
68
 
69
- ````
70
- Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.
71
 
72
  Only return valid JSON in the following form:
73
 
@@ -79,24 +25,102 @@ Only return valid JSON in the following form:
79
  'drink_items': [] # list[str] - list of visible edible drink item nouns
80
  }
81
  ```
82
- ````
 
 
83
 
84
- Except one model has been fine-tuned on the structured data whereas the other hasn't.
 
 
 
 
 
85
 
86
- Notable next steps would be:
87
- * **Remove the input prompt:** Just train the model to go straight from image -> text (no text prompt on input), this would save on inference tokens.
88
- * **Fine-tune on more real-world data:** Right now the model is only trained on 1k food images (from Food101) and 500 not food (random internet images), training on real world data would likely significantly improve performance.
89
- * **Fix the repetitive generation:** The model can sometimes get stuck in a repetitive generation pattern, e.g. "onions", "onions", "onions", etc. We could look into patterns to help reduce this.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  """
91
 
92
  demo = gr.Interface(
93
  fn=extract_foods_from_image,
94
- inputs=gr.Image(type="pil"),
95
  title=demo_title,
96
  description=demo_description,
97
- outputs=[gr.Textbox(lines=4, label="Original Model (not fine-tuned)"),
98
- gr.Textbox(lines=4, label="Fine-tuned Model")],
 
 
 
 
99
  )
100
 
101
  if __name__ == "__main__":
102
- demo.launch(share=False)
 
1
  import torch
2
  import gradio as gr
 
3
  import spaces
4
+ from transformers import AutoProcessor, AutoModelForImageTextToText
5
 
6
  BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
7
  FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
8
  OUTPUT_TOKENS = 256
9
 
10
+ SYSTEM_MESSAGE = """You are an expert food and drink image extractor.
11
+ You provide structured data to visual inputs classifying them as edible food/drink or not.
12
+ As well as titling the image with a simple food/drink related caption.
13
+ Finally you extract any and all visible food/drink items to lists.
14
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ USER_PROMPT = """Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.
 
17
 
18
  Only return valid JSON in the following form:
19
 
 
25
  'drink_items': [] # list[str] - list of visible edible drink item nouns
26
  }
27
  ```
28
+ """
29
+
30
+ processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)
31
 
32
+ print(f"[INFO] Loading Base Model: {BASE_MODEL_ID}")
33
+ base_model = AutoModelForImageTextToText.from_pretrained(
34
+ BASE_MODEL_ID,
35
+ torch_dtype=torch.bfloat16,
36
+ _attn_implementation="eager"
37
+ ).eval()
38
 
39
+ print(f"[INFO] Loading Fine-tuned Model: {FINE_TUNED_MODEL_ID}")
40
+ ft_model = AutoModelForImageTextToText.from_pretrained(
41
+ FINE_TUNED_MODEL_ID,
42
+ torch_dtype=torch.bfloat16,
43
+ _attn_implementation="eager"
44
+ ).eval()
45
+
46
+ @spaces.GPU
47
+ def extract_foods_from_image(input_image):
48
+ if input_image is None:
49
+ return "Please upload an image", "Please upload an image"
50
+
51
+ input_image = input_image.resize((512, 512))
52
+
53
+ messages = [
54
+ {
55
+ "role": "system",
56
+ "content": [{"type": "text", "text": SYSTEM_MESSAGE}]
57
+ },
58
+ {
59
+ "role": "user",
60
+ "content": [
61
+ {"type": "image"},
62
+ {"type": "text", "text": USER_PROMPT}
63
+ ]
64
+ }
65
+ ]
66
+
67
+ inputs = processor.apply_chat_template(
68
+ messages,
69
+ add_generation_prompt=True,
70
+ tokenize=True,
71
+ return_dict=True,
72
+ return_tensors="pt",
73
+ images=[input_image]
74
+ )
75
+
76
+ device = "cuda" if torch.cuda.is_available() else "cpu"
77
+ inputs = {k: v.to(device) for k, v in inputs.items()}
78
+ base_model.to(device)
79
+ ft_model.to(device)
80
+
81
+ with torch.no_grad():
82
+ base_output = base_model.generate(
83
+ **inputs,
84
+ max_new_tokens=OUTPUT_TOKENS,
85
+ do_sample=False
86
+ )
87
+ ft_output = ft_model.generate(
88
+ **inputs,
89
+ max_new_tokens=OUTPUT_TOKENS,
90
+ do_sample=False
91
+ )
92
+
93
+ base_result = processor.decode(base_output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
94
+ ft_result = processor.decode(ft_output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
95
+
96
+ return base_result, ft_result
97
+
98
+ demo_title = "πŸ₯‘βž‘οΈπŸ“ FoodExtract-Vision: Base vs Fine-tuned SmolVLM2-500M"
99
+ demo_description = """
100
+ ## Model Comparison
101
+
102
+ Compare the **base model** vs **fine-tuned model** for food extraction from images.
103
+
104
+ | Model | Link |
105
+ |-------|------|
106
+ | Base Model | [HuggingFaceTB/SmolVLM2-500M-Video-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct) |
107
+ | Fine-tuned Model | [CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune](https://huggingface.co/CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune) |
108
+
109
+ Upload an image to see how the fine-tuned model better follows the structured JSON output format.
110
  """
111
 
112
  demo = gr.Interface(
113
  fn=extract_foods_from_image,
114
+ inputs=gr.Image(type="pil", label="Upload Image"),
115
  title=demo_title,
116
  description=demo_description,
117
+ outputs=[
118
+ gr.Textbox(lines=8, label="πŸ”΅ Base Model (Original)"),
119
+ gr.Textbox(lines=8, label="🟒 Fine-tuned Model")
120
+ ],
121
+
122
+ cache_examples=False
123
  )
124
 
125
  if __name__ == "__main__":
126
+ demo.launch()