CreatorJarvis commited on
Commit
dbd3ab1
·
0 Parent(s):

Reset history to remove binaries

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +26 -0
  3. app.py +102 -0
  4. requirements.txt +8 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FoodExtract-Vision Fine-tuned VLM Structued Data Extractor
3
+ emoji: 🍟➡️📝
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ license: apache-2.0
10
+ ---
11
+
12
+ Fine-tuned SmolVLM2-500M to extract food and drink items from images.
13
+
14
+ Input can be any kind of image and output will be a formatted string such as the following:
15
+
16
+ ```json
17
+ {'is_food': 0, 'image_title': '', 'food_items': [], 'drink_items': []}
18
+ ```
19
+
20
+ Or for an image of food:
21
+
22
+ ```json
23
+ {'is_food': 1, 'image_title': 'fried calamari', 'food_items': ['fried calamari'], 'drink_items': []}
24
+ ```
25
+
26
+ Note: This README.md was authored in a live tutorial recorded for YouTube (link coming soon).
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+
4
+ import spaces
5
+ from transformers import pipeline
6
+
7
+ BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
8
+ FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
9
+ OUTPUT_TOKENS = 256
10
+
11
+ # Load original base model (no fine-tuning)
12
+ print(f"[INFO] Loading Original Model")
13
+ original_pipeline = pipeline(
14
+ "image-text-to-text",
15
+ model=BASE_MODEL_ID,
16
+ dtype=torch.bfloat16,
17
+ device_map="auto"
18
+ )
19
+
20
+ # Load fine-tuned model
21
+ print(f"[INFO] Loading Fine-tuned Model")
22
+ ft_pipe = pipeline(
23
+ "image-text-to-text",
24
+ model=FINE_TUNED_MODEL_ID,
25
+ dtype=torch.bfloat16,
26
+ device_map="auto"
27
+ )
28
+
29
+ def create_message(input_image):
30
+ return [{'role': 'user',
31
+ 'content': [{'type': 'image',
32
+ 'image': input_image},
33
+ {'type': 'text',
34
+ 'text': "Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.\n\nOnly return valid JSON in the following form:\n\n```json\n{\n 'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)\n 'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present\n 'food_items': [], # list[str] - list of visible edible food item nouns\n 'drink_items': [] # list[str] - list of visible edible drink item nouns\n}\n```\n"}]}]
35
+
36
+ @spaces.GPU
37
+ def extract_foods_from_image(input_image):
38
+ input_image = input_image.resize((512, 512))
39
+ input_message = create_message(input_image=input_image)
40
+
41
+ # Get outputs from base model (not fine-tuned)
42
+ original_pipeline_output = original_pipeline(text=[input_message],
43
+ max_new_tokens=OUTPUT_TOKENS)
44
+
45
+ outputs_pretrained = original_pipeline_output[0][0]["generated_text"][-1]["content"]
46
+
47
+ # Get outputs from fine-tuned model (fine-tuned on food images)
48
+ ft_pipe_output = ft_pipe(text=[input_message],
49
+ max_new_tokens=OUTPUT_TOKENS)
50
+ outputs_fine_tuned = ft_pipe_output[0][0]["generated_text"][-1]["content"]
51
+
52
+ return outputs_pretrained, outputs_fine_tuned
53
+
54
+ demo_title = "🥑➡️📝 FoodExtract-Vision with a fine-tuned SmolVLM2-500M"
55
+ demo_description = """* **Base model:** https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct
56
+ * **Fine-tuning dataset:** https://huggingface.co/datasets/mrdbourke/FoodExtract-1k-Vision (1k food images and 500 not food images)
57
+ * **Fine-tuned model:** https://huggingface.co/mrdbourke/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1
58
+
59
+ ## Overview
60
+
61
+ Extract food and drink items in a structured way from images.
62
+
63
+ The original model outputs fail to capture the desired structure. But the fine-tuned model sticks to the output structure quite well.
64
+
65
+ However, the fine-tuned model could definitely be improved with respects to its ability to extract the right food/drink items.
66
+
67
+ Both models use the input prompt:
68
+
69
+ ````
70
+ Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.
71
+
72
+ Only return valid JSON in the following form:
73
+
74
+ ```json
75
+ {
76
+ 'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)
77
+ 'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present
78
+ 'food_items': [], # list[str] - list of visible edible food item nouns
79
+ 'drink_items': [] # list[str] - list of visible edible drink item nouns
80
+ }
81
+ ```
82
+ ````
83
+
84
+ Except one model has been fine-tuned on the structured data whereas the other hasn't.
85
+
86
+ Notable next steps would be:
87
+ * **Remove the input prompt:** Just train the model to go straight from image -> text (no text prompt on input), this would save on inference tokens.
88
+ * **Fine-tune on more real-world data:** Right now the model is only trained on 1k food images (from Food101) and 500 not food (random internet images), training on real world data would likely significantly improve performance.
89
+ * **Fix the repetitive generation:** The model can sometimes get stuck in a repetitive generation pattern, e.g. "onions", "onions", "onions", etc. We could look into patterns to help reduce this.
90
+ """
91
+
92
+ demo = gr.Interface(
93
+ fn=extract_foods_from_image,
94
+ inputs=gr.Image(type="pil"),
95
+ title=demo_title,
96
+ description=demo_description,
97
+ outputs=[gr.Textbox(lines=4, label="Original Model (not fine-tuned)"),
98
+ gr.Textbox(lines=4, label="Fine-tuned Model")],
99
+ )
100
+
101
+ if __name__ == "__main__":
102
+ demo.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ num2words
2
+ transformers
3
+ torch
4
+ accelerate
5
+ gradio>=5.0.0
6
+ torchvision
7
+ pytz
8
+ spaces