CreatorJarvis commited on
Commit
c220cef
·
verified ·
1 Parent(s): c4bebab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -178
app.py CHANGED
@@ -1,181 +1,4 @@
1
- # import os
2
- # import torch
3
- # import gradio as gr
4
- # import spaces
5
- # from transformers import pipeline
6
-
7
- # BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
8
- # FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
9
- # OUTPUT_TOKENS = 256
10
- # original_pipeline = None
11
- # ft_pipe = None
12
-
13
- # FORCE_CPU = os.getenv("FORCE_CPU", "0") == "1"
14
- # DEVICE_TYPE = "cuda" if (torch.cuda.is_available() and not FORCE_CPU) else "cpu"
15
- # if DEVICE_TYPE == "cuda":
16
- # torch.backends.cuda.matmul.allow_tf32 = True
17
- # torch.backends.cudnn.allow_tf32 = True
18
-
19
- # def _get_dtype(device: str):
20
- # if device == "cuda":
21
- # if os.getenv("USE_FP16", "0") == "1":
22
- # return torch.float16
23
- # if os.getenv("USE_BF16", "0") == "1":
24
- # is_bf16_supported = getattr(torch.cuda, "is_bf16_supported", None)
25
- # if callable(is_bf16_supported) and is_bf16_supported():
26
- # return torch.bfloat16
27
- # return torch.float32
28
- # return torch.float32
29
-
30
- # def _make_pipe(model_id: str, device_type: str):
31
- # dtype = _get_dtype(device_type)
32
- # device_arg = 0 if device_type == "cuda" else -1
33
- # pipe = pipeline(
34
- # "image-text-to-text",
35
- # model=model_id,
36
- # device=device_arg,
37
- # torch_dtype=dtype,
38
- # )
39
- # model = getattr(pipe, "model", None)
40
- # generation_config = getattr(model, "generation_config", None)
41
- # if generation_config is not None:
42
- # generation_config.do_sample = False
43
- # generation_config.max_new_tokens = OUTPUT_TOKENS
44
- # try:
45
- # generation_config.max_length = None
46
- # except Exception:
47
- # pass
48
- # return pipe
49
-
50
- # ACTIVE_DEVICE_TYPE = DEVICE_TYPE
51
-
52
- # def _load_pipes(device_type: str):
53
- # global original_pipeline, ft_pipe, ACTIVE_DEVICE_TYPE
54
- # ACTIVE_DEVICE_TYPE = device_type
55
- # print(f"[INFO] Using device_type={ACTIVE_DEVICE_TYPE}")
56
- # original_pipeline = _make_pipe(BASE_MODEL_ID, ACTIVE_DEVICE_TYPE)
57
- # ft_pipe = _make_pipe(FINE_TUNED_MODEL_ID, ACTIVE_DEVICE_TYPE)
58
-
59
- # _load_pipes(DEVICE_TYPE)
60
-
61
- # def _extract_generated_text(pipe_output) -> str:
62
- # try:
63
- # item0 = pipe_output[0]
64
- # if isinstance(item0, dict) and "generated_text" in item0:
65
- # gt = item0["generated_text"]
66
- # else:
67
- # gt = pipe_output[0][0]["generated_text"]
68
-
69
- # if isinstance(gt, str):
70
- # return gt
71
- # if isinstance(gt, list) and gt:
72
- # last = gt[-1]
73
- # if isinstance(last, dict) and "content" in last:
74
- # return last["content"]
75
- # return str(gt)
76
- # except Exception:
77
- # return str(pipe_output)
78
-
79
- # def create_message(input_image):
80
- # return [{'role': 'user',
81
- # 'content': [{'type': 'image',
82
- # 'image': input_image},
83
- # {'type': 'text',
84
- # 'text': "Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.\n\nOnly return valid JSON in the following form:\n\n```json\n{\n 'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)\n 'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present\n 'food_items': [], # list[str] - list of visible edible food item nouns\n 'drink_items': [] # list[str] - list of visible edible drink item nouns\n}\n```\n"}]}]
85
-
86
- # @spaces.GPU
87
- # def extract_foods_from_image(input_image):
88
- # if input_image is None:
89
- # return "Please upload an image", "Please upload an image"
90
-
91
- # input_image = input_image.convert("RGB")
92
- # input_image = input_image.resize(size=(512, 512))
93
- # input_message = create_message(input_image=input_image)
94
-
95
- # try:
96
- # original_pipeline_output = original_pipeline(text=[input_message])
97
- # outputs_pretrained = _extract_generated_text(original_pipeline_output)
98
-
99
- # ft_pipe_output = ft_pipe(text=[input_message])
100
- # outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
101
- # except RuntimeError as e:
102
- # msg = str(e)
103
- # is_cuda_linear_failure = (
104
- # "CUBLAS_STATUS_INVALID_VALUE" in msg
105
- # or "cublasGemmEx" in msg
106
- # or ("CUDA error" in msg and "CUBLAS" in msg)
107
- # )
108
- # if ACTIVE_DEVICE_TYPE == "cuda" and is_cuda_linear_failure:
109
- # try:
110
- # print("[WARN] CUDA GEMM failed, falling back to CPU.")
111
- # _load_pipes("cpu")
112
- # if torch.cuda.is_available():
113
- # torch.cuda.empty_cache()
114
- # original_pipeline_output = original_pipeline(text=[input_message])
115
- # outputs_pretrained = _extract_generated_text(original_pipeline_output)
116
- # ft_pipe_output = ft_pipe(text=[input_message])
117
- # outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
118
- # except Exception:
119
- # raise e
120
- # else:
121
- # raise
122
-
123
- # return outputs_pretrained, outputs_fine_tuned
124
-
125
- # demo_title = "🥑➡️📝 FoodExtract-Vision with a fine-tuned SmolVLM2-500M"
126
- # demo_description = """* **Base model:** https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct
127
- # * **Fine-tuning dataset:** https://huggingface.co/datasets/mrdbourke/FoodExtract-1k-Vision (1k food images and 500 not food images)
128
- # * **Fine-tuned model:** https://huggingface.co/CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1
129
-
130
- # ## Overview
131
-
132
- # Extract food and drink items in a structured way from images.
133
-
134
- # The original model outputs fail to capture the desired structure. But the fine-tuned model sticks to the output structure quite well.
135
-
136
- # However, the fine-tuned model could definitely be improved with respects to its ability to extract the right food/drink items.
137
-
138
- # Both models use the input prompt:
139
-
140
- # ````
141
- # Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.
142
-
143
- # Only return valid JSON in the following form:
144
-
145
- # ```json
146
- # {
147
- # 'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)
148
- # 'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present
149
- # 'food_items': [], # list[str] - list of visible edible food item nouns
150
- # 'drink_items': [] # list[str] - list of visible edible drink item nouns
151
- # }
152
- # ```
153
- # ````
154
-
155
- # Except one model has been fine-tuned on the structured data whereas the other hasn't.
156
-
157
- # Notable next steps would be:
158
- # * **Remove the input prompt:** Just train the model to go straight from image -> text (no text prompt on input), this would save on inference tokens.
159
- # * **Fine-tune on more real-world data:** Right now the model is only trained on 1k food images (from Food101) and 500 not food (random internet images), training on real world data would likely significantly improve performance.
160
- # * **Fix the repetitive generation:** The model can sometimes get stuck in a repetitive generation pattern, e.g. "onions", "onions", "onions", etc. We could look into patterns to help reduce this.
161
- # """
162
-
163
- # demo = gr.Interface(
164
- # fn=extract_foods_from_image,
165
- # inputs=gr.Image(type="pil"),
166
- # title=demo_title,
167
- # description=demo_description,
168
- # outputs=[gr.Textbox(lines=4, label="Original Model (not fine-tuned)"),
169
- # gr.Textbox(lines=4, label="Fine-tuned Model")],
170
- # examples=[["examples/food1.jpeg"],
171
- # ["examples/food2.jpg"],
172
- # ["examples/food3.jpg"],
173
- # ["examples/food4.jpeg"]],
174
-
175
- # )
176
 
177
- # if __name__ == "__main__":
178
- # demo.launch(share=False)
179
 
180
  import os
181
  import torch
@@ -558,7 +381,7 @@ with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft(), title="FoodExtract Vision
558
  <div class="footer-section">
559
  <p style="margin: 0;">Built with ❤️ by <strong>Jarvis Zhang</strong> |
560
  <a href="https://huggingface.co/CreatorJarvis" target="_blank" style="color: #4f46e5;">🤗 Hugging Face</a> |
561
- <a href="https://github.com/CreatorJarvis" target="_blank" style="color: #4f46e5;">💻 GitHub</a>
562
  </p>
563
  <p style="margin: 0.5rem 0 0 0; font-size: 0.8rem; color: #9ca3af;">Fine-tuning Demo • Vision Language Model • Structured Output Generation</p>
564
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
 
 
2
 
3
  import os
4
  import torch
 
381
  <div class="footer-section">
382
  <p style="margin: 0;">Built with ❤️ by <strong>Jarvis Zhang</strong> |
383
  <a href="https://huggingface.co/CreatorJarvis" target="_blank" style="color: #4f46e5;">🤗 Hugging Face</a> |
384
+ <a href="https://github.com/JarvisZhang24" target="_blank" style="color: #4f46e5;">💻 GitHub</a>
385
  </p>
386
  <p style="margin: 0.5rem 0 0 0; font-size: 0.8rem; color: #9ca3af;">Fine-tuning Demo • Vision Language Model • Structured Output Generation</p>
387
  </div>