CreatorJarvis commited on
Commit
3c69511
·
verified ·
1 Parent(s): b1c5b87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -23
app.py CHANGED
@@ -1,6 +1,6 @@
 
1
  import torch
2
  import gradio as gr
3
-
4
  import spaces
5
  from transformers import pipeline
6
 
@@ -8,23 +8,66 @@ BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
8
  FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
9
  OUTPUT_TOKENS = 256
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # Load original base model (no fine-tuning)
12
  print(f"[INFO] Loading Original Model")
13
- original_pipeline = pipeline(
14
- "image-text-to-text",
15
- model=BASE_MODEL_ID,
16
- dtype=torch.bfloat16,
17
- device_map="auto"
18
- )
19
 
20
  # Load fine-tuned model
21
  print(f"[INFO] Loading Fine-tuned Model")
22
- ft_pipe = pipeline(
23
- "image-text-to-text",
24
- model=FINE_TUNED_MODEL_ID,
25
- dtype=torch.bfloat16,
26
- device_map="auto"
27
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def create_message(input_image):
30
  return [{'role': 'user',
@@ -35,26 +78,28 @@ def create_message(input_image):
35
 
36
  @spaces.GPU
37
  def extract_foods_from_image(input_image):
 
 
 
 
38
  input_image = input_image.resize(size=(512, 512))
39
  input_message = create_message(input_image=input_image)
40
 
41
  # Get outputs from base model (not fine-tuned)
42
- original_pipeline_output = original_pipeline(text=[input_message],
43
- max_new_tokens=OUTPUT_TOKENS)
44
 
45
- outputs_pretrained = original_pipeline_output[0][0]["generated_text"][-1]["content"]
46
 
47
  # Get outputs from fine-tuned model (fine-tuned on food images)
48
- ft_pipe_output = ft_pipe(text=[input_message],
49
- max_new_tokens=OUTPUT_TOKENS)
50
- outputs_fine_tuned = ft_pipe_output[0][0]["generated_text"][-1]["content"]
51
 
52
  return outputs_pretrained, outputs_fine_tuned
53
 
54
  demo_title = "🥑➡️📝 FoodExtract-Vision with a fine-tuned SmolVLM2-500M"
55
  demo_description = """* **Base model:** https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct
56
  * **Fine-tuning dataset:** https://huggingface.co/datasets/mrdbourke/FoodExtract-1k-Vision (1k food images and 500 not food images)
57
- * **Fine-tuned model:** https://huggingface.co/mrdbourke/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1
58
 
59
  ## Overview
60
 
@@ -96,9 +141,7 @@ demo = gr.Interface(
96
  description=demo_description,
97
  outputs=[gr.Textbox(lines=4, label="Original Model (not fine-tuned)"),
98
  gr.Textbox(lines=4, label="Fine-tuned Model")],
99
- examples=[["examples/camera.jpeg"],
100
- ["examples/Tandoori-Chicken.jpg"],
101
- ["examples/fries.jpeg"]],
102
  )
103
 
104
  if __name__ == "__main__":
 
1
+ import os
2
  import torch
3
  import gradio as gr
 
4
  import spaces
5
  from transformers import pipeline
6
 
 
8
  FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
9
  OUTPUT_TOKENS = 256
10
 
11
+ DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
12
+ if DEVICE_TYPE == "cuda":
13
+ torch.backends.cuda.matmul.allow_tf32 = True
14
+ torch.backends.cudnn.allow_tf32 = True
15
+
16
+ def _get_dtype(device: str):
17
+ if device == "cuda":
18
+ if os.getenv("USE_BF16", "0") == "1":
19
+ is_bf16_supported = getattr(torch.cuda, "is_bf16_supported", None)
20
+ if callable(is_bf16_supported) and is_bf16_supported():
21
+ return torch.bfloat16
22
+ return torch.float16
23
+ return torch.float32
24
+
25
+ DTYPE = _get_dtype(DEVICE_TYPE)
26
+
27
+ def _make_pipe(model_id: str):
28
+ device_arg = 0 if DEVICE_TYPE == "cuda" else -1
29
+ pipe = pipeline(
30
+ "image-text-to-text",
31
+ model=model_id,
32
+ device=device_arg,
33
+ dtype=DTYPE,
34
+ )
35
+ model = getattr(pipe, "model", None)
36
+ generation_config = getattr(model, "generation_config", None)
37
+ if generation_config is not None:
38
+ generation_config.do_sample = False
39
+ generation_config.max_new_tokens = OUTPUT_TOKENS
40
+ try:
41
+ generation_config.max_length = None
42
+ except Exception:
43
+ pass
44
+ return pipe
45
+
46
  # Load original base model (no fine-tuning)
47
  print(f"[INFO] Loading Original Model")
48
+ original_pipeline = _make_pipe(BASE_MODEL_ID)
 
 
 
 
 
49
 
50
  # Load fine-tuned model
51
  print(f"[INFO] Loading Fine-tuned Model")
52
+ ft_pipe = _make_pipe(FINE_TUNED_MODEL_ID)
53
+
54
+ def _extract_generated_text(pipe_output) -> str:
55
+ try:
56
+ item0 = pipe_output[0]
57
+ if isinstance(item0, dict) and "generated_text" in item0:
58
+ gt = item0["generated_text"]
59
+ else:
60
+ gt = pipe_output[0][0]["generated_text"]
61
+
62
+ if isinstance(gt, str):
63
+ return gt
64
+ if isinstance(gt, list) and gt:
65
+ last = gt[-1]
66
+ if isinstance(last, dict) and "content" in last:
67
+ return last["content"]
68
+ return str(gt)
69
+ except Exception:
70
+ return str(pipe_output)
71
 
72
  def create_message(input_image):
73
  return [{'role': 'user',
 
78
 
79
  @spaces.GPU
80
  def extract_foods_from_image(input_image):
81
+ if input_image is None:
82
+ return "Please upload an image", "Please upload an image"
83
+
84
+ input_image = input_image.convert("RGB")
85
  input_image = input_image.resize(size=(512, 512))
86
  input_message = create_message(input_image=input_image)
87
 
88
  # Get outputs from base model (not fine-tuned)
89
+ original_pipeline_output = original_pipeline(text=[input_message])
 
90
 
91
+ outputs_pretrained = _extract_generated_text(original_pipeline_output)
92
 
93
  # Get outputs from fine-tuned model (fine-tuned on food images)
94
+ ft_pipe_output = ft_pipe(text=[input_message])
95
+ outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
 
96
 
97
  return outputs_pretrained, outputs_fine_tuned
98
 
99
  demo_title = "🥑➡️📝 FoodExtract-Vision with a fine-tuned SmolVLM2-500M"
100
  demo_description = """* **Base model:** https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct
101
  * **Fine-tuning dataset:** https://huggingface.co/datasets/mrdbourke/FoodExtract-1k-Vision (1k food images and 500 not food images)
102
+ * **Fine-tuned model:** https://huggingface.co/CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1
103
 
104
  ## Overview
105
 
 
141
  description=demo_description,
142
  outputs=[gr.Textbox(lines=4, label="Original Model (not fine-tuned)"),
143
  gr.Textbox(lines=4, label="Fine-tuned Model")],
144
+
 
 
145
  )
146
 
147
  if __name__ == "__main__":