ninjals commited on
Commit
b7c19ad
·
verified ·
1 Parent(s): a8f359b

Uploading FoodExtract-Vision demo app.py from YouTube tutorial video

Browse files
Files changed (2) hide show
  1. app.py +44 -32
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,53 +1,65 @@
1
  import torch
2
  import gradio as gr
3
-
4
  import spaces
5
  from transformers import pipeline
6
 
7
  BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
8
- # FINE_TUNED_MODEL_ID = "mrdbourke/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1"
9
  FINE_TUNED_MODEL_ID = "ninjals/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1-VIDEO"
10
  OUTPUT_TOKENS = 256
11
-
12
- # Load original base model (no fine-tuning)
13
- print(f"[INFO] Loading Original Model")
14
- original_pipeline = pipeline(
15
- "image-text-to-text",
16
- model=BASE_MODEL_ID,
17
- dtype=torch.float16,
18
- device_map="auto"
19
- )
20
-
21
- # Load fine-tuned model
22
- print(f"[INFO] Loading Fine-tuned Model")
23
- ft_pipe = pipeline(
24
- "image-text-to-text",
25
- model=FINE_TUNED_MODEL_ID,
26
- dtype=torch.bfloat16,
27
- device_map="auto"
28
- )
 
 
 
 
 
 
 
29
 
30
  def create_message(input_image):
31
- return [{'role': 'user',
32
- 'content': [{'type': 'image',
33
- 'image': input_image},
34
- {'type': 'text',
35
- 'text': "Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. If no food/drink items are visible, return empty lists.\n\nOnly return valid JSON in the following form:\n\n```json\n{\n 'is_food': 0, # int - 0 or 1 based on whether food/drinks are present (0 = no foods visible, 1 = foods visible)\n 'image_title': '', # str - short food-related title for what foods/drinks are visible in the image, leave blank if no foods present\n 'food_items': [], # list[str] - list of visible edible food item nouns\n 'drink_items': [] # list[str] - list of visible edible drink item nouns\n}\n```\n"}]}]
 
 
36
 
37
  @spaces.GPU
38
  def extract_foods_from_image(input_image):
 
 
 
39
  input_image = input_image.resize(size=(512, 512))
40
  input_message = create_message(input_image=input_image)
41
 
42
- # Get outputs from base model (not fine-tuned)
43
- original_pipeline_output = original_pipeline(text=[input_message],
44
- max_new_tokens=OUTPUT_TOKENS)
45
-
 
46
  outputs_pretrained = original_pipeline_output[0][0]["generated_text"][-1]["content"]
47
 
48
- # Get outputs from fine-tuned model (fine-tuned on food images)
49
- ft_pipe_output = ft_pipe(text=[input_message],
50
- max_new_tokens=OUTPUT_TOKENS)
 
51
  outputs_fine_tuned = ft_pipe_output[0][0]["generated_text"][-1]["content"]
52
 
53
  return outputs_pretrained, outputs_fine_tuned
 
1
  import torch
2
  import gradio as gr
 
3
  import spaces
4
  from transformers import pipeline
5
 
6
  BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
 
7
  FINE_TUNED_MODEL_ID = "ninjals/FoodExtract-Vision-SmolVLM2-500M-fine-tune-v1-VIDEO"
8
  OUTPUT_TOKENS = 256
9
+ # Use bfloat16 for both to avoid CUDA kernel mismatches
10
+ DTYPE = torch.bfloat16
11
+
12
+ # Initialize global variables for pipelines
13
+ original_pipeline = None
14
+ ft_pipe = None
15
+
16
+ def load_models():
17
+ global original_pipeline, ft_pipe
18
+ if original_pipeline is None:
19
+ print(f"[INFO] Loading Original Model")
20
+ original_pipeline = pipeline(
21
+ "image-text-to-text",
22
+ model=BASE_MODEL_ID,
23
+ torch_dtype=DTYPE,
24
+ device_map="auto"
25
+ )
26
+ if ft_pipe is None:
27
+ print(f"[INFO] Loading Fine-tuned Model")
28
+ ft_pipe = pipeline(
29
+ "image-text-to-text",
30
+ model=FINE_TUNED_MODEL_ID,
31
+ torch_dtype=DTYPE,
32
+ device_map="auto"
33
+ )
34
 
35
  def create_message(input_image):
36
+ return [{
37
+ 'role': 'user',
38
+ 'content': [
39
+ {'type': 'image', 'image': input_image},
40
+ {'type': 'text', 'text': "Classify the given input image into food or not and if edible food or drink items are present, extract those to a list. Only return valid JSON."}
41
+ ]
42
+ }]
43
 
44
  @spaces.GPU
45
  def extract_foods_from_image(input_image):
46
+ # Ensure models are loaded on the GPU within the ZeroGPU environment
47
+ load_models()
48
+
49
  input_image = input_image.resize(size=(512, 512))
50
  input_message = create_message(input_image=input_image)
51
 
52
+ # Note: We pass max_new_tokens and explicitly avoid max_length to prevent the warning/error
53
+ original_pipeline_output = original_pipeline(
54
+ text=[input_message],
55
+ max_new_tokens=OUTPUT_TOKENS
56
+ )
57
  outputs_pretrained = original_pipeline_output[0][0]["generated_text"][-1]["content"]
58
 
59
+ ft_pipe_output = ft_pipe(
60
+ text=[input_message],
61
+ max_new_tokens=OUTPUT_TOKENS
62
+ )
63
  outputs_fine_tuned = ft_pipe_output[0][0]["generated_text"][-1]["content"]
64
 
65
  return outputs_pretrained, outputs_fine_tuned
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  num2words
2
- transformers
3
  torch
4
  accelerate
5
  gradio
6
  torchvision
 
 
1
  num2words
2
+ transformers>=4.45.0
3
  torch
4
  accelerate
5
  gradio
6
  torchvision
7
+ qwen-vl-utils