CreatorJarvis commited on
Commit
c17c199
·
verified ·
1 Parent(s): fdb4ae4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -22
app.py CHANGED
@@ -7,30 +7,34 @@ from transformers import pipeline
7
  BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
8
  FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
9
  OUTPUT_TOKENS = 256
 
 
10
 
11
- DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
 
12
  if DEVICE_TYPE == "cuda":
13
  torch.backends.cuda.matmul.allow_tf32 = True
14
  torch.backends.cudnn.allow_tf32 = True
15
 
16
  def _get_dtype(device: str):
17
  if device == "cuda":
 
 
18
  if os.getenv("USE_BF16", "0") == "1":
19
  is_bf16_supported = getattr(torch.cuda, "is_bf16_supported", None)
20
  if callable(is_bf16_supported) and is_bf16_supported():
21
  return torch.bfloat16
22
- return torch.float16
23
  return torch.float32
24
 
25
- DTYPE = _get_dtype(DEVICE_TYPE)
26
-
27
- def _make_pipe(model_id: str):
28
- device_arg = 0 if DEVICE_TYPE == "cuda" else -1
29
  pipe = pipeline(
30
  "image-text-to-text",
31
  model=model_id,
32
- device=torch.float16,
33
- dtype=DTYPE,
34
  )
35
  model = getattr(pipe, "model", None)
36
  generation_config = getattr(model, "generation_config", None)
@@ -43,13 +47,16 @@ def _make_pipe(model_id: str):
43
  pass
44
  return pipe
45
 
46
- # Load original base model (no fine-tuning)
47
- print(f"[INFO] Loading Original Model")
48
- original_pipeline = _make_pipe(BASE_MODEL_ID)
 
 
 
 
 
49
 
50
- # Load fine-tuned model
51
- print(f"[INFO] Loading Fine-tuned Model")
52
- ft_pipe = _make_pipe(FINE_TUNED_MODEL_ID)
53
 
54
  def _extract_generated_text(pipe_output) -> str:
55
  try:
@@ -85,14 +92,33 @@ def extract_foods_from_image(input_image):
85
  input_image = input_image.resize(size=(512, 512))
86
  input_message = create_message(input_image=input_image)
87
 
88
- # Get outputs from base model (not fine-tuned)
89
- original_pipeline_output = original_pipeline(text=[input_message])
90
-
91
- outputs_pretrained = _extract_generated_text(original_pipeline_output)
92
-
93
- # Get outputs from fine-tuned model (fine-tuned on food images)
94
- ft_pipe_output = ft_pipe(text=[input_message])
95
- outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  return outputs_pretrained, outputs_fine_tuned
98
 
 
7
  BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
8
  FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
9
  OUTPUT_TOKENS = 256
10
+ original_pipeline = None
11
+ ft_pipe = None
12
 
13
+ FORCE_CPU = os.getenv("FORCE_CPU", "0") == "1"
14
+ DEVICE_TYPE = "cuda" if (torch.cuda.is_available() and not FORCE_CPU) else "cpu"
15
  if DEVICE_TYPE == "cuda":
16
  torch.backends.cuda.matmul.allow_tf32 = True
17
  torch.backends.cudnn.allow_tf32 = True
18
 
19
  def _get_dtype(device: str):
20
  if device == "cuda":
21
+ if os.getenv("USE_FP16", "0") == "1":
22
+ return torch.float16
23
  if os.getenv("USE_BF16", "0") == "1":
24
  is_bf16_supported = getattr(torch.cuda, "is_bf16_supported", None)
25
  if callable(is_bf16_supported) and is_bf16_supported():
26
  return torch.bfloat16
27
+ return torch.float32
28
  return torch.float32
29
 
30
+ def _make_pipe(model_id: str, device_type: str):
31
+ dtype = _get_dtype(device_type)
32
+ device_arg = 0 if device_type == "cuda" else -1
 
33
  pipe = pipeline(
34
  "image-text-to-text",
35
  model=model_id,
36
+ device=device_arg,
37
+ dtype=dtype,
38
  )
39
  model = getattr(pipe, "model", None)
40
  generation_config = getattr(model, "generation_config", None)
 
47
  pass
48
  return pipe
49
 
50
+ ACTIVE_DEVICE_TYPE = DEVICE_TYPE
51
+
52
+ def _load_pipes(device_type: str):
53
+ global original_pipeline, ft_pipe, ACTIVE_DEVICE_TYPE
54
+ ACTIVE_DEVICE_TYPE = device_type
55
+ print(f"[INFO] Using device_type={ACTIVE_DEVICE_TYPE}")
56
+ original_pipeline = _make_pipe(BASE_MODEL_ID, ACTIVE_DEVICE_TYPE)
57
+ ft_pipe = _make_pipe(FINE_TUNED_MODEL_ID, ACTIVE_DEVICE_TYPE)
58
 
59
+ _load_pipes(DEVICE_TYPE)
 
 
60
 
61
  def _extract_generated_text(pipe_output) -> str:
62
  try:
 
92
  input_image = input_image.resize(size=(512, 512))
93
  input_message = create_message(input_image=input_image)
94
 
95
+ try:
96
+ original_pipeline_output = original_pipeline(text=[input_message])
97
+ outputs_pretrained = _extract_generated_text(original_pipeline_output)
98
+
99
+ ft_pipe_output = ft_pipe(text=[input_message])
100
+ outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
101
+ except RuntimeError as e:
102
+ msg = str(e)
103
+ is_cuda_linear_failure = (
104
+ "CUBLAS_STATUS_INVALID_VALUE" in msg
105
+ or "cublasGemmEx" in msg
106
+ or ("CUDA error" in msg and "CUBLAS" in msg)
107
+ )
108
+ if ACTIVE_DEVICE_TYPE == "cuda" and is_cuda_linear_failure:
109
+ try:
110
+ print("[WARN] CUDA GEMM failed, falling back to CPU.")
111
+ _load_pipes("cpu")
112
+ if torch.cuda.is_available():
113
+ torch.cuda.empty_cache()
114
+ original_pipeline_output = original_pipeline(text=[input_message])
115
+ outputs_pretrained = _extract_generated_text(original_pipeline_output)
116
+ ft_pipe_output = ft_pipe(text=[input_message])
117
+ outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
118
+ except Exception:
119
+ raise e
120
+ else:
121
+ raise
122
 
123
  return outputs_pretrained, outputs_fine_tuned
124