Spaces:

CreatorJarvis
/

FoodExtract-Vision

Running on Zero

App Files Files Community

CreatorJarvis commited on Jan 30

Commit

c17c199

verified ·

1 Parent(s): fdb4ae4

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -22

app.py CHANGED Viewed

@@ -7,30 +7,34 @@ from transformers import pipeline
 BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
 FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
 OUTPUT_TOKENS = 256
-DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
 if DEVICE_TYPE == "cuda":
     torch.backends.cuda.matmul.allow_tf32 = True
     torch.backends.cudnn.allow_tf32 = True
 def _get_dtype(device: str):
     if device == "cuda":
         if os.getenv("USE_BF16", "0") == "1":
             is_bf16_supported = getattr(torch.cuda, "is_bf16_supported", None)
             if callable(is_bf16_supported) and is_bf16_supported():
                 return torch.bfloat16
-        return torch.float16
     return torch.float32
-DTYPE = _get_dtype(DEVICE_TYPE)
-def _make_pipe(model_id: str):
-    device_arg = 0 if DEVICE_TYPE == "cuda" else -1
     pipe = pipeline(
         "image-text-to-text",
         model=model_id,
-        device=torch.float16,
-        dtype=DTYPE,
     )
     model = getattr(pipe, "model", None)
     generation_config = getattr(model, "generation_config", None)
@@ -43,13 +47,16 @@ def _make_pipe(model_id: str):
             pass
     return pipe
-# Load original base model (no fine-tuning)
-print(f"[INFO] Loading Original Model")
-original_pipeline = _make_pipe(BASE_MODEL_ID)
-# Load fine-tuned model
-print(f"[INFO] Loading Fine-tuned Model")
-ft_pipe = _make_pipe(FINE_TUNED_MODEL_ID)
 def _extract_generated_text(pipe_output) -> str:
     try:
@@ -85,14 +92,33 @@ def extract_foods_from_image(input_image):
     input_image = input_image.resize(size=(512, 512))
     input_message = create_message(input_image=input_image)
-    # Get outputs from base model (not fine-tuned)
-    original_pipeline_output = original_pipeline(text=[input_message])
-    outputs_pretrained = _extract_generated_text(original_pipeline_output)
-    # Get outputs from fine-tuned model (fine-tuned on food images)
-    ft_pipe_output = ft_pipe(text=[input_message])
-    outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
     return outputs_pretrained, outputs_fine_tuned

 BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
 FINE_TUNED_MODEL_ID = "CreatorJarvis/FoodExtract-Vision-SmolVLM2-500M-fine-tune"
 OUTPUT_TOKENS = 256
+original_pipeline = None
+ft_pipe = None
+FORCE_CPU = os.getenv("FORCE_CPU", "0") == "1"
+DEVICE_TYPE = "cuda" if (torch.cuda.is_available() and not FORCE_CPU) else "cpu"
 if DEVICE_TYPE == "cuda":
     torch.backends.cuda.matmul.allow_tf32 = True
     torch.backends.cudnn.allow_tf32 = True
 def _get_dtype(device: str):
     if device == "cuda":
+        if os.getenv("USE_FP16", "0") == "1":
+            return torch.float16
         if os.getenv("USE_BF16", "0") == "1":
             is_bf16_supported = getattr(torch.cuda, "is_bf16_supported", None)
             if callable(is_bf16_supported) and is_bf16_supported():
                 return torch.bfloat16
+        return torch.float32
     return torch.float32
+def _make_pipe(model_id: str, device_type: str):
+    dtype = _get_dtype(device_type)
+    device_arg = 0 if device_type == "cuda" else -1
     pipe = pipeline(
         "image-text-to-text",
         model=model_id,
+        device=device_arg,
+        dtype=dtype,
     )
     model = getattr(pipe, "model", None)
     generation_config = getattr(model, "generation_config", None)
             pass
     return pipe
+ACTIVE_DEVICE_TYPE = DEVICE_TYPE
+def _load_pipes(device_type: str):
+    global original_pipeline, ft_pipe, ACTIVE_DEVICE_TYPE
+    ACTIVE_DEVICE_TYPE = device_type
+    print(f"[INFO] Using device_type={ACTIVE_DEVICE_TYPE}")
+    original_pipeline = _make_pipe(BASE_MODEL_ID, ACTIVE_DEVICE_TYPE)
+    ft_pipe = _make_pipe(FINE_TUNED_MODEL_ID, ACTIVE_DEVICE_TYPE)
+_load_pipes(DEVICE_TYPE)
 def _extract_generated_text(pipe_output) -> str:
     try:
     input_image = input_image.resize(size=(512, 512))
     input_message = create_message(input_image=input_image)
+    try:
+        original_pipeline_output = original_pipeline(text=[input_message])
+        outputs_pretrained = _extract_generated_text(original_pipeline_output)
+        ft_pipe_output = ft_pipe(text=[input_message])
+        outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
+    except RuntimeError as e:
+        msg = str(e)
+        is_cuda_linear_failure = (
+            "CUBLAS_STATUS_INVALID_VALUE" in msg
+            or "cublasGemmEx" in msg
+            or ("CUDA error" in msg and "CUBLAS" in msg)
+        )
+        if ACTIVE_DEVICE_TYPE == "cuda" and is_cuda_linear_failure:
+            try:
+                print("[WARN] CUDA GEMM failed, falling back to CPU.")
+                _load_pipes("cpu")
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                original_pipeline_output = original_pipeline(text=[input_message])
+                outputs_pretrained = _extract_generated_text(original_pipeline_output)
+                ft_pipe_output = ft_pipe(text=[input_message])
+                outputs_fine_tuned = _extract_generated_text(ft_pipe_output)
+            except Exception:
+                raise e
+        else:
+            raise
     return outputs_pretrained, outputs_fine_tuned