1ForrestW1
/

moondream3-endpoint

Forrest Wargo commited on Oct 6, 2025

Commit

7cc210d

1 Parent(s): 34b89db

Ensure single-device placement (cuda or cpu) to avoid index device mismatch

Files changed (1) hide show

handler.py CHANGED Viewed

@@ -53,12 +53,18 @@ class EndpointHandler:
         os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
         # Load local repo (or remote if MODEL_ID points to hub id)
-        # Pass token when accessing gated repos
         hub_token = os.environ.get("HUGGINGFACE_HUB_TOKEN") or os.environ.get("HF_HUB_TOKEN") or os.environ.get("HF_TOKEN")
         load_kwargs = {
             "trust_remote_code": True,
-            "torch_dtype": torch.bfloat16,
-            "device_map": "auto",
         }
         if hub_token:
             load_kwargs["token"] = hub_token

         os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
         # Load local repo (or remote if MODEL_ID points to hub id)
+        # Pass token when accessing gated repos and ensure consistent device placement
         hub_token = os.environ.get("HUGGINGFACE_HUB_TOKEN") or os.environ.get("HF_HUB_TOKEN") or os.environ.get("HF_TOKEN")
+        if torch.cuda.is_available():
+            device_map = {"": "cuda"}
+            dtype = torch.bfloat16
+        else:
+            device_map = {"": "cpu"}
+            dtype = torch.float32
         load_kwargs = {
             "trust_remote_code": True,
+            "torch_dtype": dtype,
+            "device_map": device_map,
         }
         if hub_token:
             load_kwargs["token"] = hub_token