Spaces:

KinetoLabs
/

SmokeScan

Paused

KinetoLabs Claude Opus 4.5 commited on 12 days ago

Commit

c4bfdfa

1 Parent(s): 455c786

Fix multi-GPU support in vendored Qwen3-VL scripts

The original scripts used .to(device) which moves the entire model
to a single GPU. With vision model already on GPU 0 (~22GB), this
caused OOM when loading embedding model.

Changes:
- Remove .to(device) calls in both Qwen3VLEmbedder and Qwen3VLReranker
- Add device_map="auto" to from_pretrained() for multi-GPU distribution
- Update device references to use model's distributed device

This allows the embedding and reranker models to be distributed
across available GPUs alongside the vision model.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (2) hide show

scripts/qwen3_vl/qwen3_vl_embedding.py +8 -3
scripts/qwen3_vl/qwen3_vl_reranker.py +12 -6

scripts/qwen3_vl/qwen3_vl_embedding.py CHANGED Viewed

@@ -164,8 +164,6 @@ class Qwen3VLEmbedder:
         default_instruction: str = "Represent the user's input.",
         **kwargs,
     ):
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.max_length = max_length
         self.min_pixels = min_pixels
         self.max_pixels = max_pixels
@@ -175,14 +173,21 @@ class Qwen3VLEmbedder:
         self.max_frames = max_frames
         self.default_instruction = default_instruction
         self.model = Qwen3VLForEmbedding.from_pretrained(
             model_name_or_path, trust_remote_code=True, **kwargs
-        ).to(device)
         self.processor = Qwen3VLProcessor.from_pretrained(
             model_name_or_path, padding_side="right"
         )
         self.model.eval()
     @property
     def device(self):
         return self.model.device

         default_instruction: str = "Represent the user's input.",
         **kwargs,
     ):
         self.max_length = max_length
         self.min_pixels = min_pixels
         self.max_pixels = max_pixels
         self.max_frames = max_frames
         self.default_instruction = default_instruction
+        # Use device_map="auto" for multi-GPU distribution instead of .to(device)
+        # This is critical for HuggingFace Spaces with 4xL4 GPUs
+        if "device_map" not in kwargs and torch.cuda.is_available():
+            kwargs["device_map"] = "auto"
         self.model = Qwen3VLForEmbedding.from_pretrained(
             model_name_or_path, trust_remote_code=True, **kwargs
+        )
         self.processor = Qwen3VLProcessor.from_pretrained(
             model_name_or_path, padding_side="right"
         )
         self.model.eval()
+        logger.info(f"Qwen3VLEmbedder loaded with device_map={kwargs.get('device_map', 'N/A')}")
     @property
     def device(self):
         return self.model.device

scripts/qwen3_vl/qwen3_vl_reranker.py CHANGED Viewed

@@ -74,9 +74,6 @@ class Qwen3VLReranker:
         default_instruction: str = "Given a search query, retrieve relevant candidates that answer the query.",
         **kwargs,
     ):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.max_length = max_length
         self.min_pixels = min_pixels
         self.max_pixels = max_pixels
@@ -87,9 +84,14 @@ class Qwen3VLReranker:
         self.default_instruction = default_instruction
         lm = Qwen3VLForConditionalGeneration.from_pretrained(
             model_name_or_path, trust_remote_code=True, **kwargs
-        ).to(self.device)
         self.model = lm.model
         self.processor = AutoProcessor.from_pretrained(
@@ -97,14 +99,18 @@ class Qwen3VLReranker:
         )
         self.model.eval()
         token_true_id = self.processor.tokenizer.get_vocab()["yes"]
         token_false_id = self.processor.tokenizer.get_vocab()["no"]
         self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
         self.score_linear.eval()
-        self.score_linear.to(self.device).to(self.model.dtype)
         logger.info(
-            f"Initialized Qwen3VLReranker with yes/no scoring layer (device={self.device})"
         )
     def get_binary_linear(self, model, token_yes, token_no):

         default_instruction: str = "Given a search query, retrieve relevant candidates that answer the query.",
         **kwargs,
     ):
         self.max_length = max_length
         self.min_pixels = min_pixels
         self.max_pixels = max_pixels
         self.default_instruction = default_instruction
+        # Use device_map="auto" for multi-GPU distribution instead of .to(device)
+        # This is critical for HuggingFace Spaces with 4xL4 GPUs
+        if "device_map" not in kwargs and torch.cuda.is_available():
+            kwargs["device_map"] = "auto"
         lm = Qwen3VLForConditionalGeneration.from_pretrained(
             model_name_or_path, trust_remote_code=True, **kwargs
+        )
         self.model = lm.model
         self.processor = AutoProcessor.from_pretrained(
         )
         self.model.eval()
+        # Get device from model (may be distributed across GPUs)
+        self._lm_device = next(lm.parameters()).device
         token_true_id = self.processor.tokenizer.get_vocab()["yes"]
         token_false_id = self.processor.tokenizer.get_vocab()["no"]
         self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
         self.score_linear.eval()
+        self.score_linear.to(self._lm_device).to(self.model.dtype)
         logger.info(
+            f"Qwen3VLReranker loaded with device_map={kwargs.get('device_map', 'N/A')}, "
+            f"yes/no scoring layer initialized"
         )
     def get_binary_linear(self, model, token_yes, token_no):