Spaces:

KinetoLabs
/

SmokeScan

Paused

KinetoLabs Claude Opus 4.5 commited on Jan 10

Commit

d1901ae

1 Parent(s): c4bfdfa

Fix multi-GPU compatibility issues (6 locations)

1. Meta tensor error in reranker (BLOCKING):
- Use .clone() to force weight materialization before copying
- Create Linear layer with correct device/dtype from lm_head

2. Remove all .to(device) calls that break distributed models:
- scripts/qwen3_vl/qwen3_vl_embedding.py:388
- scripts/qwen3_vl/qwen3_vl_reranker.py:109, 374
- models/real.py:259
- rag/vectorstore.py:130
- rag/retriever.py:149

With device_map="auto", transformers handles device routing internally.
Manual .to(device) calls cause device mismatches on multi-GPU setups.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (5) hide show

models/real.py +2 -2
rag/retriever.py +2 -1
rag/vectorstore.py +2 -1
scripts/qwen3_vl/qwen3_vl_embedding.py +2 -1
scripts/qwen3_vl/qwen3_vl_reranker.py +12 -6

models/real.py CHANGED Viewed

@@ -255,8 +255,8 @@ IMPORTANT: Return ONLY valid JSON, no additional text."""
                     padding=True,
                 )
-            # Move inputs to model device
-            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
             # Log inference config being used
             logger.debug(f"Vision inference config: max_new_tokens={vision_config.max_new_tokens}, "

                     padding=True,
                 )
+            # Note: With device_map="auto", transformers handles device routing internally
+            # Do NOT call .to(device) - it breaks distributed models
             # Log inference config being used
             logger.debug(f"Vision inference config: max_new_tokens={vision_config.max_new_tokens}, "

rag/retriever.py CHANGED Viewed

@@ -146,7 +146,8 @@ class RealReranker:
                     max_length=512,
                     padding=True,
                 )
-                inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
                 outputs = self.model(**inputs)
                 # Sigmoid to get 0-1 score

                     max_length=512,
                     padding=True,
                 )
+                # Note: With device_map="auto", transformers handles device routing internally
+                # Do NOT call .to(device) - it breaks distributed models
                 outputs = self.model(**inputs)
                 # Sigmoid to get 0-1 score

rag/vectorstore.py CHANGED Viewed

@@ -127,7 +127,8 @@ class RealEmbeddingFunction:
                     max_length=512,
                     padding=True,
                 )
-                inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
                 outputs = self.model(**inputs)

                     max_length=512,
                     padding=True,
                 )
+                # Note: With device_map="auto", transformers handles device routing internally
+                # Do NOT call .to(device) - it breaks distributed models
                 outputs = self.model(**inputs)

scripts/qwen3_vl/qwen3_vl_embedding.py CHANGED Viewed

@@ -385,7 +385,8 @@ class Qwen3VLEmbedder:
         ]
         processed_inputs = self._preprocess_inputs(conversations)
-        processed_inputs = {k: v.to(self.model.device) for k, v in processed_inputs.items()}
         outputs = self.forward(processed_inputs)
         embeddings = self._pooling_last(

         ]
         processed_inputs = self._preprocess_inputs(conversations)
+        # Note: With device_map="auto", transformers handles device routing internally
+        # Do NOT call .to(device) - it breaks distributed models
         outputs = self.forward(processed_inputs)
         embeddings = self._pooling_last(

scripts/qwen3_vl/qwen3_vl_reranker.py CHANGED Viewed

@@ -106,7 +106,7 @@ class Qwen3VLReranker:
         token_false_id = self.processor.tokenizer.get_vocab()["no"]
         self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
         self.score_linear.eval()
-        self.score_linear.to(self._lm_device).to(self.model.dtype)
         logger.info(
             f"Qwen3VLReranker loaded with device_map={kwargs.get('device_map', 'N/A')}, "
@@ -114,16 +114,21 @@ class Qwen3VLReranker:
         )
     def get_binary_linear(self, model, token_yes, token_no):
-        """Extract yes/no token weights from LM head and create scoring layer."""
-        lm_head_weights = model.lm_head.weight.data
         weight_yes = lm_head_weights[token_yes]
         weight_no = lm_head_weights[token_no]
         D = weight_yes.size()[0]
-        linear_layer = torch.nn.Linear(D, 1, bias=False)
         with torch.no_grad():
-            linear_layer.weight[0] = weight_yes - weight_no
         return linear_layer
     @torch.no_grad()
@@ -371,7 +376,8 @@ class Qwen3VLReranker:
         final_scores = []
         for pair in pairs:
             tokenized_inputs = self.tokenize([pair])
-            tokenized_inputs = tokenized_inputs.to(self.model.device)
             scores = self.compute_scores(tokenized_inputs)
             final_scores.extend(scores)
         return final_scores

         token_false_id = self.processor.tokenizer.get_vocab()["no"]
         self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
         self.score_linear.eval()
+        # Note: device and dtype are set in get_binary_linear() to match lm_head weights
         logger.info(
             f"Qwen3VLReranker loaded with device_map={kwargs.get('device_map', 'N/A')}, "
         )
     def get_binary_linear(self, model, token_yes, token_no):
+        """Extract yes/no token weights from LM head and create scoring layer.
+        Note: With device_map="auto", weights may be meta tensors until materialized.
+        We use .clone() to force materialization before copying.
+        """
+        # Force materialization with .clone() - required for device_map="auto"
+        lm_head_weights = model.lm_head.weight.clone()
         weight_yes = lm_head_weights[token_yes]
         weight_no = lm_head_weights[token_no]
         D = weight_yes.size()[0]
+        linear_layer = torch.nn.Linear(D, 1, bias=False, device=weight_yes.device, dtype=weight_yes.dtype)
         with torch.no_grad():
+            linear_layer.weight.data[0] = weight_yes - weight_no
         return linear_layer
     @torch.no_grad()
         final_scores = []
         for pair in pairs:
             tokenized_inputs = self.tokenize([pair])
+            # Note: With device_map="auto", transformers handles device routing internally
+            # Do NOT call .to(device) - it breaks distributed models
             scores = self.compute_scores(tokenized_inputs)
             final_scores.extend(scores)
         return final_scores