KinetoLabs Claude Opus 4.5 commited on
Commit
d1901ae
·
1 Parent(s): c4bfdfa

Fix multi-GPU compatibility issues (6 locations)

Browse files

1. Meta tensor error in reranker (BLOCKING):
- Use .clone() to force weight materialization before copying
- Create Linear layer with correct device/dtype from lm_head

2. Remove all .to(device) calls that break distributed models:
- scripts/qwen3_vl/qwen3_vl_embedding.py:388
- scripts/qwen3_vl/qwen3_vl_reranker.py:109, 374
- models/real.py:259
- rag/vectorstore.py:130
- rag/retriever.py:149

With device_map="auto", transformers handles device routing internally.
Manual .to(device) calls cause device mismatches on multi-GPU setups.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

models/real.py CHANGED
@@ -255,8 +255,8 @@ IMPORTANT: Return ONLY valid JSON, no additional text."""
255
  padding=True,
256
  )
257
 
258
- # Move inputs to model device
259
- inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
260
 
261
  # Log inference config being used
262
  logger.debug(f"Vision inference config: max_new_tokens={vision_config.max_new_tokens}, "
 
255
  padding=True,
256
  )
257
 
258
+ # Note: With device_map="auto", transformers handles device routing internally
259
+ # Do NOT call .to(device) - it breaks distributed models
260
 
261
  # Log inference config being used
262
  logger.debug(f"Vision inference config: max_new_tokens={vision_config.max_new_tokens}, "
rag/retriever.py CHANGED
@@ -146,7 +146,8 @@ class RealReranker:
146
  max_length=512,
147
  padding=True,
148
  )
149
- inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
 
150
 
151
  outputs = self.model(**inputs)
152
  # Sigmoid to get 0-1 score
 
146
  max_length=512,
147
  padding=True,
148
  )
149
+ # Note: With device_map="auto", transformers handles device routing internally
150
+ # Do NOT call .to(device) - it breaks distributed models
151
 
152
  outputs = self.model(**inputs)
153
  # Sigmoid to get 0-1 score
rag/vectorstore.py CHANGED
@@ -127,7 +127,8 @@ class RealEmbeddingFunction:
127
  max_length=512,
128
  padding=True,
129
  )
130
- inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
 
131
 
132
  outputs = self.model(**inputs)
133
 
 
127
  max_length=512,
128
  padding=True,
129
  )
130
+ # Note: With device_map="auto", transformers handles device routing internally
131
+ # Do NOT call .to(device) - it breaks distributed models
132
 
133
  outputs = self.model(**inputs)
134
 
scripts/qwen3_vl/qwen3_vl_embedding.py CHANGED
@@ -385,7 +385,8 @@ class Qwen3VLEmbedder:
385
  ]
386
 
387
  processed_inputs = self._preprocess_inputs(conversations)
388
- processed_inputs = {k: v.to(self.model.device) for k, v in processed_inputs.items()}
 
389
 
390
  outputs = self.forward(processed_inputs)
391
  embeddings = self._pooling_last(
 
385
  ]
386
 
387
  processed_inputs = self._preprocess_inputs(conversations)
388
+ # Note: With device_map="auto", transformers handles device routing internally
389
+ # Do NOT call .to(device) - it breaks distributed models
390
 
391
  outputs = self.forward(processed_inputs)
392
  embeddings = self._pooling_last(
scripts/qwen3_vl/qwen3_vl_reranker.py CHANGED
@@ -106,7 +106,7 @@ class Qwen3VLReranker:
106
  token_false_id = self.processor.tokenizer.get_vocab()["no"]
107
  self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
108
  self.score_linear.eval()
109
- self.score_linear.to(self._lm_device).to(self.model.dtype)
110
 
111
  logger.info(
112
  f"Qwen3VLReranker loaded with device_map={kwargs.get('device_map', 'N/A')}, "
@@ -114,16 +114,21 @@ class Qwen3VLReranker:
114
  )
115
 
116
  def get_binary_linear(self, model, token_yes, token_no):
117
- """Extract yes/no token weights from LM head and create scoring layer."""
118
- lm_head_weights = model.lm_head.weight.data
 
 
 
 
 
119
 
120
  weight_yes = lm_head_weights[token_yes]
121
  weight_no = lm_head_weights[token_no]
122
 
123
  D = weight_yes.size()[0]
124
- linear_layer = torch.nn.Linear(D, 1, bias=False)
125
  with torch.no_grad():
126
- linear_layer.weight[0] = weight_yes - weight_no
127
  return linear_layer
128
 
129
  @torch.no_grad()
@@ -371,7 +376,8 @@ class Qwen3VLReranker:
371
  final_scores = []
372
  for pair in pairs:
373
  tokenized_inputs = self.tokenize([pair])
374
- tokenized_inputs = tokenized_inputs.to(self.model.device)
 
375
  scores = self.compute_scores(tokenized_inputs)
376
  final_scores.extend(scores)
377
  return final_scores
 
106
  token_false_id = self.processor.tokenizer.get_vocab()["no"]
107
  self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
108
  self.score_linear.eval()
109
+ # Note: device and dtype are set in get_binary_linear() to match lm_head weights
110
 
111
  logger.info(
112
  f"Qwen3VLReranker loaded with device_map={kwargs.get('device_map', 'N/A')}, "
 
114
  )
115
 
116
  def get_binary_linear(self, model, token_yes, token_no):
117
+ """Extract yes/no token weights from LM head and create scoring layer.
118
+
119
+ Note: With device_map="auto", weights may be meta tensors until materialized.
120
+ We use .clone() to force materialization before copying.
121
+ """
122
+ # Force materialization with .clone() - required for device_map="auto"
123
+ lm_head_weights = model.lm_head.weight.clone()
124
 
125
  weight_yes = lm_head_weights[token_yes]
126
  weight_no = lm_head_weights[token_no]
127
 
128
  D = weight_yes.size()[0]
129
+ linear_layer = torch.nn.Linear(D, 1, bias=False, device=weight_yes.device, dtype=weight_yes.dtype)
130
  with torch.no_grad():
131
+ linear_layer.weight.data[0] = weight_yes - weight_no
132
  return linear_layer
133
 
134
  @torch.no_grad()
 
376
  final_scores = []
377
  for pair in pairs:
378
  tokenized_inputs = self.tokenize([pair])
379
+ # Note: With device_map="auto", transformers handles device routing internally
380
+ # Do NOT call .to(device) - it breaks distributed models
381
  scores = self.compute_scores(tokenized_inputs)
382
  final_scores.extend(scores)
383
  return final_scores