Spaces:
Paused
Paused
Commit
·
d1901ae
1
Parent(s):
c4bfdfa
Fix multi-GPU compatibility issues (6 locations)
Browse files1. Meta tensor error in reranker (BLOCKING):
- Use .clone() to force weight materialization before copying
- Create Linear layer with correct device/dtype from lm_head
2. Remove all .to(device) calls that break distributed models:
- scripts/qwen3_vl/qwen3_vl_embedding.py:388
- scripts/qwen3_vl/qwen3_vl_reranker.py:109, 374
- models/real.py:259
- rag/vectorstore.py:130
- rag/retriever.py:149
With device_map="auto", transformers handles device routing internally.
Manual .to(device) calls cause device mismatches on multi-GPU setups.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- models/real.py +2 -2
- rag/retriever.py +2 -1
- rag/vectorstore.py +2 -1
- scripts/qwen3_vl/qwen3_vl_embedding.py +2 -1
- scripts/qwen3_vl/qwen3_vl_reranker.py +12 -6
models/real.py
CHANGED
|
@@ -255,8 +255,8 @@ IMPORTANT: Return ONLY valid JSON, no additional text."""
|
|
| 255 |
padding=True,
|
| 256 |
)
|
| 257 |
|
| 258 |
-
#
|
| 259 |
-
|
| 260 |
|
| 261 |
# Log inference config being used
|
| 262 |
logger.debug(f"Vision inference config: max_new_tokens={vision_config.max_new_tokens}, "
|
|
|
|
| 255 |
padding=True,
|
| 256 |
)
|
| 257 |
|
| 258 |
+
# Note: With device_map="auto", transformers handles device routing internally
|
| 259 |
+
# Do NOT call .to(device) - it breaks distributed models
|
| 260 |
|
| 261 |
# Log inference config being used
|
| 262 |
logger.debug(f"Vision inference config: max_new_tokens={vision_config.max_new_tokens}, "
|
rag/retriever.py
CHANGED
|
@@ -146,7 +146,8 @@ class RealReranker:
|
|
| 146 |
max_length=512,
|
| 147 |
padding=True,
|
| 148 |
)
|
| 149 |
-
|
|
|
|
| 150 |
|
| 151 |
outputs = self.model(**inputs)
|
| 152 |
# Sigmoid to get 0-1 score
|
|
|
|
| 146 |
max_length=512,
|
| 147 |
padding=True,
|
| 148 |
)
|
| 149 |
+
# Note: With device_map="auto", transformers handles device routing internally
|
| 150 |
+
# Do NOT call .to(device) - it breaks distributed models
|
| 151 |
|
| 152 |
outputs = self.model(**inputs)
|
| 153 |
# Sigmoid to get 0-1 score
|
rag/vectorstore.py
CHANGED
|
@@ -127,7 +127,8 @@ class RealEmbeddingFunction:
|
|
| 127 |
max_length=512,
|
| 128 |
padding=True,
|
| 129 |
)
|
| 130 |
-
|
|
|
|
| 131 |
|
| 132 |
outputs = self.model(**inputs)
|
| 133 |
|
|
|
|
| 127 |
max_length=512,
|
| 128 |
padding=True,
|
| 129 |
)
|
| 130 |
+
# Note: With device_map="auto", transformers handles device routing internally
|
| 131 |
+
# Do NOT call .to(device) - it breaks distributed models
|
| 132 |
|
| 133 |
outputs = self.model(**inputs)
|
| 134 |
|
scripts/qwen3_vl/qwen3_vl_embedding.py
CHANGED
|
@@ -385,7 +385,8 @@ class Qwen3VLEmbedder:
|
|
| 385 |
]
|
| 386 |
|
| 387 |
processed_inputs = self._preprocess_inputs(conversations)
|
| 388 |
-
|
|
|
|
| 389 |
|
| 390 |
outputs = self.forward(processed_inputs)
|
| 391 |
embeddings = self._pooling_last(
|
|
|
|
| 385 |
]
|
| 386 |
|
| 387 |
processed_inputs = self._preprocess_inputs(conversations)
|
| 388 |
+
# Note: With device_map="auto", transformers handles device routing internally
|
| 389 |
+
# Do NOT call .to(device) - it breaks distributed models
|
| 390 |
|
| 391 |
outputs = self.forward(processed_inputs)
|
| 392 |
embeddings = self._pooling_last(
|
scripts/qwen3_vl/qwen3_vl_reranker.py
CHANGED
|
@@ -106,7 +106,7 @@ class Qwen3VLReranker:
|
|
| 106 |
token_false_id = self.processor.tokenizer.get_vocab()["no"]
|
| 107 |
self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
|
| 108 |
self.score_linear.eval()
|
| 109 |
-
|
| 110 |
|
| 111 |
logger.info(
|
| 112 |
f"Qwen3VLReranker loaded with device_map={kwargs.get('device_map', 'N/A')}, "
|
|
@@ -114,16 +114,21 @@ class Qwen3VLReranker:
|
|
| 114 |
)
|
| 115 |
|
| 116 |
def get_binary_linear(self, model, token_yes, token_no):
|
| 117 |
-
"""Extract yes/no token weights from LM head and create scoring layer.
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
weight_yes = lm_head_weights[token_yes]
|
| 121 |
weight_no = lm_head_weights[token_no]
|
| 122 |
|
| 123 |
D = weight_yes.size()[0]
|
| 124 |
-
linear_layer = torch.nn.Linear(D, 1, bias=False)
|
| 125 |
with torch.no_grad():
|
| 126 |
-
linear_layer.weight[0] = weight_yes - weight_no
|
| 127 |
return linear_layer
|
| 128 |
|
| 129 |
@torch.no_grad()
|
|
@@ -371,7 +376,8 @@ class Qwen3VLReranker:
|
|
| 371 |
final_scores = []
|
| 372 |
for pair in pairs:
|
| 373 |
tokenized_inputs = self.tokenize([pair])
|
| 374 |
-
|
|
|
|
| 375 |
scores = self.compute_scores(tokenized_inputs)
|
| 376 |
final_scores.extend(scores)
|
| 377 |
return final_scores
|
|
|
|
| 106 |
token_false_id = self.processor.tokenizer.get_vocab()["no"]
|
| 107 |
self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
|
| 108 |
self.score_linear.eval()
|
| 109 |
+
# Note: device and dtype are set in get_binary_linear() to match lm_head weights
|
| 110 |
|
| 111 |
logger.info(
|
| 112 |
f"Qwen3VLReranker loaded with device_map={kwargs.get('device_map', 'N/A')}, "
|
|
|
|
| 114 |
)
|
| 115 |
|
| 116 |
def get_binary_linear(self, model, token_yes, token_no):
|
| 117 |
+
"""Extract yes/no token weights from LM head and create scoring layer.
|
| 118 |
+
|
| 119 |
+
Note: With device_map="auto", weights may be meta tensors until materialized.
|
| 120 |
+
We use .clone() to force materialization before copying.
|
| 121 |
+
"""
|
| 122 |
+
# Force materialization with .clone() - required for device_map="auto"
|
| 123 |
+
lm_head_weights = model.lm_head.weight.clone()
|
| 124 |
|
| 125 |
weight_yes = lm_head_weights[token_yes]
|
| 126 |
weight_no = lm_head_weights[token_no]
|
| 127 |
|
| 128 |
D = weight_yes.size()[0]
|
| 129 |
+
linear_layer = torch.nn.Linear(D, 1, bias=False, device=weight_yes.device, dtype=weight_yes.dtype)
|
| 130 |
with torch.no_grad():
|
| 131 |
+
linear_layer.weight.data[0] = weight_yes - weight_no
|
| 132 |
return linear_layer
|
| 133 |
|
| 134 |
@torch.no_grad()
|
|
|
|
| 376 |
final_scores = []
|
| 377 |
for pair in pairs:
|
| 378 |
tokenized_inputs = self.tokenize([pair])
|
| 379 |
+
# Note: With device_map="auto", transformers handles device routing internally
|
| 380 |
+
# Do NOT call .to(device) - it breaks distributed models
|
| 381 |
scores = self.compute_scores(tokenized_inputs)
|
| 382 |
final_scores.extend(scores)
|
| 383 |
return final_scores
|