KinetoLabs Claude Opus 4.5 commited on
Commit
c4bfdfa
·
1 Parent(s): 455c786

Fix multi-GPU support in vendored Qwen3-VL scripts

Browse files

The original scripts used .to(device) which moves the entire model
to a single GPU. With vision model already on GPU 0 (~22GB), this
caused OOM when loading embedding model.

Changes:
- Remove .to(device) calls in both Qwen3VLEmbedder and Qwen3VLReranker
- Add device_map="auto" to from_pretrained() for multi-GPU distribution
- Update device references to use model's distributed device

This allows the embedding and reranker models to be distributed
across available GPUs alongside the vision model.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

scripts/qwen3_vl/qwen3_vl_embedding.py CHANGED
@@ -164,8 +164,6 @@ class Qwen3VLEmbedder:
164
  default_instruction: str = "Represent the user's input.",
165
  **kwargs,
166
  ):
167
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
168
-
169
  self.max_length = max_length
170
  self.min_pixels = min_pixels
171
  self.max_pixels = max_pixels
@@ -175,14 +173,21 @@ class Qwen3VLEmbedder:
175
  self.max_frames = max_frames
176
  self.default_instruction = default_instruction
177
 
 
 
 
 
 
178
  self.model = Qwen3VLForEmbedding.from_pretrained(
179
  model_name_or_path, trust_remote_code=True, **kwargs
180
- ).to(device)
181
  self.processor = Qwen3VLProcessor.from_pretrained(
182
  model_name_or_path, padding_side="right"
183
  )
184
  self.model.eval()
185
 
 
 
186
  @property
187
  def device(self):
188
  return self.model.device
 
164
  default_instruction: str = "Represent the user's input.",
165
  **kwargs,
166
  ):
 
 
167
  self.max_length = max_length
168
  self.min_pixels = min_pixels
169
  self.max_pixels = max_pixels
 
173
  self.max_frames = max_frames
174
  self.default_instruction = default_instruction
175
 
176
+ # Use device_map="auto" for multi-GPU distribution instead of .to(device)
177
+ # This is critical for HuggingFace Spaces with 4xL4 GPUs
178
+ if "device_map" not in kwargs and torch.cuda.is_available():
179
+ kwargs["device_map"] = "auto"
180
+
181
  self.model = Qwen3VLForEmbedding.from_pretrained(
182
  model_name_or_path, trust_remote_code=True, **kwargs
183
+ )
184
  self.processor = Qwen3VLProcessor.from_pretrained(
185
  model_name_or_path, padding_side="right"
186
  )
187
  self.model.eval()
188
 
189
+ logger.info(f"Qwen3VLEmbedder loaded with device_map={kwargs.get('device_map', 'N/A')}")
190
+
191
  @property
192
  def device(self):
193
  return self.model.device
scripts/qwen3_vl/qwen3_vl_reranker.py CHANGED
@@ -74,9 +74,6 @@ class Qwen3VLReranker:
74
  default_instruction: str = "Given a search query, retrieve relevant candidates that answer the query.",
75
  **kwargs,
76
  ):
77
-
78
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
79
-
80
  self.max_length = max_length
81
  self.min_pixels = min_pixels
82
  self.max_pixels = max_pixels
@@ -87,9 +84,14 @@ class Qwen3VLReranker:
87
 
88
  self.default_instruction = default_instruction
89
 
 
 
 
 
 
90
  lm = Qwen3VLForConditionalGeneration.from_pretrained(
91
  model_name_or_path, trust_remote_code=True, **kwargs
92
- ).to(self.device)
93
 
94
  self.model = lm.model
95
  self.processor = AutoProcessor.from_pretrained(
@@ -97,14 +99,18 @@ class Qwen3VLReranker:
97
  )
98
  self.model.eval()
99
 
 
 
 
100
  token_true_id = self.processor.tokenizer.get_vocab()["yes"]
101
  token_false_id = self.processor.tokenizer.get_vocab()["no"]
102
  self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
103
  self.score_linear.eval()
104
- self.score_linear.to(self.device).to(self.model.dtype)
105
 
106
  logger.info(
107
- f"Initialized Qwen3VLReranker with yes/no scoring layer (device={self.device})"
 
108
  )
109
 
110
  def get_binary_linear(self, model, token_yes, token_no):
 
74
  default_instruction: str = "Given a search query, retrieve relevant candidates that answer the query.",
75
  **kwargs,
76
  ):
 
 
 
77
  self.max_length = max_length
78
  self.min_pixels = min_pixels
79
  self.max_pixels = max_pixels
 
84
 
85
  self.default_instruction = default_instruction
86
 
87
+ # Use device_map="auto" for multi-GPU distribution instead of .to(device)
88
+ # This is critical for HuggingFace Spaces with 4xL4 GPUs
89
+ if "device_map" not in kwargs and torch.cuda.is_available():
90
+ kwargs["device_map"] = "auto"
91
+
92
  lm = Qwen3VLForConditionalGeneration.from_pretrained(
93
  model_name_or_path, trust_remote_code=True, **kwargs
94
+ )
95
 
96
  self.model = lm.model
97
  self.processor = AutoProcessor.from_pretrained(
 
99
  )
100
  self.model.eval()
101
 
102
+ # Get device from model (may be distributed across GPUs)
103
+ self._lm_device = next(lm.parameters()).device
104
+
105
  token_true_id = self.processor.tokenizer.get_vocab()["yes"]
106
  token_false_id = self.processor.tokenizer.get_vocab()["no"]
107
  self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
108
  self.score_linear.eval()
109
+ self.score_linear.to(self._lm_device).to(self.model.dtype)
110
 
111
  logger.info(
112
+ f"Qwen3VLReranker loaded with device_map={kwargs.get('device_map', 'N/A')}, "
113
+ f"yes/no scoring layer initialized"
114
  )
115
 
116
  def get_binary_linear(self, model, token_yes, token_no):