Qrverse commited on
Commit
7157013
·
verified ·
1 Parent(s): 14e4d0b

Upload train-hf-jobs.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train-hf-jobs.py +24 -20
train-hf-jobs.py CHANGED
@@ -161,29 +161,33 @@ if len(dataset) > 0:
161
  # ---------------------------------------------------------------------------
162
  # 5. Format dataset for Qwen3-VL conversation template
163
  # ---------------------------------------------------------------------------
164
- # The dataset is already in Qwen3-VL conversation format with a `messages`
165
- # field. Each message has `role` (system/user/assistant) and `content`
166
- # (either a string or a list of typed content blocks for vision inputs).
167
  #
168
- # Unsloth's FastVisionModel + SFTTrainer handle the chat template
169
- # application automatically when we pass the dataset with the `messages`
170
- # column and set `dataset_text_field="messages"`.
171
-
172
- from unsloth.chat_templates import standardize_sharegpt, get_chat_template
173
-
174
- # Apply Qwen3-VL chat template to the tokenizer so SFTTrainer can
175
- # format conversations correctly during collation.
176
- tokenizer = get_chat_template(
177
- tokenizer,
178
- chat_template="qwen2-vl", # Qwen3-VL uses the same template family as Qwen2-VL
 
 
 
 
 
 
 
 
 
 
179
  )
180
 
181
- # Standardize the dataset to ShareGPT format expected by Unsloth.
182
- # This handles the `messages` -> `conversations` rename and validates
183
- # role alternation.
184
- dataset = standardize_sharegpt(dataset)
185
-
186
- logger.info("Dataset standardized for training.")
187
 
188
 
189
  # ---------------------------------------------------------------------------
 
161
  # ---------------------------------------------------------------------------
162
  # 5. Format dataset for Qwen3-VL conversation template
163
  # ---------------------------------------------------------------------------
164
+ # The dataset is in Qwen3-VL conversation format with a `messages` field.
165
+ # Each message has `role` (system/user/assistant) and `content`.
 
166
  #
167
+ # We use the tokenizer's built-in chat template (loaded from the model config)
168
+ # to convert conversations to formatted text strings for SFTTrainer.
169
+
170
+ logger.info("Formatting conversations with tokenizer chat template...")
171
+
172
+ def format_conversations(examples):
173
+ texts = []
174
+ for messages in examples["messages"]:
175
+ text = tokenizer.apply_chat_template(
176
+ messages,
177
+ tokenize=False,
178
+ add_generation_prompt=False,
179
+ )
180
+ texts.append(text)
181
+ return {"text": texts}
182
+
183
+ dataset = dataset.map(
184
+ format_conversations,
185
+ batched=True,
186
+ remove_columns=dataset.column_names,
187
+ desc="Applying chat template",
188
  )
189
 
190
+ logger.info("Dataset formatted: %d examples with 'text' column.", len(dataset))
 
 
 
 
 
191
 
192
 
193
  # ---------------------------------------------------------------------------