Upload train-hf-jobs.py with huggingface_hub
Browse files- train-hf-jobs.py +24 -20
train-hf-jobs.py
CHANGED
|
@@ -161,29 +161,33 @@ if len(dataset) > 0:
|
|
| 161 |
# ---------------------------------------------------------------------------
|
| 162 |
# 5. Format dataset for Qwen3-VL conversation template
|
| 163 |
# ---------------------------------------------------------------------------
|
| 164 |
-
# The dataset is
|
| 165 |
-
#
|
| 166 |
-
# (either a string or a list of typed content blocks for vision inputs).
|
| 167 |
#
|
| 168 |
-
#
|
| 169 |
-
#
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
)
|
| 180 |
|
| 181 |
-
|
| 182 |
-
# This handles the `messages` -> `conversations` rename and validates
|
| 183 |
-
# role alternation.
|
| 184 |
-
dataset = standardize_sharegpt(dataset)
|
| 185 |
-
|
| 186 |
-
logger.info("Dataset standardized for training.")
|
| 187 |
|
| 188 |
|
| 189 |
# ---------------------------------------------------------------------------
|
|
|
|
| 161 |
# ---------------------------------------------------------------------------
|
| 162 |
# 5. Format dataset for Qwen3-VL conversation template
|
| 163 |
# ---------------------------------------------------------------------------
|
| 164 |
+
# The dataset is in Qwen3-VL conversation format with a `messages` field.
|
| 165 |
+
# Each message has `role` (system/user/assistant) and `content`.
|
|
|
|
| 166 |
#
|
| 167 |
+
# We use the tokenizer's built-in chat template (loaded from the model config)
|
| 168 |
+
# to convert conversations to formatted text strings for SFTTrainer.
|
| 169 |
+
|
| 170 |
+
logger.info("Formatting conversations with tokenizer chat template...")
|
| 171 |
+
|
| 172 |
+
def format_conversations(examples):
|
| 173 |
+
texts = []
|
| 174 |
+
for messages in examples["messages"]:
|
| 175 |
+
text = tokenizer.apply_chat_template(
|
| 176 |
+
messages,
|
| 177 |
+
tokenize=False,
|
| 178 |
+
add_generation_prompt=False,
|
| 179 |
+
)
|
| 180 |
+
texts.append(text)
|
| 181 |
+
return {"text": texts}
|
| 182 |
+
|
| 183 |
+
dataset = dataset.map(
|
| 184 |
+
format_conversations,
|
| 185 |
+
batched=True,
|
| 186 |
+
remove_columns=dataset.column_names,
|
| 187 |
+
desc="Applying chat template",
|
| 188 |
)
|
| 189 |
|
| 190 |
+
logger.info("Dataset formatted: %d examples with 'text' column.", len(dataset))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
|
| 193 |
# ---------------------------------------------------------------------------
|