Qrverse
/

qr-verse-ai-lora

@@ -161,29 +161,33 @@ if len(dataset) > 0:
 # ---------------------------------------------------------------------------
 # 5. Format dataset for Qwen3-VL conversation template
 # ---------------------------------------------------------------------------
-# The dataset is already in Qwen3-VL conversation format with a `messages`
-# field. Each message has `role` (system/user/assistant) and `content`
-# (either a string or a list of typed content blocks for vision inputs).
 #
-# Unsloth's FastVisionModel + SFTTrainer handle the chat template
-# application automatically when we pass the dataset with the `messages`
-# column and set `dataset_text_field="messages"`.
-from unsloth.chat_templates import standardize_sharegpt, get_chat_template
-# Apply Qwen3-VL chat template to the tokenizer so SFTTrainer can
-# format conversations correctly during collation.
-tokenizer = get_chat_template(
-    tokenizer,
-    chat_template="qwen2-vl",  # Qwen3-VL uses the same template family as Qwen2-VL
 )
-# Standardize the dataset to ShareGPT format expected by Unsloth.
-# This handles the `messages` -> `conversations` rename and validates
-# role alternation.
-dataset = standardize_sharegpt(dataset)
-logger.info("Dataset standardized for training.")
 # ---------------------------------------------------------------------------

 # ---------------------------------------------------------------------------
 # 5. Format dataset for Qwen3-VL conversation template
 # ---------------------------------------------------------------------------
+# The dataset is in Qwen3-VL conversation format with a `messages` field.
+# Each message has `role` (system/user/assistant) and `content`.
 #
+# We use the tokenizer's built-in chat template (loaded from the model config)
+# to convert conversations to formatted text strings for SFTTrainer.
+logger.info("Formatting conversations with tokenizer chat template...")
+def format_conversations(examples):
+    texts = []
+    for messages in examples["messages"]:
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=False,
+        )
+        texts.append(text)
+    return {"text": texts}
+dataset = dataset.map(
+    format_conversations,
+    batched=True,
+    remove_columns=dataset.column_names,
+    desc="Applying chat template",
 )
+logger.info("Dataset formatted: %d examples with 'text' column.", len(dataset))
 # ---------------------------------------------------------------------------