Upload train_qwen_hf_jobs.py with huggingface_hub
Browse files- train_qwen_hf_jobs.py +22 -2
train_qwen_hf_jobs.py
CHANGED
|
@@ -10,6 +10,7 @@ Production-ready script with LoRA, Trackio monitoring, and Hub saving.
|
|
| 10 |
from datasets import load_dataset
|
| 11 |
from peft import LoraConfig
|
| 12 |
from trl import SFTTrainer, SFTConfig
|
|
|
|
| 13 |
import trackio
|
| 14 |
|
| 15 |
# Load dataset - using the "messages" field for chat format
|
|
@@ -20,6 +21,26 @@ dataset = load_dataset("open-r1/codeforces-cots", "solutions", split="train")
|
|
| 20 |
dataset = dataset.select(range(min(1000, len(dataset))))
|
| 21 |
print(f"📊 Training on {len(dataset)} examples")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# Create train/eval split for monitoring
|
| 24 |
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
|
| 25 |
|
|
@@ -41,8 +62,7 @@ trainer = SFTTrainer(
|
|
| 41 |
eval_dataset=dataset_split["test"],
|
| 42 |
peft_config=lora_config,
|
| 43 |
args=SFTConfig(
|
| 44 |
-
# Dataset configuration
|
| 45 |
-
dataset_text_field="messages", # Specify the messages field for chat format
|
| 46 |
|
| 47 |
# Output and Hub settings
|
| 48 |
output_dir="qwen-codeforces-sft",
|
|
|
|
| 10 |
from datasets import load_dataset
|
| 11 |
from peft import LoraConfig
|
| 12 |
from trl import SFTTrainer, SFTConfig
|
| 13 |
+
from transformers import AutoTokenizer
|
| 14 |
import trackio
|
| 15 |
|
| 16 |
# Load dataset - using the "messages" field for chat format
|
|
|
|
| 21 |
dataset = dataset.select(range(min(1000, len(dataset))))
|
| 22 |
print(f"📊 Training on {len(dataset)} examples")
|
| 23 |
|
| 24 |
+
# Load tokenizer to apply chat template
|
| 25 |
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B", trust_remote_code=True)
|
| 26 |
+
|
| 27 |
+
# Define formatting function to convert messages to text using chat template
|
| 28 |
+
def formatting_func(example):
|
| 29 |
+
"""Convert messages field to formatted text using tokenizer's chat template."""
|
| 30 |
+
if "messages" in example and example["messages"]:
|
| 31 |
+
# Apply chat template to convert messages to text
|
| 32 |
+
text = tokenizer.apply_chat_template(
|
| 33 |
+
example["messages"],
|
| 34 |
+
tokenize=False,
|
| 35 |
+
add_generation_prompt=False
|
| 36 |
+
)
|
| 37 |
+
return {"text": text}
|
| 38 |
+
return {"text": ""}
|
| 39 |
+
|
| 40 |
+
# Apply formatting to dataset
|
| 41 |
+
print("🔄 Formatting dataset with chat template...")
|
| 42 |
+
dataset = dataset.map(formatting_func, remove_columns=dataset.column_names)
|
| 43 |
+
|
| 44 |
# Create train/eval split for monitoring
|
| 45 |
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
|
| 46 |
|
|
|
|
| 62 |
eval_dataset=dataset_split["test"],
|
| 63 |
peft_config=lora_config,
|
| 64 |
args=SFTConfig(
|
| 65 |
+
# Dataset configuration - using default "text" field from formatting_func
|
|
|
|
| 66 |
|
| 67 |
# Output and Hub settings
|
| 68 |
output_dir="qwen-codeforces-sft",
|