nathens commited on
Commit
8b28065
·
verified ·
1 Parent(s): 3634b69

Upload train_qwen_hf_jobs.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_qwen_hf_jobs.py +22 -2
train_qwen_hf_jobs.py CHANGED
@@ -10,6 +10,7 @@ Production-ready script with LoRA, Trackio monitoring, and Hub saving.
10
  from datasets import load_dataset
11
  from peft import LoraConfig
12
  from trl import SFTTrainer, SFTConfig
 
13
  import trackio
14
 
15
  # Load dataset - using the "messages" field for chat format
@@ -20,6 +21,26 @@ dataset = load_dataset("open-r1/codeforces-cots", "solutions", split="train")
20
  dataset = dataset.select(range(min(1000, len(dataset))))
21
  print(f"📊 Training on {len(dataset)} examples")
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # Create train/eval split for monitoring
24
  dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
25
 
@@ -41,8 +62,7 @@ trainer = SFTTrainer(
41
  eval_dataset=dataset_split["test"],
42
  peft_config=lora_config,
43
  args=SFTConfig(
44
- # Dataset configuration
45
- dataset_text_field="messages", # Specify the messages field for chat format
46
 
47
  # Output and Hub settings
48
  output_dir="qwen-codeforces-sft",
 
10
  from datasets import load_dataset
11
  from peft import LoraConfig
12
  from trl import SFTTrainer, SFTConfig
13
+ from transformers import AutoTokenizer
14
  import trackio
15
 
16
  # Load dataset - using the "messages" field for chat format
 
21
  dataset = dataset.select(range(min(1000, len(dataset))))
22
  print(f"📊 Training on {len(dataset)} examples")
23
 
24
+ # Load tokenizer to apply chat template
25
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B", trust_remote_code=True)
26
+
27
+ # Define formatting function to convert messages to text using chat template
28
+ def formatting_func(example):
29
+ """Convert messages field to formatted text using tokenizer's chat template."""
30
+ if "messages" in example and example["messages"]:
31
+ # Apply chat template to convert messages to text
32
+ text = tokenizer.apply_chat_template(
33
+ example["messages"],
34
+ tokenize=False,
35
+ add_generation_prompt=False
36
+ )
37
+ return {"text": text}
38
+ return {"text": ""}
39
+
40
+ # Apply formatting to dataset
41
+ print("🔄 Formatting dataset with chat template...")
42
+ dataset = dataset.map(formatting_func, remove_columns=dataset.column_names)
43
+
44
  # Create train/eval split for monitoring
45
  dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
46
 
 
62
  eval_dataset=dataset_split["test"],
63
  peft_config=lora_config,
64
  args=SFTConfig(
65
+ # Dataset configuration - using default "text" field from formatting_func
 
66
 
67
  # Output and Hub settings
68
  output_dir="qwen-codeforces-sft",