papebaba commited on
Commit
cb94b3f
Β·
verified Β·
1 Parent(s): 8ef6f5a

Upload train_qwen_codeforces.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_qwen_codeforces.py +12 -8
train_qwen_codeforces.py CHANGED
@@ -15,24 +15,28 @@ dataset = load_dataset(
15
  "solutions_w_editorials_py_decontaminated",
16
  split="train[:1000]"
17
  )
18
- print(f"πŸ“Š Training on {len(dataset)} examples for 3 epochs")
19
 
20
  # Load tokenizer to get chat template
21
  print("πŸ”€ Loading tokenizer...")
22
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
23
 
24
- # Define formatting function for messages
25
- def formatting_func(example):
 
26
  """Convert messages format to text using chat template."""
27
  if "messages" in example and example["messages"]:
28
- # Use the tokenizer's chat template to format messages
29
  text = tokenizer.apply_chat_template(
30
  example["messages"],
31
  tokenize=False,
32
  add_generation_prompt=False
33
  )
34
- return text
35
- return ""
 
 
 
 
36
 
37
  # LoRA configuration for efficient training
38
  peft_config = LoraConfig(
@@ -78,14 +82,14 @@ config = SFTConfig(
78
  run_name="qwen-codeforces-sft-1k",
79
  )
80
 
81
- # Initialize trainer
82
  print("🎯 Initializing trainer...")
83
  trainer = SFTTrainer(
84
  model="Qwen/Qwen2.5-0.5B",
85
  train_dataset=dataset,
86
  args=config,
87
  peft_config=peft_config,
88
- formatting_func=formatting_func, # Use formatting function for messages
89
  )
90
 
91
  # Train
 
15
  "solutions_w_editorials_py_decontaminated",
16
  split="train[:1000]"
17
  )
18
+ print(f"πŸ“Š Loaded {len(dataset)} examples")
19
 
20
  # Load tokenizer to get chat template
21
  print("πŸ”€ Loading tokenizer...")
22
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
23
 
24
+ # Pre-process dataset - convert messages to text format
25
+ print("πŸ”„ Converting messages to text format...")
26
+ def convert_messages_to_text(example):
27
  """Convert messages format to text using chat template."""
28
  if "messages" in example and example["messages"]:
 
29
  text = tokenizer.apply_chat_template(
30
  example["messages"],
31
  tokenize=False,
32
  add_generation_prompt=False
33
  )
34
+ return {"text": text}
35
+ return {"text": ""}
36
+
37
+ # Apply the conversion
38
+ dataset = dataset.map(convert_messages_to_text, remove_columns=dataset.column_names)
39
+ print(f"βœ… Dataset preprocessed - training on {len(dataset)} examples for 3 epochs")
40
 
41
  # LoRA configuration for efficient training
42
  peft_config = LoraConfig(
 
82
  run_name="qwen-codeforces-sft-1k",
83
  )
84
 
85
+ # Initialize trainer with preprocessed dataset
86
  print("🎯 Initializing trainer...")
87
  trainer = SFTTrainer(
88
  model="Qwen/Qwen2.5-0.5B",
89
  train_dataset=dataset,
90
  args=config,
91
  peft_config=peft_config,
92
+ dataset_text_field="text", # Use the text field we created
93
  )
94
 
95
  # Train