albertlieadrian commited on
Commit
66abccc
·
verified ·
1 Parent(s): 2d67f40

Upload train_qwen3_codeforces.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_qwen3_codeforces.py +20 -2
train_qwen3_codeforces.py CHANGED
@@ -22,8 +22,25 @@ full_dataset = load_dataset("open-r1/codeforces-cots", split="train")
22
  dataset = full_dataset.select(range(20))
23
  print(f"✅ Dataset loaded: {len(dataset)} examples")
24
 
25
- # Use messages column directly - TRL SFT supports this format
26
- # No need for train/eval split for quick demo - use full dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # Training configuration
29
  config = SFTConfig(
@@ -70,6 +87,7 @@ print("🎯 Initializing trainer...")
70
  trainer = SFTTrainer(
71
  model="Qwen/Qwen3-0.6B",
72
  train_dataset=dataset,
 
73
  args=config,
74
  peft_config=peft_config,
75
  )
 
22
  dataset = full_dataset.select(range(20))
23
  print(f"✅ Dataset loaded: {len(dataset)} examples")
24
 
25
+ # Format the dataset - convert messages to text format for SFT
26
+ def format_for_sft(example):
27
+ """Convert messages to a single text format for training."""
28
+ messages = example.get("messages", [])
29
+ text = ""
30
+ for msg in messages:
31
+ role = msg.get("role", "unknown")
32
+ content = msg.get("content", "")
33
+ if role == "system":
34
+ text += f"System: {content}\n\n"
35
+ elif role == "user":
36
+ text += f"User: {content}\n\n"
37
+ elif role == "assistant":
38
+ text += f"Assistant: {content}\n\n"
39
+ return {"text": text.strip()}
40
+
41
+ print("🔄 Formatting dataset...")
42
+ dataset = dataset.map(format_for_sft, remove_columns=dataset.column_names)
43
+ print(f" Formatted to text: {dataset[0]['text'][:200]}...")
44
 
45
  # Training configuration
46
  config = SFTConfig(
 
87
  trainer = SFTTrainer(
88
  model="Qwen/Qwen3-0.6B",
89
  train_dataset=dataset,
90
+ dataset_text_field="text",
91
  args=config,
92
  peft_config=peft_config,
93
  )