epinfomax commited on
Commit
e2c3e92
ยท
verified ยท
1 Parent(s): e3473e4

Upload train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train.py +37 -12
train.py CHANGED
@@ -8,29 +8,51 @@ from trl import SFTTrainer, SFTConfig
8
  from transformers import AutoTokenizer
9
  import trackio
10
  import os
 
11
 
12
- print("๐Ÿš€ Starting FunctionGemma 270M Fine-tuning (V5 - Final)")
13
 
14
  model_id = "google/functiongemma-270m-it"
15
  tokenizer = AutoTokenizer.from_pretrained(model_id)
16
 
 
 
 
 
17
  # Load dataset
18
  dataset = load_dataset("epinfomax/vn-function-calling-dataset", split="train")
19
 
20
  def format_conversation(example):
21
- # Pre-render the conversation using the model's chat template
22
- text = tokenizer.apply_chat_template(
 
 
 
 
23
  example["messages"],
24
  tools=example["tools"],
25
  tokenize=False,
26
  add_generation_prompt=False
27
  )
28
- return {"text": text}
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- print("๐Ÿ”„ Pre-processing dataset with chat template...")
31
  dataset = dataset.map(format_conversation, remove_columns=dataset.column_names)
32
 
33
- # LoRA configuration - Define early to avoid NameError
34
  peft_config = LoraConfig(
35
  r=16,
36
  lora_alpha=32,
@@ -38,28 +60,31 @@ peft_config = LoraConfig(
38
  task_type="CAUSAL_LM",
39
  )
40
 
41
- # Training configuration (TRL 0.26.2 style)
42
  config = SFTConfig(
43
- dataset_text_field="text",
44
- max_length=1024, # Confirmed correct for TRL 0.26.2
45
  output_dir="vn-function-gemma-270m-finetuned",
 
46
  push_to_hub=True,
47
  hub_model_id="epinfomax/vn-function-gemma-270m-finetuned",
48
  hub_strategy="every_save",
49
  num_train_epochs=5,
50
  per_device_train_batch_size=4,
51
  gradient_accumulation_steps=4,
52
- learning_rate=5e-5,
 
 
53
  logging_steps=5,
54
  save_strategy="steps",
55
  save_steps=50,
56
  report_to="trackio",
57
  project="vn-function-calling",
58
- run_name="function-gemma-270m-final"
 
 
59
  )
60
 
61
  # Initialize and train
62
- print("๐ŸŽฏ Initializing SFTTrainer...")
63
  trainer = SFTTrainer(
64
  model=model_id,
65
  train_dataset=dataset,
 
8
  from transformers import AutoTokenizer
9
  import trackio
10
  import os
11
+ import json
12
 
13
+ print("๐Ÿš€ Starting FunctionGemma 270M Fine-tuning (V6 - Optimized with Sample Best Practices)")
14
 
15
  model_id = "google/functiongemma-270m-it"
16
  tokenizer = AutoTokenizer.from_pretrained(model_id)
17
 
18
+ # Ensure pad token is set
19
+ if tokenizer.pad_token is None:
20
+ tokenizer.pad_token = tokenizer.eos_token
21
+
22
  # Load dataset
23
  dataset = load_dataset("epinfomax/vn-function-calling-dataset", split="train")
24
 
25
  def format_conversation(example):
26
+ # As per the sample notebook: separate prompt and completion for completion_only_loss
27
+ # but TRL SFTTrainer can also handle a single 'text' field with completion_only_loss=True
28
+ # by using a specific collator if needed.
29
+ # Here we will follow the sample's way of defining prompt and completion columns.
30
+
31
+ full_text = tokenizer.apply_chat_template(
32
  example["messages"],
33
  tools=example["tools"],
34
  tokenize=False,
35
  add_generation_prompt=False
36
  )
37
+
38
+ prompt_text = tokenizer.apply_chat_template(
39
+ example["messages"][:-1], # Everything except the last assistant message
40
+ tools=example["tools"],
41
+ tokenize=False,
42
+ add_generation_prompt=True # Include 'model' header
43
+ )
44
+
45
+ completion_text = full_text[len(prompt_text):]
46
+
47
+ return {
48
+ "prompt": prompt_text,
49
+ "completion": completion_text
50
+ }
51
 
52
+ print("๐Ÿ”„ Pre-processing dataset with prompt/completion split...")
53
  dataset = dataset.map(format_conversation, remove_columns=dataset.column_names)
54
 
55
+ # LoRA configuration
56
  peft_config = LoraConfig(
57
  r=16,
58
  lora_alpha=32,
 
60
  task_type="CAUSAL_LM",
61
  )
62
 
63
+ # Training configuration (Optimized with Sample Best Practices)
64
  config = SFTConfig(
 
 
65
  output_dir="vn-function-gemma-270m-finetuned",
66
+ max_length=1024,
67
  push_to_hub=True,
68
  hub_model_id="epinfomax/vn-function-gemma-270m-finetuned",
69
  hub_strategy="every_save",
70
  num_train_epochs=5,
71
  per_device_train_batch_size=4,
72
  gradient_accumulation_steps=4,
73
+ learning_rate=1e-5, # From sample: more conservative
74
+ lr_scheduler_type="cosine", # From sample
75
+ optim="adamw_torch_fused", # From sample
76
  logging_steps=5,
77
  save_strategy="steps",
78
  save_steps=50,
79
  report_to="trackio",
80
  project="vn-function-calling",
81
+ run_name="function-gemma-270m-v6-optimized",
82
+ completion_only_loss=True, # Focus on assistant responses
83
+ packing=False
84
  )
85
 
86
  # Initialize and train
87
+ print("๐ŸŽฏ Initializing SFTTrainer with optimized configuration...")
88
  trainer = SFTTrainer(
89
  model=model_id,
90
  train_dataset=dataset,