sunkencity commited on
Commit
fc4e123
·
verified ·
1 Parent(s): 79f3f4d

Upload train_survival.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_survival.py +16 -13
train_survival.py CHANGED
@@ -22,13 +22,11 @@ def filter_empty(example):
22
  return (
23
  example["instruction"] is not None
24
  and example["response"] is not None
25
- and len(example["instruction"].strip()) > 0 # Use strip to catch whitespace-only strings
26
  and len(example["response"].strip()) > 0
27
  )
28
 
29
- print(f"Original dataset size: {len(dataset)}")
30
  dataset = dataset.filter(filter_empty)
31
- print(f"Filtered dataset size: {len(dataset)}")
32
 
33
  # Load Model
34
  bnb_config = BitsAndBytesConfig(
@@ -69,18 +67,23 @@ training_args = SFTConfig(
69
  fp16=True,
70
  packing=False,
71
  max_length=1024,
72
- dataset_text_field="text" # Re-added this. The formatting_func will populate this 'text' field.
73
  )
74
 
75
- # Formatting function
76
- # This function should process a single example and return a dictionary with a 'text' key
77
  def formatting_prompts_func(example):
78
- instruction = example['instruction']
79
- response = example['response']
 
80
 
81
- # Qwen/Llama chat template format
82
- formatted_text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
83
- return {"text": formatted_text} # Return a dictionary with the 'text' key
 
 
 
 
 
 
84
 
85
  # Trainer
86
  trainer = SFTTrainer(
@@ -89,7 +92,7 @@ trainer = SFTTrainer(
89
  peft_config=peft_config,
90
  formatting_func=formatting_prompts_func,
91
  args=training_args,
92
- tokenizer=tokenizer, # Pass tokenizer explicitly
93
  )
94
 
95
  print("Starting training...")
@@ -97,4 +100,4 @@ trainer.train()
97
 
98
  print("Pushing to hub...")
99
  trainer.push_to_hub()
100
- print("Done!")
 
22
  return (
23
  example["instruction"] is not None
24
  and example["response"] is not None
25
+ and len(example["instruction"].strip()) > 0
26
  and len(example["response"].strip()) > 0
27
  )
28
 
 
29
  dataset = dataset.filter(filter_empty)
 
30
 
31
  # Load Model
32
  bnb_config = BitsAndBytesConfig(
 
67
  fp16=True,
68
  packing=False,
69
  max_length=1024,
70
+ dataset_text_field="text"
71
  )
72
 
 
 
73
  def formatting_prompts_func(example):
74
+ output_texts = []
75
+ instructions = example['instruction']
76
+ responses = example['response']
77
 
78
+ for i in range(len(instructions)):
79
+ if i >= len(responses): break
80
+ instruction = instructions[i]
81
+ response = responses[i]
82
+ if not instruction or not response: continue
83
+
84
+ text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
85
+ output_texts.append(text)
86
+ return output_texts
87
 
88
  # Trainer
89
  trainer = SFTTrainer(
 
92
  peft_config=peft_config,
93
  formatting_func=formatting_prompts_func,
94
  args=training_args,
95
+ processing_class=tokenizer, # CORRECTED: Using processing_class instead of tokenizer
96
  )
97
 
98
  print("Starting training...")
 
100
 
101
  print("Pushing to hub...")
102
  trainer.push_to_hub()
103
+ print("Done!")