sunkencity commited on
Commit
79f3f4d
·
verified ·
1 Parent(s): b5f89c4

Upload train_survival.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_survival.py +15 -18
train_survival.py CHANGED
@@ -22,11 +22,13 @@ def filter_empty(example):
22
  return (
23
  example["instruction"] is not None
24
  and example["response"] is not None
25
- and len(example["instruction"]) > 0
26
- and len(example["response"]) > 0
27
  )
28
 
 
29
  dataset = dataset.filter(filter_empty)
 
30
 
31
  # Load Model
32
  bnb_config = BitsAndBytesConfig(
@@ -66,24 +68,19 @@ training_args = SFTConfig(
66
  hub_model_id=OUTPUT_MODEL_ID,
67
  fp16=True,
68
  packing=False,
69
- max_length=1024
70
- # Removed dataset_text_field="text" as it conflicted with formatting_func
71
  )
72
 
 
 
73
  def formatting_prompts_func(example):
74
- output_texts = []
75
- instructions = example['instruction']
76
- responses = example['response']
77
 
78
- for i in range(len(instructions)):
79
- if i >= len(responses): break
80
- instruction = instructions[i]
81
- response = responses[i]
82
- if not instruction or not response: continue
83
-
84
- text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
85
- output_texts.append(text)
86
- return output_texts
87
 
88
  # Trainer
89
  trainer = SFTTrainer(
@@ -92,7 +89,7 @@ trainer = SFTTrainer(
92
  peft_config=peft_config,
93
  formatting_func=formatting_prompts_func,
94
  args=training_args,
95
- processing_class=tokenizer,
96
  )
97
 
98
  print("Starting training...")
@@ -100,4 +97,4 @@ trainer.train()
100
 
101
  print("Pushing to hub...")
102
  trainer.push_to_hub()
103
- print("Done!")
 
22
  return (
23
  example["instruction"] is not None
24
  and example["response"] is not None
25
+ and len(example["instruction"].strip()) > 0 # Use strip to catch whitespace-only strings
26
+ and len(example["response"].strip()) > 0
27
  )
28
 
29
+ print(f"Original dataset size: {len(dataset)}")
30
  dataset = dataset.filter(filter_empty)
31
+ print(f"Filtered dataset size: {len(dataset)}")
32
 
33
  # Load Model
34
  bnb_config = BitsAndBytesConfig(
 
68
  hub_model_id=OUTPUT_MODEL_ID,
69
  fp16=True,
70
  packing=False,
71
+ max_length=1024,
72
+ dataset_text_field="text" # Re-added this. The formatting_func will populate this 'text' field.
73
  )
74
 
75
+ # Formatting function
76
+ # This function should process a single example and return a dictionary with a 'text' key
77
  def formatting_prompts_func(example):
78
+ instruction = example['instruction']
79
+ response = example['response']
 
80
 
81
+ # Qwen/Llama chat template format
82
+ formatted_text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
83
+ return {"text": formatted_text} # Return a dictionary with the 'text' key
 
 
 
 
 
 
84
 
85
  # Trainer
86
  trainer = SFTTrainer(
 
89
  peft_config=peft_config,
90
  formatting_func=formatting_prompts_func,
91
  args=training_args,
92
+ tokenizer=tokenizer, # Pass tokenizer explicitly
93
  )
94
 
95
  print("Starting training...")
 
97
 
98
  print("Pushing to hub...")
99
  trainer.push_to_hub()
100
+ print("Done!")