sunkencity commited on
Commit
b5f89c4
·
verified ·
1 Parent(s): 689d2ea

Upload train_survival.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_survival.py +4 -12
train_survival.py CHANGED
@@ -18,7 +18,6 @@ OUTPUT_MODEL_ID = "sunkencity/survival-expert-3b"
18
  dataset = load_dataset(DATASET_ID, split="train")
19
 
20
  # SANITIZE DATASET
21
- # Filter out any rows that have None or empty strings
22
  def filter_empty(example):
23
  return (
24
  example["instruction"] is not None
@@ -27,9 +26,7 @@ def filter_empty(example):
27
  and len(example["response"]) > 0
28
  )
29
 
30
- print(f"Original dataset size: {len(dataset)}")
31
  dataset = dataset.filter(filter_empty)
32
- print(f"Filtered dataset size: {len(dataset)}")
33
 
34
  # Load Model
35
  bnb_config = BitsAndBytesConfig(
@@ -68,26 +65,21 @@ training_args = SFTConfig(
68
  push_to_hub=True,
69
  hub_model_id=OUTPUT_MODEL_ID,
70
  fp16=True,
71
- dataset_text_field="text",
72
  packing=False,
73
  max_length=1024
 
74
  )
75
 
76
  def formatting_prompts_func(example):
77
  output_texts = []
78
- # Ensure we handle list input (batched)
79
  instructions = example['instruction']
80
  responses = example['response']
81
 
82
  for i in range(len(instructions)):
83
- if i >= len(responses):
84
- break # Should not happen after filtering, but safety first
85
-
86
  instruction = instructions[i]
87
  response = responses[i]
88
-
89
- if not instruction or not response:
90
- continue
91
 
92
  text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
93
  output_texts.append(text)
@@ -108,4 +100,4 @@ trainer.train()
108
 
109
  print("Pushing to hub...")
110
  trainer.push_to_hub()
111
- print("Done!")
 
18
  dataset = load_dataset(DATASET_ID, split="train")
19
 
20
  # SANITIZE DATASET
 
21
  def filter_empty(example):
22
  return (
23
  example["instruction"] is not None
 
26
  and len(example["response"]) > 0
27
  )
28
 
 
29
  dataset = dataset.filter(filter_empty)
 
30
 
31
  # Load Model
32
  bnb_config = BitsAndBytesConfig(
 
65
  push_to_hub=True,
66
  hub_model_id=OUTPUT_MODEL_ID,
67
  fp16=True,
 
68
  packing=False,
69
  max_length=1024
70
+ # Removed dataset_text_field="text" as it conflicted with formatting_func
71
  )
72
 
73
  def formatting_prompts_func(example):
74
  output_texts = []
 
75
  instructions = example['instruction']
76
  responses = example['response']
77
 
78
  for i in range(len(instructions)):
79
+ if i >= len(responses): break
 
 
80
  instruction = instructions[i]
81
  response = responses[i]
82
+ if not instruction or not response: continue
 
 
83
 
84
  text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
85
  output_texts.append(text)
 
100
 
101
  print("Pushing to hub...")
102
  trainer.push_to_hub()
103
+ print("Done!")