sunkencity
/

training-scripts

sunkencity commited on Jan 17

Commit

689d2ea

verified ·

1 Parent(s): 69fb596

Upload train_survival.py with huggingface_hub

Files changed (1) hide show

train_survival.py CHANGED Viewed

@@ -17,6 +17,20 @@ OUTPUT_MODEL_ID = "sunkencity/survival-expert-3b"
 # Load Dataset
 dataset = load_dataset(DATASET_ID, split="train")
 # Load Model
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
@@ -56,14 +70,25 @@ training_args = SFTConfig(
     fp16=True,
     dataset_text_field="text",
     packing=False,
-    max_length=1024 # Correct parameter name for SFTConfig
 )
 def formatting_prompts_func(example):
     output_texts = []
-    for i in range(len(example['instruction'])):
-        instruction = example['instruction'][i]
-        response = example['response'][i]
         text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
         output_texts.append(text)
     return output_texts
@@ -83,4 +108,4 @@ trainer.train()
 print("Pushing to hub...")
 trainer.push_to_hub()
-print("Done!")

 # Load Dataset
 dataset = load_dataset(DATASET_ID, split="train")
+# SANITIZE DATASET
+# Filter out any rows that have None or empty strings
+def filter_empty(example):
+    return (
+        example["instruction"] is not None
+        and example["response"] is not None
+        and len(example["instruction"]) > 0
+        and len(example["response"]) > 0
+    )
+print(f"Original dataset size: {len(dataset)}")
+dataset = dataset.filter(filter_empty)
+print(f"Filtered dataset size: {len(dataset)}")
 # Load Model
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     fp16=True,
     dataset_text_field="text",
     packing=False,
+    max_length=1024
 )
 def formatting_prompts_func(example):
     output_texts = []
+    # Ensure we handle list input (batched)
+    instructions = example['instruction']
+    responses = example['response']
+    for i in range(len(instructions)):
+        if i >= len(responses):
+            break # Should not happen after filtering, but safety first
+        instruction = instructions[i]
+        response = responses[i]
+        if not instruction or not response:
+            continue
         text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
         output_texts.append(text)
     return output_texts
 print("Pushing to hub...")
 trainer.push_to_hub()
+print("Done!")