sunkencity
/

training-scripts

sunkencity commited on Jan 17

Commit

79f3f4d

verified ·

1 Parent(s): b5f89c4

Upload train_survival.py with huggingface_hub

Files changed (1) hide show

train_survival.py CHANGED Viewed

@@ -22,11 +22,13 @@ def filter_empty(example):
     return (
         example["instruction"] is not None
         and example["response"] is not None
-        and len(example["instruction"]) > 0
-        and len(example["response"]) > 0
     )
 dataset = dataset.filter(filter_empty)
 # Load Model
 bnb_config = BitsAndBytesConfig(
@@ -66,24 +68,19 @@ training_args = SFTConfig(
     hub_model_id=OUTPUT_MODEL_ID,
     fp16=True,
     packing=False,
-    max_length=1024
-    # Removed dataset_text_field="text" as it conflicted with formatting_func
 )
 def formatting_prompts_func(example):
-    output_texts = []
-    instructions = example['instruction']
-    responses = example['response']
-    for i in range(len(instructions)):
-        if i >= len(responses): break
-        instruction = instructions[i]
-        response = responses[i]
-        if not instruction or not response: continue
-        text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
-        output_texts.append(text)
-    return output_texts
 # Trainer
 trainer = SFTTrainer(
@@ -92,7 +89,7 @@ trainer = SFTTrainer(
     peft_config=peft_config,
     formatting_func=formatting_prompts_func,
     args=training_args,
-    processing_class=tokenizer,
 )
 print("Starting training...")
@@ -100,4 +97,4 @@ trainer.train()
 print("Pushing to hub...")
 trainer.push_to_hub()
-print("Done!")

     return (
         example["instruction"] is not None
         and example["response"] is not None
+        and len(example["instruction"].strip()) > 0 # Use strip to catch whitespace-only strings
+        and len(example["response"].strip()) > 0
     )
+print(f"Original dataset size: {len(dataset)}")
 dataset = dataset.filter(filter_empty)
+print(f"Filtered dataset size: {len(dataset)}")
 # Load Model
 bnb_config = BitsAndBytesConfig(
     hub_model_id=OUTPUT_MODEL_ID,
     fp16=True,
     packing=False,
+    max_length=1024,
+    dataset_text_field="text" # Re-added this. The formatting_func will populate this 'text' field.
 )
+# Formatting function
+# This function should process a single example and return a dictionary with a 'text' key
 def formatting_prompts_func(example):
+    instruction = example['instruction']
+    response = example['response']
+    # Qwen/Llama chat template format
+    formatted_text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
+    return {"text": formatted_text} # Return a dictionary with the 'text' key
 # Trainer
 trainer = SFTTrainer(
     peft_config=peft_config,
     formatting_func=formatting_prompts_func,
     args=training_args,
+    tokenizer=tokenizer, # Pass tokenizer explicitly
 )
 print("Starting training...")
 print("Pushing to hub...")
 trainer.push_to_hub()
+print("Done!")