sunkencity
/

training-scripts

Model card Files Files and versions

xet

Community

sunkencity commited on Jan 17

Commit

fc4e123

verified ·

1 Parent(s): 79f3f4d

Upload train_survival.py with huggingface_hub

Browse files

Files changed (1) hide show

train_survival.py +16 -13

train_survival.py CHANGED Viewed

@@ -22,13 +22,11 @@ def filter_empty(example):
     return (
         example["instruction"] is not None
         and example["response"] is not None
-        and len(example["instruction"].strip()) > 0 # Use strip to catch whitespace-only strings
         and len(example["response"].strip()) > 0
     )
-print(f"Original dataset size: {len(dataset)}")
 dataset = dataset.filter(filter_empty)
-print(f"Filtered dataset size: {len(dataset)}")
 # Load Model
 bnb_config = BitsAndBytesConfig(
@@ -69,18 +67,23 @@ training_args = SFTConfig(
     fp16=True,
     packing=False,
     max_length=1024,
-    dataset_text_field="text" # Re-added this. The formatting_func will populate this 'text' field.
 )
-# Formatting function
-# This function should process a single example and return a dictionary with a 'text' key
 def formatting_prompts_func(example):
-    instruction = example['instruction']
-    response = example['response']
-    # Qwen/Llama chat template format
-    formatted_text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
-    return {"text": formatted_text} # Return a dictionary with the 'text' key
 # Trainer
 trainer = SFTTrainer(
@@ -89,7 +92,7 @@ trainer = SFTTrainer(
     peft_config=peft_config,
     formatting_func=formatting_prompts_func,
     args=training_args,
-    tokenizer=tokenizer, # Pass tokenizer explicitly
 )
 print("Starting training...")
@@ -97,4 +100,4 @@ trainer.train()
 print("Pushing to hub...")
 trainer.push_to_hub()
-print("Done!")

     return (
         example["instruction"] is not None
         and example["response"] is not None
+        and len(example["instruction"].strip()) > 0
         and len(example["response"].strip()) > 0
     )
 dataset = dataset.filter(filter_empty)
 # Load Model
 bnb_config = BitsAndBytesConfig(
     fp16=True,
     packing=False,
     max_length=1024,
+    dataset_text_field="text"
 )
 def formatting_prompts_func(example):
+    output_texts = []
+    instructions = example['instruction']
+    responses = example['response']
+    for i in range(len(instructions)):
+        if i >= len(responses): break
+        instruction = instructions[i]
+        response = responses[i]
+        if not instruction or not response: continue
+        text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
+        output_texts.append(text)
+    return output_texts
 # Trainer
 trainer = SFTTrainer(
     peft_config=peft_config,
     formatting_func=formatting_prompts_func,
     args=training_args,
+    processing_class=tokenizer, # CORRECTED: Using processing_class instead of tokenizer
 )
 print("Starting training...")
 print("Pushing to hub...")
 trainer.push_to_hub()
+print("Done!")