sunkencity commited on
Commit
6f6fc96
·
verified ·
1 Parent(s): afbbcb3

Upload train_aviation.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_aviation.py +10 -1
train_aviation.py CHANGED
@@ -91,7 +91,16 @@ dataset = load_dataset("sakharamg/AviationQA", split="train")
91
  # Limit dataset size for reasonable training time (e.g., 10k examples)
92
  # 1M rows is too large for a single generic fine-tuning job without massive compute.
93
  print("✂️ Subsampling dataset to 10,000 examples for efficiency...")
94
- dataset = dataset.shuffle(seed=42).select(range(10000))
 
 
 
 
 
 
 
 
 
95
 
96
  # Map to chat format
97
  print("🔄 Mapping dataset...")
 
91
  # Limit dataset size for reasonable training time (e.g., 10k examples)
92
  # 1M rows is too large for a single generic fine-tuning job without massive compute.
93
  print("✂️ Subsampling dataset to 10,000 examples for efficiency...")
94
+ dataset = dataset.shuffle(seed=42).select(range(12000)) # Take slightly more to account for filtering
95
+
96
+ # Filter out empty/null examples to prevent chat template errors
97
+ print("🧹 Filtering invalid examples...")
98
+ dataset = dataset.filter(lambda x: x["Question"] and x["Answer"] and len(x["Question"].strip()) > 0 and len(x["Answer"].strip()) > 0)
99
+ print(f" Remaining examples after filtering: {len(dataset)}")
100
+
101
+ # Limit to final count
102
+ if len(dataset) > 10000:
103
+ dataset = dataset.select(range(10000))
104
 
105
  # Map to chat format
106
  print("🔄 Mapping dataset...")