sunkencity
/

training-scripts

sunkencity commited on Dec 10, 2025

Commit

6f6fc96

verified ·

1 Parent(s): afbbcb3

Upload train_aviation.py with huggingface_hub

Files changed (1) hide show

train_aviation.py CHANGED Viewed

@@ -91,7 +91,16 @@ dataset = load_dataset("sakharamg/AviationQA", split="train")
 # Limit dataset size for reasonable training time (e.g., 10k examples)
 # 1M rows is too large for a single generic fine-tuning job without massive compute.
 print("✂️ Subsampling dataset to 10,000 examples for efficiency...")
-dataset = dataset.shuffle(seed=42).select(range(10000))
 # Map to chat format
 print("🔄 Mapping dataset...")

 # Limit dataset size for reasonable training time (e.g., 10k examples)
 # 1M rows is too large for a single generic fine-tuning job without massive compute.
 print("✂️ Subsampling dataset to 10,000 examples for efficiency...")
+dataset = dataset.shuffle(seed=42).select(range(12000)) # Take slightly more to account for filtering
+# Filter out empty/null examples to prevent chat template errors
+print("🧹 Filtering invalid examples...")
+dataset = dataset.filter(lambda x: x["Question"] and x["Answer"] and len(x["Question"].strip()) > 0 and len(x["Answer"].strip()) > 0)
+print(f"   Remaining examples after filtering: {len(dataset)}")
+# Limit to final count
+if len(dataset) > 10000:
+    dataset = dataset.select(range(10000))
 # Map to chat format
 print("🔄 Mapping dataset...")