Upload train_aviation.py with huggingface_hub
Browse files- train_aviation.py +10 -1
train_aviation.py
CHANGED
|
@@ -91,7 +91,16 @@ dataset = load_dataset("sakharamg/AviationQA", split="train")
|
|
| 91 |
# Limit dataset size for reasonable training time (e.g., 10k examples)
|
| 92 |
# 1M rows is too large for a single generic fine-tuning job without massive compute.
|
| 93 |
print("✂️ Subsampling dataset to 10,000 examples for efficiency...")
|
| 94 |
-
dataset = dataset.shuffle(seed=42).select(range(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
# Map to chat format
|
| 97 |
print("🔄 Mapping dataset...")
|
|
|
|
| 91 |
# Limit dataset size for reasonable training time (e.g., 10k examples)
|
| 92 |
# 1M rows is too large for a single generic fine-tuning job without massive compute.
|
| 93 |
print("✂️ Subsampling dataset to 10,000 examples for efficiency...")
|
| 94 |
+
dataset = dataset.shuffle(seed=42).select(range(12000)) # Take slightly more to account for filtering
|
| 95 |
+
|
| 96 |
+
# Filter out empty/null examples to prevent chat template errors
|
| 97 |
+
print("🧹 Filtering invalid examples...")
|
| 98 |
+
dataset = dataset.filter(lambda x: x["Question"] and x["Answer"] and len(x["Question"].strip()) > 0 and len(x["Answer"].strip()) > 0)
|
| 99 |
+
print(f" Remaining examples after filtering: {len(dataset)}")
|
| 100 |
+
|
| 101 |
+
# Limit to final count
|
| 102 |
+
if len(dataset) > 10000:
|
| 103 |
+
dataset = dataset.select(range(10000))
|
| 104 |
|
| 105 |
# Map to chat format
|
| 106 |
print("🔄 Mapping dataset...")
|