File size: 4,950 Bytes
bff2f94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | #!/usr/bin/env python3
"""
Merge BioRLHF training data with ecosystem failure examples.
This script:
1. Loads existing kmp_sft_final.json training data
2. Loads ecosystem_failures_training.json (failure-based examples)
3. Converts failure examples to the same format
4. Outputs combined_training.json
Usage:
python scripts/merge_training_data.py
"""
import json
from pathlib import Path
from datetime import datetime
def load_json(filepath: str) -> dict | list:
"""Load JSON file."""
with open(filepath, 'r') as f:
return json.load(f)
def save_json(data: list, filepath: str):
"""Save JSON file."""
with open(filepath, 'w') as f:
json.dump(data, f, indent=2)
print(f"Saved {len(data)} examples to {filepath}")
def convert_conversation_to_text(conversation: list) -> str:
"""
Convert conversation format to text format.
Input: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
Output: "### Instruction:\n...\n\n### Response:\n..."
"""
instruction = ""
response = ""
for turn in conversation:
if turn["role"] == "user":
instruction = turn["content"]
elif turn["role"] == "assistant":
response = turn["content"]
return f"### Instruction:\n{instruction}\n\n### Response:\n{response}"
def extract_examples_from_failures(failure_data: dict) -> list:
"""
Extract and convert all examples from failure training data.
"""
examples = []
# Process calibration examples
for ex in failure_data.get("calibration_examples", []):
text = convert_conversation_to_text(ex["conversations"])
examples.append({
"text": text,
"source": f"ecosystem_failures:{ex['type']}",
"id": ex["id"]
})
# Process adversarial resistance examples
for ex in failure_data.get("adversarial_resistance_examples", []):
text = convert_conversation_to_text(ex["conversations"])
examples.append({
"text": text,
"source": f"ecosystem_failures:{ex['type']}",
"id": ex["id"]
})
# Process completeness examples
for ex in failure_data.get("completeness_examples", []):
text = convert_conversation_to_text(ex["conversations"])
examples.append({
"text": text,
"source": f"ecosystem_failures:{ex['type']}",
"id": ex["id"]
})
# Process fact drilling examples
for ex in failure_data.get("fact_drilling_examples", []):
text = convert_conversation_to_text(ex["conversations"])
examples.append({
"text": text,
"source": f"ecosystem_failures:{ex['type']}",
"id": ex["id"]
})
return examples
def main():
# Paths
data_dir = Path(__file__).parent.parent / "data"
existing_path = data_dir / "kmp_sft_final.json"
failures_path = data_dir / "ecosystem_failures_training.json"
output_path = data_dir / "combined_training.json"
print("=" * 60)
print("BioRLHF Training Data Merger")
print("=" * 60)
# Load existing data
print(f"\n๐ Loading existing data from {existing_path}")
existing_data = load_json(existing_path)
print(f" Found {len(existing_data)} existing examples")
# Load failure-based examples
print(f"\n๐ Loading failure examples from {failures_path}")
failure_data = load_json(failures_path)
# Convert failure examples
print("\n๐ Converting failure examples to training format...")
new_examples = extract_examples_from_failures(failure_data)
print(f" Converted {len(new_examples)} examples")
# Show breakdown
print("\n๐ New examples by type:")
type_counts = {}
for ex in new_examples:
source_type = ex["source"].split(":")[1] if ":" in ex["source"] else ex["source"]
type_counts[source_type] = type_counts.get(source_type, 0) + 1
for t, c in sorted(type_counts.items()):
print(f" - {t}: {c}")
# Combine data
print("\n๐ Merging datasets...")
# Add source field to existing data if not present
for ex in existing_data:
if "source" not in ex:
ex["source"] = "kmp_sft_original"
# Combine
combined = existing_data + new_examples
print(f" Total examples: {len(combined)}")
# Save combined data
print(f"\n๐พ Saving to {output_path}")
save_json(combined, output_path)
# Summary
print("\n" + "=" * 60)
print("โ
MERGE COMPLETE")
print("=" * 60)
print(f" Original examples: {len(existing_data)}")
print(f" New examples: {len(new_examples)}")
print(f" Total combined: {len(combined)}")
print(f"\n Output: {output_path}")
print("\nNext step: Run training with combined data:")
print(" python sft_train_v2.py --dataset data/combined_training.json")
if __name__ == "__main__":
main()
|