File size: 4,950 Bytes
bff2f94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python3
"""
Merge BioRLHF training data with ecosystem failure examples.

This script:
1. Loads existing kmp_sft_final.json training data
2. Loads ecosystem_failures_training.json (failure-based examples)
3. Converts failure examples to the same format
4. Outputs combined_training.json

Usage:
    python scripts/merge_training_data.py
"""

import json
from pathlib import Path
from datetime import datetime


def load_json(filepath: str) -> dict | list:
    """Load JSON file."""
    with open(filepath, 'r') as f:
        return json.load(f)


def save_json(data: list, filepath: str):
    """Save JSON file."""
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"Saved {len(data)} examples to {filepath}")


def convert_conversation_to_text(conversation: list) -> str:
    """
    Convert conversation format to text format.

    Input: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
    Output: "### Instruction:\n...\n\n### Response:\n..."
    """
    instruction = ""
    response = ""

    for turn in conversation:
        if turn["role"] == "user":
            instruction = turn["content"]
        elif turn["role"] == "assistant":
            response = turn["content"]

    return f"### Instruction:\n{instruction}\n\n### Response:\n{response}"


def extract_examples_from_failures(failure_data: dict) -> list:
    """
    Extract and convert all examples from failure training data.
    """
    examples = []

    # Process calibration examples
    for ex in failure_data.get("calibration_examples", []):
        text = convert_conversation_to_text(ex["conversations"])
        examples.append({
            "text": text,
            "source": f"ecosystem_failures:{ex['type']}",
            "id": ex["id"]
        })

    # Process adversarial resistance examples
    for ex in failure_data.get("adversarial_resistance_examples", []):
        text = convert_conversation_to_text(ex["conversations"])
        examples.append({
            "text": text,
            "source": f"ecosystem_failures:{ex['type']}",
            "id": ex["id"]
        })

    # Process completeness examples
    for ex in failure_data.get("completeness_examples", []):
        text = convert_conversation_to_text(ex["conversations"])
        examples.append({
            "text": text,
            "source": f"ecosystem_failures:{ex['type']}",
            "id": ex["id"]
        })

    # Process fact drilling examples
    for ex in failure_data.get("fact_drilling_examples", []):
        text = convert_conversation_to_text(ex["conversations"])
        examples.append({
            "text": text,
            "source": f"ecosystem_failures:{ex['type']}",
            "id": ex["id"]
        })

    return examples


def main():
    # Paths
    data_dir = Path(__file__).parent.parent / "data"
    existing_path = data_dir / "kmp_sft_final.json"
    failures_path = data_dir / "ecosystem_failures_training.json"
    output_path = data_dir / "combined_training.json"

    print("=" * 60)
    print("BioRLHF Training Data Merger")
    print("=" * 60)

    # Load existing data
    print(f"\n๐Ÿ“‚ Loading existing data from {existing_path}")
    existing_data = load_json(existing_path)
    print(f"   Found {len(existing_data)} existing examples")

    # Load failure-based examples
    print(f"\n๐Ÿ“‚ Loading failure examples from {failures_path}")
    failure_data = load_json(failures_path)

    # Convert failure examples
    print("\n๐Ÿ”„ Converting failure examples to training format...")
    new_examples = extract_examples_from_failures(failure_data)
    print(f"   Converted {len(new_examples)} examples")

    # Show breakdown
    print("\n๐Ÿ“Š New examples by type:")
    type_counts = {}
    for ex in new_examples:
        source_type = ex["source"].split(":")[1] if ":" in ex["source"] else ex["source"]
        type_counts[source_type] = type_counts.get(source_type, 0) + 1
    for t, c in sorted(type_counts.items()):
        print(f"   - {t}: {c}")

    # Combine data
    print("\n๐Ÿ”€ Merging datasets...")

    # Add source field to existing data if not present
    for ex in existing_data:
        if "source" not in ex:
            ex["source"] = "kmp_sft_original"

    # Combine
    combined = existing_data + new_examples
    print(f"   Total examples: {len(combined)}")

    # Save combined data
    print(f"\n๐Ÿ’พ Saving to {output_path}")
    save_json(combined, output_path)

    # Summary
    print("\n" + "=" * 60)
    print("โœ… MERGE COMPLETE")
    print("=" * 60)
    print(f"   Original examples: {len(existing_data)}")
    print(f"   New examples:      {len(new_examples)}")
    print(f"   Total combined:    {len(combined)}")
    print(f"\n   Output: {output_path}")
    print("\nNext step: Run training with combined data:")
    print("   python sft_train_v2.py --dataset data/combined_training.json")


if __name__ == "__main__":
    main()