File size: 5,065 Bytes
9791706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python3
"""
Upload OrbGen training dataset to HuggingFace Hub.

Usage:
    python scripts/upload_dataset.py
    python scripts/upload_dataset.py --repo orbital-ai/orbital-schemas
"""

import os
import json
import argparse
from pathlib import Path
from huggingface_hub import HfApi, create_repo, upload_file
from datasets import Dataset, DatasetDict


def load_jsonl(path: str) -> list:
    """Load JSONL file into list of dicts."""
    data = []
    with open(path, 'r') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data


def main():
    parser = argparse.ArgumentParser(description='Upload dataset to HuggingFace')
    parser.add_argument('--repo', default='orbital-ai/orbital-schemas',
                        help='HuggingFace dataset repository')
    parser.add_argument('--data-dir', default='../../training-data',
                        help='Directory containing JSONL files')
    parser.add_argument('--private', action='store_true',
                        help='Make dataset private')
    args = parser.parse_args()

    # Resolve paths
    script_dir = Path(__file__).parent
    data_dir = (script_dir / args.data_dir).resolve()

    print(f"Loading data from: {data_dir}")

    # Load training data
    train_path = data_dir / 'combined-train.jsonl'
    val_path = data_dir / 'combined-validation.jsonl'
    test_path = data_dir / 'test.jsonl'

    if not train_path.exists():
        print(f"Error: {train_path} not found")
        return 1

    train_data = load_jsonl(str(train_path))
    val_data = load_jsonl(str(val_path)) if val_path.exists() else []
    test_data = load_jsonl(str(test_path)) if test_path.exists() else []

    print(f"Loaded {len(train_data)} train, {len(val_data)} validation, {len(test_data)} test examples")

    # Create datasets
    def process_examples(examples):
        """Ensure consistent schema."""
        processed = []
        for ex in examples:
            processed.append({
                'prompt': ex['prompt'],
                'completion': ex['completion'],
                'domain': ex.get('metadata', {}).get('domain', 'general'),
                'complexity': ex.get('metadata', {}).get('complexity', 'medium'),
                'source': ex.get('metadata', {}).get('source', 'unknown'),
            })
        return processed

    train_ds = Dataset.from_list(process_examples(train_data))
    val_ds = Dataset.from_list(process_examples(val_data)) if val_data else None
    test_ds = Dataset.from_list(process_examples(test_data)) if test_data else None

    # Create DatasetDict
    splits = {'train': train_ds}
    if val_ds:
        splits['validation'] = val_ds
    if test_ds:
        splits['test'] = test_ds

    dataset_dict = DatasetDict(splits)

    print(f"\nDataset structure:")
    print(dataset_dict)

    # Create repo if needed
    api = HfApi()
    try:
        create_repo(args.repo, repo_type='dataset', private=args.private, exist_ok=True)
        print(f"\nRepository: https://huggingface.co/datasets/{args.repo}")
    except Exception as e:
        print(f"Note: {e}")

    # Push to hub
    print(f"\nPushing to HuggingFace Hub...")
    dataset_dict.push_to_hub(
        args.repo,
        private=args.private,
        commit_message="Update training dataset"
    )

    print(f"\nDataset uploaded successfully!")
    print(f"View at: https://huggingface.co/datasets/{args.repo}")

    # Create dataset card
    dataset_card = f"""---
license: apache-2.0
task_categories:
  - text-generation
language:
  - en
tags:
  - orbital
  - schema-generation
  - code
size_categories:
  - n<1K
---

# Orbital Schemas Dataset

Training data for OrbGen - a model that generates valid Orbital schemas (.orb files).

## Dataset Structure

- **train**: {len(train_data)} examples
- **validation**: {len(val_data)} examples
- **test**: {len(test_data)} examples

## Features

- `prompt`: Natural language description of the desired schema
- `completion`: Valid Orbital schema in JSON format
- `domain`: Application domain (ecommerce, game, productivity, etc.)
- `complexity`: Schema complexity (simple, medium, complex)
- `source`: Source of the example (synthetic, pattern, integrator)

## Usage

```python
from datasets import load_dataset

dataset = load_dataset("{args.repo}")
print(dataset["train"][0])
```

## Example

```json
{{
  "prompt": "Create a task management app with projects and due dates",
  "completion": "{{...valid orbital schema...}}",
  "domain": "productivity",
  "complexity": "medium",
  "source": "synthetic"
}}
```

## License

Apache 2.0
"""

    # Save and upload README
    readme_path = data_dir / 'README.md'
    with open(readme_path, 'w') as f:
        f.write(dataset_card)

    upload_file(
        path_or_fileobj=str(readme_path),
        path_in_repo='README.md',
        repo_id=args.repo,
        repo_type='dataset',
        commit_message='Add dataset card'
    )

    print(f"Dataset card uploaded!")
    return 0


if __name__ == '__main__':
    exit(main())