Spaces:
Runtime error
Runtime error
File size: 5,065 Bytes
9791706 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
#!/usr/bin/env python3
"""
Upload OrbGen training dataset to HuggingFace Hub.
Usage:
python scripts/upload_dataset.py
python scripts/upload_dataset.py --repo orbital-ai/orbital-schemas
"""
import os
import json
import argparse
from pathlib import Path
from huggingface_hub import HfApi, create_repo, upload_file
from datasets import Dataset, DatasetDict
def load_jsonl(path: str) -> list:
"""Load JSONL file into list of dicts."""
data = []
with open(path, 'r') as f:
for line in f:
if line.strip():
data.append(json.loads(line))
return data
def main():
parser = argparse.ArgumentParser(description='Upload dataset to HuggingFace')
parser.add_argument('--repo', default='orbital-ai/orbital-schemas',
help='HuggingFace dataset repository')
parser.add_argument('--data-dir', default='../../training-data',
help='Directory containing JSONL files')
parser.add_argument('--private', action='store_true',
help='Make dataset private')
args = parser.parse_args()
# Resolve paths
script_dir = Path(__file__).parent
data_dir = (script_dir / args.data_dir).resolve()
print(f"Loading data from: {data_dir}")
# Load training data
train_path = data_dir / 'combined-train.jsonl'
val_path = data_dir / 'combined-validation.jsonl'
test_path = data_dir / 'test.jsonl'
if not train_path.exists():
print(f"Error: {train_path} not found")
return 1
train_data = load_jsonl(str(train_path))
val_data = load_jsonl(str(val_path)) if val_path.exists() else []
test_data = load_jsonl(str(test_path)) if test_path.exists() else []
print(f"Loaded {len(train_data)} train, {len(val_data)} validation, {len(test_data)} test examples")
# Create datasets
def process_examples(examples):
"""Ensure consistent schema."""
processed = []
for ex in examples:
processed.append({
'prompt': ex['prompt'],
'completion': ex['completion'],
'domain': ex.get('metadata', {}).get('domain', 'general'),
'complexity': ex.get('metadata', {}).get('complexity', 'medium'),
'source': ex.get('metadata', {}).get('source', 'unknown'),
})
return processed
train_ds = Dataset.from_list(process_examples(train_data))
val_ds = Dataset.from_list(process_examples(val_data)) if val_data else None
test_ds = Dataset.from_list(process_examples(test_data)) if test_data else None
# Create DatasetDict
splits = {'train': train_ds}
if val_ds:
splits['validation'] = val_ds
if test_ds:
splits['test'] = test_ds
dataset_dict = DatasetDict(splits)
print(f"\nDataset structure:")
print(dataset_dict)
# Create repo if needed
api = HfApi()
try:
create_repo(args.repo, repo_type='dataset', private=args.private, exist_ok=True)
print(f"\nRepository: https://huggingface.co/datasets/{args.repo}")
except Exception as e:
print(f"Note: {e}")
# Push to hub
print(f"\nPushing to HuggingFace Hub...")
dataset_dict.push_to_hub(
args.repo,
private=args.private,
commit_message="Update training dataset"
)
print(f"\nDataset uploaded successfully!")
print(f"View at: https://huggingface.co/datasets/{args.repo}")
# Create dataset card
dataset_card = f"""---
license: apache-2.0
task_categories:
- text-generation
language:
- en
tags:
- orbital
- schema-generation
- code
size_categories:
- n<1K
---
# Orbital Schemas Dataset
Training data for OrbGen - a model that generates valid Orbital schemas (.orb files).
## Dataset Structure
- **train**: {len(train_data)} examples
- **validation**: {len(val_data)} examples
- **test**: {len(test_data)} examples
## Features
- `prompt`: Natural language description of the desired schema
- `completion`: Valid Orbital schema in JSON format
- `domain`: Application domain (ecommerce, game, productivity, etc.)
- `complexity`: Schema complexity (simple, medium, complex)
- `source`: Source of the example (synthetic, pattern, integrator)
## Usage
```python
from datasets import load_dataset
dataset = load_dataset("{args.repo}")
print(dataset["train"][0])
```
## Example
```json
{{
"prompt": "Create a task management app with projects and due dates",
"completion": "{{...valid orbital schema...}}",
"domain": "productivity",
"complexity": "medium",
"source": "synthetic"
}}
```
## License
Apache 2.0
"""
# Save and upload README
readme_path = data_dir / 'README.md'
with open(readme_path, 'w') as f:
f.write(dataset_card)
upload_file(
path_or_fileobj=str(readme_path),
path_in_repo='README.md',
repo_id=args.repo,
repo_type='dataset',
commit_message='Add dataset card'
)
print(f"Dataset card uploaded!")
return 0
if __name__ == '__main__':
exit(main())
|