orbgen-training / scripts /upload_dataset.py
javasop's picture
Upload folder using huggingface_hub
9791706 verified
#!/usr/bin/env python3
"""
Upload OrbGen training dataset to HuggingFace Hub.
Usage:
python scripts/upload_dataset.py
python scripts/upload_dataset.py --repo orbital-ai/orbital-schemas
"""
import os
import json
import argparse
from pathlib import Path
from huggingface_hub import HfApi, create_repo, upload_file
from datasets import Dataset, DatasetDict
def load_jsonl(path: str) -> list:
"""Load JSONL file into list of dicts."""
data = []
with open(path, 'r') as f:
for line in f:
if line.strip():
data.append(json.loads(line))
return data
def main():
parser = argparse.ArgumentParser(description='Upload dataset to HuggingFace')
parser.add_argument('--repo', default='orbital-ai/orbital-schemas',
help='HuggingFace dataset repository')
parser.add_argument('--data-dir', default='../../training-data',
help='Directory containing JSONL files')
parser.add_argument('--private', action='store_true',
help='Make dataset private')
args = parser.parse_args()
# Resolve paths
script_dir = Path(__file__).parent
data_dir = (script_dir / args.data_dir).resolve()
print(f"Loading data from: {data_dir}")
# Load training data
train_path = data_dir / 'combined-train.jsonl'
val_path = data_dir / 'combined-validation.jsonl'
test_path = data_dir / 'test.jsonl'
if not train_path.exists():
print(f"Error: {train_path} not found")
return 1
train_data = load_jsonl(str(train_path))
val_data = load_jsonl(str(val_path)) if val_path.exists() else []
test_data = load_jsonl(str(test_path)) if test_path.exists() else []
print(f"Loaded {len(train_data)} train, {len(val_data)} validation, {len(test_data)} test examples")
# Create datasets
def process_examples(examples):
"""Ensure consistent schema."""
processed = []
for ex in examples:
processed.append({
'prompt': ex['prompt'],
'completion': ex['completion'],
'domain': ex.get('metadata', {}).get('domain', 'general'),
'complexity': ex.get('metadata', {}).get('complexity', 'medium'),
'source': ex.get('metadata', {}).get('source', 'unknown'),
})
return processed
train_ds = Dataset.from_list(process_examples(train_data))
val_ds = Dataset.from_list(process_examples(val_data)) if val_data else None
test_ds = Dataset.from_list(process_examples(test_data)) if test_data else None
# Create DatasetDict
splits = {'train': train_ds}
if val_ds:
splits['validation'] = val_ds
if test_ds:
splits['test'] = test_ds
dataset_dict = DatasetDict(splits)
print(f"\nDataset structure:")
print(dataset_dict)
# Create repo if needed
api = HfApi()
try:
create_repo(args.repo, repo_type='dataset', private=args.private, exist_ok=True)
print(f"\nRepository: https://huggingface.co/datasets/{args.repo}")
except Exception as e:
print(f"Note: {e}")
# Push to hub
print(f"\nPushing to HuggingFace Hub...")
dataset_dict.push_to_hub(
args.repo,
private=args.private,
commit_message="Update training dataset"
)
print(f"\nDataset uploaded successfully!")
print(f"View at: https://huggingface.co/datasets/{args.repo}")
# Create dataset card
dataset_card = f"""---
license: apache-2.0
task_categories:
- text-generation
language:
- en
tags:
- orbital
- schema-generation
- code
size_categories:
- n<1K
---
# Orbital Schemas Dataset
Training data for OrbGen - a model that generates valid Orbital schemas (.orb files).
## Dataset Structure
- **train**: {len(train_data)} examples
- **validation**: {len(val_data)} examples
- **test**: {len(test_data)} examples
## Features
- `prompt`: Natural language description of the desired schema
- `completion`: Valid Orbital schema in JSON format
- `domain`: Application domain (ecommerce, game, productivity, etc.)
- `complexity`: Schema complexity (simple, medium, complex)
- `source`: Source of the example (synthetic, pattern, integrator)
## Usage
```python
from datasets import load_dataset
dataset = load_dataset("{args.repo}")
print(dataset["train"][0])
```
## Example
```json
{{
"prompt": "Create a task management app with projects and due dates",
"completion": "{{...valid orbital schema...}}",
"domain": "productivity",
"complexity": "medium",
"source": "synthetic"
}}
```
## License
Apache 2.0
"""
# Save and upload README
readme_path = data_dir / 'README.md'
with open(readme_path, 'w') as f:
f.write(dataset_card)
upload_file(
path_or_fileobj=str(readme_path),
path_in_repo='README.md',
repo_id=args.repo,
repo_type='dataset',
commit_message='Add dataset card'
)
print(f"Dataset card uploaded!")
return 0
if __name__ == '__main__':
exit(main())