Spaces:

javasop
/

orbgen-training

Runtime error

File size: 5,065 Bytes
#!/usr/bin/env python3
"""
Upload OrbGen training dataset to HuggingFace Hub.

Usage:
    python scripts/upload_dataset.py
    python scripts/upload_dataset.py --repo orbital-ai/orbital-schemas
"""

import os
import json
import argparse
from pathlib import Path
from huggingface_hub import HfApi, create_repo, upload_file
from datasets import Dataset, DatasetDict


def load_jsonl(path: str) -> list:
    """Load JSONL file into list of dicts."""
    data = []
    with open(path, 'r') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data


def main():
    parser = argparse.ArgumentParser(description='Upload dataset to HuggingFace')
    parser.add_argument('--repo', default='orbital-ai/orbital-schemas',
                        help='HuggingFace dataset repository')
    parser.add_argument('--data-dir', default='../../training-data',
                        help='Directory containing JSONL files')
    parser.add_argument('--private', action='store_true',
                        help='Make dataset private')
    args = parser.parse_args()

    # Resolve paths
    script_dir = Path(__file__).parent
    data_dir = (script_dir / args.data_dir).resolve()

    print(f"Loading data from: {data_dir}")

    # Load training data
    train_path = data_dir / 'combined-train.jsonl'
    val_path = data_dir / 'combined-validation.jsonl'
    test_path = data_dir / 'test.jsonl'

    if not train_path.exists():
        print(f"Error: {train_path} not found")
        return 1

    train_data = load_jsonl(str(train_path))
    val_data = load_jsonl(str(val_path)) if val_path.exists() else []
    test_data = load_jsonl(str(test_path)) if test_path.exists() else []

    print(f"Loaded {len(train_data)} train, {len(val_data)} validation, {len(test_data)} test examples")

    # Create datasets
    def process_examples(examples):
        """Ensure consistent schema."""
        processed = []
        for ex in examples:
            processed.append({
                'prompt': ex['prompt'],
                'completion': ex['completion'],
                'domain': ex.get('metadata', {}).get('domain', 'general'),
                'complexity': ex.get('metadata', {}).get('complexity', 'medium'),
                'source': ex.get('metadata', {}).get('source', 'unknown'),
            })
        return processed

    train_ds = Dataset.from_list(process_examples(train_data))
    val_ds = Dataset.from_list(process_examples(val_data)) if val_data else None
    test_ds = Dataset.from_list(process_examples(test_data)) if test_data else None

    # Create DatasetDict
    splits = {'train': train_ds}
    if val_ds:
        splits['validation'] = val_ds
    if test_ds:
        splits['test'] = test_ds

    dataset_dict = DatasetDict(splits)

    print(f"\nDataset structure:")
    print(dataset_dict)

    # Create repo if needed
    api = HfApi()
    try:
        create_repo(args.repo, repo_type='dataset', private=args.private, exist_ok=True)
        print(f"\nRepository: https://huggingface.co/datasets/{args.repo}")
    except Exception as e:
        print(f"Note: {e}")

    # Push to hub
    print(f"\nPushing to HuggingFace Hub...")
    dataset_dict.push_to_hub(
        args.repo,
        private=args.private,
        commit_message="Update training dataset"
    )

    print(f"\nDataset uploaded successfully!")
    print(f"View at: https://huggingface.co/datasets/{args.repo}")

    # Create dataset card
    dataset_card = f"""---
license: apache-2.0
task_categories:
  - text-generation
language:
  - en
tags:
  - orbital
  - schema-generation
  - code
size_categories:
  - n<1K
---

# Orbital Schemas Dataset

Training data for OrbGen - a model that generates valid Orbital schemas (.orb files).

## Dataset Structure

- **train**: {len(train_data)} examples
- **validation**: {len(val_data)} examples
- **test**: {len(test_data)} examples

## Features

- `prompt`: Natural language description of the desired schema
- `completion`: Valid Orbital schema in JSON format
- `domain`: Application domain (ecommerce, game, productivity, etc.)
- `complexity`: Schema complexity (simple, medium, complex)
- `source`: Source of the example (synthetic, pattern, integrator)

## Usage

```python
from datasets import load_dataset

dataset = load_dataset("{args.repo}")
print(dataset["train"][0])
```

## Example

```json
{{
  "prompt": "Create a task management app with projects and due dates",
  "completion": "{{...valid orbital schema...}}",
  "domain": "productivity",
  "complexity": "medium",
  "source": "synthetic"
}}
```

## License

Apache 2.0
"""

    # Save and upload README
    readme_path = data_dir / 'README.md'
    with open(readme_path, 'w') as f:
        f.write(dataset_card)

    upload_file(
        path_or_fileobj=str(readme_path),
        path_in_repo='README.md',
        repo_id=args.repo,
        repo_type='dataset',
        commit_message='Add dataset card'
    )

    print(f"Dataset card uploaded!")
    return 0


if __name__ == '__main__':
    exit(main())