#!/usr/bin/env python3 """ Upload OrbGen training dataset to HuggingFace Hub. Usage: python scripts/upload_dataset.py python scripts/upload_dataset.py --repo orbital-ai/orbital-schemas """ import os import json import argparse from pathlib import Path from huggingface_hub import HfApi, create_repo, upload_file from datasets import Dataset, DatasetDict def load_jsonl(path: str) -> list: """Load JSONL file into list of dicts.""" data = [] with open(path, 'r') as f: for line in f: if line.strip(): data.append(json.loads(line)) return data def main(): parser = argparse.ArgumentParser(description='Upload dataset to HuggingFace') parser.add_argument('--repo', default='orbital-ai/orbital-schemas', help='HuggingFace dataset repository') parser.add_argument('--data-dir', default='../../training-data', help='Directory containing JSONL files') parser.add_argument('--private', action='store_true', help='Make dataset private') args = parser.parse_args() # Resolve paths script_dir = Path(__file__).parent data_dir = (script_dir / args.data_dir).resolve() print(f"Loading data from: {data_dir}") # Load training data train_path = data_dir / 'combined-train.jsonl' val_path = data_dir / 'combined-validation.jsonl' test_path = data_dir / 'test.jsonl' if not train_path.exists(): print(f"Error: {train_path} not found") return 1 train_data = load_jsonl(str(train_path)) val_data = load_jsonl(str(val_path)) if val_path.exists() else [] test_data = load_jsonl(str(test_path)) if test_path.exists() else [] print(f"Loaded {len(train_data)} train, {len(val_data)} validation, {len(test_data)} test examples") # Create datasets def process_examples(examples): """Ensure consistent schema.""" processed = [] for ex in examples: processed.append({ 'prompt': ex['prompt'], 'completion': ex['completion'], 'domain': ex.get('metadata', {}).get('domain', 'general'), 'complexity': ex.get('metadata', {}).get('complexity', 'medium'), 'source': ex.get('metadata', {}).get('source', 'unknown'), }) return processed train_ds = Dataset.from_list(process_examples(train_data)) val_ds = Dataset.from_list(process_examples(val_data)) if val_data else None test_ds = Dataset.from_list(process_examples(test_data)) if test_data else None # Create DatasetDict splits = {'train': train_ds} if val_ds: splits['validation'] = val_ds if test_ds: splits['test'] = test_ds dataset_dict = DatasetDict(splits) print(f"\nDataset structure:") print(dataset_dict) # Create repo if needed api = HfApi() try: create_repo(args.repo, repo_type='dataset', private=args.private, exist_ok=True) print(f"\nRepository: https://huggingface.co/datasets/{args.repo}") except Exception as e: print(f"Note: {e}") # Push to hub print(f"\nPushing to HuggingFace Hub...") dataset_dict.push_to_hub( args.repo, private=args.private, commit_message="Update training dataset" ) print(f"\nDataset uploaded successfully!") print(f"View at: https://huggingface.co/datasets/{args.repo}") # Create dataset card dataset_card = f"""--- license: apache-2.0 task_categories: - text-generation language: - en tags: - orbital - schema-generation - code size_categories: - n<1K --- # Orbital Schemas Dataset Training data for OrbGen - a model that generates valid Orbital schemas (.orb files). ## Dataset Structure - **train**: {len(train_data)} examples - **validation**: {len(val_data)} examples - **test**: {len(test_data)} examples ## Features - `prompt`: Natural language description of the desired schema - `completion`: Valid Orbital schema in JSON format - `domain`: Application domain (ecommerce, game, productivity, etc.) - `complexity`: Schema complexity (simple, medium, complex) - `source`: Source of the example (synthetic, pattern, integrator) ## Usage ```python from datasets import load_dataset dataset = load_dataset("{args.repo}") print(dataset["train"][0]) ``` ## Example ```json {{ "prompt": "Create a task management app with projects and due dates", "completion": "{{...valid orbital schema...}}", "domain": "productivity", "complexity": "medium", "source": "synthetic" }} ``` ## License Apache 2.0 """ # Save and upload README readme_path = data_dir / 'README.md' with open(readme_path, 'w') as f: f.write(dataset_card) upload_file( path_or_fileobj=str(readme_path), path_in_repo='README.md', repo_id=args.repo, repo_type='dataset', commit_message='Add dataset card' ) print(f"Dataset card uploaded!") return 0 if __name__ == '__main__': exit(main())