Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Upload OrbGen training dataset to HuggingFace Hub. | |
| Usage: | |
| python scripts/upload_dataset.py | |
| python scripts/upload_dataset.py --repo orbital-ai/orbital-schemas | |
| """ | |
| import os | |
| import json | |
| import argparse | |
| from pathlib import Path | |
| from huggingface_hub import HfApi, create_repo, upload_file | |
| from datasets import Dataset, DatasetDict | |
| def load_jsonl(path: str) -> list: | |
| """Load JSONL file into list of dicts.""" | |
| data = [] | |
| with open(path, 'r') as f: | |
| for line in f: | |
| if line.strip(): | |
| data.append(json.loads(line)) | |
| return data | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Upload dataset to HuggingFace') | |
| parser.add_argument('--repo', default='orbital-ai/orbital-schemas', | |
| help='HuggingFace dataset repository') | |
| parser.add_argument('--data-dir', default='../../training-data', | |
| help='Directory containing JSONL files') | |
| parser.add_argument('--private', action='store_true', | |
| help='Make dataset private') | |
| args = parser.parse_args() | |
| # Resolve paths | |
| script_dir = Path(__file__).parent | |
| data_dir = (script_dir / args.data_dir).resolve() | |
| print(f"Loading data from: {data_dir}") | |
| # Load training data | |
| train_path = data_dir / 'combined-train.jsonl' | |
| val_path = data_dir / 'combined-validation.jsonl' | |
| test_path = data_dir / 'test.jsonl' | |
| if not train_path.exists(): | |
| print(f"Error: {train_path} not found") | |
| return 1 | |
| train_data = load_jsonl(str(train_path)) | |
| val_data = load_jsonl(str(val_path)) if val_path.exists() else [] | |
| test_data = load_jsonl(str(test_path)) if test_path.exists() else [] | |
| print(f"Loaded {len(train_data)} train, {len(val_data)} validation, {len(test_data)} test examples") | |
| # Create datasets | |
| def process_examples(examples): | |
| """Ensure consistent schema.""" | |
| processed = [] | |
| for ex in examples: | |
| processed.append({ | |
| 'prompt': ex['prompt'], | |
| 'completion': ex['completion'], | |
| 'domain': ex.get('metadata', {}).get('domain', 'general'), | |
| 'complexity': ex.get('metadata', {}).get('complexity', 'medium'), | |
| 'source': ex.get('metadata', {}).get('source', 'unknown'), | |
| }) | |
| return processed | |
| train_ds = Dataset.from_list(process_examples(train_data)) | |
| val_ds = Dataset.from_list(process_examples(val_data)) if val_data else None | |
| test_ds = Dataset.from_list(process_examples(test_data)) if test_data else None | |
| # Create DatasetDict | |
| splits = {'train': train_ds} | |
| if val_ds: | |
| splits['validation'] = val_ds | |
| if test_ds: | |
| splits['test'] = test_ds | |
| dataset_dict = DatasetDict(splits) | |
| print(f"\nDataset structure:") | |
| print(dataset_dict) | |
| # Create repo if needed | |
| api = HfApi() | |
| try: | |
| create_repo(args.repo, repo_type='dataset', private=args.private, exist_ok=True) | |
| print(f"\nRepository: https://huggingface.co/datasets/{args.repo}") | |
| except Exception as e: | |
| print(f"Note: {e}") | |
| # Push to hub | |
| print(f"\nPushing to HuggingFace Hub...") | |
| dataset_dict.push_to_hub( | |
| args.repo, | |
| private=args.private, | |
| commit_message="Update training dataset" | |
| ) | |
| print(f"\nDataset uploaded successfully!") | |
| print(f"View at: https://huggingface.co/datasets/{args.repo}") | |
| # Create dataset card | |
| dataset_card = f"""--- | |
| license: apache-2.0 | |
| task_categories: | |
| - text-generation | |
| language: | |
| - en | |
| tags: | |
| - orbital | |
| - schema-generation | |
| - code | |
| size_categories: | |
| - n<1K | |
| --- | |
| # Orbital Schemas Dataset | |
| Training data for OrbGen - a model that generates valid Orbital schemas (.orb files). | |
| ## Dataset Structure | |
| - **train**: {len(train_data)} examples | |
| - **validation**: {len(val_data)} examples | |
| - **test**: {len(test_data)} examples | |
| ## Features | |
| - `prompt`: Natural language description of the desired schema | |
| - `completion`: Valid Orbital schema in JSON format | |
| - `domain`: Application domain (ecommerce, game, productivity, etc.) | |
| - `complexity`: Schema complexity (simple, medium, complex) | |
| - `source`: Source of the example (synthetic, pattern, integrator) | |
| ## Usage | |
| ```python | |
| from datasets import load_dataset | |
| dataset = load_dataset("{args.repo}") | |
| print(dataset["train"][0]) | |
| ``` | |
| ## Example | |
| ```json | |
| {{ | |
| "prompt": "Create a task management app with projects and due dates", | |
| "completion": "{{...valid orbital schema...}}", | |
| "domain": "productivity", | |
| "complexity": "medium", | |
| "source": "synthetic" | |
| }} | |
| ``` | |
| ## License | |
| Apache 2.0 | |
| """ | |
| # Save and upload README | |
| readme_path = data_dir / 'README.md' | |
| with open(readme_path, 'w') as f: | |
| f.write(dataset_card) | |
| upload_file( | |
| path_or_fileobj=str(readme_path), | |
| path_in_repo='README.md', | |
| repo_id=args.repo, | |
| repo_type='dataset', | |
| commit_message='Add dataset card' | |
| ) | |
| print(f"Dataset card uploaded!") | |
| return 0 | |
| if __name__ == '__main__': | |
| exit(main()) | |