Spaces:

javasop
/

orbgen-training

Runtime error

App Files Files Community

orbgen-training / scripts /upload_dataset.py

javasop

Upload folder using huggingface_hub

9791706 verified 7 days ago

raw

history blame contribute delete

5.07 kB

	#!/usr/bin/env python3
	"""
	Upload OrbGen training dataset to HuggingFace Hub.

	Usage:
	python scripts/upload_dataset.py
	python scripts/upload_dataset.py --repo orbital-ai/orbital-schemas
	"""

	import os
	import json
	import argparse
	from pathlib import Path
	from huggingface_hub import HfApi, create_repo, upload_file
	from datasets import Dataset, DatasetDict


	def load_jsonl(path: str) -> list:
	"""Load JSONL file into list of dicts."""
	data = []
	with open(path, 'r') as f:
	for line in f:
	if line.strip():
	data.append(json.loads(line))
	return data


	def main():
	parser = argparse.ArgumentParser(description='Upload dataset to HuggingFace')
	parser.add_argument('--repo', default='orbital-ai/orbital-schemas',
	help='HuggingFace dataset repository')
	parser.add_argument('--data-dir', default='../../training-data',
	help='Directory containing JSONL files')
	parser.add_argument('--private', action='store_true',
	help='Make dataset private')
	args = parser.parse_args()

	# Resolve paths
	script_dir = Path(__file__).parent
	data_dir = (script_dir / args.data_dir).resolve()

	print(f"Loading data from: {data_dir}")

	# Load training data
	train_path = data_dir / 'combined-train.jsonl'
	val_path = data_dir / 'combined-validation.jsonl'
	test_path = data_dir / 'test.jsonl'

	if not train_path.exists():
	print(f"Error: {train_path} not found")
	return 1

	train_data = load_jsonl(str(train_path))
	val_data = load_jsonl(str(val_path)) if val_path.exists() else []
	test_data = load_jsonl(str(test_path)) if test_path.exists() else []

	print(f"Loaded {len(train_data)} train, {len(val_data)} validation, {len(test_data)} test examples")

	# Create datasets
	def process_examples(examples):
	"""Ensure consistent schema."""
	processed = []
	for ex in examples:
	processed.append({
	'prompt': ex['prompt'],
	'completion': ex['completion'],
	'domain': ex.get('metadata', {}).get('domain', 'general'),
	'complexity': ex.get('metadata', {}).get('complexity', 'medium'),
	'source': ex.get('metadata', {}).get('source', 'unknown'),
	})
	return processed

	train_ds = Dataset.from_list(process_examples(train_data))
	val_ds = Dataset.from_list(process_examples(val_data)) if val_data else None
	test_ds = Dataset.from_list(process_examples(test_data)) if test_data else None

	# Create DatasetDict
	splits = {'train': train_ds}
	if val_ds:
	splits['validation'] = val_ds
	if test_ds:
	splits['test'] = test_ds

	dataset_dict = DatasetDict(splits)

	print(f"\nDataset structure:")
	print(dataset_dict)

	# Create repo if needed
	api = HfApi()
	try:
	create_repo(args.repo, repo_type='dataset', private=args.private, exist_ok=True)
	print(f"\nRepository: https://huggingface.co/datasets/{args.repo}")
	except Exception as e:
	print(f"Note: {e}")

	# Push to hub
	print(f"\nPushing to HuggingFace Hub...")
	dataset_dict.push_to_hub(
	args.repo,
	private=args.private,
	commit_message="Update training dataset"
	)

	print(f"\nDataset uploaded successfully!")
	print(f"View at: https://huggingface.co/datasets/{args.repo}")

	# Create dataset card
	dataset_card = f"""---
	license: apache-2.0
	task_categories:
	- text-generation
	language:
	- en
	tags:
	- orbital
	- schema-generation
	- code
	size_categories:
	- n<1K
	---

	# Orbital Schemas Dataset

	Training data for OrbGen - a model that generates valid Orbital schemas (.orb files).

	## Dataset Structure

	- train: {len(train_data)} examples
	- validation: {len(val_data)} examples
	- test: {len(test_data)} examples

	## Features

	- `prompt`: Natural language description of the desired schema
	- `completion`: Valid Orbital schema in JSON format
	- `domain`: Application domain (ecommerce, game, productivity, etc.)
	- `complexity`: Schema complexity (simple, medium, complex)
	- `source`: Source of the example (synthetic, pattern, integrator)

	## Usage

	```python
	from datasets import load_dataset

	dataset = load_dataset("{args.repo}")
	print(dataset["train"][0])
	```

	## Example

	```json
	{{
	"prompt": "Create a task management app with projects and due dates",
	"completion": "{{...valid orbital schema...}}",
	"domain": "productivity",
	"complexity": "medium",
	"source": "synthetic"
	}}
	```

	## License

	Apache 2.0
	"""

	# Save and upload README
	readme_path = data_dir / 'README.md'
	with open(readme_path, 'w') as f:
	f.write(dataset_card)

	upload_file(
	path_or_fileobj=str(readme_path),
	path_in_repo='README.md',
	repo_id=args.repo,
	repo_type='dataset',
	commit_message='Add dataset card'
	)

	print(f"Dataset card uploaded!")
	return 0


	if __name__ == '__main__':
	exit(main())