Upload src/push_to_hub.py with huggingface_hub

b702f6d verified 14 days ago

6.69 kB

	"""
	Push RAE Training Package to HuggingFace Hub
	═══════════════════════════════════════════════════════════════
	Uploads the dataset as an HF Dataset and creates a model repo
	with the training config, making it runnable from anywhere.

	Usage:
	export HF_TOKEN=your_write_token
	python src/push_to_hub.py --dataset --config
	python src/push_to_hub.py --all
	═══════════════════════════════════════════════════════════════
	"""

	import os
	import sys
	import json
	import argparse
	from pathlib import Path

	def push_dataset(token: str, repo_prefix: str = "rae-training"):
	"""Push RAE training data as a HuggingFace Dataset."""
	from huggingface_hub import HfApi, create_repo
	from datasets import Dataset, DatasetDict
	import jsonlines

	api = HfApi(token=token)
	user_info = api.whoami()
	username = user_info["name"]
	dataset_repo = f"{username}/{repo_prefix}-data"

	print(f"Pushing dataset to: {dataset_repo}")

	# Create repo
	try:
	create_repo(dataset_repo, repo_type="dataset", private=False, token=token)
	except Exception as e:
	print(f" (repo exists: {e})")

	# Load JSONL files
	def load_jsonl(path):
	examples = []
	with open(path) as f:
	for line in f:
	data = json.loads(line)
	# Flatten for HF Dataset format
	examples.append({
	"messages": json.dumps(data["messages"]),
	"domain": data.get("metadata", {}).get("domain", "general"),
	"difficulty": data.get("metadata", {}).get("difficulty", "medium"),
	"rae_version": data.get("metadata", {}).get("rae_version", "1.0"),
	})
	return examples

	train_data = load_jsonl("data/rae_training_data/train.jsonl")
	eval_data = load_jsonl("data/rae_training_data/validation.jsonl")

	ds = DatasetDict({
	"train": Dataset.from_list(train_data),
	"validation": Dataset.from_list(eval_data),
	})

	ds.push_to_hub(dataset_repo, token=token)

	# Upload README
	readme = f"""---
	dataset_info:
	features:
	- name: messages
	dtype: string
	- name: domain
	dtype: string
	- name: difficulty
	dtype: string
	- name: rae_version
	dtype: string
	splits:
	- name: train
	num_examples: {len(train_data)}
	- name: validation
	num_examples: {len(eval_data)}
	tags:
	- cognitive-architecture
	- chain-of-thought
	- structured-reasoning
	- RAE
	license: apache-2.0
	---

	# RAE Training Data — Recursive Abstraction Engine

	Training data structured as 4-phase RAE cognitive cycles for fine-tuning LLMs.

	## Methodology: The Handwriting Principle

	Handwriting activates widespread brain connectivity because it forces *generative
	reconstruction through multiple representational modalities simultaneously under
	a temporal bottleneck*.

	This dataset replicates that effect for ML training: each example forces the model
	through Saturation → Abstraction → Descent → Integration phases, with every
	phase contributing to loss — preventing shortcutting to the answer.

	## Data Format

	Each example contains structured `messages` with the RAE phase tags:

	```
	<SATURATION>...</SATURATION>
	<ABSTRACTION>...</ABSTRACTION>
	<DESCENT>...</DESCENT>
	<INTEGRATION>...</INTEGRATION>
	```

	## Usage

	```python
	from datasets import load_dataset
	ds = load_dataset("{dataset_repo}")
	```

	## Training

	See the companion training package: [{username}/{repo_prefix}](https://huggingface.co/{username}/{repo_prefix})
	"""

	api.upload_file(
	path_or_fileobj=readme.encode(),
	path_in_repo="README.md",
	repo_id=dataset_repo,
	repo_type="dataset",
	token=token,
	)

	print(f"✓ Dataset pushed: https://huggingface.co/datasets/{dataset_repo}")
	return dataset_repo


	def push_training_config(token: str, repo_prefix: str = "rae-training"):
	"""Push training configs and scripts as a Model repo (pre-training)."""
	from huggingface_hub import HfApi, create_repo

	api = HfApi(token=token)
	user_info = api.whoami()
	username = user_info["name"]
	model_repo = f"{username}/{repo_prefix}"

	print(f"Pushing training package to: {model_repo}")

	# Create repo
	try:
	create_repo(model_repo, repo_type="model", private=False, token=token)
	except Exception as e:
	print(f" (repo exists: {e})")

	# Upload files
	files = [
	"configs/autotrain_rae_sft.yaml",
	"configs/rae_training_config.json",
	"configs/base_models.json",
	"src/train_rae.py",
	"src/rae_loss.py",
	"src/dataset_generator.py",
	"src/rae_data_formatter.py",
	"src/rae_tokenizer_utils.py",
	"evaluation/eval_rae_model.py",
	"evaluation/benchmarks.json",
	"requirements.txt",
	"README.md",
	]

	if Path("THEORY.md").exists():
	files.append("THEORY.md")

	for filepath in files:
	if Path(filepath).exists():
	api.upload_file(
	path_or_fileobj=filepath,
	path_in_repo=filepath,
	repo_id=model_repo,
	repo_type="model",
	token=token,
	)
	print(f" ✓ {filepath}")
	else:
	print(f" ⚠ skipped: {filepath}")

	print(f"✓ Training package pushed: https://huggingface.co/{model_repo}")
	return model_repo


	def main():
	parser = argparse.ArgumentParser(description="Push RAE Training to HuggingFace")
	parser.add_argument("--dataset", action="store_true", help="Push dataset")
	parser.add_argument("--config", action="store_true", help="Push training configs")
	parser.add_argument("--all", action="store_true", help="Push everything")
	parser.add_argument("--repo_prefix", default="rae-training", help="Repo name prefix")
	args = parser.parse_args()

	token = os.environ.get("HF_TOKEN")
	if not token:
	print("Set HF_TOKEN environment variable")
	sys.exit(1)

	if args.all or args.dataset:
	push_dataset(token, args.repo_prefix)

	if args.all or args.config:
	push_training_config(token, args.repo_prefix)

	if not (args.all or args.dataset or args.config):
	print("Specify --dataset, --config, or --all")


	if __name__ == "__main__":
	main()