rae-training / src /push_to_hub.py
TrueV1sion123's picture
Upload src/push_to_hub.py with huggingface_hub
b702f6d verified
"""
Push RAE Training Package to HuggingFace Hub
═══════════════════════════════════════════════════════════════
Uploads the dataset as an HF Dataset and creates a model repo
with the training config, making it runnable from anywhere.
Usage:
export HF_TOKEN=your_write_token
python src/push_to_hub.py --dataset --config
python src/push_to_hub.py --all
═══════════════════════════════════════════════════════════════
"""
import os
import sys
import json
import argparse
from pathlib import Path
def push_dataset(token: str, repo_prefix: str = "rae-training"):
"""Push RAE training data as a HuggingFace Dataset."""
from huggingface_hub import HfApi, create_repo
from datasets import Dataset, DatasetDict
import jsonlines
api = HfApi(token=token)
user_info = api.whoami()
username = user_info["name"]
dataset_repo = f"{username}/{repo_prefix}-data"
print(f"Pushing dataset to: {dataset_repo}")
# Create repo
try:
create_repo(dataset_repo, repo_type="dataset", private=False, token=token)
except Exception as e:
print(f" (repo exists: {e})")
# Load JSONL files
def load_jsonl(path):
examples = []
with open(path) as f:
for line in f:
data = json.loads(line)
# Flatten for HF Dataset format
examples.append({
"messages": json.dumps(data["messages"]),
"domain": data.get("metadata", {}).get("domain", "general"),
"difficulty": data.get("metadata", {}).get("difficulty", "medium"),
"rae_version": data.get("metadata", {}).get("rae_version", "1.0"),
})
return examples
train_data = load_jsonl("data/rae_training_data/train.jsonl")
eval_data = load_jsonl("data/rae_training_data/validation.jsonl")
ds = DatasetDict({
"train": Dataset.from_list(train_data),
"validation": Dataset.from_list(eval_data),
})
ds.push_to_hub(dataset_repo, token=token)
# Upload README
readme = f"""---
dataset_info:
features:
- name: messages
dtype: string
- name: domain
dtype: string
- name: difficulty
dtype: string
- name: rae_version
dtype: string
splits:
- name: train
num_examples: {len(train_data)}
- name: validation
num_examples: {len(eval_data)}
tags:
- cognitive-architecture
- chain-of-thought
- structured-reasoning
- RAE
license: apache-2.0
---
# RAE Training Data β€” Recursive Abstraction Engine
Training data structured as 4-phase RAE cognitive cycles for fine-tuning LLMs.
## Methodology: The Handwriting Principle
Handwriting activates widespread brain connectivity because it forces *generative
reconstruction through multiple representational modalities simultaneously under
a temporal bottleneck*.
This dataset replicates that effect for ML training: each example forces the model
through **Saturation β†’ Abstraction β†’ Descent β†’ Integration** phases, with every
phase contributing to loss β€” preventing shortcutting to the answer.
## Data Format
Each example contains structured `messages` with the RAE phase tags:
```
<SATURATION>...</SATURATION>
<ABSTRACTION>...</ABSTRACTION>
<DESCENT>...</DESCENT>
<INTEGRATION>...</INTEGRATION>
```
## Usage
```python
from datasets import load_dataset
ds = load_dataset("{dataset_repo}")
```
## Training
See the companion training package: [{username}/{repo_prefix}](https://huggingface.co/{username}/{repo_prefix})
"""
api.upload_file(
path_or_fileobj=readme.encode(),
path_in_repo="README.md",
repo_id=dataset_repo,
repo_type="dataset",
token=token,
)
print(f"βœ“ Dataset pushed: https://huggingface.co/datasets/{dataset_repo}")
return dataset_repo
def push_training_config(token: str, repo_prefix: str = "rae-training"):
"""Push training configs and scripts as a Model repo (pre-training)."""
from huggingface_hub import HfApi, create_repo
api = HfApi(token=token)
user_info = api.whoami()
username = user_info["name"]
model_repo = f"{username}/{repo_prefix}"
print(f"Pushing training package to: {model_repo}")
# Create repo
try:
create_repo(model_repo, repo_type="model", private=False, token=token)
except Exception as e:
print(f" (repo exists: {e})")
# Upload files
files = [
"configs/autotrain_rae_sft.yaml",
"configs/rae_training_config.json",
"configs/base_models.json",
"src/train_rae.py",
"src/rae_loss.py",
"src/dataset_generator.py",
"src/rae_data_formatter.py",
"src/rae_tokenizer_utils.py",
"evaluation/eval_rae_model.py",
"evaluation/benchmarks.json",
"requirements.txt",
"README.md",
]
if Path("THEORY.md").exists():
files.append("THEORY.md")
for filepath in files:
if Path(filepath).exists():
api.upload_file(
path_or_fileobj=filepath,
path_in_repo=filepath,
repo_id=model_repo,
repo_type="model",
token=token,
)
print(f" βœ“ {filepath}")
else:
print(f" ⚠ skipped: {filepath}")
print(f"βœ“ Training package pushed: https://huggingface.co/{model_repo}")
return model_repo
def main():
parser = argparse.ArgumentParser(description="Push RAE Training to HuggingFace")
parser.add_argument("--dataset", action="store_true", help="Push dataset")
parser.add_argument("--config", action="store_true", help="Push training configs")
parser.add_argument("--all", action="store_true", help="Push everything")
parser.add_argument("--repo_prefix", default="rae-training", help="Repo name prefix")
args = parser.parse_args()
token = os.environ.get("HF_TOKEN")
if not token:
print("Set HF_TOKEN environment variable")
sys.exit(1)
if args.all or args.dataset:
push_dataset(token, args.repo_prefix)
if args.all or args.config:
push_training_config(token, args.repo_prefix)
if not (args.all or args.dataset or args.config):
print("Specify --dataset, --config, or --all")
if __name__ == "__main__":
main()