| """ |
| Push RAE Training Package to HuggingFace Hub |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| Uploads the dataset as an HF Dataset and creates a model repo |
| with the training config, making it runnable from anywhere. |
| |
| Usage: |
| export HF_TOKEN=your_write_token |
| python src/push_to_hub.py --dataset --config |
| python src/push_to_hub.py --all |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| """ |
|
|
| import os |
| import sys |
| import json |
| import argparse |
| from pathlib import Path |
|
|
| def push_dataset(token: str, repo_prefix: str = "rae-training"): |
| """Push RAE training data as a HuggingFace Dataset.""" |
| from huggingface_hub import HfApi, create_repo |
| from datasets import Dataset, DatasetDict |
| import jsonlines |
| |
| api = HfApi(token=token) |
| user_info = api.whoami() |
| username = user_info["name"] |
| dataset_repo = f"{username}/{repo_prefix}-data" |
| |
| print(f"Pushing dataset to: {dataset_repo}") |
| |
| |
| try: |
| create_repo(dataset_repo, repo_type="dataset", private=False, token=token) |
| except Exception as e: |
| print(f" (repo exists: {e})") |
| |
| |
| def load_jsonl(path): |
| examples = [] |
| with open(path) as f: |
| for line in f: |
| data = json.loads(line) |
| |
| examples.append({ |
| "messages": json.dumps(data["messages"]), |
| "domain": data.get("metadata", {}).get("domain", "general"), |
| "difficulty": data.get("metadata", {}).get("difficulty", "medium"), |
| "rae_version": data.get("metadata", {}).get("rae_version", "1.0"), |
| }) |
| return examples |
| |
| train_data = load_jsonl("data/rae_training_data/train.jsonl") |
| eval_data = load_jsonl("data/rae_training_data/validation.jsonl") |
| |
| ds = DatasetDict({ |
| "train": Dataset.from_list(train_data), |
| "validation": Dataset.from_list(eval_data), |
| }) |
| |
| ds.push_to_hub(dataset_repo, token=token) |
| |
| |
| readme = f"""--- |
| dataset_info: |
| features: |
| - name: messages |
| dtype: string |
| - name: domain |
| dtype: string |
| - name: difficulty |
| dtype: string |
| - name: rae_version |
| dtype: string |
| splits: |
| - name: train |
| num_examples: {len(train_data)} |
| - name: validation |
| num_examples: {len(eval_data)} |
| tags: |
| - cognitive-architecture |
| - chain-of-thought |
| - structured-reasoning |
| - RAE |
| license: apache-2.0 |
| --- |
| |
| # RAE Training Data β Recursive Abstraction Engine |
| |
| Training data structured as 4-phase RAE cognitive cycles for fine-tuning LLMs. |
| |
| ## Methodology: The Handwriting Principle |
| |
| Handwriting activates widespread brain connectivity because it forces *generative |
| reconstruction through multiple representational modalities simultaneously under |
| a temporal bottleneck*. |
| |
| This dataset replicates that effect for ML training: each example forces the model |
| through **Saturation β Abstraction β Descent β Integration** phases, with every |
| phase contributing to loss β preventing shortcutting to the answer. |
| |
| ## Data Format |
| |
| Each example contains structured `messages` with the RAE phase tags: |
| |
| ``` |
| <SATURATION>...</SATURATION> |
| <ABSTRACTION>...</ABSTRACTION> |
| <DESCENT>...</DESCENT> |
| <INTEGRATION>...</INTEGRATION> |
| ``` |
| |
| ## Usage |
| |
| ```python |
| from datasets import load_dataset |
| ds = load_dataset("{dataset_repo}") |
| ``` |
| |
| ## Training |
| |
| See the companion training package: [{username}/{repo_prefix}](https://huggingface.co/{username}/{repo_prefix}) |
| """ |
| |
| api.upload_file( |
| path_or_fileobj=readme.encode(), |
| path_in_repo="README.md", |
| repo_id=dataset_repo, |
| repo_type="dataset", |
| token=token, |
| ) |
| |
| print(f"β Dataset pushed: https://huggingface.co/datasets/{dataset_repo}") |
| return dataset_repo |
|
|
|
|
| def push_training_config(token: str, repo_prefix: str = "rae-training"): |
| """Push training configs and scripts as a Model repo (pre-training).""" |
| from huggingface_hub import HfApi, create_repo |
| |
| api = HfApi(token=token) |
| user_info = api.whoami() |
| username = user_info["name"] |
| model_repo = f"{username}/{repo_prefix}" |
| |
| print(f"Pushing training package to: {model_repo}") |
| |
| |
| try: |
| create_repo(model_repo, repo_type="model", private=False, token=token) |
| except Exception as e: |
| print(f" (repo exists: {e})") |
| |
| |
| files = [ |
| "configs/autotrain_rae_sft.yaml", |
| "configs/rae_training_config.json", |
| "configs/base_models.json", |
| "src/train_rae.py", |
| "src/rae_loss.py", |
| "src/dataset_generator.py", |
| "src/rae_data_formatter.py", |
| "src/rae_tokenizer_utils.py", |
| "evaluation/eval_rae_model.py", |
| "evaluation/benchmarks.json", |
| "requirements.txt", |
| "README.md", |
| ] |
| |
| if Path("THEORY.md").exists(): |
| files.append("THEORY.md") |
| |
| for filepath in files: |
| if Path(filepath).exists(): |
| api.upload_file( |
| path_or_fileobj=filepath, |
| path_in_repo=filepath, |
| repo_id=model_repo, |
| repo_type="model", |
| token=token, |
| ) |
| print(f" β {filepath}") |
| else: |
| print(f" β skipped: {filepath}") |
| |
| print(f"β Training package pushed: https://huggingface.co/{model_repo}") |
| return model_repo |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Push RAE Training to HuggingFace") |
| parser.add_argument("--dataset", action="store_true", help="Push dataset") |
| parser.add_argument("--config", action="store_true", help="Push training configs") |
| parser.add_argument("--all", action="store_true", help="Push everything") |
| parser.add_argument("--repo_prefix", default="rae-training", help="Repo name prefix") |
| args = parser.parse_args() |
| |
| token = os.environ.get("HF_TOKEN") |
| if not token: |
| print("Set HF_TOKEN environment variable") |
| sys.exit(1) |
| |
| if args.all or args.dataset: |
| push_dataset(token, args.repo_prefix) |
| |
| if args.all or args.config: |
| push_training_config(token, args.repo_prefix) |
| |
| if not (args.all or args.dataset or args.config): |
| print("Specify --dataset, --config, or --all") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|