TrueV1sion123
/

rae-training

Model card Files Files and versions

xet

Community

TrueV1sion123 commited on 21 days ago

Commit

b702f6d

verified ·

1 Parent(s): 9030cc5

Upload src/push_to_hub.py with huggingface_hub

Browse files

Files changed (1) hide show

src/push_to_hub.py +215 -0

src/push_to_hub.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""
+Push RAE Training Package to HuggingFace Hub
+═══════════════════════════════════════════════════════════════
+Uploads the dataset as an HF Dataset and creates a model repo
+with the training config, making it runnable from anywhere.
+Usage:
+    export HF_TOKEN=your_write_token
+    python src/push_to_hub.py --dataset --config
+    python src/push_to_hub.py --all
+═══════════════════════════════════════════════════════════════
+"""
+import os
+import sys
+import json
+import argparse
+from pathlib import Path
+def push_dataset(token: str, repo_prefix: str = "rae-training"):
+    """Push RAE training data as a HuggingFace Dataset."""
+    from huggingface_hub import HfApi, create_repo
+    from datasets import Dataset, DatasetDict
+    import jsonlines
+    api = HfApi(token=token)
+    user_info = api.whoami()
+    username = user_info["name"]
+    dataset_repo = f"{username}/{repo_prefix}-data"
+    print(f"Pushing dataset to: {dataset_repo}")
+    # Create repo
+    try:
+        create_repo(dataset_repo, repo_type="dataset", private=False, token=token)
+    except Exception as e:
+        print(f"  (repo exists: {e})")
+    # Load JSONL files
+    def load_jsonl(path):
+        examples = []
+        with open(path) as f:
+            for line in f:
+                data = json.loads(line)
+                # Flatten for HF Dataset format
+                examples.append({
+                    "messages": json.dumps(data["messages"]),
+                    "domain": data.get("metadata", {}).get("domain", "general"),
+                    "difficulty": data.get("metadata", {}).get("difficulty", "medium"),
+                    "rae_version": data.get("metadata", {}).get("rae_version", "1.0"),
+                })
+        return examples
+    train_data = load_jsonl("data/rae_training_data/train.jsonl")
+    eval_data = load_jsonl("data/rae_training_data/validation.jsonl")
+    ds = DatasetDict({
+        "train": Dataset.from_list(train_data),
+        "validation": Dataset.from_list(eval_data),
+    })
+    ds.push_to_hub(dataset_repo, token=token)
+    # Upload README
+    readme = f"""---
+dataset_info:
+  features:
+    - name: messages
+      dtype: string
+    - name: domain
+      dtype: string
+    - name: difficulty
+      dtype: string
+    - name: rae_version
+      dtype: string
+  splits:
+    - name: train
+      num_examples: {len(train_data)}
+    - name: validation
+      num_examples: {len(eval_data)}
+tags:
+  - cognitive-architecture
+  - chain-of-thought
+  - structured-reasoning
+  - RAE
+license: apache-2.0
+---
+# RAE Training Data — Recursive Abstraction Engine
+Training data structured as 4-phase RAE cognitive cycles for fine-tuning LLMs.
+## Methodology: The Handwriting Principle
+Handwriting activates widespread brain connectivity because it forces *generative
+reconstruction through multiple representational modalities simultaneously under
+a temporal bottleneck*.
+This dataset replicates that effect for ML training: each example forces the model
+through **Saturation → Abstraction → Descent → Integration** phases, with every
+phase contributing to loss — preventing shortcutting to the answer.
+## Data Format
+Each example contains structured `messages` with the RAE phase tags:
+```
+<SATURATION>...</SATURATION>
+<ABSTRACTION>...</ABSTRACTION>
+<DESCENT>...</DESCENT>
+<INTEGRATION>...</INTEGRATION>
+```
+## Usage
+```python
+from datasets import load_dataset
+ds = load_dataset("{dataset_repo}")
+```
+## Training
+See the companion training package: [{username}/{repo_prefix}](https://huggingface.co/{username}/{repo_prefix})
+"""
+    api.upload_file(
+        path_or_fileobj=readme.encode(),
+        path_in_repo="README.md",
+        repo_id=dataset_repo,
+        repo_type="dataset",
+        token=token,
+    )
+    print(f"✓ Dataset pushed: https://huggingface.co/datasets/{dataset_repo}")
+    return dataset_repo
+def push_training_config(token: str, repo_prefix: str = "rae-training"):
+    """Push training configs and scripts as a Model repo (pre-training)."""
+    from huggingface_hub import HfApi, create_repo
+    api = HfApi(token=token)
+    user_info = api.whoami()
+    username = user_info["name"]
+    model_repo = f"{username}/{repo_prefix}"
+    print(f"Pushing training package to: {model_repo}")
+    # Create repo
+    try:
+        create_repo(model_repo, repo_type="model", private=False, token=token)
+    except Exception as e:
+        print(f"  (repo exists: {e})")
+    # Upload files
+    files = [
+        "configs/autotrain_rae_sft.yaml",
+        "configs/rae_training_config.json",
+        "configs/base_models.json",
+        "src/train_rae.py",
+        "src/rae_loss.py",
+        "src/dataset_generator.py",
+        "src/rae_data_formatter.py",
+        "src/rae_tokenizer_utils.py",
+        "evaluation/eval_rae_model.py",
+        "evaluation/benchmarks.json",
+        "requirements.txt",
+        "README.md",
+    ]
+    if Path("THEORY.md").exists():
+        files.append("THEORY.md")
+    for filepath in files:
+        if Path(filepath).exists():
+            api.upload_file(
+                path_or_fileobj=filepath,
+                path_in_repo=filepath,
+                repo_id=model_repo,
+                repo_type="model",
+                token=token,
+            )
+            print(f"  ✓ {filepath}")
+        else:
+            print(f"  ⚠ skipped: {filepath}")
+    print(f"✓ Training package pushed: https://huggingface.co/{model_repo}")
+    return model_repo
+def main():
+    parser = argparse.ArgumentParser(description="Push RAE Training to HuggingFace")
+    parser.add_argument("--dataset", action="store_true", help="Push dataset")
+    parser.add_argument("--config", action="store_true", help="Push training configs")
+    parser.add_argument("--all", action="store_true", help="Push everything")
+    parser.add_argument("--repo_prefix", default="rae-training", help="Repo name prefix")
+    args = parser.parse_args()
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        print("Set HF_TOKEN environment variable")
+        sys.exit(1)
+    if args.all or args.dataset:
+        push_dataset(token, args.repo_prefix)
+    if args.all or args.config:
+        push_training_config(token, args.repo_prefix)
+    if not (args.all or args.dataset or args.config):
+        print("Specify --dataset, --config, or --all")
+if __name__ == "__main__":
+    main()