TrueV1sion123 commited on
Commit
b702f6d
Β·
verified Β·
1 Parent(s): 9030cc5

Upload src/push_to_hub.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/push_to_hub.py +215 -0
src/push_to_hub.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Push RAE Training Package to HuggingFace Hub
3
+ ═══════════════════════════════════════════════════════════════
4
+ Uploads the dataset as an HF Dataset and creates a model repo
5
+ with the training config, making it runnable from anywhere.
6
+
7
+ Usage:
8
+ export HF_TOKEN=your_write_token
9
+ python src/push_to_hub.py --dataset --config
10
+ python src/push_to_hub.py --all
11
+ ═══════════════════════════════════════════════════════════════
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ import json
17
+ import argparse
18
+ from pathlib import Path
19
+
20
+ def push_dataset(token: str, repo_prefix: str = "rae-training"):
21
+ """Push RAE training data as a HuggingFace Dataset."""
22
+ from huggingface_hub import HfApi, create_repo
23
+ from datasets import Dataset, DatasetDict
24
+ import jsonlines
25
+
26
+ api = HfApi(token=token)
27
+ user_info = api.whoami()
28
+ username = user_info["name"]
29
+ dataset_repo = f"{username}/{repo_prefix}-data"
30
+
31
+ print(f"Pushing dataset to: {dataset_repo}")
32
+
33
+ # Create repo
34
+ try:
35
+ create_repo(dataset_repo, repo_type="dataset", private=False, token=token)
36
+ except Exception as e:
37
+ print(f" (repo exists: {e})")
38
+
39
+ # Load JSONL files
40
+ def load_jsonl(path):
41
+ examples = []
42
+ with open(path) as f:
43
+ for line in f:
44
+ data = json.loads(line)
45
+ # Flatten for HF Dataset format
46
+ examples.append({
47
+ "messages": json.dumps(data["messages"]),
48
+ "domain": data.get("metadata", {}).get("domain", "general"),
49
+ "difficulty": data.get("metadata", {}).get("difficulty", "medium"),
50
+ "rae_version": data.get("metadata", {}).get("rae_version", "1.0"),
51
+ })
52
+ return examples
53
+
54
+ train_data = load_jsonl("data/rae_training_data/train.jsonl")
55
+ eval_data = load_jsonl("data/rae_training_data/validation.jsonl")
56
+
57
+ ds = DatasetDict({
58
+ "train": Dataset.from_list(train_data),
59
+ "validation": Dataset.from_list(eval_data),
60
+ })
61
+
62
+ ds.push_to_hub(dataset_repo, token=token)
63
+
64
+ # Upload README
65
+ readme = f"""---
66
+ dataset_info:
67
+ features:
68
+ - name: messages
69
+ dtype: string
70
+ - name: domain
71
+ dtype: string
72
+ - name: difficulty
73
+ dtype: string
74
+ - name: rae_version
75
+ dtype: string
76
+ splits:
77
+ - name: train
78
+ num_examples: {len(train_data)}
79
+ - name: validation
80
+ num_examples: {len(eval_data)}
81
+ tags:
82
+ - cognitive-architecture
83
+ - chain-of-thought
84
+ - structured-reasoning
85
+ - RAE
86
+ license: apache-2.0
87
+ ---
88
+
89
+ # RAE Training Data β€” Recursive Abstraction Engine
90
+
91
+ Training data structured as 4-phase RAE cognitive cycles for fine-tuning LLMs.
92
+
93
+ ## Methodology: The Handwriting Principle
94
+
95
+ Handwriting activates widespread brain connectivity because it forces *generative
96
+ reconstruction through multiple representational modalities simultaneously under
97
+ a temporal bottleneck*.
98
+
99
+ This dataset replicates that effect for ML training: each example forces the model
100
+ through **Saturation β†’ Abstraction β†’ Descent β†’ Integration** phases, with every
101
+ phase contributing to loss β€” preventing shortcutting to the answer.
102
+
103
+ ## Data Format
104
+
105
+ Each example contains structured `messages` with the RAE phase tags:
106
+
107
+ ```
108
+ <SATURATION>...</SATURATION>
109
+ <ABSTRACTION>...</ABSTRACTION>
110
+ <DESCENT>...</DESCENT>
111
+ <INTEGRATION>...</INTEGRATION>
112
+ ```
113
+
114
+ ## Usage
115
+
116
+ ```python
117
+ from datasets import load_dataset
118
+ ds = load_dataset("{dataset_repo}")
119
+ ```
120
+
121
+ ## Training
122
+
123
+ See the companion training package: [{username}/{repo_prefix}](https://huggingface.co/{username}/{repo_prefix})
124
+ """
125
+
126
+ api.upload_file(
127
+ path_or_fileobj=readme.encode(),
128
+ path_in_repo="README.md",
129
+ repo_id=dataset_repo,
130
+ repo_type="dataset",
131
+ token=token,
132
+ )
133
+
134
+ print(f"βœ“ Dataset pushed: https://huggingface.co/datasets/{dataset_repo}")
135
+ return dataset_repo
136
+
137
+
138
+ def push_training_config(token: str, repo_prefix: str = "rae-training"):
139
+ """Push training configs and scripts as a Model repo (pre-training)."""
140
+ from huggingface_hub import HfApi, create_repo
141
+
142
+ api = HfApi(token=token)
143
+ user_info = api.whoami()
144
+ username = user_info["name"]
145
+ model_repo = f"{username}/{repo_prefix}"
146
+
147
+ print(f"Pushing training package to: {model_repo}")
148
+
149
+ # Create repo
150
+ try:
151
+ create_repo(model_repo, repo_type="model", private=False, token=token)
152
+ except Exception as e:
153
+ print(f" (repo exists: {e})")
154
+
155
+ # Upload files
156
+ files = [
157
+ "configs/autotrain_rae_sft.yaml",
158
+ "configs/rae_training_config.json",
159
+ "configs/base_models.json",
160
+ "src/train_rae.py",
161
+ "src/rae_loss.py",
162
+ "src/dataset_generator.py",
163
+ "src/rae_data_formatter.py",
164
+ "src/rae_tokenizer_utils.py",
165
+ "evaluation/eval_rae_model.py",
166
+ "evaluation/benchmarks.json",
167
+ "requirements.txt",
168
+ "README.md",
169
+ ]
170
+
171
+ if Path("THEORY.md").exists():
172
+ files.append("THEORY.md")
173
+
174
+ for filepath in files:
175
+ if Path(filepath).exists():
176
+ api.upload_file(
177
+ path_or_fileobj=filepath,
178
+ path_in_repo=filepath,
179
+ repo_id=model_repo,
180
+ repo_type="model",
181
+ token=token,
182
+ )
183
+ print(f" βœ“ {filepath}")
184
+ else:
185
+ print(f" ⚠ skipped: {filepath}")
186
+
187
+ print(f"βœ“ Training package pushed: https://huggingface.co/{model_repo}")
188
+ return model_repo
189
+
190
+
191
+ def main():
192
+ parser = argparse.ArgumentParser(description="Push RAE Training to HuggingFace")
193
+ parser.add_argument("--dataset", action="store_true", help="Push dataset")
194
+ parser.add_argument("--config", action="store_true", help="Push training configs")
195
+ parser.add_argument("--all", action="store_true", help="Push everything")
196
+ parser.add_argument("--repo_prefix", default="rae-training", help="Repo name prefix")
197
+ args = parser.parse_args()
198
+
199
+ token = os.environ.get("HF_TOKEN")
200
+ if not token:
201
+ print("Set HF_TOKEN environment variable")
202
+ sys.exit(1)
203
+
204
+ if args.all or args.dataset:
205
+ push_dataset(token, args.repo_prefix)
206
+
207
+ if args.all or args.config:
208
+ push_training_config(token, args.repo_prefix)
209
+
210
+ if not (args.all or args.dataset or args.config):
211
+ print("Specify --dataset, --config, or --all")
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()