TrueV1sion123 commited on
Commit
3143539
Β·
verified Β·
1 Parent(s): 6970bcf

Upload src/rae_data_formatter.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/rae_data_formatter.py +265 -0
src/rae_data_formatter.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAE Data Formatter
3
+ ═══════════════════════════════════════════════════════════════
4
+ Converts existing datasets into RAE-structured format.
5
+
6
+ Supports converting:
7
+ 1. Standard Q&A datasets β†’ RAE-structured chat
8
+ 2. Chain-of-thought datasets β†’ RAE phases (mapping reasoning steps to phases)
9
+ 3. Code datasets β†’ RAE-structured code reasoning
10
+ 4. Custom formats via pluggable formatters
11
+ ═══════════════════════════════════════════════════════════════
12
+ """
13
+
14
+ import json
15
+ import re
16
+ from pathlib import Path
17
+ from typing import Callable, Optional
18
+
19
+ from rae_tokenizer_utils import PHASE_TAGS, validate_rae_response
20
+
21
+
22
+ # ── System Prompts by Domain ──────────────────────────────────
23
+
24
+ SYSTEM_PROMPTS = {
25
+ "general": (
26
+ "You are an RAE-trained cognitive reasoner. For every problem, "
27
+ "work through all four phases: SATURATION (explore without judgment), "
28
+ "ABSTRACTION (extract minimal structure), DESCENT (concrete implementation), "
29
+ "INTEGRATION (meta-learning). Use XML phase tags."
30
+ ),
31
+ "code": (
32
+ "You are an RAE-trained software engineer. For every coding task, "
33
+ "work through: SATURATION (understand requirements, edge cases, constraints), "
34
+ "ABSTRACTION (identify core algorithm/pattern), DESCENT (implement and test), "
35
+ "INTEGRATION (what was learned, what generalizes). Use XML phase tags."
36
+ ),
37
+ "analysis": (
38
+ "You are an RAE-trained strategic analyst. For every analysis, "
39
+ "work through: SATURATION (gather all signals, flag anomalies), "
40
+ "ABSTRACTION (identify root mechanism), DESCENT (specific predictions and recommendations), "
41
+ "INTEGRATION (confidence assessment, what would change the conclusion). Use XML phase tags."
42
+ ),
43
+ "reasoning": (
44
+ "You are an RAE-trained reasoner. For every problem, "
45
+ "work through: SATURATION (map the full problem space without premature conclusions), "
46
+ "ABSTRACTION (what's the underlying structure?), DESCENT (test implications concretely), "
47
+ "INTEGRATION (update beliefs, identify next questions). Use XML phase tags."
48
+ ),
49
+ }
50
+
51
+
52
+ def cot_to_rae(
53
+ question: str,
54
+ chain_of_thought: str,
55
+ answer: str,
56
+ domain: str = "general",
57
+ ) -> Optional[dict]:
58
+ """
59
+ Convert a chain-of-thought example to RAE structure.
60
+
61
+ Heuristic mapping:
62
+ - First ~30% of CoT β†’ Saturation (exploration/observation)
63
+ - Next ~20% β†’ Abstraction (key insight identification)
64
+ - Next ~30% β†’ Descent (working through specifics)
65
+ - Final ~20% + answer β†’ Integration (conclusion + meta-learning)
66
+ """
67
+ cot_sentences = [s.strip() for s in re.split(r'[.!?]+', chain_of_thought) if s.strip()]
68
+ total = len(cot_sentences)
69
+
70
+ if total < 4:
71
+ return None # Too short to meaningfully decompose
72
+
73
+ # Split into phases
74
+ sat_end = int(total * 0.3)
75
+ abs_end = int(total * 0.5)
76
+ desc_end = int(total * 0.8)
77
+
78
+ saturation = ". ".join(cot_sentences[:sat_end]) + "."
79
+ abstraction = ". ".join(cot_sentences[sat_end:abs_end]) + "."
80
+ descent = ". ".join(cot_sentences[abs_end:desc_end]) + "."
81
+ integration = ". ".join(cot_sentences[desc_end:]) + f"\n\nFinal answer: {answer}"
82
+
83
+ system = SYSTEM_PROMPTS.get(domain, SYSTEM_PROMPTS["general"])
84
+
85
+ rae_response = (
86
+ f"<SATURATION>\n{saturation}\n</SATURATION>\n\n"
87
+ f"<ABSTRACTION>\n{abstraction}\n</ABSTRACTION>\n\n"
88
+ f"<DESCENT>\n{descent}\n</DESCENT>\n\n"
89
+ f"<INTEGRATION>\n{integration}\n</INTEGRATION>"
90
+ )
91
+
92
+ return {
93
+ "messages": [
94
+ {"role": "system", "content": system},
95
+ {"role": "user", "content": question},
96
+ {"role": "assistant", "content": rae_response},
97
+ ],
98
+ "metadata": {
99
+ "domain": domain,
100
+ "source_format": "cot",
101
+ "rae_version": "1.0",
102
+ }
103
+ }
104
+
105
+
106
+ def qa_to_rae(
107
+ question: str,
108
+ answer: str,
109
+ domain: str = "general",
110
+ explanation: str = "",
111
+ ) -> dict:
112
+ """
113
+ Convert a simple Q&A pair to RAE structure.
114
+
115
+ Since there's no reasoning chain, we create a minimal
116
+ RAE scaffold that the model will learn to fill richly.
117
+ """
118
+ system = SYSTEM_PROMPTS.get(domain, SYSTEM_PROMPTS["general"])
119
+
120
+ rae_response = (
121
+ f"<SATURATION>\n"
122
+ f"The question asks: {question}\n"
123
+ f"Key elements to consider: {explanation or 'Let me explore the problem space.'}\n"
124
+ f"</SATURATION>\n\n"
125
+ f"<ABSTRACTION>\n"
126
+ f"The core structure of this problem is about identifying the right approach.\n"
127
+ f"</ABSTRACTION>\n\n"
128
+ f"<DESCENT>\n"
129
+ f"{answer}\n"
130
+ f"</DESCENT>\n\n"
131
+ f"<INTEGRATION>\n"
132
+ f"This reinforces the principle that careful problem decomposition "
133
+ f"leads to clearer solutions.\n"
134
+ f"</INTEGRATION>"
135
+ )
136
+
137
+ return {
138
+ "messages": [
139
+ {"role": "system", "content": system},
140
+ {"role": "user", "content": question},
141
+ {"role": "assistant", "content": rae_response},
142
+ ],
143
+ "metadata": {
144
+ "domain": domain,
145
+ "source_format": "qa",
146
+ "rae_version": "1.0",
147
+ }
148
+ }
149
+
150
+
151
+ def convert_hf_dataset(
152
+ dataset_name: str,
153
+ formatter: Callable,
154
+ output_path: str,
155
+ max_examples: int = 1000,
156
+ train_split: str = "train",
157
+ ):
158
+ """
159
+ Convert a HuggingFace dataset to RAE format.
160
+
161
+ Args:
162
+ dataset_name: HF dataset identifier (e.g., "gsm8k")
163
+ formatter: Function that converts a single example
164
+ output_path: Where to write the JSONL output
165
+ max_examples: Maximum examples to convert
166
+ train_split: Which split to use
167
+ """
168
+ from datasets import load_dataset
169
+
170
+ print(f"Loading {dataset_name}...")
171
+ dataset = load_dataset(dataset_name, split=train_split)
172
+
173
+ output = Path(output_path)
174
+ output.parent.mkdir(parents=True, exist_ok=True)
175
+
176
+ converted = 0
177
+ skipped = 0
178
+
179
+ with open(output, "w") as f:
180
+ for i, example in enumerate(dataset):
181
+ if converted >= max_examples:
182
+ break
183
+
184
+ result = formatter(example)
185
+ if result:
186
+ validation = validate_rae_response(result["messages"][-1]["content"])
187
+ if validation["is_valid"] or len(validation["phases_found"]) >= 3:
188
+ f.write(json.dumps(result) + "\n")
189
+ converted += 1
190
+ else:
191
+ skipped += 1
192
+ else:
193
+ skipped += 1
194
+
195
+ print(f"Converted {converted} examples ({skipped} skipped) β†’ {output}")
196
+ return converted
197
+
198
+
199
+ # ── Pre-built Formatters for Popular Datasets ─────────────────
200
+
201
+ def format_gsm8k(example: dict) -> Optional[dict]:
202
+ """Format GSM8K math reasoning to RAE."""
203
+ question = example.get("question", "")
204
+ answer_text = example.get("answer", "")
205
+
206
+ # GSM8K format: reasoning steps separated by \n, final answer after ####
207
+ parts = answer_text.split("####")
208
+ reasoning = parts[0].strip() if len(parts) > 1 else answer_text
209
+ final_answer = parts[1].strip() if len(parts) > 1 else ""
210
+
211
+ return cot_to_rae(question, reasoning, final_answer, domain="reasoning")
212
+
213
+
214
+ def format_code_alpaca(example: dict) -> Optional[dict]:
215
+ """Format Code Alpaca to RAE."""
216
+ instruction = example.get("instruction", "")
217
+ output = example.get("output", "")
218
+
219
+ return qa_to_rae(instruction, output, domain="code")
220
+
221
+
222
+ def format_openassistant(example: dict) -> Optional[dict]:
223
+ """Format OpenAssistant conversations to RAE."""
224
+ text = example.get("text", "")
225
+ if not text:
226
+ return None
227
+
228
+ # Simple: wrap the whole response in RAE structure
229
+ return qa_to_rae(
230
+ "Respond helpfully to the following conversation.",
231
+ text,
232
+ domain="general",
233
+ )
234
+
235
+
236
+ # ── Available Formatters Registry ─────────────────────────────
237
+
238
+ FORMATTERS = {
239
+ "gsm8k": ("gsm8k", "main", format_gsm8k),
240
+ "code_alpaca": ("sahil2801/CodeAlpaca-20k", None, format_code_alpaca),
241
+ "openassistant": ("timdettmers/openassistant-guanaco", None, format_openassistant),
242
+ }
243
+
244
+
245
+ if __name__ == "__main__":
246
+ import argparse
247
+
248
+ parser = argparse.ArgumentParser(description="Convert HF datasets to RAE format")
249
+ parser.add_argument("--dataset", type=str, required=True, choices=list(FORMATTERS.keys()))
250
+ parser.add_argument("--output", type=str, default="data/rae_training_data/converted.jsonl")
251
+ parser.add_argument("--max_examples", type=int, default=500)
252
+
253
+ args = parser.parse_args()
254
+
255
+ dataset_id, config, formatter = FORMATTERS[args.dataset]
256
+
257
+ from datasets import load_dataset
258
+ split_name = "train"
259
+
260
+ convert_hf_dataset(
261
+ dataset_name=dataset_id,
262
+ formatter=formatter,
263
+ output_path=args.output,
264
+ max_examples=args.max_examples,
265
+ )