TrueV1sion123
/

rae-training

Model card Files Files and versions

xet

Community

TrueV1sion123 commited on 19 days ago

Commit

3143539

verified ·

1 Parent(s): 6970bcf

Upload src/rae_data_formatter.py with huggingface_hub

Browse files

Files changed (1) hide show

src/rae_data_formatter.py +265 -0

src/rae_data_formatter.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+RAE Data Formatter
+═══════════════════════════════════════════════════════════════
+Converts existing datasets into RAE-structured format.
+Supports converting:
+1. Standard Q&A datasets → RAE-structured chat
+2. Chain-of-thought datasets → RAE phases (mapping reasoning steps to phases)
+3. Code datasets → RAE-structured code reasoning
+4. Custom formats via pluggable formatters
+═══════════════════════════════════════════════════════════════
+"""
+import json
+import re
+from pathlib import Path
+from typing import Callable, Optional
+from rae_tokenizer_utils import PHASE_TAGS, validate_rae_response
+# ── System Prompts by Domain ──────────────────────────────────
+SYSTEM_PROMPTS = {
+    "general": (
+        "You are an RAE-trained cognitive reasoner. For every problem, "
+        "work through all four phases: SATURATION (explore without judgment), "
+        "ABSTRACTION (extract minimal structure), DESCENT (concrete implementation), "
+        "INTEGRATION (meta-learning). Use XML phase tags."
+    ),
+    "code": (
+        "You are an RAE-trained software engineer. For every coding task, "
+        "work through: SATURATION (understand requirements, edge cases, constraints), "
+        "ABSTRACTION (identify core algorithm/pattern), DESCENT (implement and test), "
+        "INTEGRATION (what was learned, what generalizes). Use XML phase tags."
+    ),
+    "analysis": (
+        "You are an RAE-trained strategic analyst. For every analysis, "
+        "work through: SATURATION (gather all signals, flag anomalies), "
+        "ABSTRACTION (identify root mechanism), DESCENT (specific predictions and recommendations), "
+        "INTEGRATION (confidence assessment, what would change the conclusion). Use XML phase tags."
+    ),
+    "reasoning": (
+        "You are an RAE-trained reasoner. For every problem, "
+        "work through: SATURATION (map the full problem space without premature conclusions), "
+        "ABSTRACTION (what's the underlying structure?), DESCENT (test implications concretely), "
+        "INTEGRATION (update beliefs, identify next questions). Use XML phase tags."
+    ),
+}
+def cot_to_rae(
+    question: str,
+    chain_of_thought: str,
+    answer: str,
+    domain: str = "general",
+) -> Optional[dict]:
+    """
+    Convert a chain-of-thought example to RAE structure.
+    Heuristic mapping:
+    - First ~30% of CoT → Saturation (exploration/observation)
+    - Next ~20% → Abstraction (key insight identification)
+    - Next ~30% → Descent (working through specifics)
+    - Final ~20% + answer → Integration (conclusion + meta-learning)
+    """
+    cot_sentences = [s.strip() for s in re.split(r'[.!?]+', chain_of_thought) if s.strip()]
+    total = len(cot_sentences)
+    if total < 4:
+        return None  # Too short to meaningfully decompose
+    # Split into phases
+    sat_end = int(total * 0.3)
+    abs_end = int(total * 0.5)
+    desc_end = int(total * 0.8)
+    saturation = ". ".join(cot_sentences[:sat_end]) + "."
+    abstraction = ". ".join(cot_sentences[sat_end:abs_end]) + "."
+    descent = ". ".join(cot_sentences[abs_end:desc_end]) + "."
+    integration = ". ".join(cot_sentences[desc_end:]) + f"\n\nFinal answer: {answer}"
+    system = SYSTEM_PROMPTS.get(domain, SYSTEM_PROMPTS["general"])
+    rae_response = (
+        f"<SATURATION>\n{saturation}\n</SATURATION>\n\n"
+        f"<ABSTRACTION>\n{abstraction}\n</ABSTRACTION>\n\n"
+        f"<DESCENT>\n{descent}\n</DESCENT>\n\n"
+        f"<INTEGRATION>\n{integration}\n</INTEGRATION>"
+    )
+    return {
+        "messages": [
+            {"role": "system", "content": system},
+            {"role": "user", "content": question},
+            {"role": "assistant", "content": rae_response},
+        ],
+        "metadata": {
+            "domain": domain,
+            "source_format": "cot",
+            "rae_version": "1.0",
+        }
+    }
+def qa_to_rae(
+    question: str,
+    answer: str,
+    domain: str = "general",
+    explanation: str = "",
+) -> dict:
+    """
+    Convert a simple Q&A pair to RAE structure.
+    Since there's no reasoning chain, we create a minimal
+    RAE scaffold that the model will learn to fill richly.
+    """
+    system = SYSTEM_PROMPTS.get(domain, SYSTEM_PROMPTS["general"])
+    rae_response = (
+        f"<SATURATION>\n"
+        f"The question asks: {question}\n"
+        f"Key elements to consider: {explanation or 'Let me explore the problem space.'}\n"
+        f"</SATURATION>\n\n"
+        f"<ABSTRACTION>\n"
+        f"The core structure of this problem is about identifying the right approach.\n"
+        f"</ABSTRACTION>\n\n"
+        f"<DESCENT>\n"
+        f"{answer}\n"
+        f"</DESCENT>\n\n"
+        f"<INTEGRATION>\n"
+        f"This reinforces the principle that careful problem decomposition "
+        f"leads to clearer solutions.\n"
+        f"</INTEGRATION>"
+    )
+    return {
+        "messages": [
+            {"role": "system", "content": system},
+            {"role": "user", "content": question},
+            {"role": "assistant", "content": rae_response},
+        ],
+        "metadata": {
+            "domain": domain,
+            "source_format": "qa",
+            "rae_version": "1.0",
+        }
+    }
+def convert_hf_dataset(
+    dataset_name: str,
+    formatter: Callable,
+    output_path: str,
+    max_examples: int = 1000,
+    train_split: str = "train",
+):
+    """
+    Convert a HuggingFace dataset to RAE format.
+    Args:
+        dataset_name: HF dataset identifier (e.g., "gsm8k")
+        formatter: Function that converts a single example
+        output_path: Where to write the JSONL output
+        max_examples: Maximum examples to convert
+        train_split: Which split to use
+    """
+    from datasets import load_dataset
+    print(f"Loading {dataset_name}...")
+    dataset = load_dataset(dataset_name, split=train_split)
+    output = Path(output_path)
+    output.parent.mkdir(parents=True, exist_ok=True)
+    converted = 0
+    skipped = 0
+    with open(output, "w") as f:
+        for i, example in enumerate(dataset):
+            if converted >= max_examples:
+                break
+            result = formatter(example)
+            if result:
+                validation = validate_rae_response(result["messages"][-1]["content"])
+                if validation["is_valid"] or len(validation["phases_found"]) >= 3:
+                    f.write(json.dumps(result) + "\n")
+                    converted += 1
+                else:
+                    skipped += 1
+            else:
+                skipped += 1
+    print(f"Converted {converted} examples ({skipped} skipped) → {output}")
+    return converted
+# ── Pre-built Formatters for Popular Datasets ─────────────────
+def format_gsm8k(example: dict) -> Optional[dict]:
+    """Format GSM8K math reasoning to RAE."""
+    question = example.get("question", "")
+    answer_text = example.get("answer", "")
+    # GSM8K format: reasoning steps separated by \n, final answer after ####
+    parts = answer_text.split("####")
+    reasoning = parts[0].strip() if len(parts) > 1 else answer_text
+    final_answer = parts[1].strip() if len(parts) > 1 else ""
+    return cot_to_rae(question, reasoning, final_answer, domain="reasoning")
+def format_code_alpaca(example: dict) -> Optional[dict]:
+    """Format Code Alpaca to RAE."""
+    instruction = example.get("instruction", "")
+    output = example.get("output", "")
+    return qa_to_rae(instruction, output, domain="code")
+def format_openassistant(example: dict) -> Optional[dict]:
+    """Format OpenAssistant conversations to RAE."""
+    text = example.get("text", "")
+    if not text:
+        return None
+    # Simple: wrap the whole response in RAE structure
+    return qa_to_rae(
+        "Respond helpfully to the following conversation.",
+        text,
+        domain="general",
+    )
+# ── Available Formatters Registry ─────────────────────────────
+FORMATTERS = {
+    "gsm8k": ("gsm8k", "main", format_gsm8k),
+    "code_alpaca": ("sahil2801/CodeAlpaca-20k", None, format_code_alpaca),
+    "openassistant": ("timdettmers/openassistant-guanaco", None, format_openassistant),
+}
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Convert HF datasets to RAE format")
+    parser.add_argument("--dataset", type=str, required=True, choices=list(FORMATTERS.keys()))
+    parser.add_argument("--output", type=str, default="data/rae_training_data/converted.jsonl")
+    parser.add_argument("--max_examples", type=int, default=500)
+    args = parser.parse_args()
+    dataset_id, config, formatter = FORMATTERS[args.dataset]
+    from datasets import load_dataset
+    split_name = "train"
+    convert_hf_dataset(
+        dataset_name=dataset_id,
+        formatter=formatter,
+        output_path=args.output,
+        max_examples=args.max_examples,
+    )