#!/usr/bin/env python3
"""
AETHER Training — CPU Compatible.
Runs on CPU, Colab CPU, or any machine. Slower but works everywhere.
Uses SFT (Supervised Fine-Tuning) instead of GRPO for CPU efficiency.
"""

import os
import json
import logging
from typing import List

import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, SFTConfig

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("AETHER.CPU")


def prepare_aether_dataset():
    """Create AETHER-specific reasoning dataset."""
    
    # Try DeepMath first
    try:
        ds = load_dataset("trl-lib/DeepMath-103K", split="train")
        logger.info(f"Loaded DeepMath: {len(ds)} examples")
        
        # Convert to text format for SFT
        def format_example(ex):
            prompt = ex.get("prompt", "")
            if isinstance(prompt, list):  # messages format
                prompt = "\n".join(m.get("content", "") for m in prompt if m.get("role") == "user")
            solution = ex.get("solution", "")
            return {
                "text": f"### Instruction:\n{prompt}\n\n### Response:\n<think>\nLet me think step by step about this problem.\n\nStep 1: Analyze the given information.\nStep 2: Apply relevant mathematical principles.\nStep 3: Calculate the result carefully.\n\nTherefore, the answer is {solution}.\n</think>\n\nThe answer is {solution}."
            }
        
        ds = ds.map(format_example)
        ds = ds.remove_columns([c for c in ds.column_names if c != "text"])
        return ds
        
    except Exception as e:
        logger.warning(f"DeepMath failed: {e}")
    
    # Fallback: AETHER reasoning dataset
    logger.info("Creating AETHER reasoning dataset...")
    
    reasoning_examples = [
        {
            "text": "### Instruction:\nThink step by step: What is 17 + 25?\n\n### Response:\n<think>\nStep 1: Break down the numbers. 17 + 25.\nStep 2: Add the ones place: 7 + 5 = 12. Write down 2, carry 1.\nStep 3: Add the tens place: 1 + 2 + 1 (carry) = 4.\nStep 4: Combine: 42.\n\nTherefore, 17 + 25 = 42.\n</think>\n\nThe answer is 42."
        },
        {
            "text": "### Instruction:\nPlan and reason: A farmer has 3 fields. Each produces 42 bushels. Total?\n\n### Response:\n<think>\nStep 1: Identify given information. 3 fields, 42 bushels each.\nStep 2: Recognize this as multiplication: 3 × 42.\nStep 3: Calculate: 3 × 40 = 120, 3 × 2 = 6.\nStep 4: Sum: 120 + 6 = 126.\n\nTherefore, total bushels = 126.\n</think>\n\nThe answer is 126 bushels."
        },
        {
            "text": "### Instruction:\nReflect and improve: Your answer was 50. Correct answer is 60. What went wrong?\n\n### Response:\n<think>\nStep 1: Evaluate the error. The difference is 10 (60 - 50 = 10).\nStep 2: Check calculation steps. Likely missed adding a carry or misread a digit.\nStep 3: Identify root cause: Insufficient verification of intermediate steps.\nStep 4: Improvement plan: Always verify by reverse calculation (60 - 10 = 50 confirms error magnitude).\n\nTherefore, I need to improve my verification process.\n</think>\n\nI made a calculation error, likely missing a carry. I'll add verification steps in future."
        },
        {
            "text": "### Instruction:\nWhy does recursive self-improvement require safety constraints?\n\n### Response:\n<think>\nStep 1: Define recursive self-improvement. A system that modifies its own architecture to improve performance.\nStep 2: Consider failure modes. Unconstrained modification could: (a) remove safety checks, (b) optimize for wrong metrics, (c) cause instability.\nStep 3: Analyze historical examples. Unconstrained optimization leads to exploitation of loopholes (reward hacking).\nStep 4: Apply control theory. Any self-modifying system needs invariant constraints to maintain stability.\nStep 5: Propose solution. Sandboxed evaluation, human oversight, and constrained mutation operators.\n\nTherefore, safety constraints are necessary to maintain beneficial behavior during self-improvement.\n</think>\n\nSafety constraints prevent harmful self-modification, maintain stability, and ensure alignment with human values during recursive improvement."
        },
        {
            "text": "### Instruction:\nHow would you build a self-evolving AI system? Break into phases.\n\n### Response:\n<think>\nPhase 1 (Foundation): Establish base architecture with neuro-symbolic integration. Include knowledge graph, neural networks, and symbolic reasoning.\nPhase 2 (Evolution Loop): Implement MAP-Elites archive for diversity preservation. Use Performance-Novelty selection.\nPhase 3 (Hierarchical Control): Deploy macro-policy for blueprint generation and micro-policy for execution. Apply iterative co-evolution.\nPhase 4 (Multi-Agent): Coordinate manager, executor, and developer agents. Enable experience sharing between agents.\nPhase 5 (Safety): Add sandboxed evaluation, architecture validation, and audit logging. Require human oversight for major changes.\nPhase 6 (Meta-Cognition): Implement self-reflection to analyze performance trends and generate evolution directives.\n\nTherefore, a self-evolving AI requires careful balance between capability and safety.\n</think>\n\nA self-evolving AI needs: neuro-symbolic foundation, evolutionary loop, hierarchical control, multi-agent coordination, safety constraints, and meta-cognitive reflection."
        },
        {
            "text": "### Instruction:\nTrade-offs between symbolic and neural reasoning?\n\n### Response:\n<think>\nStep 1: Symbolic reasoning strengths. Interpretable, exact, compositional, supports formal verification.\nStep 2: Symbolic weaknesses. Brittle with noisy data, requires manual rule engineering, limited pattern generalization.\nStep 3: Neural reasoning strengths. Robust to noise, learns from data, excellent pattern recognition, handles unstructured input.\nStep 4: Neural weaknesses. Opaque (black box), no formal guarantees, can hallucinate, requires massive data.\nStep 5: Integration approach. Use neural for perception and pattern learning, symbolic for reasoning and verification.\n\nTherefore, neuro-symbolic integration combines the best of both paradigms.\n</think>\n\nSymbolic = interpretable/exact but brittle. Neural = robust/flexible but opaque. Integration is optimal."
        },
        {
            "text": "### Instruction:\nDesign a multi-agent system with manager and workers.\n\n### Response:\n<think>\nBlueprint:\n1. Manager Agent (Macro-Policy): Decomposes tasks, generates sub-goal blueprints, coordinates workers.\n2. Executor Agent (Micro-Policy): Executes atomic actions conditioned on active sub-goal.\n3. Developer Agent: Creates new tools when capability gaps detected.\n4. Integrator Agent: Merges worker outputs into coherent final response.\n5. Safety Agent: Validates all outputs against constraints before deployment.\n\nCoordination: Manager uses dynamic routing weights to assign tasks. Workers report progress. Experience sharing via group traces.\n\nTraining: Use MLPO — train only the manager with GRPO, peers remain untrained for efficiency.\n</think>\n\nMulti-agent system: Manager (planning), Executor (action), Developer (tools), Integrator (merging), Safety (validation). Train manager only via MLPO."
        },
        {
            "text": "### Instruction:\nEvolve a sorting algorithm iteratively.\n\n### Response:\n<think>\nGeneration 0: Bubble Sort. Simple but O(n²).\n\nGeneration 1 (Mutation): Quick Sort. Divide and conquer. O(n log n) average.\nEvaluation: Better average case but O(n²) worst case. Unstable.\n\nGeneration 2 (Crossover): Merge Sort. Guaranteed O(n log n). Stable.\nEvaluation: Consistent performance. Needs O(n) extra space.\n\nGeneration 3 (Innovation): Timsort. Hybrid of merge + insertion.\nEvaluation: O(n log n) worst, O(n) best. Adaptive to partially sorted data.\n\nTherefore, evolution yields progressively better solutions through mutation, evaluation, and selection.\n</think>\n\nEvolution path: Bubble → Quick → Merge → Timsort. Each generation evaluated for time complexity, stability, and space usage."
        },
        {
            "text": "### Instruction:\nPenguins are birds. All birds can fly. What about penguins?\n\n### Response:\n<think>\nStep 1: Parse premises. P(Bird|Penguin) = 1, P(Fly|Bird) = 1.\nStep 2: Apply transitive reasoning. If A→B and B→C, then A→C.\nStep 3: Derive conclusion. Penguin → Bird → Fly, therefore Penguin → Fly.\nStep 4: Check against knowledge. Real-world penguins cannot fly — premises contain false generalization.\nStep 5: Identify failure. The premise \"all birds can fly\" is incorrect (counterexample: penguins, ostriches).\n\nTherefore, symbolic reasoning reveals the inconsistency in the premises. The conclusion is invalid due to false premise.\n</think>\n\nThe conclusion that penguins can fly is invalid because the premise \"all birds can fly\" is false. Penguins are a counterexample."
        },
        {
            "text": "### Instruction:\nEvaluate your own reasoning and identify biases.\n\n### Response:\n<think>\nSelf-Evaluation:\n1. Confirmation bias: Do I favor evidence supporting my initial hypothesis? Check by actively seeking disconfirming evidence.\n2. Anchoring bias: Am I over-relying on the first piece of information? Mitigate by considering multiple starting points.\n3. Availability bias: Am I using easily recalled examples instead of statistical data? Counter with systematic data collection.\n4. Overconfidence: Is my confidence calibrated to actual accuracy? Use prediction markets or track calibration scores.\n5. Framing effects: Does problem phrasing influence my conclusion? Test with multiple equivalent formulations.\n\nImprovement: Apply adversarial reasoning — deliberately argue against my own conclusion.\n\nTherefore, meta-cognitive reflection improves reasoning quality through bias identification.\n</think>\n\nKey biases: confirmation, anchoring, availability, overconfidence, framing. Mitigation: adversarial reasoning, systematic data, calibration tracking."
        },
    ]
    
    # Replicate 100x for dataset size
    reasoning_examples = reasoning_examples * 100
    
    ds = Dataset.from_list(reasoning_examples)
    return ds


def main():
    MODEL_NAME = os.environ.get("AETHER_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
    OUTPUT_DIR = os.environ.get("AETHER_OUTPUT", "./aether-output-cpu")
    HUB_MODEL_ID = os.environ.get("AETHER_HUB_ID", "camdog920/aether-qwen-0.5b-sft")
    
    logger.info("=" * 60)
    logger.info("AETHER CPU Training — SFT with Reasoning Dataset")
    logger.info("=" * 60)
    logger.info(f"Model: {MODEL_NAME}")
    logger.info(f"Output: {OUTPUT_DIR}")
    logger.info(f"Hub: {HUB_MODEL_ID}")
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info(f"Device: {device}")
    
    # Load model (fp32 for CPU, bf16 for CUDA)
    logger.info("Loading model...")
    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=dtype,
        device_map="auto" if torch.cuda.is_available() else None,
        trust_remote_code=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load dataset
    logger.info("Preparing dataset...")
    dataset = prepare_aether_dataset()
    dataset = dataset.train_test_split(test_size=0.1)
    train_ds = dataset["train"]
    eval_ds = dataset["test"]
    logger.info(f"Train: {len(train_ds)}, Eval: {len(eval_ds)}")
    
    # Training args
    training_args = SFTConfig(
        output_dir=OUTPUT_DIR,
        num_train_epochs=3,  # More epochs for smaller dataset
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-5,
        logging_steps=10,
        save_steps=200,
        eval_strategy="steps",
        eval_steps=100,
        max_seq_length=512,
        bf16=torch.cuda.is_available(),
        fp16=not torch.cuda.is_available(),
        report_to=[],
        disable_tqdm=False,
        logging_first_step=True,
        push_to_hub=True,
        hub_model_id=HUB_MODEL_ID,
    )
    
    # Trainer
    logger.info("Initializing SFT Trainer...")
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
    )
    
    # Train
    logger.info("Starting training...")
    trainer.train()
    
    # Save
    logger.info("Saving model...")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    
    logger.info("=" * 60)
    logger.info("Training complete!")
    logger.info(f"Model: https://huggingface.co/{HUB_MODEL_ID}")
    logger.info("=" * 60)


if __name__ == "__main__":
    main()