#!/usr/bin/env python3
"""
AETHER Training — CPU Compatible.
Runs on CPU, Colab CPU, or any machine. Slower but works everywhere.
Uses SFT (Supervised Fine-Tuning) instead of GRPO for CPU efficiency.
"""
import os
import json
import logging
from typing import List
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, SFTConfig
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("AETHER.CPU")
def prepare_aether_dataset():
"""Create AETHER-specific reasoning dataset."""
# Try DeepMath first
try:
ds = load_dataset("trl-lib/DeepMath-103K", split="train")
logger.info(f"Loaded DeepMath: {len(ds)} examples")
# Convert to text format for SFT
def format_example(ex):
prompt = ex.get("prompt", "")
if isinstance(prompt, list): # messages format
prompt = "\n".join(m.get("content", "") for m in prompt if m.get("role") == "user")
solution = ex.get("solution", "")
return {
"text": f"### Instruction:\n{prompt}\n\n### Response:\n\nLet me think step by step about this problem.\n\nStep 1: Analyze the given information.\nStep 2: Apply relevant mathematical principles.\nStep 3: Calculate the result carefully.\n\nTherefore, the answer is {solution}.\n\n\nThe answer is {solution}."
}
ds = ds.map(format_example)
ds = ds.remove_columns([c for c in ds.column_names if c != "text"])
return ds
except Exception as e:
logger.warning(f"DeepMath failed: {e}")
# Fallback: AETHER reasoning dataset
logger.info("Creating AETHER reasoning dataset...")
reasoning_examples = [
{
"text": "### Instruction:\nThink step by step: What is 17 + 25?\n\n### Response:\n\nStep 1: Break down the numbers. 17 + 25.\nStep 2: Add the ones place: 7 + 5 = 12. Write down 2, carry 1.\nStep 3: Add the tens place: 1 + 2 + 1 (carry) = 4.\nStep 4: Combine: 42.\n\nTherefore, 17 + 25 = 42.\n\n\nThe answer is 42."
},
{
"text": "### Instruction:\nPlan and reason: A farmer has 3 fields. Each produces 42 bushels. Total?\n\n### Response:\n\nStep 1: Identify given information. 3 fields, 42 bushels each.\nStep 2: Recognize this as multiplication: 3 × 42.\nStep 3: Calculate: 3 × 40 = 120, 3 × 2 = 6.\nStep 4: Sum: 120 + 6 = 126.\n\nTherefore, total bushels = 126.\n\n\nThe answer is 126 bushels."
},
{
"text": "### Instruction:\nReflect and improve: Your answer was 50. Correct answer is 60. What went wrong?\n\n### Response:\n\nStep 1: Evaluate the error. The difference is 10 (60 - 50 = 10).\nStep 2: Check calculation steps. Likely missed adding a carry or misread a digit.\nStep 3: Identify root cause: Insufficient verification of intermediate steps.\nStep 4: Improvement plan: Always verify by reverse calculation (60 - 10 = 50 confirms error magnitude).\n\nTherefore, I need to improve my verification process.\n\n\nI made a calculation error, likely missing a carry. I'll add verification steps in future."
},
{
"text": "### Instruction:\nWhy does recursive self-improvement require safety constraints?\n\n### Response:\n\nStep 1: Define recursive self-improvement. A system that modifies its own architecture to improve performance.\nStep 2: Consider failure modes. Unconstrained modification could: (a) remove safety checks, (b) optimize for wrong metrics, (c) cause instability.\nStep 3: Analyze historical examples. Unconstrained optimization leads to exploitation of loopholes (reward hacking).\nStep 4: Apply control theory. Any self-modifying system needs invariant constraints to maintain stability.\nStep 5: Propose solution. Sandboxed evaluation, human oversight, and constrained mutation operators.\n\nTherefore, safety constraints are necessary to maintain beneficial behavior during self-improvement.\n\n\nSafety constraints prevent harmful self-modification, maintain stability, and ensure alignment with human values during recursive improvement."
},
{
"text": "### Instruction:\nHow would you build a self-evolving AI system? Break into phases.\n\n### Response:\n\nPhase 1 (Foundation): Establish base architecture with neuro-symbolic integration. Include knowledge graph, neural networks, and symbolic reasoning.\nPhase 2 (Evolution Loop): Implement MAP-Elites archive for diversity preservation. Use Performance-Novelty selection.\nPhase 3 (Hierarchical Control): Deploy macro-policy for blueprint generation and micro-policy for execution. Apply iterative co-evolution.\nPhase 4 (Multi-Agent): Coordinate manager, executor, and developer agents. Enable experience sharing between agents.\nPhase 5 (Safety): Add sandboxed evaluation, architecture validation, and audit logging. Require human oversight for major changes.\nPhase 6 (Meta-Cognition): Implement self-reflection to analyze performance trends and generate evolution directives.\n\nTherefore, a self-evolving AI requires careful balance between capability and safety.\n\n\nA self-evolving AI needs: neuro-symbolic foundation, evolutionary loop, hierarchical control, multi-agent coordination, safety constraints, and meta-cognitive reflection."
},
{
"text": "### Instruction:\nTrade-offs between symbolic and neural reasoning?\n\n### Response:\n\nStep 1: Symbolic reasoning strengths. Interpretable, exact, compositional, supports formal verification.\nStep 2: Symbolic weaknesses. Brittle with noisy data, requires manual rule engineering, limited pattern generalization.\nStep 3: Neural reasoning strengths. Robust to noise, learns from data, excellent pattern recognition, handles unstructured input.\nStep 4: Neural weaknesses. Opaque (black box), no formal guarantees, can hallucinate, requires massive data.\nStep 5: Integration approach. Use neural for perception and pattern learning, symbolic for reasoning and verification.\n\nTherefore, neuro-symbolic integration combines the best of both paradigms.\n\n\nSymbolic = interpretable/exact but brittle. Neural = robust/flexible but opaque. Integration is optimal."
},
{
"text": "### Instruction:\nDesign a multi-agent system with manager and workers.\n\n### Response:\n\nBlueprint:\n1. Manager Agent (Macro-Policy): Decomposes tasks, generates sub-goal blueprints, coordinates workers.\n2. Executor Agent (Micro-Policy): Executes atomic actions conditioned on active sub-goal.\n3. Developer Agent: Creates new tools when capability gaps detected.\n4. Integrator Agent: Merges worker outputs into coherent final response.\n5. Safety Agent: Validates all outputs against constraints before deployment.\n\nCoordination: Manager uses dynamic routing weights to assign tasks. Workers report progress. Experience sharing via group traces.\n\nTraining: Use MLPO — train only the manager with GRPO, peers remain untrained for efficiency.\n\n\nMulti-agent system: Manager (planning), Executor (action), Developer (tools), Integrator (merging), Safety (validation). Train manager only via MLPO."
},
{
"text": "### Instruction:\nEvolve a sorting algorithm iteratively.\n\n### Response:\n\nGeneration 0: Bubble Sort. Simple but O(n²).\n\nGeneration 1 (Mutation): Quick Sort. Divide and conquer. O(n log n) average.\nEvaluation: Better average case but O(n²) worst case. Unstable.\n\nGeneration 2 (Crossover): Merge Sort. Guaranteed O(n log n). Stable.\nEvaluation: Consistent performance. Needs O(n) extra space.\n\nGeneration 3 (Innovation): Timsort. Hybrid of merge + insertion.\nEvaluation: O(n log n) worst, O(n) best. Adaptive to partially sorted data.\n\nTherefore, evolution yields progressively better solutions through mutation, evaluation, and selection.\n\n\nEvolution path: Bubble → Quick → Merge → Timsort. Each generation evaluated for time complexity, stability, and space usage."
},
{
"text": "### Instruction:\nPenguins are birds. All birds can fly. What about penguins?\n\n### Response:\n\nStep 1: Parse premises. P(Bird|Penguin) = 1, P(Fly|Bird) = 1.\nStep 2: Apply transitive reasoning. If A→B and B→C, then A→C.\nStep 3: Derive conclusion. Penguin → Bird → Fly, therefore Penguin → Fly.\nStep 4: Check against knowledge. Real-world penguins cannot fly — premises contain false generalization.\nStep 5: Identify failure. The premise \"all birds can fly\" is incorrect (counterexample: penguins, ostriches).\n\nTherefore, symbolic reasoning reveals the inconsistency in the premises. The conclusion is invalid due to false premise.\n\n\nThe conclusion that penguins can fly is invalid because the premise \"all birds can fly\" is false. Penguins are a counterexample."
},
{
"text": "### Instruction:\nEvaluate your own reasoning and identify biases.\n\n### Response:\n\nSelf-Evaluation:\n1. Confirmation bias: Do I favor evidence supporting my initial hypothesis? Check by actively seeking disconfirming evidence.\n2. Anchoring bias: Am I over-relying on the first piece of information? Mitigate by considering multiple starting points.\n3. Availability bias: Am I using easily recalled examples instead of statistical data? Counter with systematic data collection.\n4. Overconfidence: Is my confidence calibrated to actual accuracy? Use prediction markets or track calibration scores.\n5. Framing effects: Does problem phrasing influence my conclusion? Test with multiple equivalent formulations.\n\nImprovement: Apply adversarial reasoning — deliberately argue against my own conclusion.\n\nTherefore, meta-cognitive reflection improves reasoning quality through bias identification.\n\n\nKey biases: confirmation, anchoring, availability, overconfidence, framing. Mitigation: adversarial reasoning, systematic data, calibration tracking."
},
]
# Replicate 100x for dataset size
reasoning_examples = reasoning_examples * 100
ds = Dataset.from_list(reasoning_examples)
return ds
def main():
MODEL_NAME = os.environ.get("AETHER_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
OUTPUT_DIR = os.environ.get("AETHER_OUTPUT", "./aether-output-cpu")
HUB_MODEL_ID = os.environ.get("AETHER_HUB_ID", "camdog920/aether-qwen-0.5b-sft")
logger.info("=" * 60)
logger.info("AETHER CPU Training — SFT with Reasoning Dataset")
logger.info("=" * 60)
logger.info(f"Model: {MODEL_NAME}")
logger.info(f"Output: {OUTPUT_DIR}")
logger.info(f"Hub: {HUB_MODEL_ID}")
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Device: {device}")
# Load model (fp32 for CPU, bf16 for CUDA)
logger.info("Loading model...")
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=dtype,
device_map="auto" if torch.cuda.is_available() else None,
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load dataset
logger.info("Preparing dataset...")
dataset = prepare_aether_dataset()
dataset = dataset.train_test_split(test_size=0.1)
train_ds = dataset["train"]
eval_ds = dataset["test"]
logger.info(f"Train: {len(train_ds)}, Eval: {len(eval_ds)}")
# Training args
training_args = SFTConfig(
output_dir=OUTPUT_DIR,
num_train_epochs=3, # More epochs for smaller dataset
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=4,
learning_rate=2e-5,
logging_steps=10,
save_steps=200,
eval_strategy="steps",
eval_steps=100,
max_seq_length=512,
bf16=torch.cuda.is_available(),
fp16=not torch.cuda.is_available(),
report_to=[],
disable_tqdm=False,
logging_first_step=True,
push_to_hub=True,
hub_model_id=HUB_MODEL_ID,
)
# Trainer
logger.info("Initializing SFT Trainer...")
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=eval_ds,
tokenizer=tokenizer,
)
# Train
logger.info("Starting training...")
trainer.train()
# Save
logger.info("Saving model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
logger.info("=" * 60)
logger.info("Training complete!")
logger.info(f"Model: https://huggingface.co/{HUB_MODEL_ID}")
logger.info("=" * 60)
if __name__ == "__main__":
main()