File size: 7,194 Bytes

ed1b365

#!/usr/bin/env python3
"""

Generate All Codette Training Datasets

========================================



Batch script that generates JSONL datasets for ALL LoRA adapters

with their configured target sizes. Outputs to:

    J:/codette-training-lab/datasets/{adapter_name}_reasoning.jsonl



Adapter targets:

    newton ............... 3000 examples

    davinci .............. 2500 examples

    empathy .............. 2500 examples

    philosophy ........... 2000 examples

    quantum .............. 2000 examples

    consciousness ........ 3000 examples

    multi_perspective .... 2500 examples

    systems_architecture . 2000 examples

    -----------------------------------

    Total ................ 20,500 examples



Usage:

    python generate_all.py

    python generate_all.py --seed 42

    python generate_all.py --seed 42 --output-dir J:/codette-training-lab/datasets

"""

import argparse
import json
import logging
import os
import sys
import time
from pathlib import Path

# Ensure the parent directory is on the path so imports work
# when running this script directly.
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_DIR = SCRIPT_DIR.parent
if str(PROJECT_DIR) not in sys.path:
    sys.path.insert(0, str(PROJECT_DIR))

from dataset_engine.template_registry import TemplateRegistry
from dataset_engine.dataset_generator import DatasetGenerator


def main():
    parser = argparse.ArgumentParser(
        description="Generate all Codette training datasets.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="Random seed for reproducible generation (default: 42).",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default=str(PROJECT_DIR / "datasets"),
        help="Output directory for JSONL files.",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Enable verbose logging.",
    )
    args = parser.parse_args()

    # Configure logging
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(
        level=log_level,
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )
    logger = logging.getLogger("generate_all")

    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    logger.info("=" * 60)
    logger.info("Codette Dataset Generation Engine")
    logger.info("=" * 60)
    logger.info("Output directory: %s", output_dir)
    logger.info("Random seed: %s", args.seed)

    # Show targets
    registry = TemplateRegistry(seed=args.seed)
    total_target = 0
    logger.info("")
    logger.info("Adapter targets:")
    for adapter in registry.get_adapter_names():
        target = registry.get_target(adapter)
        total_target += target
        logger.info("  %-25s %5d examples", adapter, target)
    logger.info("  %-25s %5d examples", "TOTAL", total_target)
    logger.info("")

    # Generate
    generator = DatasetGenerator(
        output_dir=str(output_dir),
        seed=args.seed,
    )

    start_time = time.time()
    results = generator.generate_all()
    total_elapsed = time.time() - start_time

    # Summary
    print("\n" + "=" * 60)
    print("GENERATION COMPLETE")
    print("=" * 60)

    total_examples = 0
    all_ok = True
    for adapter in registry.get_adapter_names():
        path = results.get(adapter, "ERROR: NOT GENERATED")
        if path.startswith("ERROR"):
            status = f"FAILED: {path}"
            all_ok = False
        else:
            count = generator._count_lines(path)
            total_examples += count
            target = registry.get_target(adapter)
            pct = (count / target * 100) if target > 0 else 0
            status = f"{count:5d} / {target:5d} ({pct:.0f}%) -> {path}"
        print(f"  {adapter:25s}  {status}")

    print(f"\n  {'TOTAL':25s}  {total_examples:5d} / {total_target:5d} examples")
    print(f"  {'Time':25s}  {total_elapsed:.1f} seconds")
    rate = total_examples / total_elapsed if total_elapsed > 0 else 0
    print(f"  {'Rate':25s}  {rate:.0f} examples/sec")
    print("=" * 60)

    # Validate output files
    print("\nValidating output files...")
    validation_ok = True
    for adapter in registry.get_adapter_names():
        path = results.get(adapter)
        if not path or path.startswith("ERROR"):
            continue
        try:
            errors = _validate_jsonl(path)
            if errors:
                print(f"  {adapter}: {len(errors)} validation errors")
                for err in errors[:3]:
                    print(f"    - {err}")
                validation_ok = False
            else:
                print(f"  {adapter}: OK")
        except Exception as e:
            print(f"  {adapter}: Validation failed: {e}")
            validation_ok = False

    if validation_ok and all_ok:
        print("\nAll datasets generated and validated successfully.")
    else:
        print("\nSome issues detected. Check logs above.")
        sys.exit(1)


def _validate_jsonl(filepath: str, sample_size: int = 50) -> list:
    """Validate a JSONL file for correct format.



    Checks:

      - Each line is valid JSON

      - Each record has a 'messages' key

      - Messages contain system, user, and assistant roles

      - No empty content fields



    Returns list of error strings (empty = valid).

    """
    errors = []
    line_count = 0

    with open(filepath, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            line_count += 1
            line = line.strip()
            if not line:
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError as e:
                errors.append(f"Line {i}: Invalid JSON: {e}")
                continue

            if "messages" not in record:
                errors.append(f"Line {i}: Missing 'messages' key")
                continue

            messages = record["messages"]
            if not isinstance(messages, list) or len(messages) != 3:
                errors.append(f"Line {i}: Expected 3 messages, got {len(messages) if isinstance(messages, list) else 'non-list'}")
                continue

            roles = [m.get("role") for m in messages]
            if roles != ["system", "user", "assistant"]:
                errors.append(f"Line {i}: Expected roles [system, user, assistant], got {roles}")
                continue

            for m in messages:
                content = m.get("content", "")
                if not content or not content.strip():
                    errors.append(f"Line {i}: Empty content for role '{m.get('role')}'")

            # Only check a sample of lines for detailed validation
            if i > sample_size and not errors:
                break

    if not errors and line_count == 0:
        errors.append("File is empty")

    return errors


if __name__ == "__main__":
    main()