|
|
| """
|
| Generate All Codette Training Datasets
|
| ========================================
|
|
|
| Batch script that generates JSONL datasets for ALL LoRA adapters
|
| with their configured target sizes. Outputs to:
|
| J:/codette-training-lab/datasets/{adapter_name}_reasoning.jsonl
|
|
|
| Adapter targets:
|
| newton ............... 3000 examples
|
| davinci .............. 2500 examples
|
| empathy .............. 2500 examples
|
| philosophy ........... 2000 examples
|
| quantum .............. 2000 examples
|
| consciousness ........ 3000 examples
|
| multi_perspective .... 2500 examples
|
| systems_architecture . 2000 examples
|
| -----------------------------------
|
| Total ................ 20,500 examples
|
|
|
| Usage:
|
| python generate_all.py
|
| python generate_all.py --seed 42
|
| python generate_all.py --seed 42 --output-dir J:/codette-training-lab/datasets
|
| """
|
|
|
| import argparse
|
| import json
|
| import logging
|
| import os
|
| import sys
|
| import time
|
| from pathlib import Path
|
|
|
|
|
|
|
| SCRIPT_DIR = Path(__file__).resolve().parent
|
| PROJECT_DIR = SCRIPT_DIR.parent
|
| if str(PROJECT_DIR) not in sys.path:
|
| sys.path.insert(0, str(PROJECT_DIR))
|
|
|
| from dataset_engine.template_registry import TemplateRegistry
|
| from dataset_engine.dataset_generator import DatasetGenerator
|
|
|
|
|
| def main():
|
| parser = argparse.ArgumentParser(
|
| description="Generate all Codette training datasets.",
|
| )
|
| parser.add_argument(
|
| "--seed",
|
| type=int,
|
| default=42,
|
| help="Random seed for reproducible generation (default: 42).",
|
| )
|
| parser.add_argument(
|
| "--output-dir",
|
| type=str,
|
| default=str(PROJECT_DIR / "datasets"),
|
| help="Output directory for JSONL files.",
|
| )
|
| parser.add_argument(
|
| "--verbose",
|
| action="store_true",
|
| help="Enable verbose logging.",
|
| )
|
| args = parser.parse_args()
|
|
|
|
|
| log_level = logging.DEBUG if args.verbose else logging.INFO
|
| logging.basicConfig(
|
| level=log_level,
|
| format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| datefmt="%Y-%m-%d %H:%M:%S",
|
| )
|
| logger = logging.getLogger("generate_all")
|
|
|
| output_dir = Path(args.output_dir)
|
| output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
| logger.info("=" * 60)
|
| logger.info("Codette Dataset Generation Engine")
|
| logger.info("=" * 60)
|
| logger.info("Output directory: %s", output_dir)
|
| logger.info("Random seed: %s", args.seed)
|
|
|
|
|
| registry = TemplateRegistry(seed=args.seed)
|
| total_target = 0
|
| logger.info("")
|
| logger.info("Adapter targets:")
|
| for adapter in registry.get_adapter_names():
|
| target = registry.get_target(adapter)
|
| total_target += target
|
| logger.info(" %-25s %5d examples", adapter, target)
|
| logger.info(" %-25s %5d examples", "TOTAL", total_target)
|
| logger.info("")
|
|
|
|
|
| generator = DatasetGenerator(
|
| output_dir=str(output_dir),
|
| seed=args.seed,
|
| )
|
|
|
| start_time = time.time()
|
| results = generator.generate_all()
|
| total_elapsed = time.time() - start_time
|
|
|
|
|
| print("\n" + "=" * 60)
|
| print("GENERATION COMPLETE")
|
| print("=" * 60)
|
|
|
| total_examples = 0
|
| all_ok = True
|
| for adapter in registry.get_adapter_names():
|
| path = results.get(adapter, "ERROR: NOT GENERATED")
|
| if path.startswith("ERROR"):
|
| status = f"FAILED: {path}"
|
| all_ok = False
|
| else:
|
| count = generator._count_lines(path)
|
| total_examples += count
|
| target = registry.get_target(adapter)
|
| pct = (count / target * 100) if target > 0 else 0
|
| status = f"{count:5d} / {target:5d} ({pct:.0f}%) -> {path}"
|
| print(f" {adapter:25s} {status}")
|
|
|
| print(f"\n {'TOTAL':25s} {total_examples:5d} / {total_target:5d} examples")
|
| print(f" {'Time':25s} {total_elapsed:.1f} seconds")
|
| rate = total_examples / total_elapsed if total_elapsed > 0 else 0
|
| print(f" {'Rate':25s} {rate:.0f} examples/sec")
|
| print("=" * 60)
|
|
|
|
|
| print("\nValidating output files...")
|
| validation_ok = True
|
| for adapter in registry.get_adapter_names():
|
| path = results.get(adapter)
|
| if not path or path.startswith("ERROR"):
|
| continue
|
| try:
|
| errors = _validate_jsonl(path)
|
| if errors:
|
| print(f" {adapter}: {len(errors)} validation errors")
|
| for err in errors[:3]:
|
| print(f" - {err}")
|
| validation_ok = False
|
| else:
|
| print(f" {adapter}: OK")
|
| except Exception as e:
|
| print(f" {adapter}: Validation failed: {e}")
|
| validation_ok = False
|
|
|
| if validation_ok and all_ok:
|
| print("\nAll datasets generated and validated successfully.")
|
| else:
|
| print("\nSome issues detected. Check logs above.")
|
| sys.exit(1)
|
|
|
|
|
| def _validate_jsonl(filepath: str, sample_size: int = 50) -> list:
|
| """Validate a JSONL file for correct format.
|
|
|
| Checks:
|
| - Each line is valid JSON
|
| - Each record has a 'messages' key
|
| - Messages contain system, user, and assistant roles
|
| - No empty content fields
|
|
|
| Returns list of error strings (empty = valid).
|
| """
|
| errors = []
|
| line_count = 0
|
|
|
| with open(filepath, "r", encoding="utf-8") as f:
|
| for i, line in enumerate(f, 1):
|
| line_count += 1
|
| line = line.strip()
|
| if not line:
|
| continue
|
|
|
| try:
|
| record = json.loads(line)
|
| except json.JSONDecodeError as e:
|
| errors.append(f"Line {i}: Invalid JSON: {e}")
|
| continue
|
|
|
| if "messages" not in record:
|
| errors.append(f"Line {i}: Missing 'messages' key")
|
| continue
|
|
|
| messages = record["messages"]
|
| if not isinstance(messages, list) or len(messages) != 3:
|
| errors.append(f"Line {i}: Expected 3 messages, got {len(messages) if isinstance(messages, list) else 'non-list'}")
|
| continue
|
|
|
| roles = [m.get("role") for m in messages]
|
| if roles != ["system", "user", "assistant"]:
|
| errors.append(f"Line {i}: Expected roles [system, user, assistant], got {roles}")
|
| continue
|
|
|
| for m in messages:
|
| content = m.get("content", "")
|
| if not content or not content.strip():
|
| errors.append(f"Line {i}: Empty content for role '{m.get('role')}'")
|
|
|
|
|
| if i > sample_size and not errors:
|
| break
|
|
|
| if not errors and line_count == 0:
|
| errors.append("File is empty")
|
|
|
| return errors
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|