Spaces:

Chars
/

CognitiveKernel-Launchpad

Sleeping

File size: 6,952 Bytes

1ea26af

#!/usr/bin/env python3
# NOTICE: This file is adapted from Tencent's CognitiveKernel-Pro (https://github.com/Tencent/CognitiveKernel-Pro).
# Modifications in this fork (2025) are for academic research and educational use only; no commercial use.
# Original rights belong to the original authors and Tencent; see upstream license for details.

"""
GAIA Simple Validator - Minimal CLI for GAIA evaluation
Pipeline: filter → run via CognitiveKernel → LLM judge → write results
"""

import argparse
import json
import os
import sys
import time
from datetime import datetime
from typing import List, Dict, Any

# Robust imports with fallback to repository root
try:
    from ck_pro.core import CognitiveKernel
    from ck_pro.config.settings import Settings
except ImportError:
    import sys
    from pathlib import Path
    sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
    from ck_pro.core import CognitiveKernel
    from ck_pro.config.settings import Settings

from gaia.data_loader import load_tasks, filter_tasks, get_task_stats
from gaia.runner import run_single_task


def main():
    """Main CLI entry point"""
    parser = argparse.ArgumentParser(
        description='GAIA Simple Validator - Minimal evaluation pipeline',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Run all tasks without file attachments
  python -m gaia.cli.simple_validate --data metadata.jsonl --config config.toml

  # Run level 2 tasks only, limit to 50
  python -m gaia.cli.simple_validate --data metadata.jsonl --config config.toml --level 2 --count 50

  # Specify output file
  python -m gaia.cli.simple_validate --data metadata.jsonl --config config.toml --output results.jsonl
        """
    )

    parser.add_argument(
        '--data',
        required=True,
        help='Path to GAIA metadata.jsonl file'
    )
    parser.add_argument(
        '--config',
        required=False,
        help='Path to TOML configuration file (optional; environment variables supported)'
    )
    parser.add_argument(
        '--level',
        default='all',
        choices=['1', '2', '3', 'all'],
        help='Filter by difficulty level (default: all)'
    )
    parser.add_argument(
        '--count',
        type=int,
        default=0,
        help='Maximum number of tasks to run (0 = no limit)'
    )
    parser.add_argument(
        '--output',
        help='Output JSONL file path (default: output/results_YYYYMMDD_HHMMSS.jsonl)'
    )

    args = parser.parse_args()

    # Load and filter tasks
    print(f"Loading tasks from {args.data}...")
    try:
        all_tasks = load_tasks(args.data)
        print(f"Loaded {len(all_tasks)} total tasks")

        # Show initial stats
        initial_stats = get_task_stats(all_tasks)
        print(f"Initial stats: {initial_stats}")

        # Filter tasks
        tasks = filter_tasks(all_tasks, level=args.level, limit=args.count)
        print(f"After filtering: {len(tasks)} tasks (level={args.level}, limit={args.count})")

        if not tasks:
            print("No tasks to process after filtering. Exiting.")
            sys.exit(0)

    except Exception as e:
        print(f"Error loading data: {e}")
        sys.exit(1)

    # Initialize CognitiveKernel (support env-only when no --config provided)
    try:
        if args.config and os.path.exists(args.config):
            print(f"Initializing CognitiveKernel from config: {args.config}")
            settings = Settings.load(args.config)
        else:
            print("Initializing CognitiveKernel (no config file); using environment variables if set, otherwise defaults")
            settings = Settings.load(args.config or "config.toml")
        kernel = CognitiveKernel(settings)
        print("CognitiveKernel initialized successfully")
    except Exception as e:
        print(f"Error initializing CognitiveKernel: {e}")
        sys.exit(1)

    # Determine output path
    output_path = args.output
    if not output_path:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        os.makedirs('output', exist_ok=True)
        output_path = os.path.join('output', f'results_{timestamp}.jsonl')

    print(f"Results will be written to: {output_path}")

    # Process tasks
    results = []
    failed_count = 0

    print(f"\nProcessing {len(tasks)} tasks...")
    print("=" * 60)

    for i, task in enumerate(tasks, 1):
        print(f"[{i}/{len(tasks)}] Processing task: {task['task_id']}")

        try:
            result = run_single_task(kernel, task)
            results.append(result)

            # Check for execution failure (fail-fast requirement)
            if not result['success']:
                print(f"FAILED: {result['error']}")
                failed_count += 1
                # Fail fast on first kernel failure
                print(f"\nFail-fast triggered: Task {task['task_id']} failed execution")
                print(f"Error: {result['error']}")
                print("Exiting immediately as per fail-fast policy")
                sys.exit(1)
            else:
                print(f"SUCCESS: Score {result['score']}/5 - {result['judge_reason']}")

        except KeyboardInterrupt:
            print("\nInterrupted by user")
            break
        except Exception as e:
            print(f"UNEXPECTED ERROR: {e}")
            failed_count += 1
            # Still fail fast on unexpected errors
            sys.exit(1)

    # Write results
    print(f"\nWriting {len(results)} results to {output_path}")
    try:
        # Ensure parent directory exists (handles --output with nested paths)
        out_dir = os.path.dirname(output_path)
        if out_dir:
            os.makedirs(out_dir, exist_ok=True)
        with open(output_path, 'w+', encoding='utf-8') as f:
            for result in results:
                f.write(json.dumps(result, ensure_ascii=False) + '\n')
        print(f"Results written successfully")
    except Exception as e:
        print(f"Error writing results: {e}")
        sys.exit(1)

    # Summary statistics
    if results:
        successful = [r for r in results if r['success']]
        scores = [r['score'] for r in successful]

        print("\n" + "=" * 60)
        print("SUMMARY")
        print("=" * 60)
        print(f"Total tasks processed: {len(results)}")
        print(f"Successful executions: {len(successful)}")
        print(f"Failed executions: {failed_count}")

        if scores:
            avg_score = sum(scores) / len(scores)
            print(f"Average score: {avg_score:.2f}/5")
            print(f"Score distribution:")
            for score in range(6):
                count = scores.count(score)
                if count > 0:
                    print(f"  Score {score}: {count} tasks ({count/len(scores)*100:.1f}%)")

        print(f"\nResults saved to: {output_path}")

    print("Evaluation completed successfully")


if __name__ == '__main__':
    main()