File size: 6,952 Bytes
1ea26af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python3
# NOTICE: This file is adapted from Tencent's CognitiveKernel-Pro (https://github.com/Tencent/CognitiveKernel-Pro).
# Modifications in this fork (2025) are for academic research and educational use only; no commercial use.
# Original rights belong to the original authors and Tencent; see upstream license for details.

"""
GAIA Simple Validator - Minimal CLI for GAIA evaluation
Pipeline: filter → run via CognitiveKernel → LLM judge → write results
"""

import argparse
import json
import os
import sys
import time
from datetime import datetime
from typing import List, Dict, Any

# Robust imports with fallback to repository root
try:
    from ck_pro.core import CognitiveKernel
    from ck_pro.config.settings import Settings
except ImportError:
    import sys
    from pathlib import Path
    sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
    from ck_pro.core import CognitiveKernel
    from ck_pro.config.settings import Settings

from gaia.data_loader import load_tasks, filter_tasks, get_task_stats
from gaia.runner import run_single_task


def main():
    """Main CLI entry point"""
    parser = argparse.ArgumentParser(
        description='GAIA Simple Validator - Minimal evaluation pipeline',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Run all tasks without file attachments
  python -m gaia.cli.simple_validate --data metadata.jsonl --config config.toml

  # Run level 2 tasks only, limit to 50
  python -m gaia.cli.simple_validate --data metadata.jsonl --config config.toml --level 2 --count 50

  # Specify output file
  python -m gaia.cli.simple_validate --data metadata.jsonl --config config.toml --output results.jsonl
        """
    )

    parser.add_argument(
        '--data',
        required=True,
        help='Path to GAIA metadata.jsonl file'
    )
    parser.add_argument(
        '--config',
        required=False,
        help='Path to TOML configuration file (optional; environment variables supported)'
    )
    parser.add_argument(
        '--level',
        default='all',
        choices=['1', '2', '3', 'all'],
        help='Filter by difficulty level (default: all)'
    )
    parser.add_argument(
        '--count',
        type=int,
        default=0,
        help='Maximum number of tasks to run (0 = no limit)'
    )
    parser.add_argument(
        '--output',
        help='Output JSONL file path (default: output/results_YYYYMMDD_HHMMSS.jsonl)'
    )

    args = parser.parse_args()

    # Load and filter tasks
    print(f"Loading tasks from {args.data}...")
    try:
        all_tasks = load_tasks(args.data)
        print(f"Loaded {len(all_tasks)} total tasks")

        # Show initial stats
        initial_stats = get_task_stats(all_tasks)
        print(f"Initial stats: {initial_stats}")

        # Filter tasks
        tasks = filter_tasks(all_tasks, level=args.level, limit=args.count)
        print(f"After filtering: {len(tasks)} tasks (level={args.level}, limit={args.count})")

        if not tasks:
            print("No tasks to process after filtering. Exiting.")
            sys.exit(0)

    except Exception as e:
        print(f"Error loading data: {e}")
        sys.exit(1)

    # Initialize CognitiveKernel (support env-only when no --config provided)
    try:
        if args.config and os.path.exists(args.config):
            print(f"Initializing CognitiveKernel from config: {args.config}")
            settings = Settings.load(args.config)
        else:
            print("Initializing CognitiveKernel (no config file); using environment variables if set, otherwise defaults")
            settings = Settings.load(args.config or "config.toml")
        kernel = CognitiveKernel(settings)
        print("CognitiveKernel initialized successfully")
    except Exception as e:
        print(f"Error initializing CognitiveKernel: {e}")
        sys.exit(1)

    # Determine output path
    output_path = args.output
    if not output_path:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        os.makedirs('output', exist_ok=True)
        output_path = os.path.join('output', f'results_{timestamp}.jsonl')

    print(f"Results will be written to: {output_path}")

    # Process tasks
    results = []
    failed_count = 0

    print(f"\nProcessing {len(tasks)} tasks...")
    print("=" * 60)

    for i, task in enumerate(tasks, 1):
        print(f"[{i}/{len(tasks)}] Processing task: {task['task_id']}")

        try:
            result = run_single_task(kernel, task)
            results.append(result)

            # Check for execution failure (fail-fast requirement)
            if not result['success']:
                print(f"FAILED: {result['error']}")
                failed_count += 1
                # Fail fast on first kernel failure
                print(f"\nFail-fast triggered: Task {task['task_id']} failed execution")
                print(f"Error: {result['error']}")
                print("Exiting immediately as per fail-fast policy")
                sys.exit(1)
            else:
                print(f"SUCCESS: Score {result['score']}/5 - {result['judge_reason']}")

        except KeyboardInterrupt:
            print("\nInterrupted by user")
            break
        except Exception as e:
            print(f"UNEXPECTED ERROR: {e}")
            failed_count += 1
            # Still fail fast on unexpected errors
            sys.exit(1)

    # Write results
    print(f"\nWriting {len(results)} results to {output_path}")
    try:
        # Ensure parent directory exists (handles --output with nested paths)
        out_dir = os.path.dirname(output_path)
        if out_dir:
            os.makedirs(out_dir, exist_ok=True)
        with open(output_path, 'w+', encoding='utf-8') as f:
            for result in results:
                f.write(json.dumps(result, ensure_ascii=False) + '\n')
        print(f"Results written successfully")
    except Exception as e:
        print(f"Error writing results: {e}")
        sys.exit(1)

    # Summary statistics
    if results:
        successful = [r for r in results if r['success']]
        scores = [r['score'] for r in successful]

        print("\n" + "=" * 60)
        print("SUMMARY")
        print("=" * 60)
        print(f"Total tasks processed: {len(results)}")
        print(f"Successful executions: {len(successful)}")
        print(f"Failed executions: {failed_count}")

        if scores:
            avg_score = sum(scores) / len(scores)
            print(f"Average score: {avg_score:.2f}/5")
            print(f"Score distribution:")
            for score in range(6):
                count = scores.count(score)
                if count > 0:
                    print(f"  Score {score}: {count} tasks ({count/len(scores)*100:.1f}%)")

        print(f"\nResults saved to: {output_path}")

    print("Evaluation completed successfully")


if __name__ == '__main__':
    main()