File size: 15,486 Bytes
953a28e
 
 
 
 
b06fdec
e378548
 
a0b1b1d
b06fdec
 
 
 
953a28e
e378548
 
 
 
fb9fdbd
 
 
407e466
953a28e
 
9168c5e
 
953a28e
fb9fdbd
 
 
a1e2111
fb9fdbd
f30003b
a1e2111
953a28e
a0b1b1d
 
 
 
 
953a28e
fb9fdbd
 
 
 
 
 
8298566
407e466
1b94a5b
 
 
 
 
407e466
1b94a5b
 
 
 
fb9fdbd
 
 
 
 
 
 
 
407e466
 
 
 
 
 
 
 
fb9fdbd
 
1b94a5b
 
 
 
fb9fdbd
1b94a5b
 
 
 
b06fdec
1b94a5b
b06fdec
1b94a5b
 
 
 
fb9fdbd
1b94a5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57e2365
41273dd
407e466
41273dd
b06fdec
57e2365
 
 
 
b06fdec
 
41273dd
 
fb9fdbd
 
 
 
41273dd
fb9fdbd
 
 
 
 
41273dd
a1e2111
 
 
 
 
 
 
 
41273dd
 
 
 
407e466
 
953a28e
407e466
 
832bd05
1b94a5b
953a28e
4db235f
9168c5e
 
b06fdec
 
 
9168c5e
 
 
953a28e
 
 
 
 
9168c5e
953a28e
 
9168c5e
 
 
 
 
953a28e
 
 
 
4db235f
9168c5e
 
 
 
 
b06fdec
9168c5e
4db235f
953a28e
9168c5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b06fdec
953a28e
 
9168c5e
953a28e
9168c5e
b06fdec
9168c5e
 
 
 
 
b06fdec
9168c5e
b06fdec
 
 
 
9168c5e
92925a0
9168c5e
 
 
 
 
92925a0
 
b06fdec
 
 
9168c5e
b06fdec
407e466
 
953a28e
8298566
fb9fdbd
 
 
 
8298566
fb9fdbd
 
 
 
 
 
 
 
 
 
 
9168c5e
 
 
fb9fdbd
407e466
9168c5e
832bd05
407e466
953a28e
a1e2111
 
 
 
 
 
 
 
953a28e
41273dd
 
8298566
407e466
b06fdec
 
 
 
 
 
4db235f
407e466
953a28e
 
b06fdec
5f04c51
953a28e
25c5c17
8298566
92925a0
953a28e
 
92925a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b06fdec
953a28e
a0b1b1d
 
953a28e
a0b1b1d
 
fb9fdbd
 
a0b1b1d
 
 
 
 
 
 
 
 
 
 
 
 
953a28e
b06fdec
953a28e
a0b1b1d
5f04c51
a0b1b1d
 
 
 
5f04c51
a0b1b1d
8298566
25c5c17
 
 
 
 
 
 
 
 
 
 
a0b1b1d
 
8298566
a0b1b1d
92925a0
a0b1b1d
 
953a28e
fb9fdbd
953a28e
 
5f04c51
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
import os
import argparse
import requests
import pandas as pd
import json
import time
import warnings
import logging
from enum import Enum
from colorama import init

# Initialize colorama for Windows compatibility
init(autoreset=True)

# Suppress asyncio event loop cleanup warnings (common on HF Spaces)
warnings.filterwarnings('ignore', message='.*Invalid file descriptor.*')
logging.getLogger('asyncio').setLevel(logging.ERROR)

# Import configuration
import config

# Agent-related code is imported via agent_runner module
# Import Gradio UI creation function
from gradioapp import create_ui
# Import scoring function for answer verification
from scorer import question_scorer

# Import new utilities
from question_loader import QuestionLoader
from result_formatter import ResultFormatter
from agent_runner import AgentRunner
from validators import InputValidator, ValidationError
from utils import retry_with_backoff
from langfuse_tracking import track_session

# --- Run Modes ---
class RunMode(Enum):
    UI = "ui"   # Gradio UI mode
    CLI = "cli" # Command-line test mode


@retry_with_backoff(max_retries=3, initial_delay=2.0)
def _submit_to_server(submit_url: str, submission_data: dict) -> dict:
    """Internal function to submit to server (with retries)."""
    response = requests.post(submit_url, json=submission_data, timeout=config.SUBMIT_TIMEOUT)
    response.raise_for_status()
    return response.json()

def submit_and_score(username: str, results: list) -> str:
    """

    Submit answers to the GAIA scoring server and return status message.



    Args:

        username: Hugging Face username for submission

        results: List of tuples (task_id, question_text, answer)



    Returns:

        str: Status message (success or error details)

    """
    # Validate username
    try:
        username = InputValidator.validate_username(username)
    except ValidationError as e:
        error_msg = f"Invalid username: {e}"
        print(error_msg)
        return error_msg

    # Format results for API submission
    answers_payload = ResultFormatter.format_for_api(results)

    if not answers_payload:
        error_msg = "No answers to submit."
        print(error_msg)
        return error_msg

    space_id = config.SPACE_ID
    submit_url = f"{config.DEFAULT_API_URL}/submit"
    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"

    # Prepare submission data
    submission_data = {
        "username": username,
        "agent_code": agent_code,
        "answers": answers_payload
    }

    print(f"\n{'=' * config.SEPARATOR_WIDTH}")
    print(f"Submitting {len(answers_payload)} answers for user '{username}'...")
    print(f"{'=' * config.SEPARATOR_WIDTH}\n")

    # Submit to server
    print(f"Submitting to: {submit_url}")
    try:
        result_data = _submit_to_server(submit_url, submission_data)

        final_status = (
            f"Submission Successful!\n"
            f"User: {result_data.get('username')}\n"
            f"Overall Score: {result_data.get('score', 'N/A')}% "
            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
            f"Message: {result_data.get('message', 'No message received.')}"
        )
        print("Submission successful.")
        return final_status

    except requests.exceptions.HTTPError as e:
        error_detail = f"Server responded with status {e.response.status_code}."
        try:
            error_json = e.response.json()
            error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
        except requests.exceptions.JSONDecodeError:
            error_detail += f" Response: {e.response.text[:500]}"
        status_message = f"Submission Failed: {error_detail}"
        print(status_message)
        return status_message

    except requests.exceptions.Timeout:
        status_message = "Submission Failed: The request timed out."
        print(status_message)
        return status_message

    except requests.exceptions.RequestException as e:
        status_message = f"Submission Failed: Network error - {e}"
        print(status_message)
        return status_message

    except Exception as e:
        status_message = f"An unexpected error occurred during submission: {e}"
        print(status_message)
        return status_message


def run_and_submit_all(username: str, active_agent: str = None) -> tuple:
    """

    Fetches all questions, runs the GAIA agent on them, submits all answers,

    and displays the results.



    Args:

        username: Hugging Face username for submission

        active_agent: The agent type to use (default: config.AGENT_LANGGRAPH)



    Returns:

        tuple: (status_message: str, results_df: pd.DataFrame)

    """
    # Fetch questions from API (always online for submission)
    try:
        questions_data = QuestionLoader().get_questions(test_mode=False)
    except Exception as e:
        return f"Error loading questions: {e}", None

    # Validate questions data
    try:
        questions_data = InputValidator.validate_questions_data(questions_data)
    except ValidationError as e:
        return f"Invalid questions data: {e}", None

    # Run agent on all questions with specified agent type (with Langfuse session tracking)
    with track_session("Submit_All", {
        "agent": active_agent or config.ACTIVE_AGENT,
        "username": username,
        "question_count": len(questions_data),
        "mode": "submission"
    }):
        results = AgentRunner(active_agent=active_agent).run_on_questions(questions_data)

    if results is None:
        return "Error initializing agent.", None

    # Submit answers and get score (formatting happens inside submit_and_score)
    status_message = submit_and_score(username, results)

    # Format results for UI display
    results_for_display = ResultFormatter.format_for_display(results)
    results_df = pd.DataFrame(results_for_display)
    return status_message, results_df

def _load_ground_truth(file_path: str = config.METADATA_FILE) -> dict:
    """Load ground truth data indexed by task_id.



    Args:

        file_path: Path to the metadata file



    Returns:

        dict: Mapping of task_id -> {"question": str, "answer": str}

    """
    truth_mapping = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                task_id = data.get("task_id")
                question = data.get("Question")
                answer = data.get("Final answer")
                if task_id and answer:
                    truth_mapping[task_id] = {
                        "question": question,
                        "answer": answer
                    }
    except Exception as e:
        print(f"Error loading ground truth: {e}")
    return truth_mapping

def _verify_answers(results: list, log_output: list, runtime: tuple = None) -> None:
    """Verify answers against ground truth using the official GAIA scorer.



    Args:

        results: List of tuples (task_id, question_text, answer)

        log_output: List to append verification results to

        runtime: Optional tuple of (minutes, seconds) for total runtime

    """
    ground_truth = _load_ground_truth()
    log_output.append("\n=== Verification Results ===")

    correct_count = 0
    total_count = 0

    for task_id, question_text, answer in results:
        if task_id in ground_truth:
            truth_data = ground_truth[task_id]
            correct_answer = truth_data["answer"]

            # Use the official GAIA question_scorer for comparison
            # This handles numbers, lists, and strings with proper normalization
            is_correct = question_scorer(str(answer), str(correct_answer))

            if is_correct:
                correct_count += 1
            total_count += 1

            log_output.append(f"Task ID: {task_id}")
            log_output.append(f"Question: {question_text[:config.ERROR_MESSAGE_LENGTH]}...")
            log_output.append(f"Expected: {correct_answer}")
            log_output.append(f"Got: {answer}")
            log_output.append(f"Match: {'✓ Correct' if is_correct else '✗ Incorrect'}\n")
        else:
            log_output.append(f"Task ID: {task_id}")
            log_output.append(f"Question: {question_text[:config.ERROR_MESSAGE_LENGTH]}...")
            log_output.append(f"No ground truth found.\n")

    # Add summary statistics
    if total_count > 0:
        accuracy = (correct_count / total_count) * 100
        log_output.append("=" * config.SEPARATOR_WIDTH)
        log_output.append(f"SUMMARY: {correct_count}/{total_count} correct ({accuracy:.1f}%)")
        if runtime:
            minutes, seconds = runtime
            log_output.append(f"Runtime: {minutes}m {seconds}s")
        log_output.append("=" * config.SEPARATOR_WIDTH)

def run_test_code(filter=None, active_agent=None) -> pd.DataFrame:
    """Run test code on selected questions.



    Args:

        filter: Optional tuple/list of question indices to test (e.g., (4, 7, 15)).

                If None, processes all questions.

        active_agent: Optional agent type to use (e.g., "LangGraph", "ReActLangGraph", "LLamaIndex").

                      If None, uses config.ACTIVE_AGENT.



    Returns:

        pd.DataFrame: Results and verification output

    """
    start_time = time.time()
    logs_for_display = []
    logs_for_display.append("=== Processing Example Questions One by One ===")

    # Fetch questions (OFFLINE for testing)
    try:
        questions_data = QuestionLoader().get_questions(test_mode=True)
    except Exception as e:
        return pd.DataFrame([f"Error loading questions: {e}"])

    # Validate questions data
    try:
        questions_data = InputValidator.validate_questions_data(questions_data)
    except ValidationError as e:
        return pd.DataFrame([f"Invalid questions data: {e}"])

    # Validate and apply filter
    try:
        filter = InputValidator.validate_filter_indices(filter, len(questions_data))
    except ValidationError as e:
        return pd.DataFrame([f"Invalid filter: {e}"])

    # Apply filter or use all questions
    if filter is not None:
        questions_to_process = [questions_data[i] for i in filter]
        logs_for_display.append(f"Testing {len(questions_to_process)} selected questions (indices: {filter})")
    else:
        questions_to_process = questions_data
        logs_for_display.append(f"Testing all {len(questions_to_process)} questions")

    # Run agent on selected questions with specified agent type (with Langfuse session tracking)
    with track_session("Test_Run", {
        "agent": active_agent or config.ACTIVE_AGENT,
        "question_count": len(questions_to_process),
        "filter": str(filter) if filter else "all",
        "mode": "test"
    }):
        results = AgentRunner(active_agent=active_agent).run_on_questions(questions_to_process)

    if results is None:
        return pd.DataFrame(["Error initializing agent."])

    logs_for_display.append("\n=== Completed Example Questions ===")

    # Calculate runtime
    elapsed_time = time.time() - start_time
    minutes = int(elapsed_time // 60)
    seconds = int(elapsed_time % 60)

    _verify_answers(results, logs_for_display, runtime=(minutes, seconds))
    return pd.DataFrame(logs_for_display)


def main() -> None:
    """Main entry point for the application."""
    parser = argparse.ArgumentParser(description="Run the agent application.")
    parser.add_argument("--test", type=str, nargs='?', const='default', help="Run local tests on selected questions and exit. Optionally provide comma-separated question indices (e.g., --test 2,4,6). If no indices provided, uses default test questions.")
    parser.add_argument("--testall", action="store_true", help="Run local tests on all questions and exit.")
    parser.add_argument("--agent", type=str, choices=['langgraph', 'reactlangg', 'llamaindex'], help="Agent to use in CLI mode (case-insensitive). Options: langgraph, react langgraph, llamaindex. Default: uses config.ACTIVE_AGENT")
    args = parser.parse_args()

    # Map agent name to config constant (case-insensitive)
    agent_mapping = {
        'langgraph': config.AGENT_LANGGRAPH,
        'reactlangg': config.AGENT_REACT_LANGGRAPH,
        'llamaindex': config.AGENT_LLAMAINDEX,
    }

    active_agent = None
    if args.agent:
        agent_key = args.agent.lower()
        active_agent = agent_mapping.get(agent_key)
        if not active_agent:
            print(f"Error: Unknown agent '{args.agent}'. Valid options: langgraph, react, llamaindex")
            return
        print(f"[CLI] Using agent: {active_agent}")

    print(f"\n{'-' * 30} App Starting {'-' * 30}")

    # Determine run mode
    run_mode = RunMode.CLI if (args.test or args.testall) else RunMode.UI

    # Print environment info only in UI mode
    if run_mode == RunMode.UI:
        space_host = config.SPACE_HOST
        space_id = config.SPACE_ID

        if space_host:
            print(f"[OK] SPACE_HOST found: {space_host}")
            print(f"   Runtime URL should be: https://{space_host}.hf.space")
        else:
            print("[INFO] SPACE_HOST environment variable not found (running locally?).")

        if space_id:
            print(f"[OK] SPACE_ID found: {space_id}")
            print(f"   Repo URL: https://huggingface.co/spaces/{space_id}")
            print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id}/tree/main")
        else:
            print("[INFO] SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")

    print(f"{'-' * (60 + len(' App Starting '))}\n")

    # Execute based on run mode
    if run_mode == RunMode.UI:
        print("Launching Gradio Interface for Basic Agent Evaluation...")
        grTestApp = create_ui(run_and_submit_all, run_test_code)
        grTestApp.launch()

    else:  # RunMode.CLI
        # Determine test filter based on which CLI flag was used
        if args.test:
            # Check if custom indices were provided
            if args.test == 'default':
                # No indices provided, use default
                test_filter = config.DEFAULT_TEST_FILTER
            else:
                # Parse comma-separated indices
                try:
                    test_filter = tuple(int(idx.strip()) for idx in args.test.split(','))
                except ValueError:
                    print(f"Error: Invalid test indices '{args.test}'. Must be comma-separated integers (e.g., 2,4,6)")
                    return
        else:  # args.testall
            test_filter = None  # Test all questions

        print(f"Running test code on {len(test_filter) if test_filter else 'ALL'} questions (CLI mode)...")
        result = run_test_code(filter=test_filter, active_agent=active_agent)

        # Print results
        if isinstance(result, pd.DataFrame):
            ResultFormatter.print_dataframe(result)
        else:
            print(result)


if __name__ == "__main__":
    main()