Final_Assignment_Template1

Sleeping

File size: 12,137 Bytes

import os
import gradio as gr
import requests
import json
import pandas as pd
from agent import BasicAgent
import traceback

DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
HF_TOKEN = os.getenv("HF_TOKEN_HERE")
if not HF_TOKEN:
    raise ValueError("HF_TOKEN_HERE is missing in Secrets!")
HEADERS = {
    "Authorization": f"Bearer {HF_TOKEN}",
    "Content-Type": "application/json"
}
VALIDATION_URL = "https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation/metadata.jsonl"

def fetch_validation_questions():
    """Fetch validation questions with better error handling."""
    try:
        response = requests.get(VALIDATION_URL, headers=HEADERS, timeout=15)
        response.raise_for_status()
        lines = response.text.splitlines()
        questions = []
        for line in lines:
            if line.strip():
                try:
                    row = json.loads(line)
                    if row.get("Level") == 1:
                        questions.append({
                            "task_id": row.get("task_id", ""),
                            "question": row.get("Question", ""),
                            "file_name": row.get("file_name", "")
                        })
                except json.JSONDecodeError as e:
                    print(f"Error parsing line: {line[:50]}... Error: {e}")
                    continue
        
        print(f"Fetched {len(questions)} Level 1 validation questions.")
        return questions[:20]  # Limit to 20 for testing
    except Exception as e:
        print(f"Error fetching validation questions: {e}")
        print(f"Traceback: {traceback.format_exc()}")
        return []

def run_and_submit_all(use_validation: bool, profile: gr.OAuthProfile | None = None):
    """Enhanced run function with better logging and error handling."""
    space_id = os.getenv("SPACE_ID") or "saandip5/Final_Assignment_Template"
    
    if profile:
        username = f"{profile.username}"
        print(f"User logged in: {username}")
    else:
        print("User not logged in.")
        return "Please Login to Hugging Face with the button.", None

    api_url = DEFAULT_API_URL
    questions_url = f"{api_url}/questions"
    submit_url = f"{api_url}/submit"
    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
    print(f"Agent code link: {agent_code}")

    # Initialize agent with error handling
    try:
        agent = BasicAgent()
        print("Agent initialized successfully")
    except Exception as e:
        error_msg = f"Error initializing agent: {e}\n{traceback.format_exc()}"
        print(error_msg)
        return error_msg, None

    # Fetch questions
    if use_validation:
        print("Using validation dataset...")
        questions_data = fetch_validation_questions()
    else:
        print(f"Fetching test questions from: {questions_url}")
        try:
            response = requests.get(questions_url, headers=HEADERS, timeout=15)
            response.raise_for_status()
            questions_data = response.json()
            print(f"Fetched {len(questions_data)} test questions.")
        except requests.exceptions.RequestException as e:
            error_msg = f"Error fetching questions: {e}"
            print(error_msg)
            return error_msg, None
        except json.JSONDecodeError as e:
            error_msg = f"Error decoding JSON response: {e}"
            print(error_msg)
            return error_msg, None

    if not questions_data:
        error_msg = "Fetched questions list is empty."
        print(error_msg)
        return error_msg, None

    # Process questions
    results_log = []
    answers_payload = []
    successful_answers = 0
    
    print(f"\n{'='*60}")
    print(f"STARTING EVALUATION ON {len(questions_data)} QUESTIONS")
    print(f"{'='*60}")
    
    for i, item in enumerate(questions_data, 1):
        task_id = item.get("task_id")
        question_text = item.get("question")
        file_name = item.get("file_name", "")
        
        print(f"\n[{i}/{len(questions_data)}] Processing task: {task_id}")
        
        if not task_id or question_text is None:
            print(f"Skipping item with missing data: {item}")
            continue
            
        try:
            # Call agent with enhanced error handling
            submitted_answer = agent(question_text, task_id, file_name)
            
            if submitted_answer and submitted_answer != "unknown":
                successful_answers += 1
                print(f" Answer: {submitted_answer}")
            else:
                print(f" No answer found")
            
            answers_payload.append({
                "task_id": task_id, 
                "submitted_answer": submitted_answer
            })
            
            results_log.append({
                "Task ID": task_id,
                "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                "File": file_name,
                "Submitted Answer": submitted_answer,
                "Status": "Success" if submitted_answer != "unknown" else "❓ Unknown"
            })
            
        except Exception as e:
            error_msg = f"AGENT ERROR: {str(e)}"
            print(f" Error processing task {task_id}: {e}")
            print(f"Traceback: {traceback.format_exc()}")
            
            results_log.append({
                "Task ID": task_id,
                "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                "File": file_name,
                "Submitted Answer": error_msg,
                "Status": " Error"
            })

    print(f"\n{'='*60}")
    print(f"EVALUATION COMPLETE")
    print(f"Total questions: {len(questions_data)}")
    print(f"Successful answers: {successful_answers}")
    print(f"Success rate: {(successful_answers/len(questions_data)*100):.1f}%")
    print(f"{'='*60}")

    if not answers_payload:
        error_msg = "Agent did not produce any answers to submit."
        print(error_msg)
        return error_msg, pd.DataFrame(results_log)

    # Save results log
    try:
        with open("results_log.json", "w") as f:
            json.dump(results_log, f, indent=2)
        print(" Saved results_log.json")
    except Exception as e:
        print(f" Error saving results_log.json: {e}")

    # Prepare submission
    submission_data = {
        "username": username.strip(), 
        "agent_code": agent_code, 
        "answers": answers_payload
    }
    
    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
    print(status_update)

    # Submit or return results
    if not use_validation:
        print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
        try:
            response = requests.post(submit_url, json=submission_data, headers=HEADERS, timeout=60)
            response.raise_for_status()
            result_data = response.json()
            
            final_status = (
                f" Submission Successful!\n"
                f"User: {result_data.get('username')}\n"
                f"Overall Score: {result_data.get('score', 'N/A')}% "
                f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
                f"Message: {result_data.get('message', 'No message received.')}\n\n"
                f" Processing Summary:\n"
                f"• Total questions processed: {len(questions_data)}\n"
                f"• Answers found (non-'unknown'): {successful_answers}\n"
                f"• Processing success rate: {(successful_answers/len(questions_data)*100):.1f}%"
            )
            print(" Submission successful.")
            return final_status, pd.DataFrame(results_log)
            
        except requests.exceptions.HTTPError as e:
            error_detail = f"Server responded with status {e.response.status_code}."
            try:
                error_json = e.response.json()
                error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
            except:
                error_detail += f" Response: {e.response.text[:500]}"
            
            status_message = f" Submission Failed: {error_detail}"
            print(status_message)
            return status_message, pd.DataFrame(results_log)
            
        except Exception as e:
            status_message = f"Submission Failed: {e}\n{traceback.format_exc()}"
            print(status_message)
            return status_message, pd.DataFrame(results_log)
    else:
        print("Validation mode: Skipping submission, returning results.")
        validation_summary = (
            f" Validation Run Complete\n\n"
            f" Summary:\n"
            f"• Total questions processed: {len(questions_data)}\n"
            f"• Answers found (non-'unknown'): {successful_answers}\n"
            f"• Processing success rate: {(successful_answers/len(questions_data)*100):.1f}%\n\n"
            f" This gives you an estimate of potential performance.\n"
            f"Check the results table below for detailed breakdown."
        )
        return validation_summary, pd.DataFrame(results_log)

# Gradio Interface
with gr.Blocks(title="GAIA Benchmark Agent Evaluation", theme=gr.themes.Soft()) as demo:
    gr.Markdown("#  GAIA Benchmark Agent Evaluation")
    gr.Markdown(
        """
        ### Instructions:
        1. **Setup**: Ensure `HF_TOKEN_HERE` is set in Space Secrets
        2. **Development**: Clone this Space and modify `agent.py` with your logic  
        3. **Authentication**: Log in to Hugging Face below
        4. **Testing**: Select 'Use Validation' for local testing or leave unchecked for test set submission
        5. **Run**: Click 'Run Evaluation & Submit All Answers' to process questions and submit

        ###  Important Notes:
        - **Validation Mode**: Use this to test your agent on known questions before submitting
        - **Test Mode**: Submits to the actual benchmark (limited submissions per day)
        - **Processing Time**: May take several minutes depending on number of questions
        - **Debugging**: Check `results_log.json` if you need to debug failures

        ###  Current Goal: Improve accuracy
        """
    )

    gr.LoginButton()
    
    with gr.Row():
        use_validation = gr.Checkbox(
            label="🧪 Use Validation Set for Testing", 
            value=True,  # Default to validation for safety
            info="Recommended: Test on validation set first before submitting to test set"
        )
    
    run_button = gr.Button(
        "🚀 Run Evaluation & Submit All Answers", 
        variant="primary",
        size="lg"
    )
    
    status_output = gr.Textbox(
        label="Run Status / Submission Result", 
        lines=10, 
        interactive=False,
        show_copy_button=True
    )
    
    results_table = gr.DataFrame(
        label="Detailed Results: Questions and Agent Answers", 
        wrap=True,
        interactive=False
    )

    run_button.click(
        fn=run_and_submit_all,
        inputs=[use_validation],
        outputs=[status_output, results_table]
    )

if __name__ == "__main__":
    print("\n" + "="*70)
    print("  GAIA BENCHMARK AGENT - STARTING UP ")
    print("="*70)
    
    space_host = os.getenv("SPACE_HOST")
    space_id = os.getenv("SPACE_ID") or "saandip5/Final_Assignment_Template"
    
    if space_host:
        print(f" SPACE_HOST found: {space_host}")
        print(f"   Runtime URL: https://{space_host}.hf.space")
    else:
        print(" SPACE_HOST not found (running locally?)")
    
    if space_id:
        print(f" SPACE_ID found: {space_id}")
        print(f"    Repo URL: https://huggingface.co/spaces/{space_id}")
        print(f"    Repo Tree URL: https://huggingface.co/spaces/{space_id}/tree/main")
    else:
        print("  SPACE_ID not found (running locally?)")
    
    print("="*70)
    print(" Launching Gradio Interface...")
    print("="*70 + "\n")
    
    demo.launch(debug=True, share=False)