""" MLE-Bench Workflow Simple wrapper for running SciEvo FullWorkflow on MLE-Bench competition tasks. MLE-Bench provides: - instructions.md: Specific task instructions (used as user_query) - description.md: Overall task background description This wrapper register models, reads these files, builds user_query, and invokes FullWorkflow. """ import sys from pathlib import Path # Add parent directory to path to find scievo and bench modules sys.path.insert(0, str(Path(__file__).parent.parent)) from loguru import logger from bench.register_models.gemini import ( register_gemini_low_medium_models, register_gemini_medium_high_models, ) from bench.register_models.gpt import ( register_gpt_low_medium_models, register_gpt_medium_high_models, ) from scievo.workflows.full_workflow import run_full_workflow def build_mlebench_user_query( instructions_path: Path, description_path: Path, ) -> tuple[str, str]: """ Build user query and data description from MLE-Bench task files. Args: instructions_path: Path to instructions.md description_path: Path to description.md Returns: Tuple of (user_query, data_desc) - user_query: Task instructions for the experiment - data_desc: Task description for data analysis context """ # Load instructions if not instructions_path.exists(): raise FileNotFoundError(f"Instructions file not found: {instructions_path}") instructions = instructions_path.read_text(encoding="utf-8") # Load description if not description_path.exists(): raise FileNotFoundError(f"Description file not found: {description_path}") description = description_path.read_text(encoding="utf-8") # Use instructions as user_query, description as data_desc user_query = instructions data_desc = description return user_query, data_desc if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="MLE-Bench Workflow - Run SciEvo on MLE-Bench competition tasks", prog="python -m bench.mlebench_workflow", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Basic usage python -m bench.mlebench_workflow \\ -i competition/instructions.md \\ -d competition/description.md \\ --data competition/data \\ -w workspace # With custom settings python -m bench.mlebench_workflow \\ -i competition/instructions.md \\ -d competition/description.md \\ --data competition/data \\ -w workspace \\ --max-revisions 10 \\ --session-name my_experiment """, ) # Required arguments parser.add_argument( "--instructions", "-i", required=True, help="Path to instructions.md (task instructions)", ) parser.add_argument( "--description", "-d", required=True, help="Path to description.md (task background)", ) parser.add_argument( "--data", required=True, help="Path to the data directory or file", ) parser.add_argument( "--workspace", "-w", required=True, help="Workspace directory for the experiment", ) # Optional arguments parser.add_argument( "--repo-source", default=None, help="Optional repository source (local path or git URL)", ) parser.add_argument( "--max-revisions", type=int, default=3, help="Maximum revision loops (default: 3)", ) parser.add_argument( "--data-recursion-limit", type=int, default=512, help="Recursion limit for DataAgent (default: 512)", ) parser.add_argument( "--experiment-recursion-limit", type=int, default=512, help="Recursion limit for ExperimentAgent (default: 512)", ) parser.add_argument( "--session-name", default=None, help="Custom session name (otherwise uses timestamp)", ) parser.add_argument( "--models", choices=[ "gpt-low-medium", "gpt-medium-high", "gemini-low-medium", "gemini-medium-high", ], default="gemini-low-medium", help="Model configuration to use (default: gemini-low-medium)", ) args = parser.parse_args() # Register models based on choice logger.info(f"Registering models: {args.models}") match args.models: case "gpt-low-medium": register_gpt_low_medium_models() case "gpt-medium-high": register_gpt_medium_high_models() case "gemini-low-medium": register_gemini_low_medium_models() case "gemini-medium-high": register_gemini_medium_high_models() # Build user query and data description from MLE-Bench files logger.info("Building user query from MLE-Bench task files...") user_query, data_desc = build_mlebench_user_query( instructions_path=Path(args.instructions), description_path=Path(args.description), ) logger.info(f"User query built: {len(user_query)} chars") logger.info(f"Data description built: {len(data_desc)} chars") # Run FullWorkflow result = run_full_workflow( data_path=args.data, workspace_path=args.workspace, user_query=user_query, data_desc=data_desc, repo_source=args.repo_source, max_revisions=args.max_revisions, data_agent_recursion_limit=args.data_recursion_limit, experiment_agent_recursion_limit=args.experiment_recursion_limit, session_name=args.session_name, ) # Save summary result.save_summary() print(f"\nStatus: {result.final_status}")