scider / bench_workflows /scicodebench_workflow.py
leonardklin's picture
Upload 328 files
978fed5 verified
"""
Workflow for running coding tasks using Claude coding subagent.
"""
import sys
import tempfile
from pathlib import Path
from loguru import logger
# Add parent directory to path to find scider and bench modules
sys.path.insert(0, str(Path(__file__).parent.parent))
from bench_workflows.register_models.gemini import (
register_gemini3_medium_high_models,
register_gemini_low_medium_models,
register_gemini_medium_high_models,
)
from bench_workflows.register_models.gpt import (
register_gpt_low_medium_models,
register_gpt_medium_high_models,
)
from scider.agents.experiment_agent.coding_subagent_v3_claude import build
from scider.agents.experiment_agent.coding_subagent_v3_claude.state import ClaudeCodingAgentState
from scider.core.code_env import LocalEnv
def run_coding_workflow(user_query: str, workspace_dir: str | Path | None = None) -> str:
"""
Run a simple coding workflow using Claude coding subagent.
Args:
user_query: The coding task description
workspace_dir: The working directory for the coding task. If None, a temporary directory is created.
Returns:
The output from Claude's coding execution
"""
logger.info(f"Starting coding workflow with query: {user_query[:100]}...")
# Create workspace environment
if workspace_dir is None:
workspace_dir = tempfile.mkdtemp(prefix="scider_coding_")
logger.info(f"Using temporary workspace: {workspace_dir}")
workspace = LocalEnv(working_dir=workspace_dir, create_dir_if_missing=True)
# Create agent state
coding_state = ClaudeCodingAgentState(
user_query=user_query,
workspace=workspace,
data_summary="", # No background data needed for simple coding tasks
intermediate_full_output=True, # Store full output in intermediate state
skip_summary=True, # Skip final summary to get full output directly
)
# Build and compile the graph
coding_graph = build().compile()
# Execute the workflow
logger.info("Executing coding graph...")
result_state = coding_graph.invoke(coding_state)
# Extract intermediate states and find the last 'claude' node output
intermediate_states = result_state.get("intermediate_state", [])
# Filter for claude nodes and get the last one
claude_states = [state for state in intermediate_states if state.get("node_name") == "claude"]
if not claude_states:
logger.warning("No claude node found in intermediate states")
return ""
# Get the output from the last claude node
last_claude_output = claude_states[-1].get("_raw_claude_result", {}).get("final_result", None)
assert last_claude_output is not None, "No final_result found in the last claude node output"
return last_claude_output
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="SciCodeBench Workflow - Run simple coding tasks using Claude coding subagent",
prog="python -m bench_workflows.scicodebench_workflow",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Run with inline query
python -m bench_workflows.scicodebench_workflow \\
--query "Create a function to calculate fibonacci numbers"
# Run with query from file
python -m bench_workflows.scicodebench_workflow \\
--query-file task.txt \\
--workspace ./my_workspace
# Use specific model configuration
python -m bench_workflows.scicodebench_workflow \\
--query "Implement a binary search tree" \\
--models gpt-medium-high
""",
)
# Query input (mutually exclusive)
query_group = parser.add_mutually_exclusive_group(required=True)
query_group.add_argument(
"--query",
"-q",
help="Coding task query (inline text)",
)
query_group.add_argument(
"--query-file",
"-f",
help="Path to file containing the coding task query",
)
# Optional arguments
parser.add_argument(
"--workspace",
"-w",
default=None,
help="Workspace directory for the coding task (default: creates temp directory)",
)
parser.add_argument(
"--models",
choices=[
"gpt-low-medium",
"gpt-medium-high",
"gemini-low-medium",
"gemini-medium-high",
"gemini3-medium-high",
],
default="gemini-low-medium",
help="Model configuration to use (default: gemini-low-medium)",
)
parser.add_argument(
"--output",
"-o",
default=None,
help="Optional output file to save the result",
)
args = parser.parse_args()
# Register models based on choice
logger.info(f"Registering models: {args.models}")
match args.models:
case "gpt-low-medium":
register_gpt_low_medium_models()
case "gpt-medium-high":
register_gpt_medium_high_models()
case "gemini-low-medium":
register_gemini_low_medium_models()
case "gemini-medium-high":
register_gemini_medium_high_models()
case "gemini3-medium-high":
register_gemini3_medium_high_models()
# Get user query
if args.query:
user_query = args.query
else:
query_path = Path(args.query_file)
if not query_path.exists():
raise FileNotFoundError(f"Query file not found: {query_path}")
user_query = query_path.read_text(encoding="utf-8")
logger.info(f"User query length: {len(user_query)} chars")
# Run coding workflow
result = run_coding_workflow(
user_query=user_query,
workspace_dir=args.workspace,
)
# Print result
print("\n" + "=" * 80)
print("CODING WORKFLOW RESULT")
print("=" * 80)
print(result)
print("=" * 80)
# Save to file if requested
if args.output:
output_path = Path(args.output)
output_path.write_text(result, encoding="utf-8")
logger.info(f"Result saved to: {output_path}")
output_path.write_text(result, encoding="utf-8")
logger.info(f"Result saved to: {output_path}")