Spaces:

AI4Research
/

scider

Running

App Files Files Community

harry-lu-0708 commited on 17 days ago

Commit

0913c52

0 Parent(s):

clean HF Space commit (no binary history)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
.gitignore +215 -0
.pre-commit-config.yaml +22 -0
.python-version +1 -0
.vscode/settings.json +23 -0
CLAUDE.md +257 -0
Dockerfile +20 -0
LICENSE +201 -0
README.md +49 -0
bench/mlebench_workflow.py +197 -0
bench/register_models/gemini.py +180 -0
bench/register_models/gpt.py +215 -0
case-studies/case1/task.md +0 -0
case-studies/case2/task.md +0 -0
pyproject.toml +94 -0
reasoning_bank/README.md +5 -0
reasoning_bank/mem_induction.py +365 -0
reasoning_bank/mem_manage.py +207 -0
requirements.txt +3 -0
scievo/__init__.py +0 -0
scievo/agents/__init__.py +0 -0
scievo/agents/critic_agent/__init__.py +2 -0
scievo/agents/critic_agent/build.py +39 -0
scievo/agents/critic_agent/execute.py +289 -0
scievo/agents/critic_agent/state.py +24 -0
scievo/agents/data_agent/__init__.py +2 -0
scievo/agents/data_agent/build.py +178 -0
scievo/agents/data_agent/execute.py +487 -0
scievo/agents/data_agent/paper_subagent/__init__.py +10 -0
scievo/agents/data_agent/paper_subagent/build.py +47 -0
scievo/agents/data_agent/paper_subagent/execute.py +436 -0
scievo/agents/data_agent/paper_subagent/state.py +27 -0
scievo/agents/data_agent/plan.py +176 -0
scievo/agents/data_agent/state.py +33 -0
scievo/agents/dummy_agent.py +33 -0
scievo/agents/experiment_agent/__init__.py +15 -0
scievo/agents/experiment_agent/build.py +67 -0
scievo/agents/experiment_agent/coding_subagent_v2/__init__.py +11 -0
scievo/agents/experiment_agent/coding_subagent_v2/build.py +29 -0
scievo/agents/experiment_agent/coding_subagent_v2/execute.py +161 -0
scievo/agents/experiment_agent/coding_subagent_v2/state.py +160 -0
scievo/agents/experiment_agent/coding_subagent_v3_claude/__init__.py +11 -0
scievo/agents/experiment_agent/coding_subagent_v3_claude/build.py +29 -0
scievo/agents/experiment_agent/coding_subagent_v3_claude/execute.py +189 -0
scievo/agents/experiment_agent/coding_subagent_v3_claude/state.py +31 -0
scievo/agents/experiment_agent/exec_subagent/__init__.py +12 -0
scievo/agents/experiment_agent/exec_subagent/build.py +96 -0
scievo/agents/experiment_agent/exec_subagent/execute.py +502 -0
scievo/agents/experiment_agent/exec_subagent/state.py +57 -0
scievo/agents/experiment_agent/execute.py +513 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,215 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# temporary files
+tmp_*
+rsync_tmp_*
+.aider*
+data_analysis.md
+software-agent-sdk
+env

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v6.0.0
+  hooks:
+  - id: trailing-whitespace
+  - id: end-of-file-fixer
+  - id: name-tests-test
+  - id: requirements-txt-fixer
+- repo: https://github.com/pycqa/isort
+  rev: 5.13.2
+  hooks:
+    - id: isort
+      args: ["--profile", "black", "--line-length=100", "--python-version=310"]
+- repo: https://github.com/psf/black
+  rev: 25.1.0
+  hooks:
+    - id: black
+      args: ["--line-length=100", "--target-version=py310"]
+- repo: https://github.com/kynan/nbstripout
+  rev: 0.8.2
+  hooks:
+    - id: nbstripout

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.13

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "sync-rsync.local": "/home/link/github/SciEvo",
+    "sync-rsync.remote": "klin07@astral-8xA30-jump:~/rsync/SciEvo",
+    // "sync-rsync.remote": "ubuntu@192.9.141.47:~/rsync/mle-bench/agents/scievo/SciEvo",
+    "sync-rsync.onSave": false,
+    "sync-rsync.onSaveIndividual": true,
+    "sync-rsync.delete": false,
+    "sync-rsync.exclude": [
+        ".vscode/",
+        "**/__pycache__/",
+        "**/*.pyc",
+        ".git/",
+        ".ipynb_checkpoints/",
+        "**/.venv/",
+        "**/.DS_Store",
+        "**/.pytest_cache/",
+        // tmp files
+        "**/tmp_*",
+        // project specific
+        "sth_large/"
+    ],
+    "sync-rsync.useWSL": true
+}

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,257 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+SciEvo is a multi-agent framework for automated scientific experimentation. It orchestrates data analysis and experimental code generation through specialized agents that can search papers, generate code, execute experiments, and maintain long-term memory of insights.
+## Setup and Environment
+### Initial Setup
+```bash
+# Install dependencies (choose based on your platform)
+# For macOS
+uv sync --extra mac
+# For CPU-only
+uv sync --extra cpu
+# For CUDA 12.8
+uv sync --extra cu128
+# Install pre-commit hooks
+pip install pre-commit
+pre-commit install
+```
+### Environment Configuration
+Copy `.env.template` to `.env` and configure:
+```bash
+cp .env.template .env
+```
+Required environment variables:
+- `OPENAI_API_KEY` - OpenAI API access
+- `GEMINI_API_KEY` - Google Gemini API access
+- `BRAIN_DIR` - Session storage location (default: `./tmp_brain`)
+Optional configurations (see `.env.template` for full list):
+- `REASONING_BANK_ENABLED` - Enable long-term memory consolidation
+- `HISTORY_AUTO_COMPRESSION` - Auto-compress conversation history
+- `CRITIC_ENABLED` - Enable agent output critique
+- `CODING_AGENT_VERSION` - v2 or v3
+- `AIDER_*` - Aider code editor configuration
+- `OPENHANDS_MODEL` - Model for OpenHands integration
+### Code Formatting
+This project uses:
+- **black** (line length: 100, target: py310)
+- **isort** (profile: black, line length: 100)
+- **nbstripout** for cleaning notebooks
+Run formatting manually:
+```bash
+pre-commit run --all-files
+```
+## Running Workflows
+### Full Workflow (Data Analysis + Experiment)
+```bash
+python -m scievo.run_workflow full <data_path> <workspace_path> "<user_query>" [repo_source]
+# Example
+python -m scievo.run_workflow full data.csv ./workspace "Train SVR model for regression"
+# With options
+python -m scievo.run_workflow full data.csv ./workspace "Train model" \
+    --data-recursion-limit 100 \
+    --experiment-recursion-limit 100 \
+    --session-name my_experiment
+```
+### Data Analysis Only
+```bash
+python -m scievo.run_workflow data <data_path> <workspace_path> [--recursion-limit N] [--session-name NAME]
+# Example
+python -m scievo.run_workflow data data.csv ./workspace --session-name my_analysis
+```
+### Experiment Only (Requires Existing Analysis)
+```bash
+python -m scievo.run_workflow experiment <workspace_path> "<user_query>" [data_analysis_path] [--recursion-limit N]
+# Example (uses data_analysis.md from workspace)
+python -m scievo.run_workflow experiment ./workspace "Train SVR model"
+# With custom analysis file
+python -m scievo.run_workflow experiment ./workspace "Train model" ./my_analysis.md
+```
+## Architecture Overview
+### Core Components
+**`scievo/core/`** - Infrastructure and shared utilities
+- `types.py` - Core message types, state management (ToolsetState, HistoryState, RBankState, ExecState)
+- `brain.py` - Singleton session manager coordinating shared application state
+- `llms.py` - Model registry with completion/response API wrappers (supports rate limiting, embeddings)
+- `exec/` - Command execution sessions (SessionManager, PTYSession)
+- `code_env.py` - Workspace context manager (LocalEnv)
+- `utils.py` - TOON/JSON parsing, markdown extraction
+- `constant.py` - Configuration flags and defaults
+**`scievo/tools/`** - 20+ tool integrations
+- Core: `fs_tool`, `shell_tool`, `exec_tool`
+- Search: `arxiv_tool`, `dataset_search_tool`, `metric_search_tool`, `web_tool`
+- Code: `coder_tool`, `cursor_tool`, `claude_code_tool`, `claude_agent_sdk_tool`, `openhands_tool`
+- Other: `github_tool`, `ideation_tool`, `history_tool`, `state_tool`, `todo_tool`, `env_tool`
+- Registry: `Tool` base class with JSON schemas, `ToolRegistry` singleton
+**`scievo/agents/`** - Agent implementations using LangGraph
+- `data_agent/` - Analyzes data, generates `data_analysis.md`, searches papers/datasets
+  - Flow: START → planner → gateway (router) → llm_chat/tool_calling/mem_extraction → replanner → finalize → END
+  - Sub-agents: `paper_subagent/` for academic search
+- `experiment_agent/` - Generates and executes experimental code
+  - Flow: START → init → coding → exec → summary → analysis → revision_judge → END
+  - Sub-agents: CodingSubagent, ExecSubagent, SummarySubagent
+- `ideation_agent/` - Research idea generation
+- `critic_agent/` - Output quality review
+**`scievo/workflows/`** - Workflow orchestration
+- `full_workflow.py` - Chains DataAgent → ExperimentAgent
+- `data_workflow.py` - Standalone DataAgent execution
+- `experiment_workflow.py` - Standalone ExperimentAgent execution
+- `run_workflow.py` - CLI entry point with three subcommands (backward compatibility layer)
+**`scievo/prompts/`** - Prompt management
+- `prompt_data.py` - Dataclass-based organization (DataPrompts, ExperimentPrompts, etc.)
+- YAML files with Jinja2 templating for dynamic content
+**`scievo/rbank/`** - ReasoningBank (Long-term Memory)
+- `memo.py` - Persistent memory with embeddings for similarity search
+- `subgraph/` - Memory consolidation subgraph
+- Three memory tiers: short-term (session), long-term (cross-project), project-specific
+### Key Architectural Patterns
+1. **Singleton Pattern** - Brain, ModelRegistry, SessionManager, ToolRegistry ensure single instances
+2. **State Graph Pattern** (LangGraph) - Agents as stateful graphs with nodes (steps) and edges (transitions)
+3. **Sub-agent Composition** - Complex agents orchestrate specialized sub-agents
+4. **History Compression** - Automatic message summarization to manage token usage
+5. **Tool Registry** - Self-registering tools with JSON schemas for LLM consumption
+6. **Memory Consolidation** - Periodic extraction of insights into long-term, project, and short-term memory
+### Data Flow
+```
+run_workflow.py CLI
+    ↓
+FullWorkflow
+    ├─→ DataWorkflow
+    │   ├─→ DataAgent (planner → execution loop → finalize)
+    │   │   └─→ PaperSubagent (searches papers/datasets)
+    │   └─→ Output: data_analysis.md
+    │
+    └─→ ExperimentWorkflow
+        ├─→ ExperimentAgent (init → coding → exec → summary → revision loop)
+        │   ├─→ CodingSubagent
+        │   ├─→ ExecSubagent
+        │   └─→ SummarySubagent
+        └─→ Output: metrics, final_summary
+All agents use: Brain, ModelRegistry, ToolRegistry, Prompts, ReasoningBank
+```
+## Development Guidelines
+### Agent State Management
+Agents use LangGraph state objects that extend core state types:
+- `HistoryState` - Message history with compression support
+- `ToolsetState` - Available tools
+- `RBankState` - Memory directories
+- `ExecState` - Execution sessions
+State is passed through node functions and updated via returns.
+### Adding New Tools
+1. Create tool in `scievo/tools/` directory
+2. Inherit from `Tool` base class
+3. Define `json_schema` property
+4. Implement tool logic
+5. Tool auto-registers on import via `ToolRegistry`
+### Working with Memory
+- Enable via `REASONING_BANK_ENABLED=true` in `.env`
+- Extraction frequency controlled by `MEM_EXTRACTION_ROUND_FREQ`
+- Three directories: short-term, long-term (MEM_LONG_TERM_DIR), project (MEM_PROJECT_DIR)
+- Memories stored as markdown with embeddings for retrieval
+### History Management
+- Auto-compression enabled via `HISTORY_AUTO_COMPRESSION=true`
+- Triggers at `HISTORY_AUTO_COMPRESSION_TOKEN_THRESHOLD` (default: 64000)
+- Keeps `HISTORY_AUTO_COMPRESSION_KEEP_RATIO` (default: 0.33) of messages
+- Compression patches stored in `HistoryState.history_patches`
+## File Locations
+- Workflow implementations: `scievo/workflows/`
+- Agent logic: `scievo/agents/{agent_name}/`
+- Tool definitions: `scievo/tools/`
+- Prompts: `scievo/prompts/` (YAML files) + `prompt_data.py` (dataclasses)
+- Core infrastructure: `scievo/core/`
+- Memory: Configured via `BRAIN_DIR`, `MEM_LONG_TERM_DIR`, `MEM_PROJECT_DIR`
+- Generated outputs: Within workspace directory specified in CLI
+## Testing and Debugging
+### Jupyter Notebooks
+Development notebooks are prefixed with `tmp_*`:
+- `tmp_workflow_w_ideation.ipynb` - Full workflow with ideation
+- `tmp_ideation_test.ipynb` - Ideation agent testing
+- `tmp_paper_agent_test.ipynb` - Paper search testing
+- Other `tmp_*.ipynb` files for component testing
+### Logging
+Control verbosity via `.env`:
+```bash
+LOGURU_LEVEL=DEBUG          # or INFO
+LOG_MEM_SUBGRAPH=true       # Memory consolidation logs
+LOG_SYSTEM_PROMPT=false     # Show system prompts
+```
+### Running Partial Workflows
+Use mode-specific commands for testing individual components:
+```bash
+# Test only data analysis
+python -m scievo.run_workflow data test_data/sample.csv ./debug_workspace
+# Test experiment with existing analysis
+python -m scievo.run_workflow experiment ./debug_workspace "Test query"
+```
+## Important Notes
+- **Python Version**: Requires Python >=3.13 (see `pyproject.toml`)
+- **Package Manager**: Uses `uv` for dependency management
+- **PyTorch**: Platform-specific installation via custom indices (see `pyproject.toml` [tool.uv.sources])
+- **Optional Dependencies**: OpenHands (`openhands-sdk`, `openhands-tools`) - enable via `SCIEVO_ENABLE_OPENHANDS`
+- **Pre-commit Hooks**: Always run before committing to maintain code style
+- **Temporary Files**: `tmp_*` directories and notebooks are for development, not production
+- **Brain Directory**: Session state persists in `BRAIN_DIR` - can accumulate over time

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.13.5-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./
+COPY src/ ./src/
+RUN pip3 install -r requirements.txt
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# SciEvo
+```shell
+# for cpu
+uv sync --extra cpu
+# for mac
+uv sync --extra mac
+# for gpu
+uv sync --extra cu128
+```
+Optional: install Claude Code (for `claude_code` toolset):
+- Ensure the `claude` CLI is installed and authenticated on your machine.
+- If your `claude` command needs extra flags, set `CLAUDE_CODE_CMD`, e.g.:
+```shell
+export CLAUDE_CODE_CMD="claude"
+```
+Optional: install Claude Agent SDK (for `claude_agent_sdk` toolset):
+- Docs: `https://platform.claude.com/docs/en/agent-sdk/overview`
+- Install:
+```shell
+pip install claude-agent-sdk
+export ANTHROPIC_API_KEY="..."
+```
+## Development Guide
+First, install `pre-commit`:
+```shell
+pip install pre-commit
+```
+Install `pre-commit` to format code:
+```shell
+pre-commit install
+```
+Then, copy `.env.template` to `.env` and fill in the necessary values.
+```
+OPENAI_API_KEY=<your_openai_api_key>
+GEMINI_API_KEY=<your_gemini_api_key>
+```

bench/mlebench_workflow.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+MLE-Bench Workflow
+Simple wrapper for running SciEvo FullWorkflow on MLE-Bench competition tasks.
+MLE-Bench provides:
+- instructions.md: Specific task instructions (used as user_query)
+- description.md: Overall task background description
+This wrapper register models, reads these files, builds user_query, and invokes FullWorkflow.
+"""
+import sys
+from pathlib import Path
+# Add parent directory to path to find scievo and bench modules
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from loguru import logger
+from bench.register_models.gemini import (
+    register_gemini_low_medium_models,
+    register_gemini_medium_high_models,
+)
+from bench.register_models.gpt import (
+    register_gpt_low_medium_models,
+    register_gpt_medium_high_models,
+)
+from scievo.workflows.full_workflow import run_full_workflow
+def build_mlebench_user_query(
+    instructions_path: Path,
+    description_path: Path,
+) -> tuple[str, str]:
+    """
+    Build user query and data description from MLE-Bench task files.
+    Args:
+        instructions_path: Path to instructions.md
+        description_path: Path to description.md
+    Returns:
+        Tuple of (user_query, data_desc)
+        - user_query: Task instructions for the experiment
+        - data_desc: Task description for data analysis context
+    """
+    # Load instructions
+    if not instructions_path.exists():
+        raise FileNotFoundError(f"Instructions file not found: {instructions_path}")
+    instructions = instructions_path.read_text(encoding="utf-8")
+    # Load description
+    if not description_path.exists():
+        raise FileNotFoundError(f"Description file not found: {description_path}")
+    description = description_path.read_text(encoding="utf-8")
+    # Use instructions as user_query, description as data_desc
+    user_query = instructions
+    data_desc = description
+    return user_query, data_desc
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="MLE-Bench Workflow - Run SciEvo on MLE-Bench competition tasks",
+        prog="python -m bench.mlebench_workflow",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage
+  python -m bench.mlebench_workflow \\
+      -i competition/instructions.md \\
+      -d competition/description.md \\
+      --data competition/data \\
+      -w workspace
+  # With custom settings
+  python -m bench.mlebench_workflow \\
+      -i competition/instructions.md \\
+      -d competition/description.md \\
+      --data competition/data \\
+      -w workspace \\
+      --max-revisions 10 \\
+      --session-name my_experiment
+        """,
+    )
+    # Required arguments
+    parser.add_argument(
+        "--instructions",
+        "-i",
+        required=True,
+        help="Path to instructions.md (task instructions)",
+    )
+    parser.add_argument(
+        "--description",
+        "-d",
+        required=True,
+        help="Path to description.md (task background)",
+    )
+    parser.add_argument(
+        "--data",
+        required=True,
+        help="Path to the data directory or file",
+    )
+    parser.add_argument(
+        "--workspace",
+        "-w",
+        required=True,
+        help="Workspace directory for the experiment",
+    )
+    # Optional arguments
+    parser.add_argument(
+        "--repo-source",
+        default=None,
+        help="Optional repository source (local path or git URL)",
+    )
+    parser.add_argument(
+        "--max-revisions",
+        type=int,
+        default=3,
+        help="Maximum revision loops (default: 3)",
+    )
+    parser.add_argument(
+        "--data-recursion-limit",
+        type=int,
+        default=512,
+        help="Recursion limit for DataAgent (default: 512)",
+    )
+    parser.add_argument(
+        "--experiment-recursion-limit",
+        type=int,
+        default=512,
+        help="Recursion limit for ExperimentAgent (default: 512)",
+    )
+    parser.add_argument(
+        "--session-name",
+        default=None,
+        help="Custom session name (otherwise uses timestamp)",
+    )
+    parser.add_argument(
+        "--models",
+        choices=[
+            "gpt-low-medium",
+            "gpt-medium-high",
+            "gemini-low-medium",
+            "gemini-medium-high",
+        ],
+        default="gemini-low-medium",
+        help="Model configuration to use (default: gemini-low-medium)",
+    )
+    args = parser.parse_args()
+    # Register models based on choice
+    logger.info(f"Registering models: {args.models}")
+    match args.models:
+        case "gpt-low-medium":
+            register_gpt_low_medium_models()
+        case "gpt-medium-high":
+            register_gpt_medium_high_models()
+        case "gemini-low-medium":
+            register_gemini_low_medium_models()
+        case "gemini-medium-high":
+            register_gemini_medium_high_models()
+    # Build user query and data description from MLE-Bench files
+    logger.info("Building user query from MLE-Bench task files...")
+    user_query, data_desc = build_mlebench_user_query(
+        instructions_path=Path(args.instructions),
+        description_path=Path(args.description),
+    )
+    logger.info(f"User query built: {len(user_query)} chars")
+    logger.info(f"Data description built: {len(data_desc)} chars")
+    # Run FullWorkflow
+    result = run_full_workflow(
+        data_path=args.data,
+        workspace_path=args.workspace,
+        user_query=user_query,
+        data_desc=data_desc,
+        repo_source=args.repo_source,
+        max_revisions=args.max_revisions,
+        data_agent_recursion_limit=args.data_recursion_limit,
+        experiment_agent_recursion_limit=args.experiment_recursion_limit,
+        session_name=args.session_name,
+    )
+    # Save summary
+    result.save_summary()
+    print(f"\nStatus: {result.final_status}")

bench/register_models/gemini.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+from pydantic import BaseModel
+from scievo.core.llms import ModelRegistry
+LOW_COST_MODEL = "gemini/gemini-2.5-flash-lite"
+MEDIUM_COST_MODEL = "gemini/gemini-2.5-flash"
+HIGH_COST_MODEL = "gemini/gemini-2.5-pro"
+OPENAI_KEY = os.getenv("OPENAI_API_KEY")
+GEMINI_KEY = os.getenv("GEMINI_API_KEY")
+def register_gemini_low_medium_models(reasoning: str = "low"):
+    """Register Gemini low and medium cost models in the ModelRegistry."""
+    ModelRegistry.register(
+        name="data",
+        model=LOW_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+    )
+    ModelRegistry.register(
+        name="plan",
+        model=MEDIUM_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+        temperature=0.3,
+        top_p=0.9,
+    )
+    ModelRegistry.register(
+        name="critic",
+        model=LOW_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+        temperature=0.3,
+        top_p=0.9,
+    )
+    ModelRegistry.register(
+        name="mem",
+        model=LOW_COST_MODEL,
+        api_key=GEMINI_KEY,
+    )
+    # NOTE: Use OpenAI embeddings for better performance
+    ModelRegistry.register(
+        name="embed",
+        model="text-embedding-3-small",
+        api_key=OPENAI_KEY,
+    )
+    ModelRegistry.register(
+        name="history",
+        model=LOW_COST_MODEL,
+        api_key=GEMINI_KEY,
+    )
+    ModelRegistry.register(
+        name="experiment_agent",
+        model=MEDIUM_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+    )
+    ModelRegistry.register(
+        name="experiment_coding",
+        model=MEDIUM_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+    )
+    ModelRegistry.register(
+        name="experiment_execute",
+        model=MEDIUM_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+    )
+    ModelRegistry.register(
+        name="experiment_monitor",
+        model=LOW_COST_MODEL,
+        api_key=GEMINI_KEY,
+        temperature=0.3,
+        top_p=0.9,
+    )
+    ModelRegistry.register(
+        name="experiment_summary",
+        model=LOW_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort="low",
+    )
+def register_gemini_medium_high_models(reasoning: str = "low"):
+    """Register Gemini medium and high cost models in the ModelRegistry."""
+    ModelRegistry.register(
+        name="data",
+        model=MEDIUM_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+    )
+    ModelRegistry.register(
+        name="plan",
+        model=HIGH_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+        temperature=0.3,
+        top_p=0.9,
+    )
+    ModelRegistry.register(
+        name="critic",
+        model=HIGH_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+        temperature=0.3,
+        top_p=0.9,
+    )
+    ModelRegistry.register(
+        name="mem",
+        model=MEDIUM_COST_MODEL,
+        api_key=GEMINI_KEY,
+    )
+    ModelRegistry.register(
+        name="embed",
+        model="text-embedding-3-small",
+        api_key=OPENAI_KEY,
+    )
+    ModelRegistry.register(
+        name="history",
+        model=MEDIUM_COST_MODEL,
+        api_key=GEMINI_KEY,
+    )
+    ModelRegistry.register(
+        name="experiment_agent",
+        model=HIGH_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+    )
+    ModelRegistry.register(
+        name="experiment_coding",
+        model=HIGH_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+    )
+    ModelRegistry.register(
+        name="experiment_execute",
+        model=HIGH_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort=reasoning,
+    )
+    ModelRegistry.register(
+        name="experiment_monitor",
+        model=MEDIUM_COST_MODEL,
+        api_key=GEMINI_KEY,
+        temperature=0.3,
+        top_p=0.9,
+    )
+    ModelRegistry.register(
+        name="experiment_summary",
+        model=HIGH_COST_MODEL,
+        api_key=GEMINI_KEY,
+        reasoning_effort="low",
+    )

bench/register_models/gpt.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+from pydantic import BaseModel
+from scievo.core.llms import ModelRegistry
+LOW_COST_MODEL = "gpt-5-nano"
+MEDIUM_COST_MODEL = "gpt-5-mini"
+HIGH_COST_MODEL = "gpt-5.2"
+OPENAI_KEY = os.getenv("OPENAI_API_KEY")
+def register_gpt_low_medium_models(reasoning: str = "low"):
+    """Register GPT low and medium cost models in the ModelRegistry."""
+    ModelRegistry.register(
+        name="data",
+        model=LOW_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+            "summary": "detailed",
+        },
+    )
+    ModelRegistry.register(
+        name="plan",
+        model=MEDIUM_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+        },
+    )
+    ModelRegistry.register(
+        name="critic",
+        model=LOW_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+        },
+    )
+    ModelRegistry.register(
+        name="mem",
+        model=LOW_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": "minimal",
+        },
+    )
+    # NOTE: Use OpenAI embeddings for better performance
+    ModelRegistry.register(
+        name="embed",
+        model="text-embedding-3-small",
+        api_key=OPENAI_KEY,
+    )
+    ModelRegistry.register(
+        name="history",
+        model=LOW_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": "minimal",
+        },
+    )
+    ModelRegistry.register(
+        name="experiment_agent",
+        model=MEDIUM_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+        },
+    )
+    ModelRegistry.register(
+        name="experiment_coding",
+        model=MEDIUM_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+        },
+    )
+    ModelRegistry.register(
+        name="experiment_execute",
+        model=MEDIUM_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+        },
+    )
+    ModelRegistry.register(
+        name="experiment_monitor",
+        model=LOW_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": "minimal",
+        },
+    )
+    ModelRegistry.register(
+        name="experiment_summary",
+        model=LOW_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": "low",
+        },
+    )
+def register_gpt_medium_high_models(reasoning: str = "low"):
+    """Register GPT medium and high cost models in the ModelRegistry."""
+    ModelRegistry.register(
+        name="data",
+        model=MEDIUM_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+            "summary": "detailed",
+        },
+    )
+    ModelRegistry.register(
+        name="plan",
+        model=HIGH_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+        },
+    )
+    ModelRegistry.register(
+        name="critic",
+        model=MEDIUM_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+        },
+    )
+    ModelRegistry.register(
+        name="mem",
+        model=MEDIUM_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": "minimal",
+        },
+    )
+    ModelRegistry.register(
+        name="embed",
+        model="text-embedding-3-small",
+        api_key=OPENAI_KEY,
+    )
+    ModelRegistry.register(
+        name="history",
+        model=MEDIUM_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": "minimal",
+        },
+    )
+    ModelRegistry.register(
+        name="experiment_agent",
+        model=HIGH_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+        },
+    )
+    ModelRegistry.register(
+        name="experiment_coding",
+        model=HIGH_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+        },
+    )
+    ModelRegistry.register(
+        name="experiment_execute",
+        model=HIGH_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": reasoning,
+        },
+    )
+    ModelRegistry.register(
+        name="experiment_monitor",
+        model=MEDIUM_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": "minimal",
+        },
+    )
+    ModelRegistry.register(
+        name="experiment_summary",
+        model=MEDIUM_COST_MODEL,
+        api_key=OPENAI_KEY,
+        reasoning={
+            "effort": "low",
+        },
+    )

case-studies/case1/task.md ADDED Viewed

File without changes

case-studies/case2/task.md ADDED Viewed

File without changes

pyproject.toml ADDED Viewed

	@@ -0,0 +1,94 @@

+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "scievo"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "beautifulsoup4>=4.14.2",
+    "ddgs>=9.6.1",
+    "epam-indigo==1.35.0",
+    "feedparser>=6.0.12",
+    "filetype>=1.2.0",
+    "jinja2>=3.1.6",
+    "json-repair>=0.53.0",
+    "langchain-text-splitters>=1.0.0",
+    "langgraph>=1.0.2",
+    "litellm>=1.79.0,<1.80.0",
+    "loguru>=0.7.3",
+    "numpy>=2.3.4",
+    "openhands-sdk==1.3.0",
+    "openhands-tools==1.3.0",
+    "pandas>=2.3.3",
+    "pexpect>=4.9.0",
+    "pillow>=12.0.0",
+    "pydantic>=2.12.3",
+    "pyfunctional>=1.5.0",
+    "python-toon>=0.1.2",
+    "pyyaml>=6.0.3",
+    "rich>=14.2.0",
+    "scikit-learn>=1.8.0",
+    "tiktoken>=0.12.0",
+]
+[dependency-groups]
+dev = [
+    "jupyterlab>=4.4.10",
+]
+[project.optional-dependencies]
+cpu = [
+    "torch>=2.9.0",
+    "torchvision",
+]
+cu128 = [
+    "torch>=2.9.0",
+    "torchvision",
+]
+mac = [
+    "torch>=2.9.0",
+    "torchvision",
+]
+[tool.setuptools]
+packages = { find = { include = ["scievo", "scievo.*"] } }
+[tool.uv]
+conflicts = [
+    [
+        { extra = "cpu" },
+        { extra = "cu128" },
+        { extra = "mac" },
+    ],
+]
+[tool.uv.sources]
+torch = [
+    { index = "pytorch-cpu", extra = "cpu" },
+    { index = "pytorch-cu128", extra = "cu128" },
+    { index = "pytorch-mac", extra = "mac" },
+]
+torchvision = [
+    { index = "pytorch-cpu", extra = "cpu" },
+    { index = "pytorch-cu128", extra = "cu128" },
+    { index = "pytorch-mac", extra = "mac" },
+]
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
+[[tool.uv.index]]
+name = "pytorch-mac"
+url = "https://pypi.org/simple"
+explicit = true

reasoning_bank/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# ReasoningBank Core (Reference Copy)
+- This directory contains a minimal copy of the core component of Google's ReasoningBank.
+- It is included only as a reminder/reference of what their core looks like.
+- For the complete, authoritative source and updates, please refer to the original Google project.

reasoning_bank/mem_induction.py ADDED Viewed

	@@ -0,0 +1,365 @@

+#!/usr/bin/env python3
+"""Run mini-SWE-agent on SWE-bench instances in batch mode."""
+# Read this first: https://mini-swe-agent.com/latest/usage/swebench/  (usage docs)
+import concurrent.futures
+import json
+import os
+import random
+import re
+import threading
+import time
+import traceback
+from pathlib import Path
+import typer
+import yaml
+from datasets import load_dataset
+from google import genai
+from google.genai.types import GenerateContentConfig, HttpOptions
+from jinja2 import Template
+from minisweagent import Environment
+from minisweagent.agents.default import DefaultAgent
+from minisweagent.config import builtin_config_dir, get_config_path
+from minisweagent.environments import get_environment
+from minisweagent.memory.instruction import FAILED_SI, SUCCESSFUL_SI
+from minisweagent.memory.memory_management import select_memory
+from minisweagent.models import get_model
+from minisweagent.run.extra.utils.batch_progress import RunBatchProgressManager
+from minisweagent.run.utils.save import save_traj
+from minisweagent.utils.log import add_file_handler, logger
+from rich.live import Live
+client = genai.Client(http_options=HttpOptions(api_version="v1"))
+_HELP_TEXT = """Run mini-SWE-agent on SWEBench instances.
+[not dim]
+More information about the usage: [bold green]https://mini-swe-agent.com/latest/usage/swebench/[/bold green]
+[/not dim]
+"""
+app = typer.Typer(rich_markup_mode="rich", add_completion=False)
+DATASET_MAPPING = {
+    "full": "princeton-nlp/SWE-Bench",
+    "verified": "princeton-nlp/SWE-Bench_Verified",
+    "lite": "princeton-nlp/SWE-Bench_Lite",
+    "multimodal": "princeton-nlp/SWE-Bench_Multimodal",
+    "multilingual": "swe-bench/SWE-Bench_Multilingual",
+    "smith": "SWE-bench/SWE-smith",
+    "_test": "klieret/swe-bench-dummy-test-dataset",
+}
+_OUTPUT_FILE_LOCK = threading.Lock()
+class ProgressTrackingAgent(DefaultAgent):
+    """Simple wrapper around DefaultAgent that provides progress updates."""
+    def __init__(
+        self, *args, progress_manager: RunBatchProgressManager, instance_id: str = "", **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.progress_manager: RunBatchProgressManager = progress_manager
+        self.instance_id = instance_id
+    def step(self) -> dict:
+        """Override step to provide progress updates."""
+        self.progress_manager.update_instance_status(
+            self.instance_id, f"Step {self.model.n_calls + 1:3d} (${self.model.cost:.2f})"
+        )
+        return super().step()
+def get_swebench_docker_image_name(instance: dict) -> str:
+    """Get the image name for a SWEBench instance."""
+    image_name = instance.get("image_name", None)
+    if image_name is None:
+        # Docker doesn't allow double underscore, so we replace them with a magic token
+        iid = instance["instance_id"]
+        id_docker_compatible = iid.replace("__", "_1776_")
+        image_name = f"swebench/sweb.eval.x86_64.{id_docker_compatible}:latest".lower()
+    return image_name
+def get_sb_environment(config: dict, instance: dict) -> Environment:
+    env_config = config.setdefault("environment", {})
+    env_config["environment_class"] = env_config.get("environment_class", "docker")
+    image_name = get_swebench_docker_image_name(instance)
+    if env_config["environment_class"] == "docker":
+        env_config["image"] = image_name
+    elif env_config["environment_class"] == "singularity":
+        env_config["image"] = "docker://" + image_name
+    env = get_environment(env_config)
+    if startup_command := config.get("run", {}).get("env_startup_command"):
+        startup_command = Template(startup_command).render(**instance)
+        out = env.execute(startup_command)
+        if out["returncode"] != 0:
+            raise RuntimeError(f"Error executing startup command: {out}")
+    return env
+def update_preds_file(output_path: Path, instance_id: str, model_name: str, result: str):
+    """Update the output JSON file with results from a single instance."""
+    with _OUTPUT_FILE_LOCK:
+        output_data = {}
+        if output_path.exists():
+            output_data = json.loads(output_path.read_text())
+        output_data[instance_id] = {
+            "model_name_or_path": model_name,
+            "instance_id": instance_id,
+            "model_patch": result,
+        }
+        output_path.write_text(json.dumps(output_data, indent=2))
+def remove_from_preds_file(output_path: Path, instance_id: str):
+    """Remove an instance from the predictions file."""
+    if not output_path.exists():
+        return
+    with _OUTPUT_FILE_LOCK:
+        output_data = json.loads(output_path.read_text())
+        if instance_id in output_data:
+            del output_data[instance_id]
+            output_path.write_text(json.dumps(output_data, indent=2))
+def llm_generate(prompt: list[dict], model, verbose: bool = False, si: str = None) -> str:
+    """Call gpt model to generate memories."""
+    if verbose:
+        print("Prompt:\n", prompt, "\n\n")
+    response = client.models.generate_content(
+        model=model,
+        contents=prompt,
+        config=GenerateContentConfig(
+            temperature=1.0,
+            max_output_tokens=65536,
+            system_instruction=si.strip() if si else None,
+        ),
+    )
+    response = response.text
+    if verbose:
+        print(response)
+    return response.split("\n\n")
+def process_instance(
+    instance: dict,
+    output_dir: Path,
+    config: dict,
+    progress_manager: RunBatchProgressManager,
+) -> None:
+    """Process a single SWEBench instance."""
+    instance_id = instance["instance_id"]
+    instance_dir = output_dir / instance_id
+    # avoid inconsistent state if something here fails and there's leftover previous files
+    remove_from_preds_file(output_dir / "preds.json", instance_id)
+    (instance_dir / f"{instance_id}.traj.json").unlink(missing_ok=True)
+    model = get_model(config=config.get("model", {}))
+    task = instance["problem_statement"]
+    if not os.path.exists(f"./memory/{model.config.model_name}.jsonl"):
+        open(f"./memory/{model.config.model_name}.jsonl", "w").close()  # create an empty file
+    with open(f"./memory/{model.config.model_name}.jsonl", "r") as f:
+        memory_bank = [json.loads(line) for line in f.readlines()]
+    res = select_memory(
+        1,
+        reasoning_bank=memory_bank,
+        cur_query=task,
+        task_id=instance_id,
+        cache_path=f"./memory/{model.config.model_name}_embeddings.jsonl",
+        prefer_model="gemini",
+    )
+    if not res:
+        selected_memory = ""
+    else:
+        mem_items = []
+        for item in res:
+            for i in item["memory_items"]:
+                mem_items.append(i)
+        selected_memory = "\n\n".join(mem_items)
+    progress_manager.on_instance_start(instance_id)
+    progress_manager.update_instance_status(instance_id, "Pulling/starting docker")
+    agent = None
+    extra_info = None
+    try:
+        env = get_sb_environment(config, instance)
+        agent = ProgressTrackingAgent(
+            model,
+            env,
+            progress_manager=progress_manager,
+            instance_id=instance_id,
+            **config.get("agent", {}),
+        )
+        exit_status, result = agent.run(task, selected_memory=selected_memory)
+    except Exception as e:
+        logger.error(f"Error processing instance {instance_id}: {e}", exc_info=True)
+        exit_status, result = type(e).__name__, str(e)
+        extra_info = {"traceback": traceback.format_exc()}
+    finally:
+        save_traj(
+            agent,
+            instance_dir / f"{instance_id}.traj.json",
+            exit_status=exit_status,
+            result=result,
+            extra_info=extra_info,
+            instance_id=instance_id,
+            print_fct=logger.info,
+        )
+        update_preds_file(output_dir / "preds.json", instance_id, model.config.model_name, result)
+        progress_manager.on_instance_end(instance_id, exit_status)
+        # read trajectory and extract memory
+        with open(instance_dir / f"{instance_id}.traj.json", "r") as f:
+            messages = json.load(f)["messages"]
+        trajectory = "\n".join([m["content"] for m in messages if m["role"] != "system"])
+        status = llm_judge_status(task, trajectory, model.config.model_name)
+        trajectory = f"**Query:** {task}\n\n**Trajectory:**\n{trajectory}"
+        if status:
+            generated_memory_item = llm_generate(
+                trajectory, model.config.model_name, True, si=SUCCESSFUL_SI
+            )
+        else:
+            generated_memory_item = llm_generate(
+                trajectory, model.config.model_name, True, si=FAILED_SI
+            )
+        with open(f"./memory/{model.config.model_name}.jsonl", "a") as f:
+            f.write(
+                json.dumps(
+                    {
+                        "task_id": instance_id,
+                        "query": task,
+                        "memory_items": generated_memory_item,
+                        "status": "success" if status else "fail",
+                    }
+                )
+                + "\n"
+            )
+def llm_judge_status(task: str, trajectory: str, model: str) -> str:
+    prompt = f"Task: {task}\n\nTrajectory:\n{trajectory}\n\nDid the agent successfully complete the task? Answer with 'success' or 'fail' only."
+    response = client.models.generate_content(
+        model=model,
+        contents=prompt,
+        config=GenerateContentConfig(
+            temperature=0.0,
+            max_output_tokens=65536,
+            system_instruction="You are a helpful assistant that judges whether the agent successfully completed the task.",
+        ),
+    )
+    response = response.text.strip().lower()
+    if "success" in response:
+        return True
+    else:
+        return False
+def filter_instances(
+    instances: list[dict], *, filter_spec: str, slice_spec: str = "", shuffle: bool = False
+) -> list[dict]:
+    """Filter and slice a list of SWEBench instances."""
+    if shuffle:
+        instances = sorted(instances.copy(), key=lambda x: x["instance_id"])
+        random.seed(42)
+        random.shuffle(instances)
+    before_filter = len(instances)
+    instances = [
+        instance for instance in instances if re.match(filter_spec, instance["instance_id"])
+    ]
+    if (after_filter := len(instances)) != before_filter:
+        logger.info(f"Instance filter: {before_filter} -> {after_filter} instances")
+    if slice_spec:
+        values = [int(x) if x else None for x in slice_spec.split(":")]
+        instances = instances[slice(*values)]
+        if (after_slice := len(instances)) != before_filter:
+            logger.info(f"Instance slice: {before_filter} -> {after_slice} instances")
+    return instances
+# fmt: off
+@app.command(help=_HELP_TEXT)
+def main(
+    subset: str = typer.Option("lite", "--subset", help="SWEBench subset to use or path to a dataset", rich_help_panel="Data selection"),
+    split: str = typer.Option("dev", "--split", help="Dataset split", rich_help_panel="Data selection"),
+    slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g., '0:5' for first 5 instances)", rich_help_panel="Data selection"),
+    filter_spec: str = typer.Option("", "--filter", help="Filter instance IDs by regex", rich_help_panel="Data selection"),
+    shuffle: bool = typer.Option(False, "--shuffle", help="Shuffle instances", rich_help_panel="Data selection"),
+    output: str = typer.Option("", "-o", "--output", help="Output directory", rich_help_panel="Basic"),
+    workers: int = typer.Option(1, "-w", "--workers", help="Number of worker threads for parallel processing", rich_help_panel="Basic"),
+    model: str | None = typer.Option(None, "-m", "--model", help="Model to use", rich_help_panel="Basic"),
+    model_class: str | None = typer.Option(None, "-c", "--model-class", help="Model class to use (e.g., 'anthropic' or 'minisweagent.models.anthropic.AnthropicModel')", rich_help_panel="Advanced"),
+    redo_existing: bool = typer.Option(False, "--redo-existing", help="Redo existing instances", rich_help_panel="Data selection"),
+    config_spec: Path = typer.Option( builtin_config_dir / "extra" / "swebench.yaml", "-c", "--config", help="Path to a config file", rich_help_panel="Basic"),
+    environment_class: str | None = typer.Option( None, "--environment-class", help="Environment type to use. Recommended are docker or singularity", rich_help_panel="Advanced"),
+) -> None:
+    # fmt: on
+    output_path = Path(output)
+    output_path.mkdir(parents=True, exist_ok=True)
+    logger.info(f"Results will be saved to {output_path}")
+    add_file_handler(output_path / "minisweagent.log")
+    dataset_path = DATASET_MAPPING.get(subset, subset)
+    logger.info(f"Loading dataset {dataset_path}, split {split}...")
+    instances = list(load_dataset(dataset_path, split=split))
+    instances = filter_instances(instances, filter_spec=filter_spec, slice_spec=slice_spec, shuffle=shuffle)
+    if not redo_existing and (output_path / "preds.json").exists():
+        existing_instances = list(json.loads((output_path / "preds.json").read_text()).keys())
+        logger.info(f"Skipping {len(existing_instances)} existing instances")
+        instances = [instance for instance in instances if instance["instance_id"] not in existing_instances]
+    logger.info(f"Running on {len(instances)} instances...")
+    config = yaml.safe_load(get_config_path(config_spec).read_text())
+    if environment_class is not None:
+        config.setdefault("environment", {})["environment_class"] = environment_class
+    if model is not None:
+        config.setdefault("model", {})["model_name"] = model
+    if model_class is not None:
+        config.setdefault("model", {})["model_class"] = model_class
+    progress_manager = RunBatchProgressManager(len(instances), output_path / f"exit_statuses_{time.time()}.yaml")
+    def process_futures(futures: dict[concurrent.futures.Future, str]):
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                future.result()
+            except concurrent.futures.CancelledError:
+                pass
+            except Exception as e:
+                instance_id = futures[future]
+                logger.error(f"Error in future for instance {instance_id}: {e}", exc_info=True)
+                progress_manager.on_uncaught_exception(instance_id, e)
+    with Live(progress_manager.render_group, refresh_per_second=4):
+        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
+            futures = {
+                executor.submit(process_instance, instance, output_path, config, progress_manager): instance[
+                    "instance_id"
+                ]
+                for instance in instances
+            }
+            try:
+                process_futures(futures)
+            except KeyboardInterrupt:
+                logger.info("Cancelling all pending jobs. Press ^C again to exit immediately.")
+                for future in futures:
+                    if not future.running() and not future.done():
+                        future.cancel()
+                process_futures(futures)
+if __name__ == "__main__":
+    app()

reasoning_bank/mem_manage.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import argparse
+import json
+# from transformers import AutoTokenizer, AutoModel
+import logging
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+logger = logging.getLogger(__name__)
+from google import genai
+from google.genai.types import EmbedContentConfig
+from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
+client = genai.Client()
+def get_embeddings(texts: list) -> list:
+    """
+    Get embeddings for a list of texts using Google GenAI.
+    """
+    response = client.models.embed_content(
+        model="gemini-embedding-001",
+        contents=texts,
+        config=EmbedContentConfig(
+            task_type="RETRIEVAL_DOCUMENT",
+            output_dimensionality=3072,
+            title="Memory Embeddings",
+        ),
+    )
+    return [item.embedding for item in response.embeddings]
+def l2_normalize(x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    return F.normalize(x, p=2, dim=dim)
+def embed_query_with_qwen(query: str) -> Tuple[torch.Tensor, str, int]:
+    """Returns (1, D) torch tensor (on CPU), model_name, dim."""
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Embedding-8B", padding_side="left")
+    model = AutoModel.from_pretrained("Qwen/Qwen3-Embedding-8B")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    batch = tokenizer([query], max_length=1024, padding=True, truncation=True, return_tensors="pt")
+    batch = {k: v.to(device) for k, v in batch.items()}
+    with torch.no_grad():
+        out = model(**batch)
+        last_hidden = out.last_hidden_state  # (1, L, D)
+        masked = last_hidden.masked_fill(~batch["attention_mask"][..., None].bool(), 0.0)
+        pooled = masked.sum(dim=1) / batch["attention_mask"].sum(dim=1)[..., None]  # (1, D)
+    pooled = pooled.to("cpu")
+    pooled = l2_normalize(pooled, dim=1)
+    return pooled
+def embed_query_with_gemini(
+    query: str, dimensionality: int = 3072
+) -> Tuple[torch.Tensor, str, int]:
+    """Returns (1, D) torch tensor (on CPU), model_name, dim."""
+    model_name = "gemini-embedding-001"
+    model = TextEmbeddingModel.from_pretrained(model_name)
+    text_input = TextEmbeddingInput(query, "RETRIEVAL_DOCUMENT")
+    resp = model.get_embeddings([text_input], output_dimensionality=dimensionality)
+    # vertexai returns a list of TextEmbedding objects with .values
+    vec = torch.tensor([resp[0].values], dtype=torch.float32)  # (1, D)
+    return vec
+def load_cached_embeddings(path: str) -> Tuple[List[str], List[str], torch.Tensor]:
+    """
+    Load cached embeddings from JSONL.
+    Returns: ids, texts, torch.Tensor (N, D) normalized
+    Each line must contain keys: id, text, embedding
+    """
+    ids, texts, vecs = [], [], []
+    if not os.path.exists(path):
+        logger.warning(f"Cache file not found: {path}, creating an empty cache.")
+        open(path, "w").close()  # create an empty file
+        return ids, texts, torch.empty(0)
+    with open(path, "r") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            obj = json.loads(line)
+            ids.append(obj["id"])
+            texts.append(obj.get("text", ""))
+            vecs.append(obj["embedding"])
+    if len(vecs) == 0:
+        return ids, texts, torch.empty(0)
+    emb = torch.tensor(vecs, dtype=torch.float32)  # (N, D)
+    emb = l2_normalize(emb, dim=1)
+    return ids, texts, emb
+def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+def get_detailed_instruct(task_description: str, query: str) -> str:
+    return f"Instruct: {task_description}\nQuery: {query}"
+def formalize(queries):
+    tmp = []
+    ids = []
+    for id, data in enumerate(queries):
+        ids.append(id)
+        tmp.append(data)
+    return tmp, ids
+def select_memory(
+    n: int,
+    reasoning_bank: List[Dict],
+    cur_query: str,
+    task_id: str = None,
+    cache_path: str = "./memories/embeddings.jsonl",
+    prefer_model: str = "gemini",
+) -> Dict:
+    """
+    Returns a dict of top-n items by ID -> (optionally) original metadata.
+    This uses ONLY the cached embeddings; it does not recompute them.
+    """
+    if n > 10:
+        logger.error("the number of return experiences shouldn't be greater than 10")
+    id2score, ordered_ids = screening(
+        cur_query=cur_query, task_id=task_id, cache_path=cache_path, prefer_model=prefer_model
+    )
+    if not ordered_ids:
+        return {}
+    top_ids = ordered_ids[:n]
+    # optional: map back to your in-memory store if you have it
+    # below assumes your cache ids correspond 1:1 to indices in reasoning_bank
+    out = []
+    for sid in top_ids:
+        # find the corresponding reasoning bank entry, with reasoning_bank["task_id"] == sid
+        for i, item in enumerate(reasoning_bank):
+            if item["task_id"] == sid:
+                out.append(reasoning_bank[i])
+                break
+    return out
+def screening(
+    cur_query: str,
+    cache_path: str,
+    task_id: str = None,
+    prefer_model: str = "",
+) -> Tuple[List[Tuple[str, float]], List[str]]:
+    """
+    Compute similarity of current query against cached embeddings, optionally append the query embedding to cache.
+    """
+    cache_ids, cache_texts, cache_emb = load_cached_embeddings(cache_path)
+    # choose embedding method to match the cache
+    use_qwen = "Qwen" in prefer_model
+    if use_qwen:
+        q_vec = embed_query_with_qwen(cur_query)
+    else:
+        q_vec = embed_query_with_gemini(cur_query, dimensionality=3072)
+    # write current query embeddings to cache
+    record = {
+        "id": task_id,
+        "text": cur_query,
+        "embedding": q_vec.squeeze(0).tolist(),
+    }
+    with open(cache_path, "a") as f:
+        f.write(json.dumps(record) + "\n")
+    logger.info(f"Appended new query embedding to cache: webarena.{task_id}")
+    if len(cache_emb) == 0:
+        logger.warning(f"No cached embeddings found in {cache_path}.")
+        return [], []
+    # add instruction-aware embedding for calculation
+    task = "Given the prior software engineering queries, your task is to analyze a current query's intent and select relevant prior queries that could help resolve it."
+    instruction_query = get_detailed_instruct(task, cur_query)
+    instruct_vec = embed_query_with_gemini(instruction_query, dimensionality=3072)
+    instruct_vec = l2_normalize(instruct_vec, dim=1)
+    # Calculate similarity scores for embeddings and current query
+    scores = (instruct_vec @ cache_emb.T).squeeze(0) * 100.0  # (N,)
+    id2score = list(zip(cache_ids, scores.tolist()))
+    id2score.sort(key=lambda x: x[1], reverse=True)
+    return id2score, [str(i) for i, _ in id2score]

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+altair
+pandas
+streamlit

scievo/__init__.py ADDED Viewed

File without changes

scievo/agents/__init__.py ADDED Viewed

File without changes

scievo/agents/critic_agent/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .build import build
2	+ from .state import CriticAgentState

scievo/agents/critic_agent/build.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from langgraph.graph import END, START, StateGraph
+from loguru import logger
+from . import execute
+from .state import CriticAgentState
+@logger.catch
+def build():
+    g = StateGraph(CriticAgentState)
+    # nodes
+    g.add_node("create_first_user_msg", execute.create_first_user_msg_node)
+    g.add_node("gateway", execute.gateway_node)
+    g.add_node("llm_chat", execute.llm_chat_node)
+    g.add_node("tool_calling", execute.tool_calling_node)
+    g.add_node("summary", execute.summary_node)
+    # edges
+    g.add_edge(START, "create_first_user_msg")
+    g.add_edge("create_first_user_msg", "gateway")
+    g.add_conditional_edges(
+        "gateway",
+        execute.gateway_conditional,
+        [
+            "llm_chat",
+            "tool_calling",
+            "summary",
+        ],
+    )
+    # edges from nodes back to gateway
+    g.add_edge("llm_chat", "gateway")
+    g.add_edge("tool_calling", "gateway")
+    # edge from summary to end
+    g.add_edge("summary", END)
+    return g

scievo/agents/critic_agent/execute.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+Agent for criticizing and giving feedback on the agent's actions
+"""
+from typing import TYPE_CHECKING, TypeVar
+from loguru import logger
+from scievo.core import constant
+from scievo.core.llms import ModelRegistry
+from scievo.core.types import Message
+from scievo.core.utils import wrap_dict_to_toon, wrap_text_with_block
+from scievo.prompts import PROMPTS
+from scievo.rbank.subgraph import mem_retrieval
+from scievo.tools import Tool, ToolRegistry
+from .state import CriticAgentState
+if TYPE_CHECKING:
+    from scievo.core.types import HistoryState, RBankState
+    from scievo.rbank.memo import Memo
+    MemHistoryMixin = TypeVar("MemHistoryMixin", HistoryState, RBankState)
+LLM_NAME = "critic"
+AGENT_NAME = "critic"
+BUILTIN_TOOLSETS = [
+    # "todo",
+    "state",
+    "history",
+    "web",
+]
+ALLOWED_TOOLSETS = ["fs", "web"]
+def create_first_user_msg_node(agent_state: CriticAgentState) -> CriticAgentState:
+    logger.debug("create_first_user_msg_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("create_first_user_msg")
+    # Stringify all input messages
+    input_msgs_texts = []
+    for i, msg in enumerate(agent_state.input_msgs):
+        plain = msg.to_plain_text()
+        input_msgs_texts.append(f"--- Message {i} Begin ---\n{plain}\n--- Message {i} End ---")
+    trajectory_text: str = "\n".join(input_msgs_texts)
+    # Format using user_prompt template
+    user_prompt_content = PROMPTS.critic.user_prompt.render(
+        plan_text=agent_state.plan,
+        trajectory_text=trajectory_text,
+        is_data_agent=agent_state.is_data_agent,
+        is_exp_agent=agent_state.is_exp_agent,
+    )
+    # Add as first user message
+    agent_state.add_message(
+        Message(role="user", content=user_prompt_content, agent_sender=AGENT_NAME)
+    )
+    return agent_state
+def gateway_node(agent_state: CriticAgentState) -> CriticAgentState:
+    # NOTE: Same as data agent
+    logger.trace("gateway_node of Agent {}", AGENT_NAME)
+    return agent_state
+def gateway_conditional(agent_state: CriticAgentState) -> str:
+    # NOTE: Same as data agent
+    last_msg = agent_state.patched_history[-1]
+    if (tool_calls := last_msg.tool_calls) and len(tool_calls) > 0:
+        return "tool_calling"
+    match last_msg.role:
+        case "user" | "tool":
+            return "llm_chat"
+        case "assistant":
+            # finish this round of critic, go to "summary" node
+            return "summary"
+        case _:
+            raise ValueError(f"Unknown message role: {last_msg.role}")
+mem_retrieval_subgraph = mem_retrieval.build()
+mem_retrieval_subgraph_compiled = mem_retrieval_subgraph.compile()
+def llm_chat_node(agent_state: CriticAgentState) -> CriticAgentState:
+    logger.debug("llm_chat_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("llm_chat")
+    selected_state = {
+        "current_activated_toolsets": agent_state.toolsets,
+    }
+    # retrieve memos
+    if constant.REASONING_BANK_ENABLED:
+        try:
+            mem_dirs = [agent_state.sess_dir / "short_term"]
+            if d := agent_state.long_term_mem_dir:
+                mem_dirs.append(d)
+            if d := agent_state.project_mem_dir:
+                mem_dirs.append(d)
+            res = mem_retrieval_subgraph_compiled.invoke(
+                mem_retrieval.MemRetrievalState(
+                    input_msgs=agent_state.patched_history,
+                    mem_dirs=mem_dirs,
+                    max_num_memos=constant.MEM_RETRIEVAL_MAX_NUM_MEMOS,
+                )
+            )
+            memos: list[Memo] = res.get("output_memos", [])
+            from scievo.agents.data_agent.execute import _memos_to_markdown
+            memory_text = _memos_to_markdown(memos)
+        except Exception:
+            logger.exception("mem_retrieval_error")
+            memory_text = None
+    else:
+        memory_text = None
+    # update system prompt
+    system_prompt = PROMPTS.critic.system_prompt.render(
+        state_text=wrap_dict_to_toon(selected_state),
+        toolsets_desc=ToolRegistry.get_toolsets_desc(BUILTIN_TOOLSETS + ALLOWED_TOOLSETS),
+        memory_text=wrap_text_with_block(memory_text, "markdown"),
+        is_data_agent=agent_state.is_data_agent,
+        is_exp_agent=agent_state.is_exp_agent,
+    )
+    # construct tools
+    tools: dict[str, Tool] = {}
+    for toolset in agent_state.toolsets:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    for toolset in BUILTIN_TOOLSETS:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    msg = ModelRegistry.completion(
+        LLM_NAME,
+        agent_state.patched_history,
+        system_prompt=(
+            Message(role="system", content=system_prompt)
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+        tools=[tool.name for tool in tools.values()],
+    ).with_log()
+    agent_state.add_message(msg)
+    return agent_state
+def tool_calling_node(agent_state: CriticAgentState) -> CriticAgentState:
+    """Execute tool calls from the last message and update the graph state"""
+    logger.debug("tool_calling_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("tool_calling")
+    # Get the last message which contains tool calls
+    last_msg = agent_state.patched_history[-1]
+    if not last_msg.tool_calls:
+        raise ValueError("No tool calls found in the last message")
+    # construct tools
+    tools: dict[str, Tool] = {}
+    for toolset in agent_state.toolsets:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    for toolset in BUILTIN_TOOLSETS:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    function_map = {tool.name: tool.func for tool in tools.values()}
+    # Execute each tool call
+    for tool_call in last_msg.tool_calls:
+        tool_name = tool_call.function.name
+        # Check if tool exists in function map
+        if tool_name not in function_map:
+            error_msg = f"Tool {tool_name} not found"
+            tool_response = {
+                "role": "tool",
+                "tool_name": tool_name,
+                "tool_call_id": tool_call.id,
+                "content": error_msg,
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+            continue
+        # Parse tool arguments
+        try:
+            args = json.loads(tool_call.function.arguments)
+            assert isinstance(args, dict)
+        except json.JSONDecodeError as e:
+            error_msg = f"Invalid JSON in tool arguments: {e}"
+            tool_response = {
+                "role": "tool",
+                "tool_name": tool_name,
+                "tool_call_id": tool_call.id,
+                "content": error_msg,
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+            continue
+        except AssertionError as e:
+            error_msg = f"Invalid tool arguments: {e}"
+            tool_response = {
+                "role": "tool",
+                "tool_name": tool_name,
+                "tool_call_id": tool_call.id,
+                "content": error_msg,
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+            continue
+        # Execute the tool
+        try:
+            # Pass the graph state to the tool function
+            func = function_map[tool_name]
+            # Check if function expects agent_state parameter
+            import inspect
+            sig = inspect.signature(func)
+            if constant.__AGENT_STATE_NAME__ in sig.parameters:
+                args.update({constant.__AGENT_STATE_NAME__: agent_state})
+            if constant.__CTX_NAME__ in sig.parameters:
+                args.update({constant.__CTX_NAME__: {"current_agent": AGENT_NAME}})
+            # Execute the tool in the agent's local environment
+            with agent_state.local_env:
+                result = func(**args)
+            # Create tool response message
+            tool_response = {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "tool_name": tool_name,
+                "content": str(result),  # Ensure result is string
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+        except Exception as e:
+            error_msg = f"Tool {tool_name} execution failed: {e}"
+            tool_response = {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "tool_name": tool_name,
+                "content": error_msg,
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+    return agent_state
+def summary_node(agent_state: CriticAgentState) -> CriticAgentState:
+    logger.debug("summary_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("summary")
+    # update system prompt
+    system_prompt = PROMPTS.critic.system_prompt.render(
+        toolsets_desc={},
+        is_data_agent=agent_state.is_data_agent,
+        is_exp_agent=agent_state.is_exp_agent,
+    )
+    # Render the summary prompt
+    summary_prompt_content = PROMPTS.critic.user_prompt_summary.render(
+        is_data_agent=agent_state.is_data_agent,
+        is_exp_agent=agent_state.is_exp_agent,
+    )
+    # Add summary request as user message
+    agent_state.add_message(
+        Message(role="user", content=summary_prompt_content, agent_sender=AGENT_NAME)
+    )
+    # Get AI summary response
+    msg = ModelRegistry.completion(
+        LLM_NAME,
+        agent_state.patched_history,
+        system_prompt=system_prompt,
+        agent_sender=AGENT_NAME,
+    ).with_log()
+    agent_state.add_message(msg)
+    # Set the summary message as the output
+    agent_state.critic_msg = msg
+    return agent_state

scievo/agents/critic_agent/state.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pydantic import model_validator
+from scievo.core.types import HistoryState, Message, RBankState, ToolsetState
+class CriticAgentState(HistoryState, ToolsetState, RBankState):
+    # messages to be criticized (input)
+    input_msgs: list[Message]
+    # current plan of the caller (input)
+    plan: str | None = None
+    # whether the input messages are from data agent (input)
+    is_data_agent: bool = False
+    # whether the input messages are from experiment agent (input)
+    is_exp_agent: bool = False
+    # critics (output)
+    critic_msg: Message | None = None
+    @model_validator(mode="after")
+    def check_agent_source(self):
+        if self.is_data_agent and self.is_exp_agent:
+            raise ValueError("CriticAgentState: both is_data_agent and is_exp_agent are True")
+        if not self.is_data_agent and not self.is_exp_agent:
+            raise ValueError("CriticAgentState: both is_data_agent and is_exp_agent are False")
+        return self

scievo/agents/data_agent/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .build import build
2	+ from .state import DataAgentState

scievo/agents/data_agent/build.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from langgraph.graph import END, START, StateGraph
+from loguru import logger
+from scievo.core import constant
+from scievo.core.types import Message
+from scievo.rbank.subgraph import mem_consolidation
+from . import execute, plan
+from .paper_subagent import build as paper_subagent_build
+from .paper_subagent.state import PaperSearchAgentState
+from .state import DataAgentState
+mem_consolidation_subgraph = mem_consolidation.build()
+mem_consolidation_subgraph_compiled = mem_consolidation_subgraph.compile()
+paper_subagent_graph = paper_subagent_build()
+paper_subagent_graph_compiled = paper_subagent_graph.compile()
+def finialize_node(agent_state: DataAgentState) -> DataAgentState:
+    """A finalization node to do any final processing before ending the graph."""
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "finalize",
+            "output": f"Finalization complete. Plans completed: {len(agent_state.past_plans)}, Remaining: {len(agent_state.remaining_plans)}",
+        }
+    )
+    return agent_state
+def run_paper_subagent(agent_state: DataAgentState) -> DataAgentState:
+    """Run paper subagent to search for relevant papers, datasets, and metrics."""
+    logger.debug("run_paper_subagent of Agent data")
+    paper_state = PaperSearchAgentState(
+        user_query=agent_state.user_query,
+        data_summary=agent_state.data_desc,
+    )
+    try:
+        result_state = paper_subagent_graph_compiled.invoke(paper_state)
+        result_state = PaperSearchAgentState(**result_state)
+        agent_state.papers = result_state.papers
+        agent_state.datasets = result_state.datasets
+        agent_state.metrics = result_state.metrics
+        agent_state.paper_search_summary = result_state.output_summary
+        agent_state.intermediate_state.append(
+            {
+                "node_name": "paper_subagent",
+                "output": f"Paper subagent completed. Found {len(result_state.papers)} papers, {len(result_state.datasets)} datasets, {len(result_state.metrics)} metrics.\n\nSummary: {result_state.output_summary or 'No summary'}",
+            }
+        )
+        if result_state.output_summary:
+            agent_state.add_message(
+                Message(
+                    role="assistant",
+                    content=f"[Paper Search Results]\n{result_state.output_summary}",
+                    agent="paper_subagent",
+                ).with_log()
+            )
+    except Exception as e:
+        logger.exception("paper_subagent_error")
+        error_msg = f"Paper subagent error: {e}"
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=error_msg,
+                agent="paper_subagent",
+            ).with_log()
+        )
+        agent_state.intermediate_state.append(
+            {
+                "node_name": "paper_subagent",
+                "output": error_msg,
+            }
+        )
+    return agent_state
+def prepare_for_talk_mode(agent_state: DataAgentState) -> DataAgentState:
+    assert agent_state.talk_mode
+    agent_state.remaining_plans = ["Response to users' query."]
+    mem_output = "Memory consolidation skipped"
+    # consolidate mems
+    if constant.REASONING_BANK_ENABLED:
+        try:
+            mem_consolidation_subgraph_compiled.invoke(
+                mem_consolidation.MemConsolidationState(
+                    mem_dir=agent_state.sess_dir / "short_term",
+                    long_term_mem_dir=agent_state.long_term_mem_dir,
+                    project_mem_dir=agent_state.project_mem_dir,
+                )
+            )
+            mem_output = "Memory consolidation completed"
+        except Exception as e:
+            error_msg = f"mem_consolidation_error: {e}"
+            agent_state.add_message(
+                Message(
+                    role="assistant",
+                    content=error_msg,
+                    agent="noname",
+                ).with_log()
+            )
+            mem_output = error_msg
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "prepare_for_talk_mode",
+            "output": mem_output,
+        }
+    )
+    return agent_state
+@logger.catch
+def build():
+    g = StateGraph(DataAgentState)
+    # nodes
+    g.add_node("paper_subagent", run_paper_subagent)
+    g.add_node("planner", plan.planner_node)
+    g.add_node("replanner", plan.replanner_node)
+    g.add_node("gateway", execute.gateway_node)
+    g.add_node("llm_chat", execute.llm_chat_node)
+    g.add_node("tool_calling", execute.tool_calling_node)
+    g.add_node("mem_extraction", execute.mem_extraction_node)
+    g.add_node("history_compression", execute.history_compression_node)
+    # g.add_node("critic", execute.critic_node) # not used for now
+    g.add_node("critic_before_replan", execute.critic_node)
+    g.add_node("finalize", finialize_node)
+    g.add_node("generate_summary", execute.generate_summary_node)
+    g.add_node("prepare_for_talk_mode", prepare_for_talk_mode)
+    # edges from gateway to nodes
+    g.add_edge(START, "paper_subagent")
+    g.add_edge("paper_subagent", "planner")
+    g.add_edge("planner", "gateway")
+    g.add_conditional_edges(
+        "gateway",
+        execute.gateway_conditional,
+        [
+            "llm_chat",
+            "tool_calling",
+            "mem_extraction",
+            "history_compression",
+            "critic_before_replan",  # plan END
+        ],
+    )
+    # edges from nodes to gateway
+    g.add_edge("llm_chat", "gateway")
+    g.add_edge("tool_calling", "gateway")
+    g.add_edge("mem_extraction", "gateway")
+    g.add_edge("history_compression", "gateway")
+    g.add_edge("critic_before_replan", "replanner")
+    # edges from gateway to replanner
+    g.add_conditional_edges(
+        "replanner",
+        plan.should_replan,
+        [
+            "gateway",
+            "finalize",
+        ],
+    )
+    # edges from nodes to end
+    g.add_edge("finalize", "generate_summary")
+    g.add_edge("generate_summary", "prepare_for_talk_mode")
+    g.add_edge("prepare_for_talk_mode", END)
+    return g

scievo/agents/data_agent/execute.py ADDED Viewed

	@@ -0,0 +1,487 @@

+"""
+Agent for data understanding and processing
+"""
+import json
+from pathlib import Path
+from typing import TYPE_CHECKING, TypeVar
+from loguru import logger
+from scievo import history_compression
+from scievo.agents import critic_agent
+from scievo.core import constant
+from scievo.core.errors import sprint_chained_exception
+from scievo.core.llms import ModelRegistry
+from scievo.core.types import HistoryState, Message, RBankState
+from scievo.core.utils import wrap_dict_to_toon, wrap_text_with_block
+from scievo.prompts import PROMPTS
+from scievo.rbank.subgraph import mem_extraction, mem_retrieval
+from scievo.tools import Tool, ToolRegistry
+from .state import DataAgentState
+if TYPE_CHECKING:
+    from scievo.rbank.memo import Memo
+MemHistoryMixin = TypeVar("MemHistoryMixin", HistoryState, RBankState)
+LLM_NAME = "data"
+AGENT_NAME = "data"
+BUILTIN_TOOLSETS = [
+    # "todo",
+    "state",
+    "history",
+    "fs",
+]
+ALLOWED_TOOLSETS = ["web"]
+def gateway_node(agent_state: DataAgentState) -> DataAgentState:
+    # NOTE: this node does nothing, it's just a placeholder for the conditional edges
+    # Check `gateway_conditional` for the actual logic
+    logger.trace("gateway_node of Agent {}", AGENT_NAME)
+    return agent_state
+def gateway_conditional(agent_state: DataAgentState) -> str:
+    # compress history if needed
+    if (
+        constant.HISTORY_AUTO_COMPRESSION
+        and "history_compression" not in agent_state.node_history[-2:]
+        and agent_state.total_patched_tokens > constant.HISTORY_AUTO_COMPRESSION_TOKEN_THRESHOLD
+    ):
+        return "history_compression"
+    if (
+        constant.REASONING_BANK_ENABLED
+        and len(agent_state.node_history) > 0
+        and agent_state.node_history[-1] != "mem_extraction"
+        and agent_state.round > 0
+        and agent_state.round % constant.MEM_EXTRACTION_ROUND_FREQ == 0
+    ):
+        return "mem_extraction"
+    if len(agent_state.patched_history) == 0:
+        logger.warning("patched_history is empty, returning llm_chat")
+        return "llm_chat"
+    last_msg = agent_state.patched_history[-1]
+    if (tool_calls := last_msg.tool_calls) and len(tool_calls) > 0:
+        return "tool_calling"
+    match last_msg.role:
+        case "user" | "tool":
+            return "llm_chat"
+        case "assistant":
+            return "critic_before_replan"
+        case _:
+            raise ValueError(f"Unknown message role: {last_msg.role}")
+mem_retrieval_subgraph = mem_retrieval.build()
+mem_retrieval_subgraph_compiled = mem_retrieval_subgraph.compile()
+def _memos_to_markdown(memos: list["Memo"]) -> str:
+    ret = ""
+    if len(memos) == 0:
+        return "No memory retrieved."
+    for i, memo in enumerate(memos):
+        ret += f"# Memo {i + 1}\n\n{memo.to_markdown()}\n\n"
+    return ret
+def llm_chat_node(agent_state: DataAgentState) -> DataAgentState:
+    logger.debug("llm_chat_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("llm_chat")
+    selected_state = {
+        "workspace": agent_state.workspace.working_dir,
+        "current_activated_toolsets": agent_state.toolsets,
+    }
+    # retrieve memos
+    if constant.REASONING_BANK_ENABLED:
+        try:
+            mem_dirs = [agent_state.sess_dir / "short_term"]
+            if d := agent_state.long_term_mem_dir:
+                mem_dirs.append(d)
+            if d := agent_state.project_mem_dir:
+                mem_dirs.append(d)
+            res = mem_retrieval_subgraph_compiled.invoke(
+                mem_retrieval.MemRetrievalState(
+                    input_msgs=agent_state.patched_history,
+                    mem_dirs=mem_dirs,
+                    max_num_memos=constant.MEM_RETRIEVAL_MAX_NUM_MEMOS,
+                )
+            )
+            memos: list[Memo] = res.get("output_memos", [])
+            memory_text = _memos_to_markdown(memos)
+        except Exception:
+            logger.exception("mem_retrieval_error")
+            memory_text = None
+    else:
+        memory_text = None
+    # update system prompt
+    system_prompt = PROMPTS.data.system_prompt.render(
+        state_text=wrap_dict_to_toon(selected_state),
+        toolsets_desc=ToolRegistry.get_toolsets_desc(BUILTIN_TOOLSETS + ALLOWED_TOOLSETS),
+        memory_text=wrap_text_with_block(memory_text, "markdown"),
+        current_plan=(
+            agent_state.remaining_plans[0] if len(agent_state.remaining_plans) > 0 else None
+        ),
+    )
+    # construct tools
+    tools: dict[str, Tool] = {}
+    for toolset in agent_state.toolsets:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    for toolset in BUILTIN_TOOLSETS:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    # Ensure there's at least one non-system message for Anthropic API
+    history = agent_state.patched_history
+    if len(history) == 0 or all(msg.role == "system" for msg in history):
+        # Add a dummy user message if history is empty or only contains system messages
+        logger.warning(
+            "patched_history is empty or only contains system messages, adding dummy user message"
+        )
+        history = [
+            Message(
+                role="user",
+                content="Please continue with the task.",
+                agent_sender=AGENT_NAME,
+            )
+        ]
+    msg = ModelRegistry.completion(
+        LLM_NAME,
+        history,
+        system_prompt=(
+            Message(role="system", content=system_prompt)
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+        tools=[tool.name for tool in tools.values()],
+    ).with_log()
+    agent_state.add_message(msg)
+    llm_output = (
+        msg.content
+        if msg.content
+        else ("[Tool calls: " + str(len(msg.tool_calls)) + "]" if msg.tool_calls else "[No output]")
+    )
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "llm_chat",
+            "output": llm_output,
+        }
+    )
+    return agent_state
+def tool_calling_node(agent_state: DataAgentState) -> DataAgentState:
+    """Execute tool calls from the last message and update the graph state"""
+    logger.debug("tool_calling_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("tool_calling")
+    # Get the last message which contains tool calls
+    last_msg = agent_state.patched_history[-1]
+    if not last_msg.tool_calls:
+        raise ValueError("No tool calls found in the last message")
+    # construct tools
+    tools: dict[str, Tool] = {}
+    for toolset in agent_state.toolsets:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    for toolset in BUILTIN_TOOLSETS:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    function_map = {tool.name: tool.func for tool in tools.values()}
+    tool_results = []
+    # Execute each tool call
+    for tool_call in last_msg.tool_calls:
+        tool_name = tool_call.function.name
+        # Check if tool exists in function map
+        if tool_name not in function_map:
+            error_msg = f"Tool {tool_name} not found"
+            tool_response = {
+                "role": "tool",
+                "tool_name": tool_name,
+                "tool_call_id": tool_call.id,
+                "content": error_msg,
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+            tool_results.append({"tool": tool_name, "result": error_msg})
+            continue
+        # Parse tool arguments
+        try:
+            args = json.loads(tool_call.function.arguments)
+            assert isinstance(args, dict)
+        except json.JSONDecodeError as e:
+            error_msg = f"Invalid JSON in tool arguments: {e}"
+            tool_response = {
+                "role": "tool",
+                "tool_name": tool_name,
+                "tool_call_id": tool_call.id,
+                "content": error_msg,
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+            tool_results.append({"tool": tool_name, "result": error_msg})
+            continue
+        except AssertionError as e:
+            error_msg = f"Invalid tool arguments: {e}"
+            tool_response = {
+                "role": "tool",
+                "tool_name": tool_name,
+                "tool_call_id": tool_call.id,
+                "content": error_msg,
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+            tool_results.append({"tool": tool_name, "result": error_msg})
+            continue
+        # Execute the tool
+        result = None
+        try:
+            # Pass the graph state to the tool function
+            func = function_map[tool_name]
+            # Check if function expects agent_state parameter
+            import inspect
+            sig = inspect.signature(func)
+            if constant.__AGENT_STATE_NAME__ in sig.parameters:
+                args.update({constant.__AGENT_STATE_NAME__: agent_state})
+            if constant.__CTX_NAME__ in sig.parameters:
+                args.update({constant.__CTX_NAME__: {"current_agent": AGENT_NAME}})
+            # Execute the tool in the agent's local environment
+            with agent_state.workspace:
+                result = func(**args)
+            # Create tool response message
+            tool_response = {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "tool_name": tool_name,
+                "content": str(result),  # Ensure result is string
+            }
+            tool_results.append(
+                {"tool": tool_name, "result": str(result)[:1000] if result else "No result"}
+            )
+        except Exception as e:
+            error_msg = f"Tool {tool_name} execution failed: {e}"
+            tool_response = {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "tool_name": tool_name,
+                "content": error_msg,
+            }
+            tool_results.append({"tool": tool_name, "result": error_msg})
+        tool_response_msg = Message(**tool_response).with_log()
+        agent_state.add_message(tool_response_msg)
+    tool_output_parts = []
+    for tr in tool_results:
+        tool_output_parts.append(f"Tool: {tr['tool']}\nResult: {tr['result']}")
+    tool_output = "\n\n".join(tool_output_parts) if tool_output_parts else "No tool calls executed"
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "tool_calling",
+            "output": tool_output,
+        }
+    )
+    return agent_state
+mem_extraction_subgraph = mem_extraction.build()
+mem_extraction_subgraph_compiled = mem_extraction_subgraph.compile()
+def mem_extraction_node(agent_state: MemHistoryMixin) -> MemHistoryMixin:
+    logger.debug("mem_extraction_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("mem_extraction")
+    context_window = agent_state.patched_history[-constant.MEM_EXTRACTION_CONTEXT_WINDOW :]
+    logger.info("Agent {} begins to Memory Extraction", AGENT_NAME)
+    mem_output = "Memory extraction completed"
+    try:
+        result = mem_extraction_subgraph_compiled.invoke(
+            mem_extraction.MemExtractionState(
+                mem_dir=Path(agent_state.sess_dir) / f"short_term",
+                input_msgs=context_window,
+                input_agent_name=AGENT_NAME,
+            )
+        )
+        if isinstance(result, dict) and "output_memos" in result:
+            mem_output = f"Extracted {len(result.get('output_memos', []))} memory entries"
+    except Exception as e:
+        error_msg = f"mem_extraction_error: {sprint_chained_exception(e)}"
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=error_msg,
+                agent_sender=AGENT_NAME,
+            ).with_log()
+        )
+        mem_output = error_msg
+    if isinstance(agent_state, DataAgentState):
+        agent_state.intermediate_state.append(
+            {
+                "node_name": "mem_extraction",
+                "output": mem_output,
+            }
+        )
+    return agent_state
+def history_compression_node(agent_state: DataAgentState) -> DataAgentState:
+    logger.debug("history_compression_node of Agent {}", AGENT_NAME)
+    history_before = len(agent_state.history)
+    agent_state = history_compression.invoke_history_compression(agent_state)
+    history_after = len(agent_state.history)
+    compression_output = f"Compressed history: {history_before} -> {history_after} messages"
+    if agent_state.history_patches:
+        last_patch = agent_state.history_patches[-1]
+        if last_patch.patched_message and last_patch.patched_message.content:
+            compression_output = f"Compressed {last_patch.n_messages} messages into:\n{last_patch.patched_message.content[:500]}"
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "history_compression",
+            "output": compression_output,
+        }
+    )
+    return agent_state
+def generate_summary_node(agent_state: DataAgentState) -> DataAgentState:
+    """Generate analysis summary and store it in agent state"""
+    logger.debug("generate_summary_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("generate_summary")
+    try:
+        # Construct a summary request message
+        summary_system_prompt = PROMPTS.data.summary_system_prompt
+        summary_user_prompt = PROMPTS.data.summary_user_prompt
+        agent_state.add_message(
+            Message(
+                role="user",
+                content=summary_user_prompt.render(),
+            ).with_log(cond=constant.LOG_SYSTEM_PROMPT)
+        )
+        # Call LLM to generate summary
+        summary_msg = ModelRegistry.completion(
+            LLM_NAME,
+            agent_state.patched_history,
+            system_prompt=summary_system_prompt.render(),
+            agent_sender=AGENT_NAME,
+        ).with_log()
+        agent_state.add_message(summary_msg)
+        # Extract summary content
+        if summary_msg.role != "assistant" or not summary_msg.content:
+            raise ValueError("Failed to get summary from LLM")
+        # Store summary in state
+        agent_state.output_summary = summary_msg.content
+        logger.info("Analysis summary generated successfully")
+    except Exception as e:
+        error_msg = f"Failed to generate analysis summary: {sprint_chained_exception(e)}"
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=error_msg,
+                agent_sender=AGENT_NAME,
+            ).with_log()
+        )
+        logger.error("generate_summary_node failed: {}", error_msg)
+    summary_output = (
+        summary_msg.content
+        if "summary_msg" in locals() and summary_msg.content
+        else (error_msg if "error_msg" in locals() else "No summary generated")
+    )
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "generate_summary",
+            "output": summary_output,
+        }
+    )
+    return agent_state
+critic_subgraph = critic_agent.build()
+critic_subgraph_compiled = critic_subgraph.compile()
+def critic_node(agent_state: DataAgentState) -> DataAgentState:
+    logger.trace("critic_node of Agent {}", AGENT_NAME)
+    if not constant.CRITIC_ENABLED:
+        return agent_state
+    try:
+        current_plan = (
+            agent_state.remaining_plans[0] if len(agent_state.remaining_plans) > 0 else "N/A"
+        )
+        res = critic_subgraph_compiled.invoke(
+            critic_agent.CriticAgentState(
+                input_msgs=agent_state.patched_history[-constant.CRITIC_CONTEXT_WINDOW :],
+                plan=agent_state.remaining_plans[0],
+                is_data_agent=True,
+                # RBankState
+                sess_dir=agent_state.sess_dir,
+                long_term_mem_dir=agent_state.long_term_mem_dir,
+                project_mem_dir=agent_state.project_mem_dir,
+            )
+        )
+        assert res.get("critic_msg", None) is not None, "critic_msg is None"
+        critic_msg: Message = res.get("critic_msg")
+        agent_state.add_message(critic_msg.with_log())
+        critic_output = critic_msg.content if critic_msg.content else "No critic feedback"
+    except Exception as e:
+        error_msg = f"critic_error: {sprint_chained_exception(e)}"
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=error_msg,
+                agent_sender=AGENT_NAME,
+            ).with_log()
+        )
+        critic_output = error_msg
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "critic",
+            "output": critic_output if "critic_output" in locals() else "No critic output",
+        }
+    )
+    return agent_state

scievo/agents/data_agent/paper_subagent/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Paper Search Subagent
+A minimal agent for searching academic papers using arxiv_tool.
+"""
+from .build import build
+from .state import PaperSearchAgentState
+__all__ = ["build", "PaperSearchAgentState"]

scievo/agents/data_agent/paper_subagent/build.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from langgraph.graph import END, START, StateGraph
+from loguru import logger
+from . import execute
+from .state import PaperSearchAgentState
+@logger.catch
+def build():
+    """Build paper search agent graph with iterative query refinement.
+    Flow:
+    START -> optimize_query -> search -> check_results ->
+        (if insufficient results) -> optimize_query -> search -> check_results -> ...
+        (if sufficient results) -> dataset -> metric -> summary -> END
+    """
+    g = StateGraph(PaperSearchAgentState)
+    # Nodes
+    g.add_node("optimize_query", execute.optimize_query_node)
+    g.add_node("search", execute.search_node)
+    g.add_node("check_results", execute.check_results_node)
+    g.add_node("dataset", execute.dataset_node)
+    g.add_node("metric", execute.metric_node)
+    g.add_node("summary", execute.summary_node)
+    # Flow with iteration support
+    g.add_edge(START, "optimize_query")
+    g.add_edge("optimize_query", "search")
+    g.add_edge("search", "check_results")
+    # Conditional edge: continue searching or proceed
+    g.add_conditional_edges(
+        "check_results",
+        execute.should_continue_search,
+        {
+            "continue_search": "optimize_query",  # Iterate: optimize query and search again
+            "proceed": "dataset",  # Proceed with current results
+        },
+    )
+    # Continue with dataset, metric, and summary
+    g.add_edge("dataset", "metric")
+    g.add_edge("metric", "summary")
+    g.add_edge("summary", END)
+    return g

scievo/agents/data_agent/paper_subagent/execute.py ADDED Viewed

	@@ -0,0 +1,436 @@

+"""
+Execution nodes for the Paper Search Agent
+This module provides a minimal execution flow that searches for papers, datasets,
+extracts metrics, and generates a summary.
+Flow: START -> search_node -> dataset_node -> metric_node -> summary_node -> END
+"""
+from loguru import logger
+from scievo.core.llms import ModelRegistry
+from scievo.core.types import Message
+from scievo.core.utils import unwrap_dict_from_toon
+from scievo.prompts.prompt_data import PROMPTS
+from scievo.tools.arxiv_tool import search_papers
+from scievo.tools.dataset_search_tool import search_datasets
+from scievo.tools.metric_search_tool import extract_metrics_from_papers
+from .state import PaperSearchAgentState
+LLM_NAME = "paper_search"
+AGENT_NAME = "paper_search"
+# Minimum thresholds for considering search successful
+MIN_PAPERS_THRESHOLD = 3
+MIN_DATASETS_THRESHOLD = 2
+def optimize_query_node(agent_state: PaperSearchAgentState) -> PaperSearchAgentState:
+    """Optimize the search query using LLM to improve search results."""
+    logger.debug("optimize_query_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("optimize_query")
+    # Initialize current_query if not set
+    if agent_state.current_query is None:
+        agent_state.current_query = agent_state.user_query
+        agent_state.query_history = [agent_state.user_query]
+    # If we've already tried multiple queries, use the best one or stop
+    if agent_state.search_iteration >= agent_state.max_search_iterations:
+        logger.info("Reached max iterations, using current query")
+        return agent_state
+    # Build optimization prompt
+    previous_results = ""
+    if agent_state.search_iteration > 0:
+        previous_results = f"""
+Previous search results:
+- Papers found: {len(agent_state.papers)}
+- Datasets found: {len(agent_state.datasets)}
+- Previous queries tried: {', '.join(agent_state.query_history[-3:])}
+"""
+    optimization_prompt = f"""You are a research assistant helping to optimize academic paper search queries.
+Original user query: "{agent_state.user_query}"
+{previous_results}
+Your task is to generate an improved search query that is more likely to find relevant academic papers on arXiv.
+Guidelines:
+1. If previous search found few/no results, make the query MORE GENERAL (remove specific details, use broader terms)
+2. If previous search found too many irrelevant results, make the query MORE SPECIFIC (add key terms, use domain-specific vocabulary)
+3. Use standard academic terminology and keywords
+4. Keep the query concise (2-5 key terms)
+5. Consider synonyms and related terms
+6. Focus on the core research topic, not implementation details
+Generate ONLY the optimized search query (no explanation, just the query text):"""
+    try:
+        msg = ModelRegistry.completion(
+            LLM_NAME,
+            [Message(role="user", content=optimization_prompt)],
+            system_prompt="You are an expert at crafting effective academic search queries. Return only the optimized query text.",
+            agent_sender=AGENT_NAME,
+            tools=None,
+        )
+        optimized_query = msg.content.strip()
+        # Remove quotes if present
+        optimized_query = optimized_query.strip('"').strip("'").strip()
+        if optimized_query and optimized_query != agent_state.current_query:
+            agent_state.current_query = optimized_query
+            agent_state.query_history.append(optimized_query)
+            logger.info(
+                f"Optimized query (iteration {agent_state.search_iteration + 1}): {optimized_query}"
+            )
+            agent_state.add_message(
+                Message(
+                    role="assistant",
+                    content=f"[Query Optimization] Optimized search query: '{optimized_query}'",
+                    agent_sender=AGENT_NAME,
+                ).with_log()
+            )
+        else:
+            logger.info("Query optimization did not produce a new query, using current query")
+    except Exception as e:
+        logger.exception("Query optimization error")
+        # Continue with current query if optimization fails
+        if not agent_state.current_query:
+            agent_state.current_query = agent_state.user_query
+    return agent_state
+def check_results_node(agent_state: PaperSearchAgentState) -> PaperSearchAgentState:
+    """Check if paper search results are sufficient, decide whether to iterate."""
+    logger.debug("check_results_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("check_results")
+    papers_count = len(agent_state.papers)
+    # Check if we have sufficient papers
+    has_sufficient_papers = papers_count >= MIN_PAPERS_THRESHOLD
+    # Decision: continue if we don't have enough papers and haven't exceeded max iterations
+    should_continue = (
+        not has_sufficient_papers
+        and agent_state.search_iteration < agent_state.max_search_iterations
+    )
+    logger.info(
+        f"Results check: {papers_count} papers found. "
+        f"Sufficient: {has_sufficient_papers} (threshold: {MIN_PAPERS_THRESHOLD}). "
+        f"Should continue: {should_continue} (iteration {agent_state.search_iteration}/{agent_state.max_search_iterations})"
+    )
+    # Store decision in state (we'll use this in conditional edge)
+    agent_state.add_message(
+        Message(
+            role="assistant",
+            content=f"[Results Check] Found {papers_count} papers. "
+            f"{'Continuing search iteration' if should_continue else 'Proceeding with current results'}.",
+            agent_sender=AGENT_NAME,
+        ).with_log()
+    )
+    return agent_state
+def should_continue_search(agent_state: PaperSearchAgentState) -> str:
+    """Conditional function to decide whether to continue searching or proceed.
+    Only iterates on paper search. Dataset search happens once after paper search is done.
+    """
+    papers_count = len(agent_state.papers)
+    has_sufficient_papers = papers_count >= MIN_PAPERS_THRESHOLD
+    should_continue = (
+        not has_sufficient_papers
+        and agent_state.search_iteration < agent_state.max_search_iterations
+    )
+    return "continue_search" if should_continue else "proceed"
+def search_node(agent_state: PaperSearchAgentState) -> PaperSearchAgentState:
+    """Execute paper search using the search_papers tool."""
+    logger.debug("search_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("search")
+    # Increment iteration count
+    agent_state.search_iteration += 1
+    # Use current_query if available, otherwise use user_query
+    query_to_use = agent_state.current_query or agent_state.user_query
+    try:
+        # Call the search_papers tool directly
+        # Use only arxiv by default to avoid rate limiting issues with Semantic Scholar
+        # Semantic Scholar has strict rate limits (429 errors)
+        result = search_papers(
+            query=query_to_use,
+            sources=["arxiv"],  # Use arxiv only to avoid rate limiting
+            max_results=10,
+        )
+        # Parse the result (tool returns TOON format)
+        try:
+            papers = unwrap_dict_from_toon(result)
+            if isinstance(papers, list):
+                agent_state.papers = papers
+            else:
+                logger.warning("Unexpected result format from search_papers")
+                agent_state.papers = []
+        except Exception as parse_error:
+            logger.warning("Failed to parse search results: {}", parse_error)
+            agent_state.papers = []
+        logger.info("Found {} papers", len(agent_state.papers))
+        # Add search results to history
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=f"[Search Results] Found {len(agent_state.papers)} papers for query: '{query_to_use}' (iteration {agent_state.search_iteration})",
+                agent_sender=AGENT_NAME,
+            ).with_log()
+        )
+    except Exception as e:
+        logger.exception("Paper search error")
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=f"[Search Error] {str(e)}",
+                agent_sender=AGENT_NAME,
+            ).with_log()
+        )
+    return agent_state
+def dataset_node(agent_state: PaperSearchAgentState) -> PaperSearchAgentState:
+    """Execute dataset search using the search_datasets tool."""
+    logger.debug("dataset_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("dataset")
+    try:
+        # Call the search_datasets tool directly
+        # Use current_query if available, otherwise use user_query
+        query_to_use = agent_state.current_query or agent_state.user_query
+        # Pass data_summary if available to search for similar datasets
+        result = search_datasets(
+            query=query_to_use,
+            sources=["paperswithcode", "huggingface"],  # Default sources
+            max_results=10,
+            data_summary=agent_state.data_summary,  # Pass data analysis summary
+        )
+        # Parse the result (tool returns TOON format)
+        try:
+            datasets = unwrap_dict_from_toon(result)
+            if isinstance(datasets, list):
+                agent_state.datasets = datasets
+            else:
+                logger.warning("Unexpected result format from search_datasets")
+                agent_state.datasets = []
+        except Exception as parse_error:
+            logger.warning("Failed to parse dataset search results: {}", parse_error)
+            agent_state.datasets = []
+        logger.info("Found {} datasets", len(agent_state.datasets))
+        # Add search results to history
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=f"[Dataset Search Results] Found {len(agent_state.datasets)} datasets for query: '{agent_state.user_query}'",
+                agent_sender=AGENT_NAME,
+            ).with_log()
+        )
+    except Exception as e:
+        logger.exception("Dataset search error")
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=f"[Dataset Search Error] {str(e)}",
+                agent_sender=AGENT_NAME,
+            ).with_log()
+        )
+    return agent_state
+def metric_node(agent_state: PaperSearchAgentState) -> PaperSearchAgentState:
+    """Extract evaluation metrics from the searched papers."""
+    logger.debug("metric_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("metric")
+    try:
+        # Extract metrics even if we don't have papers (fallback to common metrics)
+        if not agent_state.papers:
+            logger.info("No papers available for metric extraction, using fallback")
+            # Still call the tool - it has fallback logic to suggest common metrics
+            result = extract_metrics_from_papers(
+                papers=[],  # Empty list triggers fallback
+                task_query=agent_state.user_query,
+                max_results=20,
+            )
+        else:
+            # Call the extract_metrics_from_papers tool with actual papers
+            result = extract_metrics_from_papers(
+                papers=agent_state.papers,
+                task_query=agent_state.user_query,
+                max_results=20,
+            )
+        # Parse the result (tool returns TOON format)
+        try:
+            metrics = unwrap_dict_from_toon(result)
+            if isinstance(metrics, list):
+                agent_state.metrics = metrics
+            else:
+                logger.warning("Unexpected result format from extract_metrics_from_papers")
+                agent_state.metrics = []
+        except Exception as parse_error:
+            logger.warning("Failed to parse metric extraction results: {}", parse_error)
+            agent_state.metrics = []
+        logger.info("Extracted {} metrics", len(agent_state.metrics))
+        # Add extraction results to history
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=f"[Metric Extraction Results] Extracted {len(agent_state.metrics)} evaluation metrics from {len(agent_state.papers)} papers.",
+                agent_sender=AGENT_NAME,
+            ).with_log()
+        )
+    except Exception as e:
+        logger.exception("Metric extraction error")
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=f"[Metric Extraction Error] {str(e)}",
+                agent_sender=AGENT_NAME,
+            ).with_log()
+        )
+    return agent_state
+def summary_node(agent_state: PaperSearchAgentState) -> PaperSearchAgentState:
+    """Generate summary of search results."""
+    logger.debug("summary_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("summary")
+    # Build summary prompt with paper, dataset, and metric details
+    if not agent_state.papers and not agent_state.datasets and not agent_state.metrics:
+        agent_state.output_summary = (
+            f"No papers, datasets, or metrics found for query: '{agent_state.user_query}'"
+        )
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=agent_state.output_summary,
+                agent_sender=AGENT_NAME,
+            ).with_log()
+        )
+        return agent_state
+    # Format papers for summary
+    papers_text = ""
+    if agent_state.papers:
+        papers_text = "\n\n".join(
+            [
+                f"**{i+1}. {p.get('title', 'N/A')}**\n"
+                f"- Authors: {', '.join(p.get('authors', [])[:5])}{'...' if len(p.get('authors', [])) > 5 else ''}\n"
+                f"- Published: {p.get('published', 'N/A')}\n"
+                f"- Source: {p.get('source', 'N/A')}\n"
+                f"- Summary: {p.get('summary', 'N/A')[:300]}...\n"
+                f"- URL: {p.get('url', 'N/A')}"
+                for i, p in enumerate(agent_state.papers[:10])
+            ]
+        )
+    else:
+        papers_text = "No papers found."
+    # Format datasets for summary (more detailed)
+    datasets_text = ""
+    if agent_state.datasets:
+        datasets_text = "\n\n".join(
+            [
+                f"**Dataset {i+1}: {d.get('name', 'N/A')}**\n"
+                f"- **Source**: {d.get('source', 'N/A')}\n"
+                f"- **Description**: {d.get('description', 'N/A')[:500]}{'...' if len(d.get('description', '')) > 500 else ''}\n"
+                f"- **Domain**: {d.get('domain', 'N/A')}\n"
+                f"- **Size**: {d.get('size', 'N/A')}\n"
+                f"- **URL**: {d.get('url', 'N/A')}\n"
+                f"- **Download URL**: {d.get('download_url', 'N/A') if d.get('download_url') else 'N/A'}\n"
+                f"- **License**: {d.get('license', 'N/A') if d.get('license') else 'Not specified'}\n"
+                f"- **Paper URL**: {d.get('paper_url', 'N/A') if d.get('paper_url') else 'N/A'}"
+                for i, d in enumerate(agent_state.datasets[:15])  # Show more datasets
+            ]
+        )
+    else:
+        datasets_text = "No datasets found."
+    # Format metrics for summary (more detailed with formulas)
+    metrics_text = ""
+    if agent_state.metrics:
+        metrics_text = "\n\n".join(
+            [
+                f"**Metric {i+1}: {m.get('name', 'N/A')}**\n"
+                f"- **Description**: {m.get('description', 'N/A')}\n"
+                f"- **Domain**: {m.get('domain', 'N/A')}\n"
+                f"- **Source Paper**: {m.get('paper_title', 'N/A')}\n"
+                f"- **Paper URL**: {m.get('paper_url', 'N/A') if m.get('paper_url') else 'N/A'}\n"
+                f"- **Reported Value**: {m.get('value', 'N/A') if m.get('value') else 'Not specified'}\n"
+                f"- **Formula**: {m.get('formula', 'N/A') if m.get('formula') else 'Not provided'}"
+                for i, m in enumerate(agent_state.metrics[:20])  # Show more metrics
+            ]
+        )
+    else:
+        metrics_text = "No metrics extracted."
+    # Render summary prompt from template
+    summary_prompt_content = PROMPTS.paper_subagent.summary_prompt.render(
+        user_query=agent_state.user_query,
+        papers_text=papers_text,
+        datasets_text=datasets_text,
+        metrics_text=metrics_text,
+    )
+    summary_prompt = Message(
+        role="user",
+        content=summary_prompt_content,
+        agent_sender=AGENT_NAME,
+    )
+    agent_state.add_message(summary_prompt)
+    # Get summary from LLM
+    system_prompt = PROMPTS.paper_subagent.summary_system_prompt.render()
+    msg = ModelRegistry.completion(
+        LLM_NAME,
+        agent_state.patched_history,
+        system_prompt=system_prompt,
+        agent_sender=AGENT_NAME,
+        tools=None,  # No tools needed for summary
+    ).with_log()
+    # Store the summary text
+    agent_state.output_summary = msg.content or ""
+    agent_state.add_message(msg)
+    logger.info(f"Summary generated: {len(agent_state.output_summary)} characters")
+    return agent_state

scievo/agents/data_agent/paper_subagent/state.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from scievo.core.types import HistoryState, ToolsetState
+class PaperSearchAgentState(ToolsetState, HistoryState):
+    """Minimal state for Paper Search Agent.
+    This agent searches for academic papers and datasets using the paper_search and dataset_search toolsets.
+    Supports iterative query refinement to improve search results.
+    """
+    # Input
+    user_query: str  # User's original search query
+    data_summary: str | None = (
+        None  # Data analysis summary from data agent (for dataset similarity search)
+    )
+    current_query: str | None = None  # Current optimized query (for iteration)
+    max_search_iterations: int = 3  # Maximum number of search iterations
+    # Iteration tracking
+    search_iteration: int = 0  # Current search iteration count
+    query_history: list[str] = []  # History of queries tried
+    # Output
+    papers: list[dict] = []  # Paper search results
+    datasets: list[dict] = []  # Dataset search results
+    metrics: list[dict] = []  # Extracted metrics from papers
+    output_summary: str | None = None  # Final summary

scievo/agents/data_agent/plan.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from loguru import logger
+from pydantic import BaseModel
+from scievo.core import constant
+from scievo.core.llms import ModelRegistry
+from scievo.core.plan import Plan
+from scievo.core.types import Message
+from scievo.core.utils import parse_json_from_llm_response
+from scievo.prompts import PROMPTS
+from .state import DataAgentState
+LLM_NAME = "plan"
+AGENT_NAME = "data_planner"
+@logger.catch
+def planner_node(agent_state: DataAgentState) -> DataAgentState:
+    logger.trace("planner_node of Agent {}", AGENT_NAME)
+    user_query_msg = Message(
+        role="user",
+        content=agent_state.user_query,
+        agent_sender=AGENT_NAME,
+    )
+    agent_state.add_message(user_query_msg)
+    msg = ModelRegistry.completion(
+        LLM_NAME,
+        agent_state.patched_history,
+        system_prompt=(
+            Message(
+                role="system",
+                content=PROMPTS.data.planner_system_prompt.render(is_replanner=False),
+            )
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+    ).with_log()
+    agent_state.add_message(msg)
+    # NOTE: we don't add the message to the history
+    plans = parse_json_from_llm_response(msg, Plan)
+    # NOTE:
+    agent_state.add_message(
+        Message(
+            role="user",
+            content="Follow the current plan.",
+            agent_sender=AGENT_NAME,
+        )
+    )
+    agent_state.plans = plans
+    agent_state.remaining_plans = plans.steps
+    agent_state.past_plans = []
+    # dummy user response, just for logging
+    if len(agent_state.remaining_plans) > 0:
+        Message(
+            role="user",
+            content=PROMPTS.data.replanner_user_response.render(
+                next_step=agent_state.remaining_plans[0],
+            ),
+            agent_sender=AGENT_NAME,
+        ).with_log()
+    else:
+        logger.warning("No plans generated by planner - remaining_plans is empty")
+    planner_output = msg.content if "msg" in locals() and msg.content else "No planner output"
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "planner",
+            "output": planner_output,
+        }
+    )
+    return agent_state
+def replanner_node(agent_state: DataAgentState) -> DataAgentState:
+    logger.trace("replanner_node of Agent {}", AGENT_NAME)
+    # NOTE: when all the plans are done, go into the talk mode
+    if len(agent_state.remaining_plans) == 0:
+        logger.debug("All plans are done, going into talk mode")
+        agent_state.talk_mode = True
+        # agent_state.remaining_plans = ["Response to users' query."]
+        return agent_state
+    # Move current plan to past_plans
+    agent_state.past_plans.append(agent_state.remaining_plans.pop(0))
+    user_query = agent_state.user_query
+    user_msg = Message(
+        role="user",
+        content=PROMPTS.data.replanner_user_prompt.render(
+            user_query=user_query,
+            plan=agent_state.plans.steps,
+            past_steps=agent_state.past_plans,
+        ),
+        agent_sender=AGENT_NAME,
+    ).with_log()
+    agent_state.add_message(user_msg)
+    msg = ModelRegistry.completion(
+        LLM_NAME,
+        agent_state.patched_history,
+        system_prompt=(
+            Message(
+                role="system",
+                content=PROMPTS.data.planner_system_prompt.render(is_replanner=True),
+            )
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+    ).with_log()
+    agent_state.add_message(msg)
+    class Replan(BaseModel):
+        continued: bool = False
+        modified: list[str] = []
+    # NOTE: we don't add the message to the history
+    plans = parse_json_from_llm_response(msg, Replan)
+    if plans.continued is True:
+        pass  # No changes to plan
+    elif plans.continued is False:
+        # plans done
+        logger.debug("Replanner indicates all plans are done, going into talk mode")
+        agent_state.talk_mode = True
+        return agent_state
+    else:
+        agent_state.plans = Plan(steps=plans.modified)
+        agent_state.remaining_plans = plans.modified
+    if len(agent_state.remaining_plans) > 0:
+        agent_state.add_message(
+            Message(
+                role="user",
+                content=PROMPTS.data.replanner_user_response.render(
+                    next_step=agent_state.remaining_plans[0],
+                ),
+                agent_sender=AGENT_NAME,
+            )
+        )
+    else:
+        logger.warning("No remaining plans after replan - going to talk mode")
+        agent_state.talk_mode = True
+    replanner_output = msg.content if "msg" in locals() and msg.content else "No replanner output"
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "replanner",
+            "output": replanner_output,
+        }
+    )
+    return agent_state
+def should_replan(agent_state: DataAgentState) -> str:
+    if agent_state.talk_mode:
+        return "finalize"
+    else:
+        return "gateway"

scievo/agents/data_agent/state.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from scievo.core.code_env import LocalEnv
+from scievo.core.plan import PlanState
+from scievo.core.types import HistoryState, RBankState, ToolsetState
+class DataAgentState(ToolsetState, PlanState, HistoryState, RBankState):
+    """State of an agent"""
+    user_query: str
+    # Local environment for the agent
+    workspace: LocalEnv
+    # Optional additional description of the data (input)
+    data_desc: str | None = None
+    # talking mode
+    talk_mode: bool = False
+    # output summary generated by the agent (output)
+    output_summary: str | None = None
+    # Paper subagent results
+    papers: list[dict] = []
+    datasets: list[dict] = []
+    metrics: list[dict] = []
+    paper_search_summary: str | None = None
+    # Intermediate states
+    intermediate_state: list[dict] = []
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.toolsets.append("fs")

scievo/agents/dummy_agent.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from langgraph.graph import END, START, StateGraph
+from loguru import logger
+from scievo.core.types import GraphState, Message
+from scievo.prompts import PROMPTS
+LLM_NAME = "dummy"
+AGENT_NAME = "dummy"
+def say_hello(graph_state: GraphState) -> GraphState:
+    logger.debug("say_hello of Agent {}", AGENT_NAME)
+    msg = Message(
+        role="assistant",
+        content="Hello",
+        llm_sender=None,
+        agent_sender=AGENT_NAME,
+    ).with_log()
+    graph_state.agents[AGENT_NAME].data_msgs.append(msg)
+    return graph_state
+@logger.catch
+def build():
+    g = StateGraph(GraphState)
+    g.add_node("dummy1", say_hello)
+    g.add_node("dummy2", say_hello)
+    g.add_edge(START, "dummy1")
+    g.add_edge("dummy1", "dummy2")
+    g.add_edge("dummy2", END)
+    return g

scievo/agents/experiment_agent/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+Experiment Agent - High-level orchestrator for code modification experiments.
+This agent coordinates three sub-agents:
+1. Coding Subagent V2 - Plans and executes code modifications
+2. Exec Subagent - Runs experiments/commands in a local shell
+3. Summary Subagent - Generates comprehensive experiment summaries
+The agent runs in a revision loop until the experiment succeeds or max revisions is reached.
+"""
+from .build import build
+from .state import ExperimentAgentState
+__all__ = ["build", "ExperimentAgentState"]

scievo/agents/experiment_agent/build.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Build the Experiment Agent graph.
+"""
+from langgraph.graph import END, START, StateGraph
+from loguru import logger
+from . import execute
+from .state import ExperimentAgentState
+@logger.catch
+def build():
+    """Build the Experiment Agent graph with sub-agent composition."""
+    g = StateGraph(ExperimentAgentState)
+    # ==================== NODES ====================
+    # Initialization node - prepares initial context
+    g.add_node("init", execute.init_node)
+    # Sub-agent nodes - invoke compiled sub-graphs
+    g.add_node("coding", execute.run_coding_subagent)
+    g.add_node("exec", execute.run_exec_subagent)
+    g.add_node("summary", execute.run_summary_subagent)
+    # Analysis node - analyzes loop results and generates insights
+    g.add_node("analysis", execute.analysis_node)
+    # Revision judge node - decides whether to continue or complete
+    g.add_node("revision_judge", execute.revision_judge_node)
+    # Finalize node - prepares final output
+    g.add_node("finalize", execute.finalize_node)
+    # ==================== EDGES ====================
+    # Start -> Init
+    g.add_edge(START, "init")
+    # Init -> Coding
+    g.add_edge("init", "coding")
+    # Coding -> Exec
+    g.add_edge("coding", "exec")
+    # Exec -> Summary
+    g.add_edge("exec", "summary")
+    # Summary -> Analysis
+    g.add_edge("summary", "analysis")
+    # Analysis -> Revision Judge
+    g.add_edge("analysis", "revision_judge")
+    # Revision Judge -> Conditional (Continue loop or Complete)
+    g.add_conditional_edges(
+        "revision_judge",
+        execute.should_continue_revision,
+        {
+            "continue": "coding",  # Go back to coding for next revision
+            "complete": "finalize",  # Exit the loop
+        },
+    )
+    # Finalize -> END
+    g.add_edge("finalize", END)
+    return g

scievo/agents/experiment_agent/coding_subagent_v2/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+Coding Subagent V2
+This agent follows the plan-and-execute paradigm for coding tasks.
+It integrates with OpenHands SDK for external code manipulation.
+"""
+from .build import build
+from .state import CodingAgentState
+__all__ = ["build", "CodingAgentState"]

scievo/agents/experiment_agent/coding_subagent_v2/build.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from langgraph.graph import END, START, StateGraph
+from loguru import logger
+from . import execute
+from .state import CodingAgentState
+@logger.catch
+def build():
+    """Build the coding agent graph.
+    This is a minimal graph that delegates all coding work to OpenHands SDK.
+    Flow: START -> openhands_node -> summary_node -> END
+    OpenHands has its own internal planning and execution, so no external
+    LLM chat loop or tool calling is needed.
+    """
+    g = StateGraph(CodingAgentState)
+    # Nodes - minimal: just OpenHands execution and summary
+    g.add_node("openhands", execute.openhands_node)
+    g.add_node("summary", execute.summary_node)
+    # Simple linear flow
+    g.add_edge(START, "openhands")
+    g.add_edge("openhands", "summary")
+    g.add_edge("summary", END)
+    return g

scievo/agents/experiment_agent/coding_subagent_v2/execute.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""
+Execution nodes for the Coding Subagent V2
+This module provides a minimal execution flow that delegates all coding work
+to OpenHands SDK. The flow is: START -> openhands_node -> summary_node -> END
+"""
+import os
+from loguru import logger
+from openhands.sdk.event import ActionEvent
+from scievo.core import constant
+from scievo.core.llms import ModelRegistry
+from scievo.core.types import Message
+from scievo.prompts import PROMPTS
+from .state import CodingAgentState
+LLM_NAME = "experiment_coding"
+AGENT_NAME = "experiment_coding"
+def openhands_node(agent_state: CodingAgentState) -> CodingAgentState:
+    """
+    Execute the coding task using OpenHands sub-agent.
+    This node directly invokes the OpenHands conversation to handle
+    the entire coding workflow. OpenHands has its own internal planning,
+    tool calling, and execution mechanisms.
+    """
+    logger.debug("openhands_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("openhands")
+    conversation = agent_state.openhands_conversation
+    if conversation is None:
+        logger.error("OpenHands conversation not initialized")
+        agent_state.output_summary = "Error: OpenHands conversation not initialized."
+        return agent_state
+    try:
+        # Construct the message for OpenHands
+        instruction = agent_state.user_query or "No specific coding task provided."
+        bg_info = agent_state.data_summary or "No background information available."
+        # prefix with `> ` for markdown blockquote
+        instruction = "\n".join([f"> {line}" for line in instruction.splitlines()])
+        bg_info = "\n".join([f"> {line}" for line in bg_info.splitlines()])
+        workspace_dir = os.path.abspath(agent_state.workspace.working_dir)
+        message = f"""\
+# Requirements:
+- At the end of your response, provide a detailed explanation of what you did and why.
+- Ensure that all changes are made in a way that maintains the integrity of the codebase.
+- Avoid long-running executions of training or data processing; focus on code changes. If needed for code testing, design some simple test code instead.
+# Important Notes:
+- DO NOT train the full model. Just train a demo if needed for testing code changes.
+- DO NOT run large data processing tasks. Just simulate with small data if needed for testing code
+- Always ensure that the code runs without errors after your changes.
+- I would run the full experiments later after getting your code changes.
+# Workspace
+{workspace_dir}
+# Task:
+{instruction}
+# Background information:
+```
+{bg_info}
+```
+"""
+        logger.info("Sending task to OpenHands sub-agent: {}", instruction[:100])
+        # Send message to the OpenHands agent
+        conversation.send_message(message)
+        # Run the agent until completion
+        with agent_state.workspace:
+            conversation.run()
+        # Extract the last response from OpenHands
+        if conversation.state.events:
+            for e in reversed(conversation.state.events):
+                if isinstance(e, ActionEvent) and e.source == "agent":
+                    if hasattr(e, "llm_message") and e.llm_message:
+                        content = e.llm_message.content
+                    elif (m := getattr(e, "to_llm_message", None)) is not None and callable(m):
+                        content = m().content
+                    else:
+                        # Unable to extract content from this event
+                        continue
+                    last_response = "\n".join([c.text for c in content])
+                    break
+            else:
+                last_response = "Coding task completed (no detailed response available)."
+        else:
+            last_response = "Coding task completed (no detailed response available)."
+        # Log the result
+        logger.info("OpenHands sub-agent completed task")
+        # Store the response in history for summary generation
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=f"[OpenHands Sub-Agent Result]\n{last_response}",
+                agent_sender="openhands",
+            ).with_log()
+        )
+    except Exception as e:
+        logger.exception("OpenHands agent error")
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=f"[OpenHands Error] {str(e)}",
+                agent_sender="openhands",
+            ).with_log()
+        )
+    return agent_state
+def summary_node(agent_state: CodingAgentState) -> CodingAgentState:
+    """Generate summary of the coding workflow and results."""
+    logger.debug("summary_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("summary")
+    # Add summary generation prompt from PROMPTS
+    summary_prompt = Message(
+        role="user",
+        content=PROMPTS.experiment_coding_v2.summary_prompt.render(),
+        agent_sender=AGENT_NAME,
+    )
+    agent_state.add_message(summary_prompt)
+    # Get summary from LLM
+    msg = ModelRegistry.completion(
+        LLM_NAME,
+        agent_state.patched_history,
+        system_prompt=(
+            Message(
+                role="system",
+                content=PROMPTS.experiment_coding_v2.summary_system_prompt.render(),
+            )
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+        tools=None,  # No tools needed for final summary
+    ).with_log()
+    # Store the summary text
+    agent_state.output_summary = msg.content or ""
+    agent_state.add_message(msg)
+    logger.info(f"Coding task summary generated: {len(agent_state.output_summary)} characters")
+    return agent_state

scievo/agents/experiment_agent/coding_subagent_v2/state.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import uuid
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from openhands.sdk import Conversation
+from pydantic import PrivateAttr
+from scievo.core.code_env import LocalEnv
+from scievo.core.types import HistoryState, ToolsetState
+from scievo.prompts import SKILLS
+class CodingAgentState(ToolsetState, HistoryState):
+    """State of the Coding Subagent V2.
+    This agent delegates coding tasks to OpenHands SDK which has its own
+    internal planning mechanism. No external planning is needed.
+    Note: No RBankState - memory extraction is not used in this agent.
+    """
+    # Summary of the data from data agent, providing background info for the coding task (input)
+    data_summary: str
+    # User's coding task description (input, optional)
+    user_query: str | None = None
+    # Local environment for the agent (input)
+    workspace: LocalEnv
+    # OpenHands Conversation object - persists throughout the execution (private)
+    # This maintains the conversation history with the external coding agent
+    _openhands_conversation: Optional["Conversation"] = PrivateAttr(default=None)
+    # Output summary (output)
+    output_summary: str | None = None
+    def __init__(self, _openhands_conversation: Optional["Conversation"] = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Create a default empty conversation if not provided
+        if _openhands_conversation is None:
+            enable_openhands = os.getenv("SCIEVO_ENABLE_OPENHANDS", "").strip().lower() in {
+                "1",
+                "true",
+                "yes",
+                "y",
+            }
+            if not enable_openhands:
+                raise RuntimeError(
+                    "OpenHands coding subagent (v2) is disabled. "
+                    "Set env `SCIEVO_ENABLE_OPENHANDS=1` to enable it, or use the Claude coding subagent "
+                    "(`CODING_AGENT_VERSION=v3`)."
+                )
+            # Setup openhands paths first (must be before any openhands imports)
+            # Local imports so importing this module doesn't require OpenHands unless v2 is used.
+            from openhands.sdk import LLM, Agent, AgentContext, Conversation, Tool
+            from openhands.sdk.context.skills import Skill
+            from scievo.core import openhands_import  # noqa: F401
+            # Try to import LLMSummarizingCondenser if available
+            try:
+                from openhands.sdk.context.condenser import LLMSummarizingCondenser
+            except ImportError:
+                # Fallback: LLMSummarizingCondenser is not available in this version
+                LLMSummarizingCondenser = None
+            api_key = os.getenv("OPENHANDS_API_KEY") or os.getenv("LLM_API_KEY")
+            model = os.getenv("OPENHANDS_MODEL", "anthropic/claude-sonnet-4-5-20250929")
+            llm = LLM(
+                model=model,
+                api_key=api_key,
+                usage_id=f"openhands-coding-agent-{uuid.uuid4().hex[:8]}",
+            )
+            from openhands.tools.file_editor import FileEditorTool
+            from openhands.tools.glob import GlobTool
+            from openhands.tools.grep import GrepTool
+            from openhands.tools.task_tracker import TaskTrackerTool
+            from openhands.tools.terminal import TerminalTool
+            tools = [
+                Tool(name=FileEditorTool.name),
+                Tool(name=TaskTrackerTool.name),
+                Tool(name=TerminalTool.name),
+                Tool(name=GlobTool.name),
+                Tool(name=GrepTool.name),
+            ]
+            agent_context = AgentContext(
+                skills=[
+                    Skill(
+                        name="Python Dependency Management by `uv` instead of `pip`",
+                        content="For Python projects: Always prioritize using 'uv' for managing dependencies and virtual environments. "
+                        "Avoid using 'pip' or other package managers that directly affect the native system environment. "
+                        "Use 'uv sync' to install dependencies from lock files, 'uv venv' to create isolated environments, "
+                        "and 'uv add' to add new packages. This approach ensures project isolation and reproducibility. "
+                        "This skill applies only to Python projects.",
+                    ),
+                    Skill(
+                        name="Avoid Long Time Operations",
+                        content="Avoid using tools or commands that may lead to long wait times or blocking operations, "
+                        "such as training the model directly within this environment. ",
+                    ),
+                    Skill(
+                        name="File Operations Should Use Absolute Paths as Much as Possible",
+                        content="When using the File Editor tool and other file-related tools, always refer to files using their absolute paths. "
+                        "This ensures that file operations are unambiguous and correctly targeted within the workspace. ",
+                    ),
+                    Skill(
+                        name="UV - Python Package Manager Skill",
+                        content=SKILLS.uv_skill,
+                    ),
+                ],
+                system_message_suffix="""\
+<CLI_MODE>
+You are operating in CLI mode, so all file paths should be absolute paths as much as possible.
+Besides, try to avoid long time operations that may block the process, e.g., training the deep learning model directly.
+</CLI_MODE>
+<SHORT_RUNNING>
+- DO NOT train the full model. Just train a demo if needed for testing code changes.
+- DO NOT run large data processing tasks. Just simulate with small data if needed for testing code
+- The full experiments will be run later by the user after getting the code changes.
+- IMPORTANT: If a command takes longer than 10 minutes (a.k.a. 600 seconds), you should leave it to the user to run later.
+</SHORT_RUNNING>
+""",
+            )
+            # Build agent kwargs - only include condenser if available
+            agent_kwargs = {
+                "llm": llm,
+                "tools": tools,
+                "system_prompt_kwargs": {"cli_mode": True},
+                "agent_context": agent_context,
+            }
+            # Add condenser only if LLMSummarizingCondenser is available
+            if LLMSummarizingCondenser is not None:
+                agent_kwargs["condenser"] = LLMSummarizingCondenser(
+                    llm=llm.model_copy(update={"usage_id": "condenser"}),
+                    max_size=48,
+                    keep_first=4,
+                )
+            agent = Agent(**agent_kwargs)
+            _openhands_conversation = Conversation(
+                agent=agent, workspace=self.workspace.working_dir
+            )
+        self._openhands_conversation = _openhands_conversation
+        # Ensure the openhands toolset is included initially
+        self.toolsets.append("openhands")
+    @property
+    def openhands_conversation(self) -> "Conversation":
+        """Get the OpenHands Conversation object."""
+        return self._openhands_conversation

scievo/agents/experiment_agent/coding_subagent_v3_claude/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+Coding Subagent V3 Claude
+This agent delegates coding tasks to Claude Agent SDK for external code manipulation.
+Claude Agent SDK has its own internal planning and execution mechanisms.
+"""
+from .build import build
+from .state import ClaudeCodingAgentState, CodingAgentState
+__all__ = ["build", "ClaudeCodingAgentState", "CodingAgentState"]

scievo/agents/experiment_agent/coding_subagent_v3_claude/build.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from langgraph.graph import END, START, StateGraph
+from loguru import logger
+from . import execute
+from .state import ClaudeCodingAgentState
+@logger.catch
+def build():
+    """Build the Claude coding agent graph.
+    This is a minimal graph that delegates all coding work to Claude Agent SDK.
+    Flow: START -> claude_node -> summary_node -> END
+    Claude Agent SDK has its own internal planning and execution, so no external
+    LLM chat loop or tool calling is needed.
+    """
+    g = StateGraph(ClaudeCodingAgentState)
+    # Nodes - minimal: just Claude execution and summary
+    g.add_node("claude", execute.claude_node)
+    g.add_node("summary", execute.summary_node)
+    # Simple linear flow
+    g.add_edge(START, "claude")
+    g.add_edge("claude", "summary")
+    g.add_edge("summary", END)
+    return g

scievo/agents/experiment_agent/coding_subagent_v3_claude/execute.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Execution nodes for the Coding Subagent V3 Claude
+This module provides a minimal execution flow that delegates all coding work
+to Claude Agent SDK. The flow is: START -> claude_node -> summary_node -> END
+"""
+import os
+from loguru import logger
+from scievo.core import constant
+from scievo.core.llms import ModelRegistry
+from scievo.core.types import Message
+from scievo.prompts import PROMPTS
+from scievo.tools.claude_agent_sdk_tool import run_claude_agent_sdk
+from scievo.tools.claude_code_tool import run_claude_code
+from .state import ClaudeCodingAgentState
+LLM_NAME = "experiment_coding"
+AGENT_NAME = "experiment_coding"
+def claude_node(agent_state: ClaudeCodingAgentState) -> ClaudeCodingAgentState:
+    """
+    Execute the coding task using Claude Agent SDK.
+    This node directly invokes the Claude Agent SDK to handle
+    the entire coding workflow. Claude Agent SDK has its own internal planning,
+    tool calling, and execution mechanisms.
+    """
+    logger.debug("claude_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("claude")
+    try:
+        # Construct the message for Claude Agent SDK
+        instruction = agent_state.user_query or "No specific coding task provided."
+        bg_info = agent_state.data_summary or "No background information available."
+        # prefix with `> ` for markdown blockquote
+        instruction = "\n".join([f"> {line}" for line in instruction.splitlines()])
+        bg_info = "\n".join([f"> {line}" for line in bg_info.splitlines()])
+        workspace_dir = os.path.abspath(agent_state.workspace.working_dir)
+        prompt = f"""\
+# Requirements:
+- At the end of your response, provide a detailed explanation of what you did and why.
+- Ensure that all changes are made in a way that maintains the integrity of the codebase.
+- Avoid long-running executions of training or data processing; focus on code changes. If needed for code testing, design some simple test code instead.
+# Important Notes:
+- DO NOT train the full model. Just train a demo if needed for testing code changes.
+- DO NOT run large data processing tasks. Just simulate with small data if needed for testing code
+- Always ensure that the code runs without errors after your changes.
+- I would run the full experiments later after getting your code changes.
+# Workspace
+{workspace_dir}
+# Task:
+{instruction}
+# Background information:
+```
+{bg_info}
+```
+"""
+        logger.info("Sending task to Claude Agent SDK: {}", instruction[:100])
+        # Call Claude Agent SDK tool (preferred)
+        sdk_result = run_claude_agent_sdk(
+            prompt=prompt,
+            cwd=workspace_dir,
+            allowed_tools=["Read", "Write", "Edit", "Bash", "Glob", "Grep"],
+            permission_mode="acceptEdits",
+            **{constant.__AGENT_STATE_NAME__: agent_state},
+        )
+        sdk_text = str(sdk_result)
+        has_error = any(
+            (line.strip().startswith("error:") and "error=None" not in line)
+            for line in sdk_text.splitlines()[:20]
+        )
+        if not has_error:
+            logger.info("Claude Agent SDK completed task")
+            agent_state.add_message(
+                Message(
+                    role="assistant",
+                    content=(
+                        "[Claude Agent SDK Result]\n"
+                        "Claude Agent SDK has completed the coding task. The changes have been applied to the workspace.\n\n"
+                        f"{sdk_text}"
+                    ),
+                    agent_sender="claude_agent_sdk",
+                ).with_log()
+            )
+        else:
+            # Fallback to Claude Code CLI (still Claude-based, but doesn't require SDK install)
+            logger.warning("Claude Agent SDK returned an error; falling back to Claude Code CLI")
+            cli_result = run_claude_code(
+                instruction=prompt,
+                cwd=workspace_dir,
+                timeout=1800,
+                **{constant.__AGENT_STATE_NAME__: agent_state},
+            )
+            agent_state.add_message(
+                Message(
+                    role="assistant",
+                    content=(
+                        "[Claude Agent SDK Error]\n"
+                        f"{sdk_text}\n\n"
+                        "[Claude Code CLI Fallback Result]\n"
+                        f"{str(cli_result)}"
+                    ),
+                    agent_sender="claude_code",
+                ).with_log()
+            )
+    except Exception as e:
+        logger.exception("Claude Agent SDK error")
+        agent_state.add_message(
+            Message(
+                role="assistant",
+                content=f"[Claude Agent SDK Error] {str(e)}",
+                agent_sender="claude_agent_sdk",
+            ).with_log()
+        )
+    claude_output = "Claude Agent SDK execution completed"
+    if agent_state.history:
+        last_msg = agent_state.history[-1]
+        if last_msg.role == "assistant" and last_msg.content:
+            claude_output = last_msg.content[:2000]
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "claude",
+            "output": claude_output,
+        }
+    )
+    return agent_state
+def summary_node(agent_state: ClaudeCodingAgentState) -> ClaudeCodingAgentState:
+    """Generate summary of the coding workflow and results."""
+    logger.debug("summary_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("summary")
+    # Add summary generation prompt from PROMPTS
+    summary_prompt = Message(
+        role="user",
+        content=PROMPTS.experiment_coding_v2.summary_prompt.render(),
+        agent_sender=AGENT_NAME,
+    )
+    agent_state.add_message(summary_prompt)
+    # Get summary from LLM
+    msg = ModelRegistry.completion(
+        LLM_NAME,
+        agent_state.patched_history,
+        system_prompt=(
+            Message(
+                role="system",
+                content=PROMPTS.experiment_coding_v2.summary_system_prompt.render(),
+            )
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+        tools=None,  # No tools needed for final summary
+    ).with_log()
+    # Store the summary text
+    agent_state.output_summary = msg.content or ""
+    agent_state.add_message(msg)
+    logger.info(f"Coding task summary generated: {len(agent_state.output_summary)} characters")
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "summary",
+            "output": agent_state.output_summary or "No summary generated",
+        }
+    )
+    return agent_state

scievo/agents/experiment_agent/coding_subagent_v3_claude/state.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from scievo.core.code_env import LocalEnv
+from scievo.core.types import HistoryState, ToolsetState
+class ClaudeCodingAgentState(ToolsetState, HistoryState):
+    """State of the Coding Subagent V3 Claude.
+    This agent delegates coding tasks to Claude Agent SDK which has its own
+    internal planning mechanism. No external planning is needed.
+    Note: No RBankState - memory extraction is not used in this agent.
+    """
+    # Summary of the data from data agent, providing background info for the coding task (input)
+    data_summary: str
+    # User's coding task description (input, optional)
+    user_query: str | None = None
+    # Local environment for the agent (input)
+    workspace: LocalEnv
+    # Output summary (output)
+    output_summary: str | None = None
+    # Intermediate states
+    intermediate_state: list[dict] = []
+# Alias for consistency with v2 (CodingAgentState)
+CodingAgentState = ClaudeCodingAgentState

scievo/agents/experiment_agent/exec_subagent/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+Experiment Execution Agent
+This agent is responsible for executing experiments in local shell sessions.
+It parses natural language queries to determine commands to execute and manages
+the execution using LocalShellSession.
+"""
+from .build import build
+from .state import ExecAgentState
+__all__ = ["build", "ExecAgentState"]

scievo/agents/experiment_agent/exec_subagent/build.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""
+Build the Experiment Execution Agent graph
+"""
+from langgraph.graph import END, START, StateGraph
+from loguru import logger
+from . import execute
+from .state import ExecAgentState
+def init_node(agent_state: ExecAgentState) -> ExecAgentState:
+    """Initialize the agent with the user query as the first message"""
+    logger.trace("init_node of ExecAgent")
+    # Add the initial user query message if history is empty
+    if not agent_state.history or len(agent_state.history) == 0:
+        from scievo.core.types import Message
+        from scievo.prompts import PROMPTS
+        user_msg = Message(
+            role="user",
+            content=PROMPTS.experiment_exec.exec_user_prompt.render(
+                user_query=agent_state.user_query,
+                working_dir=agent_state.workspace,
+                current_coding_summary=(
+                    agent_state.coding_summaries[-1]
+                    if agent_state.coding_summaries is not None
+                    and len(agent_state.coding_summaries) > 0
+                    else None
+                ),
+                coding_summaries=agent_state.coding_summaries,
+            ),
+        )
+        agent_state.add_message(user_msg)
+    else:
+        logger.warning("Agent history is not empty during init_node; skipping adding user query.")
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "init",
+            "output": user_msg.content if "user_msg" in locals() else "Initialization complete",
+        }
+    )
+    return agent_state
+@logger.catch
+def build():
+    """Build and return the Experiment Execution Agent graph"""
+    g = StateGraph(ExecAgentState)
+    # Add nodes
+    g.add_node("init", init_node)
+    g.add_node("gateway", execute.gateway_node)
+    g.add_node("llm_chat", execute.llm_chat_node)
+    g.add_node("tool_calling", execute.tool_calling_node)
+    g.add_node("monitoring", execute.monitoring_node)
+    g.add_node("summary", execute.summary_node)
+    g.add_node("history_compression", execute.history_compression_node)
+    # Add edges
+    # Start -> Init -> Gateway
+    g.add_edge(START, "init")
+    g.add_edge("init", "gateway")
+    # Gateway -> conditional routing
+    g.add_conditional_edges(
+        "gateway",
+        execute.gateway_conditional,
+        [
+            "llm_chat",
+            "tool_calling",
+            "monitoring",
+            "summary",
+            "history_compression",
+        ],
+    )
+    # LLM chat -> Gateway
+    g.add_edge("llm_chat", "gateway")
+    # Tool calling -> Gateway
+    g.add_edge("tool_calling", "gateway")
+    # Monitoring -> Gateway (after checking/interrupting)
+    g.add_edge("monitoring", "gateway")
+    # History compression -> Gateway
+    g.add_edge("history_compression", "gateway")
+    # Summary -> END
+    g.add_edge("summary", END)
+    return g

scievo/agents/experiment_agent/exec_subagent/execute.py ADDED Viewed

	@@ -0,0 +1,502 @@

+"""
+Experiment Execution Agent - handles running experiments in local shell sessions
+"""
+import inspect
+import json
+import time
+from pathlib import Path
+from loguru import logger
+from pydantic import BaseModel
+from scievo import history_compression
+from scievo.core import constant
+from scievo.core.llms import ModelRegistry
+from scievo.core.types import Message
+from scievo.core.utils import parse_json_from_llm_response, wrap_dict_to_toon
+from scievo.prompts import PROMPTS, SKILLS
+from scievo.tools import Tool, ToolRegistry
+from .state import ExecAgentState
+LLM_NAME = "experiment_execute"
+LLM_MONITOR_NAME = "experiment_monitor"
+AGENT_NAME = "experiment_exec"
+BUILTIN_TOOLSETS = [
+    "state",
+    "exec",  # The exec toolset is built-in for this agent
+    "fs",
+]
+ALLOWED_TOOLSETS = [
+    "history",
+]  # Can be extended if needed
+MONITORING_INTERVALS = [5, 10, 10, 20, 20, 30, 45, 60, 60, 120, 120, 180]  # in seconds
+# load uv skill md
+UV_SKILL = Path(__file__).parent.parent.parent.parent / "tools" / "skills" / "uv_venv_management.md"
+def gateway_node(agent_state: ExecAgentState) -> ExecAgentState:
+    """Gateway node - placeholder for conditional routing logic"""
+    logger.trace("gateway_node of Agent {}", AGENT_NAME)
+    return agent_state
+def gateway_conditional(agent_state: ExecAgentState) -> str:
+    """Determine the next node based on the last message"""
+    # compress history if needed
+    if (
+        constant.HISTORY_AUTO_COMPRESSION
+        and "history_compression" not in agent_state.node_history[-2:]
+        and agent_state.total_patched_tokens > constant.HISTORY_AUTO_COMPRESSION_TOKEN_THRESHOLD
+    ):
+        return "history_compression"
+    # Check if there's a command currently running in the session
+    if agent_state.is_monitor_mode:
+        # A command is running -> go to monitoring node
+        time2sleep = MONITORING_INTERVALS[
+            min(agent_state.monitoring_attempts, len(MONITORING_INTERVALS) - 1)
+        ]
+        logger.debug(
+            f"A command is currently running. Waiting for {time2sleep} seconds before monitoring again."
+        )
+        time.sleep(time2sleep)
+        return "monitoring"
+    last_msg = agent_state.patched_history[-1]
+    # If the last message contains tool calls, execute them
+    if (tool_calls := last_msg.tool_calls) and len(tool_calls) > 0:
+        return "tool_calling"
+    # Route based on message role
+    match last_msg.role:
+        case "user" | "tool":
+            # User or tool message -> call LLM
+            return "llm_chat"
+        case "assistant":
+            # Assistant responded without tool calls -> execution is complete, go to summary
+            return "summary"
+        case _:
+            raise ValueError(f"Unknown message role: {last_msg.role}")
+def llm_chat_node(agent_state: ExecAgentState) -> ExecAgentState:
+    """LLM chat node - gets next action from the model"""
+    logger.debug("llm_chat_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("llm_chat")
+    selected_state = {
+        "workspace": agent_state.workspace.working_dir,
+        "current_activated_toolsets": agent_state.toolsets,
+    }
+    # Update system prompt
+    system_prompt = PROMPTS.experiment_exec.exec_system_prompt.render(
+        state_text=wrap_dict_to_toon(selected_state),
+        toolsets_desc=ToolRegistry.get_toolsets_desc(BUILTIN_TOOLSETS + ALLOWED_TOOLSETS),
+        uv_skill=SKILLS.uv_skill,
+    )
+    # Construct tools
+    tools: dict[str, Tool] = {}
+    for toolset in agent_state.toolsets:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    for toolset in BUILTIN_TOOLSETS:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    # Get completion from LLM
+    msg = ModelRegistry.completion(
+        LLM_NAME,
+        agent_state.patched_history,
+        system_prompt=(
+            Message(role="system", content=system_prompt)
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+        tools=[tool.name for tool in tools.values()],
+    ).with_log()
+    agent_state.add_message(msg)
+    llm_output = (
+        msg.content
+        if msg.content
+        else ("[Tool calls: " + str(len(msg.tool_calls)) + "]" if msg.tool_calls else "[No output]")
+    )
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "llm_chat",
+            "output": llm_output,
+        }
+    )
+    return agent_state
+def monitoring_node(agent_state: ExecAgentState) -> ExecAgentState:
+    """Monitor a running command and decide whether to continue waiting or interrupt it"""
+    logger.debug("monitoring_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("monitoring")
+    agent_state.monitoring_attempts += 1
+    if agent_state.monitoring_attempts <= len(MONITORING_INTERVALS):
+        total_monitoring_seconds = sum(MONITORING_INTERVALS[: agent_state.monitoring_attempts])
+    else:
+        total_monitoring_seconds = (
+            sum(MONITORING_INTERVALS)
+            + (agent_state.monitoring_attempts - len(MONITORING_INTERVALS))
+            * MONITORING_INTERVALS[-1]
+        )
+    # Get the current running command context
+    ctx = agent_state.session.get_current_context()
+    if ctx is None:
+        # No command running, this shouldn't happen but handle it gracefully
+        logger.warning("monitoring_node called but no command is running")
+        agent_state.monitoring_attempts = 0
+        agent_state.is_monitor_mode = False
+        agent_state.intermediate_state.append(
+            {
+                "node_name": "monitoring",
+                "output": "No command running - monitoring stopped",
+            }
+        )
+        return agent_state
+    # Get current output from the running command
+    current_output = ctx.get_input_output(max_length=32000)
+    if not agent_state.session.is_running_command():
+        # Command has completed while we were waiting
+        logger.debug("The monitored command has completed.")
+        agent_state.monitoring_attempts = 0
+        agent_state.is_monitor_mode = False
+        # Add monitoring end user prompt message
+        monitoring_end_user_msg = Message(
+            role="user",
+            content=PROMPTS.experiment_exec.monitoring_end_user_prompt.render(
+                command=ctx.command,
+                final_output=current_output,
+                error_text=ctx.get_error(),
+                total_monitoring_seconds=total_monitoring_seconds,
+            ),
+            agent_sender=AGENT_NAME,
+        ).with_log()
+        agent_state.add_message(monitoring_end_user_msg)
+        return agent_state
+    history = agent_state.patched_history.copy()
+    # Prepare monitoring prompt
+    monitoring_user_msg = Message(
+        role="user",
+        content=PROMPTS.experiment_exec.monitoring_user_prompt.render(
+            command=ctx.command,
+            monitoring_attempts=agent_state.monitoring_attempts,
+            current_output=current_output,
+            total_monitoring_seconds=total_monitoring_seconds,
+        ),
+        agent_sender=AGENT_NAME,
+    )
+    history.append(monitoring_user_msg)
+    # Ask monitoring LLM to decide
+    msg = ModelRegistry.completion(
+        LLM_MONITOR_NAME,
+        history,
+        system_prompt=(
+            Message(
+                role="system",
+                content=PROMPTS.experiment_exec.monitoring_system_prompt.render(),
+            )
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+        tools=None,
+    ).with_log()
+    class MonitorDecisionModel(BaseModel):
+        action: str
+    r = parse_json_from_llm_response(msg, MonitorDecisionModel)  # just to validate JSON format
+    if "wait" in r.action.lower():
+        logger.debug("Monitoring decision: continue waiting for the command to complete.")
+        agent_state.is_monitor_mode = True
+    elif "ctrlc" in r.action.lower():
+        logger.debug("Monitoring decision: interrupting the running command.")
+        ctx.cancel()
+        logger.debug("Monitoring is interrupted. Command is cancelled.")
+        monitoring_ctrlc_user_msg = Message(
+            role="user",
+            content=PROMPTS.experiment_exec.monitoring_ctrlc_user_prompt.render(
+                command=ctx.command,
+                output_before_interrupt=current_output,
+                total_monitoring_seconds=total_monitoring_seconds,
+            ),
+            agent_sender=AGENT_NAME,
+        )
+        agent_state.add_message(monitoring_ctrlc_user_msg)
+        agent_state.is_monitor_mode = False
+    else:
+        logger.warning(
+            f"Unknown monitoring action '{r.action}' received. Continuing to wait by default."
+        )
+        agent_state.is_monitor_mode = True
+    monitoring_output = f"Monitoring attempt {agent_state.monitoring_attempts}, total time: {total_monitoring_seconds}s"
+    if ctx:
+        monitoring_output += f"\nCommand: {ctx.command if hasattr(ctx, 'command') else 'Unknown'}\nAction: {r.action}"
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "monitoring",
+            "output": monitoring_output,
+        }
+    )
+    return agent_state
+def summary_node(agent_state: ExecAgentState) -> ExecAgentState:
+    """Generate a summary of the experiment execution"""
+    logger.debug("summary_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("summary")
+    # Construct a prompt to generate the summary
+    summary_prompt = Message(
+        role="user",
+        content=PROMPTS.experiment_exec.summary_user_prompt.render(),
+        agent_sender=AGENT_NAME,
+    )
+    agent_state.add_message(summary_prompt)
+    # Get summary from LLM
+    msg = ModelRegistry.completion(
+        LLM_NAME,
+        agent_state.patched_history,
+        system_prompt=(
+            Message(
+                role="system",
+                content=PROMPTS.experiment_exec.summary_system_prompt.render(),
+            )
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+        tools=None,  # No tools needed for summary
+    ).with_log()
+    # Store the summary text
+    agent_state.execution_summary = msg.content or ""
+    agent_state.add_message(msg)
+    # Parse JSON summary from the response
+    try:
+        class ExecutionSummary(BaseModel):
+            status: str
+            commands_executed: list[str]
+            key_outputs: str
+            errors_issues: str
+        summary_dict = parse_json_from_llm_response(msg, ExecutionSummary)
+        agent_state.execution_summary_dict = summary_dict.model_dump()
+    except Exception as e:
+        logger.warning(f"Failed to parse execution summary as JSON: {e}")
+        # If JSON parsing fails, store the text response in a basic dict structure
+        agent_state.execution_summary_dict = {
+            "status": "Unknown",
+            "commands_executed": [],
+            "key_outputs": agent_state.execution_summary,
+            "errors_issues": str(e),
+        }
+    summary_output = (
+        json.dumps(agent_state.execution_summary_dict, indent=2)
+        if agent_state.execution_summary_dict
+        else agent_state.execution_summary
+    )
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "summary",
+            "output": summary_output,
+        }
+    )
+    return agent_state
+def tool_calling_node(agent_state: ExecAgentState) -> ExecAgentState:
+    """Execute tool calls from the last message"""
+    logger.debug("tool_calling_node of Agent {}", AGENT_NAME)
+    agent_state.add_node_history("tool_calling")
+    # Get the last message which contains tool calls
+    last_msg = agent_state.patched_history[-1]
+    if not last_msg.tool_calls:
+        raise ValueError("No tool calls found in the last message")
+    # Construct tools
+    tools: dict[str, Tool] = {}
+    for toolset in agent_state.toolsets:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    for toolset in BUILTIN_TOOLSETS:
+        tools.update(ToolRegistry.get_toolset(toolset))
+    function_map = {tool.name: tool.func for tool in tools.values()}
+    # Execute each tool call
+    for tool_call in last_msg.tool_calls:
+        tool_name = tool_call.function.name
+        # Check if tool exists in function map
+        if tool_name not in function_map:
+            error_msg = f"Tool {tool_name} not found"
+            tool_response = {
+                "role": "tool",
+                "tool_name": tool_name,
+                "tool_call_id": tool_call.id,
+                "content": error_msg,
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+            continue
+        # Parse tool arguments
+        try:
+            args = json.loads(tool_call.function.arguments)
+            assert isinstance(args, dict)
+        except json.JSONDecodeError as e:
+            error_msg = f"Invalid JSON in tool arguments: {e}"
+            tool_response = {
+                "role": "tool",
+                "tool_name": tool_name,
+                "tool_call_id": tool_call.id,
+                "content": error_msg,
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+            continue
+        except AssertionError as e:
+            error_msg = f"Invalid tool arguments: {e}"
+            tool_response = {
+                "role": "tool",
+                "tool_name": tool_name,
+                "tool_call_id": tool_call.id,
+                "content": error_msg,
+            }
+            agent_state.add_message(Message(**tool_response).with_log())
+            continue
+        # Execute the tool
+        try:
+            func = function_map[tool_name]
+            # Check if function expects agent_state parameter
+            sig = inspect.signature(func)
+            if constant.__AGENT_STATE_NAME__ in sig.parameters:
+                args.update({constant.__AGENT_STATE_NAME__: agent_state})
+            if constant.__CTX_NAME__ in sig.parameters:
+                args.update({constant.__CTX_NAME__: {"current_agent": AGENT_NAME}})
+            # Execute the tool
+            result = func(**args)
+            # Create tool response message
+            tool_response = {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "tool_name": tool_name,
+                "content": str(result),  # Ensure result is string
+            }
+            # if this is a long-running exec_command, check for monitoring flag
+            flag_text = "Try to check the execution status later."
+            if tool_name == "exec_command" and flag_text in tool_response["content"]:
+                logger.debug("The executed command is still running, entering monitor mode.")
+                assert (
+                    agent_state.session.get_current_context() is not None
+                ), "Expected a current context when entering monitor mode"
+                # The command is still running, go into monitor mode in the next step
+                agent_state.is_monitor_mode = True
+        except Exception as e:
+            logger.exception(f"Tool {tool_name} execution failed")
+            error_msg = f"Tool {tool_name} execution failed: {e}"
+            tool_response = {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "tool_name": tool_name,
+                "content": error_msg,
+            }
+        agent_state.add_message(Message(**tool_response).with_log())
+    # Reset monitoring attempts after tool execution
+    agent_state.monitoring_attempts = 0
+    tool_results = []
+    for tool_call in last_msg.tool_calls:
+        tool_name = tool_call.function.name
+        for msg in reversed(agent_state.history):
+            if (
+                msg.role == "tool"
+                and hasattr(msg, "tool_call_id")
+                and msg.tool_call_id == tool_call.id
+            ):
+                tool_results.append(
+                    {
+                        "tool": tool_name,
+                        "result": msg.content[:1000] if msg.content else "No result",
+                    }
+                )
+                break
+    tool_output_parts = []
+    for tr in tool_results:
+        tool_output_parts.append(f"Tool: {tr['tool']}\nResult: {tr['result']}")
+    tool_output = "\n\n".join(tool_output_parts) if tool_output_parts else "No tool calls executed"
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "tool_calling",
+            "output": tool_output,
+        }
+    )
+    return agent_state
+def history_compression_node(agent_state: ExecAgentState) -> ExecAgentState:
+    logger.debug("history_compression_node of Agent {}", AGENT_NAME)
+    history_before = len(agent_state.history)
+    agent_state = history_compression.invoke_history_compression(agent_state)
+    history_after = len(agent_state.history)
+    compression_output = f"Compressed history: {history_before} -> {history_after} messages"
+    if agent_state.history_patches:
+        last_patch = agent_state.history_patches[-1]
+        if last_patch.patched_message and last_patch.patched_message.content:
+            compression_output = f"Compressed {last_patch.n_messages} messages into:\n{last_patch.patched_message.content[:500]}"
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "history_compression",
+            "output": compression_output,
+        }
+    )
+    return agent_state

scievo/agents/experiment_agent/exec_subagent/state.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from scievo.core.code_env import LocalEnv
+from scievo.core.exec.manager import SessionManager
+from scievo.core.exec.pty_session import LocalShellSession
+from scievo.core.types import ExecState, HistoryState, ToolsetState
+class ExecAgentState(ExecState, ToolsetState, HistoryState):
+    """State of the Experiment Execution Agent.
+    This agent is responsible for executing experiments in local shell sessions.
+    It combines:
+    - ToolsetState: for managing available toolsets
+    - HistoryState: for managing conversation history
+    """
+    # The natural language query describing what experiment to run (input)
+    user_query: str
+    # Current working directory where experiments are executed (input)
+    workspace: LocalEnv
+    # Coding summaries from previous revisions (input, optional)
+    # Used to provide context about code changes made in each revision
+    coding_summaries: list[str] | None = None
+    # Raw summary of the experiment execution, try to use `execution_summary_dict` instead (output)
+    execution_summary: str = ""
+    # Structured summary of the experiment execution (output)
+    # Should be:
+    # ```json
+    # {
+    #     "status": "Success" or "Failed",
+    #     "commands_executed": ["command 1", "command 2", ...],
+    #     "key_outputs": "Highlight any important output or results",
+    #     "errors_issues": "Note any errors or issues encountered, or 'None' if successful"
+    # }
+    # ```
+    execution_summary_dict: dict = {}
+    # Number of monitoring attempts for the current running command (internal use)
+    monitoring_attempts: int = 0
+    # Whether to force monitoring in the next step (internal use)
+    is_monitor_mode: bool = False
+    # Intermediate states
+    intermediate_state: list[dict] = []
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.session_id is None:
+            s = LocalShellSession(cwd=self.workspace.working_dir)
+            # Store session ID instead of the session instance
+            self.session_id = s.session_id
+        # add initial toolset
+        self.toolsets.append("exec")

scievo/agents/experiment_agent/execute.py ADDED Viewed

	@@ -0,0 +1,513 @@

+"""
+Execution nodes for the Experiment Agent.
+"""
+import json
+import os
+from typing import Literal
+from dotenv import load_dotenv
+from loguru import logger
+from pydantic import BaseModel
+from scievo.core import constant
+from scievo.core.llms import ModelRegistry
+from scievo.core.types import Message
+from scievo.core.utils import parse_json_from_llm_response
+from scievo.prompts import PROMPTS
+from .exec_subagent import build as exec_build
+from .exec_subagent.state import ExecAgentState
+from .state import ExperimentAgentState
+from .summary_subagent import build as summary_build
+from .summary_subagent.state import SummaryAgentState
+AGENT_NAME = "experiment_agent"
+LLM_NAME = "experiment_agent"
+load_dotenv()
+CODING_AGENT_VERSION = os.getenv("CODING_AGENT_VERSION", "v3")  # default to Claude (v3)
+_OPENHANDS_ENABLED = os.getenv("SCIEVO_ENABLE_OPENHANDS", "").strip().lower() in {
+    "1",
+    "true",
+    "yes",
+    "y",
+}
+match CODING_AGENT_VERSION:
+    case "v2":
+        if not _OPENHANDS_ENABLED:
+            raise RuntimeError(
+                "CODING_AGENT_VERSION=v2 requires OpenHands, but OpenHands is disabled.\n"
+                "Hint: set `CODING_AGENT_VERSION=v3` to use the Claude coding agent, or enable OpenHands with "
+                "`SCIEVO_ENABLE_OPENHANDS=1`."
+            )
+        from .coding_subagent_v2 import build as coding_build
+        from .coding_subagent_v2.state import CodingAgentState
+    case "v3":
+        from .coding_subagent_v3_claude import build as coding_build
+        from .coding_subagent_v3_claude.state import CodingAgentState
+    case _:
+        raise ValueError(f"Unsupported CODING_AGENT_VERSION: {CODING_AGENT_VERSION}")
+# Compile sub-agent graphs as global variables
+coding_graph = coding_build().compile()
+exec_graph = exec_build().compile()
+summary_graph = summary_build().compile()
+def init_node(agent_state: ExperimentAgentState) -> ExperimentAgentState:
+    """Initialize the experiment agent.
+    Prepares the initial context message with data summary and user query.
+    The repo_source will be passed to the coding subagent which handles
+    git cloning and workspace setup.
+    """
+    logger.info("Initializing Experiment Agent")
+    agent_state.current_phase = "init"
+    # Add initial message to history
+    init_msg = Message(
+        role="user",
+        content=PROMPTS.experiment_agent.init_prompt.render(
+            data_summary=agent_state.data_summary,
+            user_query=agent_state.user_query,
+            repo_source=agent_state.repo_source or "Not specified",
+        ),
+        agent_sender=AGENT_NAME,
+    ).with_log()
+    agent_state.add_message(init_msg)
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "init",
+            "output": init_msg.content,
+        }
+    )
+    return agent_state
+def run_coding_subagent(agent_state: ExperimentAgentState) -> ExperimentAgentState:
+    """Run the Coding Subagent (stateless invocation).
+    The coding subagent receives repo_source and handles git cloning
+    and workspace setup internally. By default this uses the Claude Agent SDK/Claude Code path (v3).
+    """
+    logger.info(f"Running Coding Subagent (revision {agent_state.current_revision})")
+    agent_state.current_phase = "coding"
+    # Build revision feedback context if available
+    revision_feedback_list = []
+    if agent_state.revision_summaries:
+        for i, summary in enumerate(agent_state.revision_summaries):
+            revision_feedback_list.append({"revision_number": i + 1, "summary": summary})
+    # Collect all previous coding summaries
+    previous_coding_summaries = []
+    for i, loop in enumerate(agent_state.loop_results):
+        prev_summary = loop.get("coding_summary", "")
+        if prev_summary:
+            previous_coding_summaries.append({"revision": i, "summary": prev_summary})
+    # Also include accumulated analysis
+    revision_analysis_text = (
+        agent_state.revision_analysis
+        if agent_state.revision_analysis
+        else "No previous analysis yet."
+    )
+    # Build user query using prompt template
+    coding_query = PROMPTS.experiment_agent.coding_subagent_query_prompt.render(
+        user_query=agent_state.user_query,
+        repo_source=agent_state.repo_source or "Not specified",
+        # TODO: limit to last revision and coding summary for now
+        revision_feedback_list=revision_feedback_list[-1:],
+        previous_coding_summaries=previous_coding_summaries[-1:],
+        revision_analysis=revision_analysis_text,
+        current_revision=agent_state.current_revision,
+    )
+    coding_state = CodingAgentState(
+        data_summary=agent_state.data_summary,  # Keep data_summary separate
+        user_query=coding_query,
+        workspace=agent_state.workspace,
+    )
+    # Invoke coding subagent (stateless call)
+    result_state = coding_graph.invoke(coding_state)
+    # Extract only needed data from result - don't store full state (graph.invoke returns dict)
+    agent_state.history = result_state["history"]  # Merge back history
+    # Store coding summary for this loop (for later analysis)
+    # Use .get() for safe access in case output_summary is not set
+    coding_summary = result_state.get("output_summary") or "No summary available"
+    if (
+        not agent_state.loop_results
+        or agent_state.loop_results[-1].get("revision") != agent_state.current_revision
+    ):
+        agent_state.loop_results.append(
+            {
+                "revision": agent_state.current_revision,
+                "coding_summary": coding_summary,
+            }
+        )
+    else:
+        agent_state.loop_results[-1]["coding_summary"] = coding_summary
+    coding_output = coding_summary
+    if isinstance(result_state, dict) and "intermediate_state" in result_state:
+        coding_intermediate = result_state.get("intermediate_state", [])
+        if coding_intermediate:
+            coding_output = (
+                f"{coding_summary}\n\n[Coding Subagent Intermediate States]\n"
+                + "\n".join(
+                    [
+                        f"{item.get('node_name', 'unknown')}: {item.get('output', '')[:200]}"
+                        for item in coding_intermediate
+                    ]
+                )
+            )
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "run_coding_subagent",
+            "output": coding_output,
+        }
+    )
+    return agent_state
+def run_exec_subagent(agent_state: ExperimentAgentState) -> ExperimentAgentState:
+    """Run the Exec Subagent (stateless invocation).
+    The workspace path should be extracted from the conversation history
+    left by the coding subagent.
+    """
+    logger.info(f"Running Exec Subagent (revision {agent_state.current_revision})")
+    agent_state.current_phase = "exec"
+    # Collect all coding summaries from loop results
+    coding_summaries = [
+        loop.get("coding_summary", "")
+        for loop in agent_state.loop_results
+        if loop.get("coding_summary")
+    ]
+    exec_state = ExecAgentState(
+        user_query="Run the modified code/experiments and verify the output.",
+        workspace=agent_state.workspace,
+        coding_summaries=coding_summaries if coding_summaries else None,
+        toolsets=["exec"],
+    )
+    # Invoke exec subagent (stateless call)
+    result_state = exec_graph.invoke(exec_state)
+    # Extract only needed data from result - don't store full state (graph.invoke returns dict)
+    agent_state.history = result_state["history"]
+    agent_state.all_execution_results.append(result_state["execution_summary_dict"])
+    # Store exec results for this loop
+    if (
+        agent_state.loop_results
+        and agent_state.loop_results[-1].get("revision") == agent_state.current_revision
+    ):
+        agent_state.loop_results[-1]["exec_result"] = result_state["execution_summary_dict"]
+    exec_output = json.dumps(result_state.get("execution_summary_dict", {}), indent=2)
+    if isinstance(result_state, dict) and "intermediate_state" in result_state:
+        exec_intermediate = result_state.get("intermediate_state", [])
+        if exec_intermediate:
+            exec_output = f"{exec_output}\n\n[Exec Subagent Intermediate States]\n" + "\n".join(
+                [
+                    f"{item.get('node_name', 'unknown')}: {item.get('output', '')[:200]}"
+                    for item in exec_intermediate
+                ]
+            )
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "run_exec_subagent",
+            "output": exec_output,
+        }
+    )
+    return agent_state
+def run_summary_subagent(agent_state: ExperimentAgentState) -> ExperimentAgentState:
+    """Run the Summary Subagent (stateless invocation).
+    The workspace path should be extracted from the conversation history.
+    """
+    logger.info(f"Running Summary Subagent (revision {agent_state.current_revision})")
+    agent_state.current_phase = "summary"
+    summary_state = SummaryAgentState(
+        workspace=agent_state.workspace,
+        history=agent_state.history.copy(),
+        output_path=None,  # Or specify a path for saving
+        toolsets=["fs"],
+    )
+    # Invoke summary subagent (stateless call)
+    result_state = summary_graph.invoke(summary_state)
+    # Extract only needed data from result - don't store full state (graph.invoke returns dict)
+    agent_state.history = result_state["history"]
+    agent_state.revision_summaries.append(result_state["summary_text"])
+    # Store summary for this loop
+    if (
+        agent_state.loop_results
+        and agent_state.loop_results[-1].get("revision") == agent_state.current_revision
+    ):
+        agent_state.loop_results[-1]["summary"] = result_state["summary_text"]
+    summary_output = result_state.get("summary_text", "No summary generated")
+    if isinstance(result_state, dict) and "intermediate_state" in result_state:
+        summary_intermediate = result_state.get("intermediate_state", [])
+        if summary_intermediate:
+            summary_output = (
+                f"{summary_output}\n\n[Summary Subagent Intermediate States]\n"
+                + "\n".join(
+                    [
+                        f"{item.get('node_name', 'unknown')}: {item.get('output', '')[:200]}"
+                        for item in summary_intermediate
+                    ]
+                )
+            )
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "run_summary_subagent",
+            "output": summary_output,
+        }
+    )
+    return agent_state
+def analysis_node(agent_state: ExperimentAgentState) -> ExperimentAgentState:
+    """Analyze the current loop results and generate insights.
+    This node uses an LLM to analyze what went wrong, what succeeded,
+    and what needs improvement. The analysis is accumulated across revisions.
+    """
+    logger.info(f"Analyzing loop results for revision {agent_state.current_revision}")
+    agent_state.current_phase = "analysis"
+    # Get current loop results
+    current_loop = agent_state.loop_results[-1] if agent_state.loop_results else {}
+    # Use LLM to analyze the loop
+    analysis_prompt = PROMPTS.experiment_agent.analysis_prompt.render(
+        revision_number=agent_state.current_revision,
+        coding_summary=current_loop.get("coding_summary", "No coding summary available"),
+        exec_result=json.dumps(current_loop.get("exec_result", {}), indent=2),
+        summary=current_loop.get("summary", "No summary available"),
+        previous_analysis=agent_state.revision_analysis or "No previous analysis.",
+        user_query=agent_state.user_query,
+    )
+    response = ModelRegistry.completion(
+        LLM_NAME,
+        [Message(role="user", content=analysis_prompt)],
+        system_prompt=(
+            Message(
+                role="system",
+                content=PROMPTS.experiment_agent.analysis_system_prompt.render(),
+            )
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+    )
+    # Accumulate analysis
+    analysis_text = response.content
+    if agent_state.revision_analysis:
+        agent_state.revision_analysis += (
+            f"\n\n---\n\n## Revision {agent_state.current_revision} Analysis\n{analysis_text}"
+        )
+    else:
+        agent_state.revision_analysis = (
+            f"## Revision {agent_state.current_revision} Analysis\n{analysis_text}"
+        )
+    # Save analysis result to file
+    try:
+        import os
+        analysis_dir = os.path.join(agent_state.workspace.working_dir, "experiment_analyses")
+        os.makedirs(analysis_dir, exist_ok=True)
+        analysis_file = os.path.join(
+            analysis_dir, f"revision_{agent_state.current_revision}_analysis.md"
+        )
+        with open(analysis_file, "w", encoding="utf-8") as f:
+            f.write(f"# Revision {agent_state.current_revision} Analysis\n\n")
+            f.write(analysis_text)
+        logger.info(f"Analysis saved to {analysis_file}")
+    except Exception as e:
+        logger.warning(f"Failed to save analysis to file: {e}")
+    logger.debug(f"Analysis for revision {agent_state.current_revision} completed")
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "analysis",
+            "output": analysis_text,
+        }
+    )
+    return agent_state
+def revision_judge_node(agent_state: ExperimentAgentState) -> ExperimentAgentState:
+    """Judge whether a revision is needed based on the summary.
+    This node analyzes the experiment summary and decides:
+    1. COMPLETE - Experiment succeeded, no more revisions needed
+    2. CONTINUE - Issues found, need another revision loop
+    3. COMPLETE (max_revisions) - Hit max revisions limit
+    """
+    logger.info("Revision Judge evaluating results")
+    agent_state.current_phase = "judge"
+    # Check max revisions
+    if agent_state.current_revision >= agent_state.max_revisions - 1:
+        logger.warning("Max revisions reached")
+        agent_state.final_status = "max_revisions_reached"
+        judge_output = "Max revisions reached - stopping"
+        agent_state.intermediate_state.append(
+            {
+                "node_name": "revision_judge",
+                "output": judge_output,
+            }
+        )
+        return agent_state
+    # Get the latest summary
+    latest_summary = (
+        agent_state.revision_summaries[-1]
+        if agent_state.revision_summaries
+        else "No summary available"
+    )
+    exec_result = agent_state.all_execution_results[-1] if agent_state.all_execution_results else {}
+    # Use LLM to judge whether revision is needed (with accumulated analysis)
+    judge_prompt = PROMPTS.experiment_agent.judge_prompt.render(
+        latest_summary=latest_summary,
+        exec_result=json.dumps(exec_result, indent=2),
+        user_query=agent_state.user_query,
+        revision_analysis=agent_state.revision_analysis or "No analysis available.",
+    )
+    response = ModelRegistry.completion(
+        LLM_NAME,
+        [Message(role="user", content=judge_prompt)],
+        system_prompt=(
+            Message(
+                role="system",
+                content=PROMPTS.experiment_agent.judge_system_prompt.render(),
+            )
+            .with_log(cond=constant.LOG_SYSTEM_PROMPT)
+            .content
+        ),
+        agent_sender=AGENT_NAME,
+    )
+    class JudgeDecisionModel(BaseModel):
+        """Model for revision judge decision"""
+        decision: str  # "COMPLETE" or "CONTINUE"
+        reason: str
+        issues_to_fix: list[str] = []
+    # Parse the response using utility function
+    judge_output = response.content
+    try:
+        result = parse_json_from_llm_response(response, JudgeDecisionModel)
+        if result.decision == "COMPLETE":
+            logger.info("Revision judge decided: COMPLETE")
+            agent_state.final_status = "success"
+            judge_output = f"Decision: COMPLETE\nReason: {result.reason}"
+        else:
+            logger.info(f"Revision judge decided: CONTINUE - {result.reason}")
+            # Prepare for next revision
+            agent_state.current_revision += 1
+            # Add feedback to history for next coding iteration
+            feedback_msg = Message(
+                role="user",
+                content=PROMPTS.experiment_agent.revision_feedback_prompt.render(
+                    attempt_number=agent_state.current_revision + 1,
+                    reason=result.reason,
+                    issues_to_fix=result.issues_to_fix,
+                ),
+                agent_sender=AGENT_NAME,
+            )
+            agent_state.add_message(feedback_msg)
+            judge_output = f"Decision: CONTINUE\nReason: {result.reason}\nIssues to fix: {result.issues_to_fix}"
+    except Exception as e:
+        logger.error(f"Error parsing judge response: {e}")
+        agent_state.final_status = "success"
+        judge_output = f"Error parsing judge response: {e}"
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "revision_judge",
+            "output": judge_output,
+        }
+    )
+    return agent_state
+def should_continue_revision(
+    agent_state: ExperimentAgentState,
+) -> Literal["continue", "complete"]:
+    """Conditional edge function to determine next step after revision judge."""
+    if agent_state.final_status is None:
+        return "continue"
+    return "complete"
+def finalize_node(agent_state: ExperimentAgentState) -> ExperimentAgentState:
+    """Finalize the experiment and prepare output."""
+    logger.info("Finalizing Experiment Agent")
+    agent_state.current_phase = "complete"
+    # Compile final summary
+    exec_results_text = json.dumps(agent_state.all_execution_results, indent=2)
+    agent_state.final_summary = f"""# Experiment Complete
+## Status: {agent_state.final_status}
+## Total Revisions: {agent_state.current_revision + 1}
+## Final Summary
+{agent_state.revision_summaries[-1] if agent_state.revision_summaries else 'No summary generated'}
+## Accumulated Analysis
+{agent_state.revision_analysis or 'No analysis available'}
+## All Execution Results
+{exec_results_text}
+"""
+    agent_state.intermediate_state.append(
+        {
+            "node_name": "finalize",
+            "output": agent_state.final_summary,
+        }
+    )
+    return agent_state