Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .github/workflows/ci.yml +51 -0
- .gitignore +60 -0
- .gitmodules +3 -0
- LICENSE +201 -0
- README.md +388 -0
- benchmarks/kernelbench/__init__.py +0 -0
- benchmarks/kernelbench/requirements.txt +4 -0
- docs/.gitignore +3 -0
- docs/README.md +13 -0
- docs/app/api/search/route.ts +6 -0
- docs/app/docs/layout.tsx +11 -0
- docs/app/global.css +3 -0
- docs/app/page.tsx +22 -0
- docs/content/docs/getting-started/index.mdx +11 -0
- docs/content/docs/getting-started/installation.mdx +51 -0
- docs/content/docs/getting-started/meta.json +4 -0
- docs/content/docs/meta.json +7 -0
- docs/lib/source.ts +7 -0
- docs/mdx-components.tsx +18 -0
- docs/next-env.d.ts +6 -0
- docs/next.config.mjs +19 -0
- docs/package-lock.json +0 -0
- docs/package.json +30 -0
- docs/postcss.config.mjs +5 -0
- docs/source.config.ts +7 -0
- docs/tsconfig.json +36 -0
- examples/text_similarity/config.yaml +41 -0
- examples/text_similarity/evaluator/Dockerfile +9 -0
- examples/text_similarity/evaluator/evaluate.sh +5 -0
- examples/text_similarity/evaluator/evaluator.py +48 -0
- examples/text_similarity/evaluator/pairs.json +51 -0
- examples/text_similarity/initial_program.py +34 -0
- scripts/reproduce/adrs.sh +66 -0
- scripts/reproduce/ale_bench.sh +63 -0
- scripts/reproduce/arc.sh +45 -0
- scripts/reproduce/frontier_cs.sh +52 -0
- scripts/reproduce/gpu.sh +58 -0
- scripts/reproduce/math.sh +77 -0
- scripts/reproduce/prompt_opt.sh +45 -0
- scripts/reproduce/run_all.sh +18 -0
- scripts/run_cp.sh +20 -0
- setup.py +3 -0
- skydiscover/README.md +50 -0
- skydiscover/__init__.py +19 -0
- skydiscover/_version.py +3 -0
- skydiscover/api.py +296 -0
- skydiscover/benchmarks/__init__.py +5 -0
- skydiscover/benchmarks/base.py +48 -0
- skydiscover/benchmarks/resolution.py +38 -0
- skydiscover/cli.py +327 -0
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [main]
|
| 6 |
+
pull_request:
|
| 7 |
+
|
| 8 |
+
concurrency:
|
| 9 |
+
group: ${{ github.workflow }}-${{ github.ref }}
|
| 10 |
+
cancel-in-progress: true
|
| 11 |
+
|
| 12 |
+
jobs:
|
| 13 |
+
lint:
|
| 14 |
+
runs-on: ubuntu-latest
|
| 15 |
+
timeout-minutes: 5
|
| 16 |
+
steps:
|
| 17 |
+
- uses: actions/checkout@v4
|
| 18 |
+
- uses: astral-sh/setup-uv@v4
|
| 19 |
+
with:
|
| 20 |
+
python-version: "3.10"
|
| 21 |
+
enable-cache: true
|
| 22 |
+
- run: uv sync --frozen --extra dev
|
| 23 |
+
- run: uv run black --check skydiscover/
|
| 24 |
+
- run: uv run isort --check skydiscover/
|
| 25 |
+
|
| 26 |
+
test:
|
| 27 |
+
runs-on: ubuntu-latest
|
| 28 |
+
timeout-minutes: 10
|
| 29 |
+
steps:
|
| 30 |
+
- uses: actions/checkout@v4
|
| 31 |
+
- uses: astral-sh/setup-uv@v4
|
| 32 |
+
with:
|
| 33 |
+
python-version: "3.10"
|
| 34 |
+
enable-cache: true
|
| 35 |
+
- run: uv sync --frozen --extra dev
|
| 36 |
+
- name: Smoke test — package imports cleanly
|
| 37 |
+
run: uv run python -c "from skydiscover import Runner, run_discovery, discover_solution, __version__; print(f'skydiscover {__version__} OK')"
|
| 38 |
+
- name: Run tests
|
| 39 |
+
run: uv run pytest tests/ -v
|
| 40 |
+
|
| 41 |
+
build:
|
| 42 |
+
runs-on: ubuntu-latest
|
| 43 |
+
timeout-minutes: 5
|
| 44 |
+
needs: [lint, test]
|
| 45 |
+
steps:
|
| 46 |
+
- uses: actions/checkout@v4
|
| 47 |
+
- uses: astral-sh/setup-uv@v4
|
| 48 |
+
with:
|
| 49 |
+
python-version: "3.10"
|
| 50 |
+
enable-cache: true
|
| 51 |
+
- run: uv build
|
.gitignore
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
*.egg-info/
|
| 7 |
+
*.egg
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
.eggs/
|
| 11 |
+
|
| 12 |
+
# Virtual environments
|
| 13 |
+
.venv/
|
| 14 |
+
venv/
|
| 15 |
+
env/
|
| 16 |
+
|
| 17 |
+
# IDE
|
| 18 |
+
.idea/
|
| 19 |
+
.vscode/
|
| 20 |
+
*.swp
|
| 21 |
+
*.swo
|
| 22 |
+
.claude/
|
| 23 |
+
|
| 24 |
+
# OS
|
| 25 |
+
.DS_Store
|
| 26 |
+
|
| 27 |
+
# Testing
|
| 28 |
+
.pytest_cache/
|
| 29 |
+
.coverage
|
| 30 |
+
htmlcov/
|
| 31 |
+
|
| 32 |
+
# Secrets
|
| 33 |
+
.env
|
| 34 |
+
secrets.yaml
|
| 35 |
+
|
| 36 |
+
# Logs & outputs
|
| 37 |
+
*.log
|
| 38 |
+
*.jsonl
|
| 39 |
+
output*/
|
| 40 |
+
outputs*/
|
| 41 |
+
outputs_*/
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Benchmark generated data
|
| 45 |
+
benchmarks/image_gen/sky_festival/sky_festival_output/
|
| 46 |
+
benchmarks/image_gen/sky_festival/sky_festival_paradigm_output_*/
|
| 47 |
+
benchmarks/frontier-cs-eval/Frontier-CS
|
| 48 |
+
benchmarks/ADRS/eplb/expert-load.json
|
| 49 |
+
benchmarks/ADRS/cloudcast/profiles/
|
| 50 |
+
benchmarks/ADRS/cloudcast/examples/
|
| 51 |
+
benchmarks/ADRS/llm_sql/datasets/
|
| 52 |
+
|
| 53 |
+
# Generated test outputs (re-generate with test_all_benchmarks.sh)
|
| 54 |
+
tests/**/test_outputs_*/
|
| 55 |
+
|
| 56 |
+
# Evaluation run outputs
|
| 57 |
+
eval_runs/
|
| 58 |
+
|
| 59 |
+
# Local documentation
|
| 60 |
+
tasks/
|
.gitmodules
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[submodule "benchmarks/ale_bench/ALE-Bench"]
|
| 2 |
+
path = benchmarks/ale_bench/ALE-Bench
|
| 3 |
+
url = https://github.com/SakanaAI/ALE-Bench.git
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [2025] [SkyDiscover Team]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
README.md
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<h1 align="center">
|
| 2 |
+
<img src="assets/logo_vector.png" height="80" alt="SkyDiscover logo" style="vertical-align: middle;">
|
| 3 |
+
|
| 4 |
+
<b>SkyDiscover</b>
|
| 5 |
+
</h1>
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
<p align="center"> A Flexible Framework for AI-Driven Scientific and Algorithmic Discovery</p>
|
| 9 |
+
<p align="center">
|
| 10 |
+
<a href="https://skydiscover-ai.github.io/blog.html"><img src="https://img.shields.io/badge/blog-SkyDiscover-orange?style=flat-square" alt="Blog" /></a>
|
| 11 |
+
<a href="https://arxiv.org/abs/2602.20133"><img src="https://img.shields.io/badge/paper-AdaEvolve-red?style=flat-square" alt="AdaEvolve Paper" /></a>
|
| 12 |
+
<a href="https://arxiv.org/abs/2602.23413"><img src="https://img.shields.io/badge/paper-EvoX-lightblue?style=flat-square" alt="EvoX Paper" /></a>
|
| 13 |
+
<a href="LICENSE"><img src="https://img.shields.io/badge/license-Apache--2.0-green?style=flat-square" /></a>
|
| 14 |
+
</p>
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
<p align="center">
|
| 19 |
+
<img src="assets/architecture.png" width="720" alt="SkyDiscover architecture"><br>
|
| 20 |
+
</p>
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
**SkyDiscover** is a modular framework for AI-driven scientific and algorithmic discovery, providing a unified interface for implementing, running, and fairly comparing discovery algorithms across 200+ optimization tasks.
|
| 24 |
+
|
| 25 |
+
SkyDiscover introduces two new adaptive optimization algorithms:
|
| 26 |
+
|
| 27 |
+
- **[AdaEvolve](https://arxiv.org/abs/2602.20133)**, which dynamically adjusts its optimization behavior based on observed progress.
|
| 28 |
+
- **[EvoX](https://arxiv.org/abs/2602.23413)**, which dynamically evolves the optimization (evolution) strategy itself using LLMs on the fly.
|
| 29 |
+
|
| 30 |
+
SkyDiscover also supports using OpenEvolve, ShinkaEvolve and GEPA to quickly benchmark these algorithms using their own source code. SkyDiscover also hosts native versions of OpenEvolve and GEPA under `openevolve_native` and `gepa_native` algorithms using the modular interface.
|
| 31 |
+
|
| 32 |
+
SkyDiscover natively supports [Harbor](https://harborframework.com/)-format benchmarks, so you can run external benchmark suites out of the box, including [AlgoTune](https://github.com/oripress/AlgoTune), [EvoEval](https://github.com/evo-eval/evoeval), [HumanEvalFix](https://github.com/bigcode-project/octopack), [BigCodeBench](https://github.com/bigcode-project/bigcodebench), [LiveCodeBench](https://livecodebench.github.io/), [USACO](https://usaco.org/), [CRUSTBench](https://github.com/AInfinity/CRUSTBench), and [CodePDE](https://github.com/).
|
| 33 |
+
> 🚧 This project is under active development.
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## 🏆 Benchmark Performance
|
| 38 |
+
|
| 39 |
+
Across ~200 optimization benchmarks, AdaEvolve and EvoX achieve the strongest open-source results: matching or exceeding AlphaEvolve and human SOTA, and outperforming OpenEvolve, GEPA, and ShinkaEvolve under identical generation budgets.
|
| 40 |
+
|
| 41 |
+
- **Frontier-CS (172 problems)**: ~34% median score improvement over OpenEvolve, GEPA, and ShinkaEvolve
|
| 42 |
+
- **Math + Systems Optimization (14 tasks evaluated)**: Matches or exceeds AlphaEvolve and human-designed SOTA on 6/6 systems and 6/8 math tasks
|
| 43 |
+
- **Real-world systems impact**: 41% lower cross-cloud transfer cost, 14% better GPU load balance for MoE serving, and 29% lower KV-cache pressure via GPU model placement
|
| 44 |
+
|
| 45 |
+
<p align="center">
|
| 46 |
+
<img src="assets/benchmarks.png" width="900" alt="SkyDiscover benchmarks">
|
| 47 |
+
</p>
|
| 48 |
+
|
| 49 |
+
<details>
|
| 50 |
+
<summary><b>📊 Complete results of AdaEvolve and EvoX (100 iterations)</b></summary>
|
| 51 |
+
|
| 52 |
+
> AdaEvolve and EvoX are **complementary**: AdaEvolve adapts search *parameters* for fast early gains; EvoX evolves the search *strategy itself* for stronger long-horizon gains. Both are built on SkyDiscover.
|
| 53 |
+
|
| 54 |
+
<p align="center">
|
| 55 |
+
<img src="assets/comparison.png" width="900" alt="Main results for systems and math problems">
|
| 56 |
+
</p>
|
| 57 |
+
|
| 58 |
+
</details>
|
| 59 |
+
|
| 60 |
+
<details>
|
| 61 |
+
<summary><b>📈 Scaling behavior of AdaEvolve and EvoX</b></summary>
|
| 62 |
+
|
| 63 |
+
The scaling behavior of AdaEvolve and EvoX shows a **complementary crossover**. AdaEvolve's per-iteration parameter adaptation yields fast early gains in low-budget runs (T≤50), while EvoX's demand-driven strategy evolution unlocks step-change improvements in longer runs (T≥50).
|
| 64 |
+
|
| 65 |
+
<p align="center">
|
| 66 |
+
<img src="assets/scaling_comparison.png" width="900" alt="Scaling behavior of AdaEvolve vs EvoX across 500 iterations">
|
| 67 |
+
<br><em>Best-so-far score vs. iteration for Signal Processing, Heilbronn Convex, Prism, and Cloudcast (500 iterations, GPT-5).</em>
|
| 68 |
+
</p>
|
| 69 |
+
|
| 70 |
+
</details>
|
| 71 |
+
|
| 72 |
+
<details>
|
| 73 |
+
<summary><b>🔗 Evolving AdaEvolve's policy with EvoX (coming soon)</b></summary>
|
| 74 |
+
|
| 75 |
+
The two methods are **composable**: EvoX can evolve using AdaEvolve as its starting strategy, achieving the best results on 3 out of 4 benchmarks (100 iterations, GPT-5). This combined mode will be available in SkyDiscover soon.
|
| 76 |
+
|
| 77 |
+
| Benchmark | AdaEvolve | EvoX (Random Init) | EvoX (AdaEvolve Init) |
|
| 78 |
+
|:--|--:|--:|--:|
|
| 79 |
+
| Signal Proc. (↑) | 0.718 | 0.721 | **0.760** |
|
| 80 |
+
| Heilbronn Cvx. (↑) | 0.0290 | 0.0270 | **0.0291** |
|
| 81 |
+
| Cloudcast (↓) | 640.5 | 637.1 | **623.4** |
|
| 82 |
+
| Prism (↑) | 26.37 | **30.52** | 26.27 |
|
| 83 |
+
|
| 84 |
+
</details>
|
| 85 |
+
|
| 86 |
+
<details>
|
| 87 |
+
<summary><b>Task breakdown across math, systems, and programming challenges</b></summary>
|
| 88 |
+
|
| 89 |
+
| | Benchmark | Domain | Tasks | Description |
|
| 90 |
+
|-|-----------|--------|------:|-------------|
|
| 91 |
+
| 🔢 | [math/](benchmarks/math/) | Math | 14 | Circle packing, Erdos problems, geometric optimization |
|
| 92 |
+
| 🖥️ | [ADRS/](benchmarks/ADRS/) | Systems | 5 | Cloud scheduling, load balancing, MoE expert placement |
|
| 93 |
+
| ⚡ | [gpu_mode/](benchmarks/gpu_mode/) | Systems | 4 | GPU kernel optimization |
|
| 94 |
+
| 🔧 | [kernelbench/](benchmarks/kernelbench/) | Systems | 250+ | [KernelBench](https://github.com/ScalingIntelligence/KernelBench) GPU kernel speedup optimization |
|
| 95 |
+
| 🧩 | [frontier-cs-eval/](benchmarks/frontier-cs-eval/) | Algorithms | 172 | [Frontier-CS](https://frontier-cs.org/) competitive programming |
|
| 96 |
+
| 🧠 | [arc_benchmark/](benchmarks/arc_benchmark/) | Reasoning | — | ARC-AGI visual reasoning |
|
| 97 |
+
| 💻 | [ale_bench/](benchmarks/ale_bench/) | Algorithms | 10 | Algorithmic programming contests |
|
| 98 |
+
| 🎨 | [image_gen/](benchmarks/image_gen/) | Creative | 1 | AI image generation evolution |
|
| 99 |
+
| 💬 | [prompt_optimization/](benchmarks/prompt_optimization/) | NLP | 1 | HotPotQA prompt evolution |
|
| 100 |
+
|
| 101 |
+
See [Dependency extras](#dependency-extras) for install commands per benchmark.
|
| 102 |
+
|
| 103 |
+
</details>
|
| 104 |
+
|
| 105 |
+
## 🚀 Quick Start
|
| 106 |
+
|
| 107 |
+
**Prerequisites:** Python >= 3.10, [uv](https://docs.astral.sh/uv/)
|
| 108 |
+
|
| 109 |
+
```bash
|
| 110 |
+
# Install
|
| 111 |
+
uv sync
|
| 112 |
+
export OPENAI_API_KEY="<your-key>"
|
| 113 |
+
|
| 114 |
+
# Try the circle packing benchmark
|
| 115 |
+
uv sync --extra math
|
| 116 |
+
uv run skydiscover-run benchmarks/math/circle_packing/initial_program.py \
|
| 117 |
+
benchmarks/math/circle_packing/evaluator.py \
|
| 118 |
+
--config benchmarks/math/circle_packing/config.yaml \
|
| 119 |
+
--search evox \
|
| 120 |
+
--iterations 100
|
| 121 |
+
|
| 122 |
+
uv run skydiscover-run benchmarks/math/circle_packing/initial_program.py \
|
| 123 |
+
benchmarks/math/circle_packing/evaluator.py \
|
| 124 |
+
--config benchmarks/math/circle_packing/config.yaml \
|
| 125 |
+
--search adaevolve \
|
| 126 |
+
--iterations 100
|
| 127 |
+
|
| 128 |
+
# Or run on your own problem
|
| 129 |
+
# algo can be "evox", "adaevolve", "openevolve", "gepa", "shinkaevolve"
|
| 130 |
+
uv run skydiscover-run initial_program.py evaluator.py \
|
| 131 |
+
--search <algo> \
|
| 132 |
+
--model gpt-5 \
|
| 133 |
+
--iterations 100
|
| 134 |
+
|
| 135 |
+
# initial_program is optional — omit it to let the LLM start from scratch
|
| 136 |
+
uv run skydiscover-run evaluator.py \
|
| 137 |
+
--search <algo> \
|
| 138 |
+
--model gpt-5 \
|
| 139 |
+
--iterations 100
|
| 140 |
+
|
| 141 |
+
# Run a Harbor benchmark (e.g. AlgoTune) — no seed program needed
|
| 142 |
+
pip install harbor
|
| 143 |
+
harbor datasets download algotune@1.0 -o /tmp/algotune
|
| 144 |
+
uv run skydiscover-run /tmp/algotune/<id>/algotune-set-cover \
|
| 145 |
+
--model anthropic/claude-sonnet-4-6 \
|
| 146 |
+
--search best_of_n -i 10
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
Or use the Python API:
|
| 150 |
+
|
| 151 |
+
```python
|
| 152 |
+
from skydiscover import run_discovery
|
| 153 |
+
|
| 154 |
+
result = run_discovery(
|
| 155 |
+
initial_program="initial_program.py",
|
| 156 |
+
evaluator="evaluator.py",
|
| 157 |
+
search=[algo], # algo can be "adaevolve", "evox", "openevolve", "gepa", "shinkaevolve"
|
| 158 |
+
model="gpt-5",
|
| 159 |
+
iterations=100,
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
print(result.best_score, result.best_solution)
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
## ✏️ What You Write
|
| 167 |
+
|
| 168 |
+
### Scoring Function (required)
|
| 169 |
+
|
| 170 |
+
SkyDiscover supports three evaluator formats — pick whichever fits your use case:
|
| 171 |
+
|
| 172 |
+
| Format | When to use | What you point `evaluation_file` at |
|
| 173 |
+
|:---|:---|:---|
|
| 174 |
+
| **Python function** | Simple tasks, no system deps | `evaluator.py` |
|
| 175 |
+
| **Containerized** | Custom deps, data files, isolation | `evaluator/` directory (must contain `Dockerfile` + `evaluate.sh`) |
|
| 176 |
+
| **Harbor task** | External benchmark suites (AlgoTune, EvoEval, HumanEvalFix, BigCodeBench, LiveCodeBench, USACO, CRUSTBench, CodePDE, and more) | Task directory (must contain `instruction.md` + `tests/` + `environment/Dockerfile`) |
|
| 177 |
+
|
| 178 |
+
SkyDiscover auto-detects the format. See [`benchmarks/README.md`](benchmarks/README.md#adding-a-benchmark) for full setup instructions.
|
| 179 |
+
|
| 180 |
+
**Python evaluator** — a file with an `evaluate(program_path)` function:
|
| 181 |
+
|
| 182 |
+
```python
|
| 183 |
+
def evaluate(program_path):
|
| 184 |
+
score = run_and_grade(program_path)
|
| 185 |
+
return {
|
| 186 |
+
"combined_score": score, # primary optimization target (maximized)
|
| 187 |
+
"artifacts": { # optional — stored with the solution for future context
|
| 188 |
+
"feedback": "Off by one in the loop boundary",
|
| 189 |
+
},
|
| 190 |
+
}
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
**Containerized evaluator** — a directory with a `Dockerfile` and `evaluate.sh` that writes JSON to stdout. Runs in Docker, so it can have arbitrary dependencies.
|
| 194 |
+
|
| 195 |
+
**Harbor task** — a directory following the [Harbor](https://harborframework.com/) format (`instruction.md`, `environment/Dockerfile`, `tests/test.sh`). Works out of the box with 8+ tested benchmark suites (see [benchmarks/README.md](benchmarks/README.md#tested-harbor-datasets) for the full list).
|
| 196 |
+
|
| 197 |
+
- **combined_score** drives evolution. If omitted, SkyDiscover averages all numeric values in the dict.
|
| 198 |
+
- **artifacts** is optional — entries are injected into the next LLM prompt as context.
|
| 199 |
+
|
| 200 |
+
For `search.type: adaevolve`, you can also enable explicit Pareto optimization by configuring `search.database.pareto_objectives` and returning those objective metrics directly from the evaluator. In that mode, `combined_score` becomes optional and is only used as a scalar fallback/proxy when configured.
|
| 201 |
+
|
| 202 |
+
### Starting Solution (optional)
|
| 203 |
+
|
| 204 |
+
The initial program is **optional**. When omitted, the LLM generates a solution from scratch. If provided, it marks the region to mutate with EVOLVE-BLOCK markers. Everything outside is left untouched.
|
| 205 |
+
|
| 206 |
+
```python
|
| 207 |
+
# EVOLVE-BLOCK-START
|
| 208 |
+
def solve(input_data):
|
| 209 |
+
return input_data # baseline — SkyDiscover will improve this
|
| 210 |
+
# EVOLVE-BLOCK-END
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
If no markers are present, the entire file is treated as mutatable.
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
## 🧬 Pick an Algorithm
|
| 217 |
+
|
| 218 |
+
See [Benchmark Performance](#-benchmark-performance) for a detailed comparison of AdaEvolve and EvoX against other algorithms.
|
| 219 |
+
|
| 220 |
+
| Algorithm | Flag | Description |
|
| 221 |
+
|:---|:---|:---|
|
| 222 |
+
| ⭐ **AdaEvolve** | `--search adaevolve` | Multi-island adaptive search with UCB, migration, and paradigm breakthroughs |
|
| 223 |
+
| 🧠 **EvoX** | `--search evox` | Self-evolving paradigm that co-adapts solution generation and experience management |
|
| 224 |
+
| 📊 **Top-K** | `--search topk` | Selects top-K solutions to refine |
|
| 225 |
+
| 🔍 **Beam Search** | `--search beam_search` | Breadth-first expansion of a beam of top solutions |
|
| 226 |
+
| 🎲 **Best-of-N** | `--search best_of_n` | Generates N variants per iteration, keeps the best |
|
| 227 |
+
| 🧪 **GEPA Native** | `--search gepa_native` | Pareto-efficient search with reflective prompting and LLM-mediated merge |
|
| 228 |
+
| 🗺️ **OpenEvolve Native** | `--search openevolve_native` | MAP-Elites + island-based evolutionary search |
|
| 229 |
+
|
| 230 |
+
### External backends
|
| 231 |
+
|
| 232 |
+
Install with `uv sync --extra external`, then use the corresponding flag:
|
| 233 |
+
|
| 234 |
+
| Backend | Flag | Source |
|
| 235 |
+
|:---|:---|:---|
|
| 236 |
+
| **OpenEvolve** | `--search openevolve` | [codelion/openevolve](https://github.com/codelion/openevolve) |
|
| 237 |
+
| **GEPA** | `--search gepa` | [gepa-ai/gepa](https://github.com/gepa-ai/gepa) |
|
| 238 |
+
| **ShinkaEvolve** | `--search shinkaevolve` | [SakanaAI/ShinkaEvolve](https://github.com/SakanaAI/ShinkaEvolve) (manual install) |
|
| 239 |
+
|
| 240 |
+
<details>
|
| 241 |
+
<summary>ShinkaEvolve manual install</summary>
|
| 242 |
+
|
| 243 |
+
```bash
|
| 244 |
+
git clone --depth 1 https://github.com/SakanaAI/ShinkaEvolve.git external_repos/ShinkaEvolve
|
| 245 |
+
uv pip install -e external_repos/ShinkaEvolve
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
</details>
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
## ⚙️ Configuration
|
| 252 |
+
|
| 253 |
+
Pass a YAML config with `-c`. See [configs/](configs/) for full annotated templates.
|
| 254 |
+
|
| 255 |
+
```yaml
|
| 256 |
+
max_iterations: 100
|
| 257 |
+
llm:
|
| 258 |
+
models: [{ name: "gemini/gemini-3-pro-preview", weight: 1.0 }]
|
| 259 |
+
search:
|
| 260 |
+
type: "adaevolve" # or "evox", "topk", "beam_search", "best_of_n"
|
| 261 |
+
prompt:
|
| 262 |
+
system_message: |
|
| 263 |
+
You are an expert at optimizing algorithms.
|
| 264 |
+
```
|
| 265 |
+
|
| 266 |
+
API keys (OPENAI_API_KEY, GEMINI_API_KEY, etc.) are resolved from environment variables automatically.
|
| 267 |
+
|
| 268 |
+
### 📊 Live Monitor & Human Feedback
|
| 269 |
+
|
| 270 |
+
Add `monitor: { enabled: true }` to your config. The dashboard URL prints at run start — scatter plot of all programs, code diffs, metrics, and AI summaries. A **Human Feedback** panel lets you steer evolution in real time.
|
| 271 |
+
Replay a completed run:
|
| 272 |
+
|
| 273 |
+
```bash
|
| 274 |
+
uv run skydiscover-viewer /path/to/checkpoints/checkpoint_100
|
| 275 |
+
```
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
## 📖 Reference
|
| 279 |
+
|
| 280 |
+
<details>
|
| 281 |
+
<summary><b>CLI flags</b></summary>
|
| 282 |
+
|
| 283 |
+
```
|
| 284 |
+
uv run skydiscover-run [INITIAL_PROGRAM] EVALUATOR [options]
|
| 285 |
+
```
|
| 286 |
+
|
| 287 |
+
| Flag | Description |
|
| 288 |
+
|:---|:---|
|
| 289 |
+
| `-c, --config FILE` | Config YAML |
|
| 290 |
+
| `-i, --iterations N` | Number of iterations |
|
| 291 |
+
| `-m, --model MODEL` | LLM model (overrides config) |
|
| 292 |
+
| `-s, --search TYPE` | Search algorithm |
|
| 293 |
+
| `-o, --output DIR` | Output directory |
|
| 294 |
+
| `--api-base URL` | Override LLM API endpoint |
|
| 295 |
+
| `--checkpoint DIR` | Resume from checkpoint |
|
| 296 |
+
| `--agentic` | Enable agentic mode (LLM can read your files) |
|
| 297 |
+
| `-l, --log-level LEVEL` | DEBUG, INFO, WARNING, or ERROR |
|
| 298 |
+
|
| 299 |
+
</details>
|
| 300 |
+
|
| 301 |
+
<details>
|
| 302 |
+
<summary><b>Python API — discover_solution() (convenience wrapper)</b></summary>
|
| 303 |
+
|
| 304 |
+
`discover_solution()` is a convenience wrapper around `run_discovery()` (shown in [Quick Start](#-quick-start)) for inline string solutions and callable evaluators:
|
| 305 |
+
|
| 306 |
+
```python
|
| 307 |
+
from skydiscover import discover_solution
|
| 308 |
+
|
| 309 |
+
result = discover_solution(
|
| 310 |
+
initial_solution="def solve(x): return x", # optional — omit to start from scratch
|
| 311 |
+
evaluator=lambda path: {"combined_score": run_tests(path)},
|
| 312 |
+
iterations=50,
|
| 313 |
+
search="evox",
|
| 314 |
+
)
|
| 315 |
+
```
|
| 316 |
+
|
| 317 |
+
</details>
|
| 318 |
+
|
| 319 |
+
<details>
|
| 320 |
+
<summary><b>Model providers</b></summary>
|
| 321 |
+
|
| 322 |
+
Any [LiteLLM](https://docs.litellm.ai/)-compatible model works using `provider/model` format:
|
| 323 |
+
|
| 324 |
+
```bash
|
| 325 |
+
--model gpt-5 # OpenAI (default)
|
| 326 |
+
--model gemini/gemini-3-pro-preview # Gemini
|
| 327 |
+
--model anthropic/claude-sonnet-4-20250514 # Anthropic
|
| 328 |
+
--model ollama/llama3 --api-base http://localhost:11434/v1 # Local (Ollama, vLLM, etc.)
|
| 329 |
+
```
|
| 330 |
+
|
| 331 |
+
Multi-model pools with weighted sampling are supported in config:
|
| 332 |
+
|
| 333 |
+
```yaml
|
| 334 |
+
llm:
|
| 335 |
+
models:
|
| 336 |
+
- name: "gpt-5-mini"
|
| 337 |
+
weight: 0.7
|
| 338 |
+
- name: "gemini/gemini-2.0-flash"
|
| 339 |
+
weight: 0.3
|
| 340 |
+
```
|
| 341 |
+
|
| 342 |
+
</details>
|
| 343 |
+
|
| 344 |
+
<details id="dependency-extras">
|
| 345 |
+
<summary><b>Benchmark dependency extras</b></summary>
|
| 346 |
+
|
| 347 |
+
```bash
|
| 348 |
+
uv sync # Base install
|
| 349 |
+
uv sync --extra math # Math benchmarks (SciPy, JAX, PyWavelets, …)
|
| 350 |
+
uv sync --extra adrs # ADRS systems benchmarks
|
| 351 |
+
uv sync --extra frontier-cs # Frontier-CS benchmark tooling
|
| 352 |
+
uv sync --extra external # OpenEvolve / GEPA / ShinkaEvolve backends
|
| 353 |
+
uv sync --extra prompt-optimization # HotPotQA prompt optimization
|
| 354 |
+
```
|
| 355 |
+
|
| 356 |
+
Combine extras as needed: `uv sync --extra external --extra math`
|
| 357 |
+
|
| 358 |
+
If a benchmark ships its own `requirements.txt`, also run: `uv pip install -r path/to/requirements.txt`
|
| 359 |
+
|
| 360 |
+
</details>
|
| 361 |
+
|
| 362 |
+
---
|
| 363 |
+
|
| 364 |
+
## 🛠️ Extending SkyDiscover
|
| 365 |
+
|
| 366 |
+
- **New benchmark** → [`benchmarks/README.md`](benchmarks/README.md#adding-a-benchmark)
|
| 367 |
+
- **New search algorithm** → [`skydiscover/search/README.md`](skydiscover/search/README.md)
|
| 368 |
+
- **New context builder** → [`skydiscover/context_builder/README.md`](skydiscover/context_builder/README.md)
|
| 369 |
+
|
| 370 |
+
---
|
| 371 |
+
|
| 372 |
+
## 🔗 Related Work
|
| 373 |
+
SkyDiscover is inspired by [AlphaEvolve](https://deepmind.google/discover/blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/) and incorporates useful code components from open-source efforts such as [OpenEvolve](https://github.com/codelion/openevolve). Its interface is compatible with the [optimize_anything](https://gepa-ai.github.io/gepa/blog/2026/02/18/introducing-optimize-anything/) API.
|
| 374 |
+
|
| 375 |
+
## ✍️ Citation
|
| 376 |
+
|
| 377 |
+
```bibtex
|
| 378 |
+
@misc{skydiscover2026,
|
| 379 |
+
title = {SkyDiscover: A Flexible Framework for AI-Driven Scientific and Algorithmic Discovery},
|
| 380 |
+
author = {Liu, Shu and Cemri, Mert and Agarwal, Shubham and Krentsel, Alexander and Naren, Ashwin and Mang, Qiuyang and Li, Zhifei and Gupta, Akshat and Maheswaran, Monishwaran and Cheng, Audrey and Pan, Melissa and Boneh, Ethan and Ramchandran, Kannan and Sen, Koushik and Dimakis, Alexandros G. and Zaharia, Matei and Stoica, Ion},
|
| 381 |
+
year = {2026},
|
| 382 |
+
url = {https://skydiscover-ai.github.io/blog.html}
|
| 383 |
+
}
|
| 384 |
+
```
|
| 385 |
+
|
| 386 |
+
## 📬 Contact Us
|
| 387 |
+
For questions or feedback, reach out to us:
|
| 388 |
+
[lshu@berkeley.edu](mailto:lshu@berkeley.edu) · [mert_cemri@berkeley.edu](mailto:mert_cemri@berkeley.edu) · [shubham3@berkeley.edu](mailto:shubham3@berkeley.edu)
|
benchmarks/kernelbench/__init__.py
ADDED
|
File without changes
|
benchmarks/kernelbench/requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# KernelBench library for problem fetching (resolver)
|
| 2 |
+
# Note: The evaluator uses kernelbench[gpu] which includes GPU support
|
| 3 |
+
# For resolver-only usage (fetching problems), the base package is sufficient
|
| 4 |
+
kernelbench @ git+https://github.com/ScalingIntelligence/KernelBench.git
|
docs/.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
node_modules/
|
| 2 |
+
.next/
|
| 3 |
+
.source/
|
docs/README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SkyDiscover Documentation
|
| 2 |
+
|
| 3 |
+
Built with [Next.js](https://nextjs.org/) + [Fumadocs](https://fumadocs.vercel.app/).
|
| 4 |
+
|
| 5 |
+
## Local Development
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
cd docs
|
| 9 |
+
npm install
|
| 10 |
+
npm run dev
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
Then open [http://localhost:3000](http://localhost:3000).
|
docs/app/api/search/route.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { source } from '@/lib/source';
|
| 2 |
+
import { createFromSource } from 'fumadocs-core/search/server';
|
| 3 |
+
|
| 4 |
+
export const { GET } = createFromSource(source, {
|
| 5 |
+
language: 'english',
|
| 6 |
+
});
|
docs/app/docs/layout.tsx
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { source } from '@/lib/source';
|
| 2 |
+
import { DocsLayout } from 'fumadocs-ui/layouts/docs';
|
| 3 |
+
import { baseOptions } from '@/lib/layout.shared';
|
| 4 |
+
|
| 5 |
+
export default function Layout({ children }: LayoutProps<'/docs'>) {
|
| 6 |
+
return (
|
| 7 |
+
<DocsLayout tree={source.getPageTree()} {...baseOptions()}>
|
| 8 |
+
{children}
|
| 9 |
+
</DocsLayout>
|
| 10 |
+
);
|
| 11 |
+
}
|
docs/app/global.css
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@import 'tailwindcss';
|
| 2 |
+
@import 'fumadocs-ui/css/neutral.css';
|
| 3 |
+
@import 'fumadocs-ui/css/preset.css';
|
docs/app/page.tsx
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Link from 'next/link';
|
| 2 |
+
|
| 3 |
+
export default function HomePage() {
|
| 4 |
+
return (
|
| 5 |
+
<main className="flex min-h-screen flex-col items-center justify-center p-24">
|
| 6 |
+
<div className="z-10 max-w-5xl w-full items-center justify-center font-mono text-sm">
|
| 7 |
+
<h1 className="text-4xl font-bold mb-8 text-center">SkyDiscover Documentation</h1>
|
| 8 |
+
<p className="text-xl mb-8 text-center">
|
| 9 |
+
Documentation for SkyDiscover.
|
| 10 |
+
</p>
|
| 11 |
+
<div className="flex justify-center">
|
| 12 |
+
<Link
|
| 13 |
+
href="/docs"
|
| 14 |
+
className="inline-block bg-blue-600 hover:bg-blue-700 text-white font-bold py-3 px-6 rounded text-lg"
|
| 15 |
+
>
|
| 16 |
+
View Documentation
|
| 17 |
+
</Link>
|
| 18 |
+
</div>
|
| 19 |
+
</div>
|
| 20 |
+
</main>
|
| 21 |
+
);
|
| 22 |
+
}
|
docs/content/docs/getting-started/index.mdx
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: "Getting Started"
|
| 3 |
+
description: "Set up SkyDiscover, run your first discovery task, and learn how to configure it."
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
Get up and running with SkyDiscover in a few minutes. This section covers
|
| 7 |
+
everything you need to go from zero to your first AI-driven discovery:
|
| 8 |
+
|
| 9 |
+
- **[Installation](/docs/getting-started/installation)** — install SkyDiscover and set up your API keys
|
| 10 |
+
- **[Quick Start](/docs/getting-started/quick-start)** — run your first task and understand the core workflow
|
| 11 |
+
- **[Configuration](/docs/getting-started/configuration)** — models, algorithms, config files, and the Python API
|
docs/content/docs/getting-started/installation.mdx
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: "Installation"
|
| 3 |
+
description: "Install SkyDiscover and configure your LLM API keys."
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
## Prerequisites
|
| 7 |
+
|
| 8 |
+
- **Python** >= 3.10
|
| 9 |
+
- **[uv](https://docs.astral.sh/uv/)** — fast Python package manager
|
| 10 |
+
- **[Docker](https://docs.docker.com/get-docker/)** — for containerized evaluators
|
| 11 |
+
- An **LLM API key** (OpenAI, Gemini, Anthropic, or a local model)
|
| 12 |
+
|
| 13 |
+
## Install SkyDiscover
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
git clone https://github.com/skydiscover-ai/skydiscover.git
|
| 17 |
+
cd skydiscover
|
| 18 |
+
uv sync
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
## Verify the installation
|
| 22 |
+
|
| 23 |
+
```bash
|
| 24 |
+
uv run skydiscover-run --help
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
You should see the help text with available flags.
|
| 28 |
+
|
| 29 |
+
## Set your API key
|
| 30 |
+
|
| 31 |
+
SkyDiscover uses [LiteLLM](https://docs.litellm.ai/) under the hood, so any
|
| 32 |
+
provider works. Set the key for the provider you want to use:
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
export ANTHROPIC_API_KEY="..."
|
| 36 |
+
# Or for other providers:
|
| 37 |
+
# export OPENAI_API_KEY="sk-..."
|
| 38 |
+
# export GEMINI_API_KEY="..."
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## Optional extras
|
| 42 |
+
|
| 43 |
+
Running some comparison benchmarks requires additional dependencies. Install them as needed:
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
uv sync --extra adrs # ADRS systems benchmarks
|
| 47 |
+
uv sync --extra external # OpenEvolve / GEPA / ShinkaEvolve backends
|
| 48 |
+
uv sync --extra math # Math benchmarks (SciPy, JAX, etc.)
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
You can combine extras: `uv sync --extra external --extra math`
|
docs/content/docs/getting-started/meta.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"title": "Getting Started",
|
| 3 |
+
"pages": ["index", "installation", "quick-start", "configuration"]
|
| 4 |
+
}
|
docs/content/docs/meta.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"title": "Documentation",
|
| 3 |
+
"pages": [
|
| 4 |
+
"index",
|
| 5 |
+
"getting-started"
|
| 6 |
+
]
|
| 7 |
+
}
|
docs/lib/source.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { docs } from 'fumadocs-mdx:collections/server';
|
| 2 |
+
import { loader } from 'fumadocs-core/source';
|
| 3 |
+
|
| 4 |
+
export const source = loader({
|
| 5 |
+
baseUrl: '/docs',
|
| 6 |
+
source: docs.toFumadocsSource(),
|
| 7 |
+
});
|
docs/mdx-components.tsx
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import defaultMdxComponents from 'fumadocs-ui/mdx';
|
| 2 |
+
import { Callout } from 'fumadocs-ui/components/callout';
|
| 3 |
+
import { File, Files, Folder } from 'fumadocs-ui/components/files';
|
| 4 |
+
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
| 5 |
+
import type { MDXComponents } from 'mdx/types';
|
| 6 |
+
|
| 7 |
+
export function getMDXComponents(components?: MDXComponents): MDXComponents {
|
| 8 |
+
return {
|
| 9 |
+
...defaultMdxComponents,
|
| 10 |
+
Callout,
|
| 11 |
+
File,
|
| 12 |
+
Files,
|
| 13 |
+
Folder,
|
| 14 |
+
Tab,
|
| 15 |
+
Tabs,
|
| 16 |
+
...components,
|
| 17 |
+
};
|
| 18 |
+
}
|
docs/next-env.d.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/// <reference types="next" />
|
| 2 |
+
/// <reference types="next/image-types/global" />
|
| 3 |
+
import "./.next/dev/types/routes.d.ts";
|
| 4 |
+
|
| 5 |
+
// NOTE: This file should not be edited
|
| 6 |
+
// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
|
docs/next.config.mjs
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { createMDX } from 'fumadocs-mdx/next';
|
| 2 |
+
|
| 3 |
+
const withMDX = createMDX();
|
| 4 |
+
|
| 5 |
+
/** @type {import('next').NextConfig} */
|
| 6 |
+
const config = {
|
| 7 |
+
reactStrictMode: true,
|
| 8 |
+
async redirects() {
|
| 9 |
+
return [
|
| 10 |
+
{
|
| 11 |
+
source: '/',
|
| 12 |
+
destination: '/docs',
|
| 13 |
+
permanent: true,
|
| 14 |
+
},
|
| 15 |
+
];
|
| 16 |
+
},
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
export default withMDX(config);
|
docs/package-lock.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
docs/package.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "skydiscover-docs",
|
| 3 |
+
"version": "0.1.0",
|
| 4 |
+
"type": "module",
|
| 5 |
+
"private": true,
|
| 6 |
+
"scripts": {
|
| 7 |
+
"build": "next build",
|
| 8 |
+
"dev": "next dev",
|
| 9 |
+
"start": "next start",
|
| 10 |
+
"types:check": "fumadocs-mdx && next typegen && tsc --noEmit"
|
| 11 |
+
},
|
| 12 |
+
"dependencies": {
|
| 13 |
+
"fumadocs-core": "^16.4.8",
|
| 14 |
+
"fumadocs-mdx": "^14.2.6",
|
| 15 |
+
"fumadocs-ui": "^16.4.8",
|
| 16 |
+
"next": "^16.1.4",
|
| 17 |
+
"react": "^19.0.0",
|
| 18 |
+
"react-dom": "^19.0.0"
|
| 19 |
+
},
|
| 20 |
+
"devDependencies": {
|
| 21 |
+
"@tailwindcss/postcss": "^4.1.18",
|
| 22 |
+
"@types/mdx": "^2.0.13",
|
| 23 |
+
"@types/node": "25.0.10",
|
| 24 |
+
"@types/react": "^19.2.9",
|
| 25 |
+
"@types/react-dom": "^19.2.3",
|
| 26 |
+
"postcss": "^8.5.6",
|
| 27 |
+
"tailwindcss": "^4.1.18",
|
| 28 |
+
"typescript": "^5.9.3"
|
| 29 |
+
}
|
| 30 |
+
}
|
docs/postcss.config.mjs
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export default {
|
| 2 |
+
plugins: {
|
| 3 |
+
'@tailwindcss/postcss': {},
|
| 4 |
+
},
|
| 5 |
+
};
|
docs/source.config.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { defineConfig, defineDocs } from 'fumadocs-mdx/config';
|
| 2 |
+
|
| 3 |
+
export const docs = defineDocs({
|
| 4 |
+
dir: 'content/docs',
|
| 5 |
+
});
|
| 6 |
+
|
| 7 |
+
export default defineConfig();
|
docs/tsconfig.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"baseUrl": ".",
|
| 4 |
+
"target": "ESNext",
|
| 5 |
+
"lib": ["dom", "dom.iterable", "esnext"],
|
| 6 |
+
"allowJs": true,
|
| 7 |
+
"skipLibCheck": true,
|
| 8 |
+
"strict": true,
|
| 9 |
+
"forceConsistentCasingInFileNames": true,
|
| 10 |
+
"noEmit": true,
|
| 11 |
+
"esModuleInterop": true,
|
| 12 |
+
"module": "esnext",
|
| 13 |
+
"moduleResolution": "bundler",
|
| 14 |
+
"resolveJsonModule": true,
|
| 15 |
+
"isolatedModules": true,
|
| 16 |
+
"jsx": "react-jsx",
|
| 17 |
+
"incremental": true,
|
| 18 |
+
"paths": {
|
| 19 |
+
"@/*": ["./*"],
|
| 20 |
+
"fumadocs-mdx:collections/*": [".source/*"]
|
| 21 |
+
},
|
| 22 |
+
"plugins": [
|
| 23 |
+
{
|
| 24 |
+
"name": "next"
|
| 25 |
+
}
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
"include": [
|
| 29 |
+
"next-env.d.ts",
|
| 30 |
+
"**/*.ts",
|
| 31 |
+
"**/*.tsx",
|
| 32 |
+
".next/types/**/*.ts",
|
| 33 |
+
".next/dev/types/**/*.ts"
|
| 34 |
+
],
|
| 35 |
+
"exclude": ["node_modules"]
|
| 36 |
+
}
|
examples/text_similarity/config.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Text similarity benchmark — evolve a function that matches human judgments
|
| 2 |
+
# Usage: skydiscover-run examples/text_similarity/initial_program.py examples/text_similarity/evaluator/ -c examples/text_similarity/config.yaml
|
| 3 |
+
language: python
|
| 4 |
+
max_iterations: 50
|
| 5 |
+
checkpoint_interval: 5
|
| 6 |
+
|
| 7 |
+
llm:
|
| 8 |
+
models:
|
| 9 |
+
- name: "anthropic/claude-sonnet-4-6"
|
| 10 |
+
weight: 1.0
|
| 11 |
+
max_tokens: 8192
|
| 12 |
+
timeout: 300
|
| 13 |
+
|
| 14 |
+
prompt:
|
| 15 |
+
system_message: |-
|
| 16 |
+
You are an expert in natural language processing and string similarity
|
| 17 |
+
algorithms. Your task is to write a similarity function that scores how
|
| 18 |
+
alike two strings are, matching human intuition as closely as possible.
|
| 19 |
+
|
| 20 |
+
The function must return a float between 0.0 (unrelated) and 1.0 (identical).
|
| 21 |
+
Only use the Python standard library — no external packages.
|
| 22 |
+
|
| 23 |
+
The evaluation dataset includes:
|
| 24 |
+
- Typos and misspellings (should score high)
|
| 25 |
+
- Paraphrases (same meaning, different words — should score high)
|
| 26 |
+
- Word reordering (should score fairly high)
|
| 27 |
+
- Negation (high word overlap but opposite meaning — should score low)
|
| 28 |
+
- Unrelated strings (should score near 0)
|
| 29 |
+
|
| 30 |
+
A basic edit distance baseline gets ~0.3 correlation. Character n-grams,
|
| 31 |
+
token overlap, and word-level features can push it higher. Think about
|
| 32 |
+
combining multiple signals — there is no single trick that handles all
|
| 33 |
+
cases. Consider synonym awareness, word order, negation detection, and
|
| 34 |
+
length normalization.
|
| 35 |
+
|
| 36 |
+
evaluator:
|
| 37 |
+
timeout: 30
|
| 38 |
+
|
| 39 |
+
# Live dashboard — opens in your browser
|
| 40 |
+
monitor:
|
| 41 |
+
enabled: true
|
examples/text_similarity/evaluator/Dockerfile
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
WORKDIR /benchmark
|
| 3 |
+
|
| 4 |
+
RUN pip install --no-cache-dir scipy
|
| 5 |
+
|
| 6 |
+
COPY evaluator.py pairs.json evaluate.sh ./
|
| 7 |
+
RUN chmod +x evaluate.sh
|
| 8 |
+
|
| 9 |
+
ENTRYPOINT ["./evaluate.sh"]
|
examples/text_similarity/evaluator/evaluate.sh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
PROGRAM="$1"
|
| 5 |
+
python /benchmark/evaluator.py "$PROGRAM"
|
examples/text_similarity/evaluator/evaluator.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Score a candidate text-similarity function against human judgments."""
|
| 3 |
+
|
| 4 |
+
import importlib.util
|
| 5 |
+
import json
|
| 6 |
+
import random
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
from scipy.stats import spearmanr
|
| 10 |
+
|
| 11 |
+
PAIRS = json.load(open("/benchmark/pairs.json"))
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def main():
|
| 15 |
+
program_path = sys.argv[1]
|
| 16 |
+
|
| 17 |
+
# Load the candidate's similarity() function
|
| 18 |
+
spec = importlib.util.spec_from_file_location("candidate", program_path)
|
| 19 |
+
mod = importlib.util.module_from_spec(spec)
|
| 20 |
+
spec.loader.exec_module(mod)
|
| 21 |
+
|
| 22 |
+
# Score every pair
|
| 23 |
+
predicted = []
|
| 24 |
+
for a, b, _ in PAIRS:
|
| 25 |
+
try:
|
| 26 |
+
score = max(0.0, min(1.0, float(mod.similarity(a, b))))
|
| 27 |
+
except Exception:
|
| 28 |
+
score = 0.0
|
| 29 |
+
predicted.append(score)
|
| 30 |
+
|
| 31 |
+
human = [h for _, _, h in PAIRS]
|
| 32 |
+
correlation = spearmanr(predicted, human).statistic
|
| 33 |
+
|
| 34 |
+
samples = random.sample(range(len(PAIRS)), 3)
|
| 35 |
+
lines = [f"Spearman correlation: {correlation:.4f}", ""]
|
| 36 |
+
for i in samples:
|
| 37 |
+
a, b, h = PAIRS[i]
|
| 38 |
+
lines.append(f" '{a}' vs '{b}': predicted={predicted[i]:.2f}, human={h:.2f}")
|
| 39 |
+
|
| 40 |
+
print(json.dumps({
|
| 41 |
+
"status": "success",
|
| 42 |
+
"combined_score": round(max(0.0, correlation), 4),
|
| 43 |
+
"artifacts": {"feedback": "\n".join(lines)},
|
| 44 |
+
}))
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
main()
|
examples/text_similarity/evaluator/pairs.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
["the cat sat on the mat", "the cat sat on the mat", 1.0],
|
| 3 |
+
["the cat sat on the mat", "the cat sat on a mat", 0.9],
|
| 4 |
+
["restaurant", "restarant", 0.9],
|
| 5 |
+
|
| 6 |
+
["the movie was great", "the film was excellent", 0.85],
|
| 7 |
+
["she is happy", "she feels joyful", 0.8],
|
| 8 |
+
["the car is fast", "the vehicle moves quickly", 0.8],
|
| 9 |
+
["it is raining outside", "rain is falling outdoors", 0.85],
|
| 10 |
+
["he fixed the bug", "he resolved the defect", 0.8],
|
| 11 |
+
["prices went up", "costs increased", 0.85],
|
| 12 |
+
|
| 13 |
+
["the dog chased the cat", "the cat was chased by the dog", 0.85],
|
| 14 |
+
["I love programming in Python", "programming in Python is something I love", 0.8],
|
| 15 |
+
|
| 16 |
+
["machine learning is a subset of AI", "machine learning uses data", 0.4],
|
| 17 |
+
["the weather is nice today", "today is a good day", 0.5],
|
| 18 |
+
["I went to the store", "I drove to the mall", 0.45],
|
| 19 |
+
|
| 20 |
+
["the cat is on the mat", "the dog is in the yard", 0.25],
|
| 21 |
+
["she plays piano", "he plays guitar", 0.35],
|
| 22 |
+
["New York is a big city", "Tokyo has a large population", 0.3],
|
| 23 |
+
["I ate breakfast", "the morning meal was consumed", 0.7],
|
| 24 |
+
|
| 25 |
+
["the test passed", "the test did not pass", 0.2],
|
| 26 |
+
["I love this movie", "I hate this movie", 0.2],
|
| 27 |
+
["the system is working", "the system is not working", 0.15],
|
| 28 |
+
|
| 29 |
+
["the cat sat on the mat", "quantum physics is fascinating", 0.0],
|
| 30 |
+
["hello world", "purple elephants dance on mars", 0.0],
|
| 31 |
+
["database optimization", "chocolate cake recipe", 0.0],
|
| 32 |
+
["she went to school", "the stock market crashed", 0.05],
|
| 33 |
+
|
| 34 |
+
["yes", "yeah", 0.8],
|
| 35 |
+
["no", "nope", 0.8],
|
| 36 |
+
["hi", "hello", 0.75],
|
| 37 |
+
["error", "bug", 0.6],
|
| 38 |
+
["fast", "quick", 0.85],
|
| 39 |
+
["big", "large", 0.9],
|
| 40 |
+
["happy", "sad", 0.15],
|
| 41 |
+
["good", "bad", 0.15],
|
| 42 |
+
|
| 43 |
+
["the server returned a 500 error", "the server threw an internal error", 0.85],
|
| 44 |
+
["null pointer exception", "segmentation fault", 0.4],
|
| 45 |
+
["open a pull request", "submit a PR", 0.85],
|
| 46 |
+
["the function returns a list", "the method outputs an array", 0.7],
|
| 47 |
+
|
| 48 |
+
["the meeting is at 3pm", "the meeting is at 3:00 PM", 0.95],
|
| 49 |
+
["version 2.0", "v2.0", 0.9],
|
| 50 |
+
["100 dollars", "$100", 0.9]
|
| 51 |
+
]
|
examples/text_similarity/initial_program.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# EVOLVE-BLOCK-START
|
| 2 |
+
def similarity(a: str, b: str) -> float:
|
| 3 |
+
"""
|
| 4 |
+
Return a similarity score between 0.0 (unrelated) and 1.0 (identical)
|
| 5 |
+
for two input strings.
|
| 6 |
+
|
| 7 |
+
This should capture not just character-level similarity but also
|
| 8 |
+
meaning — paraphrases should score high, negations should score low,
|
| 9 |
+
and typos should be forgiven.
|
| 10 |
+
|
| 11 |
+
Only use the Python standard library (no external packages).
|
| 12 |
+
"""
|
| 13 |
+
# Baseline: normalized Levenshtein distance
|
| 14 |
+
if a == b:
|
| 15 |
+
return 1.0
|
| 16 |
+
if not a or not b:
|
| 17 |
+
return 0.0
|
| 18 |
+
|
| 19 |
+
m, n = len(a), len(b)
|
| 20 |
+
dp = list(range(n + 1))
|
| 21 |
+
for i in range(1, m + 1):
|
| 22 |
+
prev = dp[0]
|
| 23 |
+
dp[0] = i
|
| 24 |
+
for j in range(1, n + 1):
|
| 25 |
+
temp = dp[j]
|
| 26 |
+
if a[i - 1] == b[j - 1]:
|
| 27 |
+
dp[j] = prev
|
| 28 |
+
else:
|
| 29 |
+
dp[j] = 1 + min(dp[j], dp[j - 1], prev)
|
| 30 |
+
prev = temp
|
| 31 |
+
|
| 32 |
+
max_len = max(m, n)
|
| 33 |
+
return 1.0 - dp[n] / max_len
|
| 34 |
+
# EVOLVE-BLOCK-END
|
scripts/reproduce/adrs.sh
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Reproduce ADRS benchmarks (5 problems x 2 search methods).
|
| 3 |
+
# All benchmarks launch in parallel.
|
| 4 |
+
set -euo pipefail
|
| 5 |
+
|
| 6 |
+
# ── Settings ─────────────────────────────────────────────────────────────────
|
| 7 |
+
# Only two things to change:
|
| 8 |
+
|
| 9 |
+
MODEL="gpt-5" # main generation model
|
| 10 |
+
# MODEL="gemini/gemini-3.0-pro-preview" # alternative
|
| 11 |
+
ITERATIONS=100
|
| 12 |
+
|
| 13 |
+
# -m sets all models (main + guide/paradigm) to the same MODEL.
|
| 14 |
+
# API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
|
| 15 |
+
|
| 16 |
+
# ── Install ──────────────────────────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
cd "$(dirname "$0")/../.."
|
| 19 |
+
uv sync --extra adrs
|
| 20 |
+
|
| 21 |
+
# ── Download Data ────────────────────────────────────────────────────────────
|
| 22 |
+
|
| 23 |
+
if [[ ! -f benchmarks/ADRS/cloudcast/profiles/cost.csv ]]; then
|
| 24 |
+
echo "Downloading cloudcast dataset..."
|
| 25 |
+
bash benchmarks/ADRS/cloudcast/download_dataset.sh
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
if [[ ! -d benchmarks/ADRS/llm_sql/datasets ]] || \
|
| 29 |
+
[[ -z "$(ls benchmarks/ADRS/llm_sql/datasets/*.csv 2>/dev/null)" ]]; then
|
| 30 |
+
echo "Downloading llm_sql dataset..."
|
| 31 |
+
bash benchmarks/ADRS/llm_sql/download_dataset.sh
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
# ── Helper ───────────────────────────────────────────────────────────────────
|
| 35 |
+
|
| 36 |
+
run() {
|
| 37 |
+
local dir=$1 search=$2
|
| 38 |
+
local init="$dir/initial_program.py"
|
| 39 |
+
[[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
|
| 40 |
+
[[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
|
| 41 |
+
local cfg="$dir/config.yaml"
|
| 42 |
+
[[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
|
| 43 |
+
echo "== $search: ${dir#benchmarks/} =="
|
| 44 |
+
uv run skydiscover-run "$init" "$dir/evaluator.py" \
|
| 45 |
+
-c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
|
| 46 |
+
-o "outputs/reproduce/$search/${dir#benchmarks/}"
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# ── AdaEvolve ────────────────────────────────────────────────────────────────
|
| 50 |
+
|
| 51 |
+
run benchmarks/ADRS/cloudcast adaevolve &
|
| 52 |
+
run benchmarks/ADRS/eplb adaevolve &
|
| 53 |
+
run benchmarks/ADRS/llm_sql adaevolve &
|
| 54 |
+
run benchmarks/ADRS/prism adaevolve &
|
| 55 |
+
run benchmarks/ADRS/txn_scheduling adaevolve &
|
| 56 |
+
|
| 57 |
+
# ── EvoX ─────────────────────────────────────────────────────────────────────
|
| 58 |
+
|
| 59 |
+
run benchmarks/ADRS/cloudcast evox &
|
| 60 |
+
run benchmarks/ADRS/eplb evox &
|
| 61 |
+
run benchmarks/ADRS/llm_sql evox &
|
| 62 |
+
run benchmarks/ADRS/prism evox &
|
| 63 |
+
run benchmarks/ADRS/txn_scheduling evox &
|
| 64 |
+
|
| 65 |
+
wait
|
| 66 |
+
echo "adrs.sh: all 10 runs finished."
|
scripts/reproduce/ale_bench.sh
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Reproduce ALE-Bench benchmarks (10 problems x 2 search methods).
|
| 3 |
+
# All benchmarks launch in parallel.
|
| 4 |
+
set -euo pipefail
|
| 5 |
+
|
| 6 |
+
# ── Settings ─────────────────────────────────────────────────────────────────
|
| 7 |
+
# Only two things to change:
|
| 8 |
+
|
| 9 |
+
MODEL="gpt-5" # main generation model
|
| 10 |
+
# MODEL="gemini/gemini-3.0-pro-preview" # alternative
|
| 11 |
+
ITERATIONS=100
|
| 12 |
+
|
| 13 |
+
# -m sets all models (main + guide/paradigm) to the same MODEL.
|
| 14 |
+
# API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
|
| 15 |
+
|
| 16 |
+
# ── Install ──────────────────────────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
cd "$(dirname "$0")/../.."
|
| 19 |
+
uv sync --extra external
|
| 20 |
+
|
| 21 |
+
# ── Helper ───────────────────────────────────────────────────────────────────
|
| 22 |
+
|
| 23 |
+
run() {
|
| 24 |
+
local dir=$1 search=$2
|
| 25 |
+
local init="$dir/initial_program.py"
|
| 26 |
+
[[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
|
| 27 |
+
[[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
|
| 28 |
+
local cfg="$dir/config.yaml"
|
| 29 |
+
[[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
|
| 30 |
+
echo "== $search: ${dir#benchmarks/} =="
|
| 31 |
+
uv run skydiscover-run "$init" "$dir/evaluator.py" \
|
| 32 |
+
-c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
|
| 33 |
+
-o "outputs/reproduce/$search/${dir#benchmarks/}"
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# ── AdaEvolve ────────────────────────────────────────────────────────────────
|
| 37 |
+
|
| 38 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc008 adaevolve &
|
| 39 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc011 adaevolve &
|
| 40 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc015 adaevolve &
|
| 41 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc016 adaevolve &
|
| 42 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc024 adaevolve &
|
| 43 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc025 adaevolve &
|
| 44 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc026 adaevolve &
|
| 45 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc027 adaevolve &
|
| 46 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc039 adaevolve &
|
| 47 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc046 adaevolve &
|
| 48 |
+
|
| 49 |
+
# ── EvoX ─────────────────────────────────────────────────────────────────────
|
| 50 |
+
|
| 51 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc008 evox &
|
| 52 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc011 evox &
|
| 53 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc015 evox &
|
| 54 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc016 evox &
|
| 55 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc024 evox &
|
| 56 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc025 evox &
|
| 57 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc026 evox &
|
| 58 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc027 evox &
|
| 59 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc039 evox &
|
| 60 |
+
run benchmarks/ale_bench/ale-bench-lite-problems/ahc046 evox &
|
| 61 |
+
|
| 62 |
+
wait
|
| 63 |
+
echo "ale_bench.sh: all 20 runs finished."
|
scripts/reproduce/arc.sh
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Reproduce ARC benchmark (1 problem x 2 search methods).
|
| 3 |
+
# All benchmarks launch in parallel.
|
| 4 |
+
set -euo pipefail
|
| 5 |
+
|
| 6 |
+
# ── Settings ─────────────────────────────────────────────────────────────────
|
| 7 |
+
# Only two things to change:
|
| 8 |
+
|
| 9 |
+
MODEL="gpt-5" # main generation model
|
| 10 |
+
# MODEL="gemini/gemini-3.0-pro-preview" # alternative
|
| 11 |
+
ITERATIONS=100
|
| 12 |
+
|
| 13 |
+
# -m sets all models (main + guide/paradigm) to the same MODEL.
|
| 14 |
+
# API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
|
| 15 |
+
|
| 16 |
+
# ── Install ──────────────────────────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
cd "$(dirname "$0")/../.."
|
| 19 |
+
uv sync
|
| 20 |
+
|
| 21 |
+
# ── Helper ───────────────────────────────────────────────────────────────────
|
| 22 |
+
|
| 23 |
+
run() {
|
| 24 |
+
local dir=$1 search=$2
|
| 25 |
+
local init="$dir/initial_program.py"
|
| 26 |
+
[[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
|
| 27 |
+
[[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
|
| 28 |
+
local cfg="$dir/config.yaml"
|
| 29 |
+
[[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
|
| 30 |
+
echo "== $search: ${dir#benchmarks/} =="
|
| 31 |
+
uv run skydiscover-run "$init" "$dir/evaluator.py" \
|
| 32 |
+
-c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
|
| 33 |
+
-o "outputs/reproduce/$search/${dir#benchmarks/}"
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# ── AdaEvolve ────────────────────────────────────────────────────────────────
|
| 37 |
+
|
| 38 |
+
run benchmarks/arc_benchmark adaevolve &
|
| 39 |
+
|
| 40 |
+
# ── EvoX ─────────────────────────────────────────────────────────────────────
|
| 41 |
+
|
| 42 |
+
run benchmarks/arc_benchmark evox &
|
| 43 |
+
|
| 44 |
+
wait
|
| 45 |
+
echo "arc.sh: all 2 runs finished."
|
scripts/reproduce/frontier_cs.sh
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Reproduce Frontier-CS benchmark (1 problem x 2 search methods).
|
| 3 |
+
# Requires Docker to be installed and running.
|
| 4 |
+
# All benchmarks launch in parallel.
|
| 5 |
+
set -euo pipefail
|
| 6 |
+
|
| 7 |
+
# ── Settings ─────────────────────────────────────────────────────────────────
|
| 8 |
+
# Only two things to change:
|
| 9 |
+
|
| 10 |
+
MODEL="gpt-5" # main generation model
|
| 11 |
+
# MODEL="gemini/gemini-3.0-pro-preview" # alternative
|
| 12 |
+
ITERATIONS=100
|
| 13 |
+
|
| 14 |
+
# -m sets all models (main + guide/paradigm) to the same MODEL.
|
| 15 |
+
# API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
|
| 16 |
+
|
| 17 |
+
# ── Install ──────────────────────────────────────────────────────────────────
|
| 18 |
+
|
| 19 |
+
cd "$(dirname "$0")/../.."
|
| 20 |
+
uv sync --extra frontier-cs
|
| 21 |
+
|
| 22 |
+
# ── Check Docker ─────────────────────────────────────────────────────────────
|
| 23 |
+
|
| 24 |
+
if ! command -v docker &>/dev/null; then
|
| 25 |
+
echo "Warning: Docker not found. The evaluator requires Docker." >&2
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
# ── Helper ───────────────────────────────────────────────────────────────────
|
| 29 |
+
|
| 30 |
+
run() {
|
| 31 |
+
local dir=$1 search=$2
|
| 32 |
+
local init="$dir/initial_program.py"
|
| 33 |
+
[[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
|
| 34 |
+
[[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
|
| 35 |
+
local cfg="$dir/config.yaml"
|
| 36 |
+
[[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
|
| 37 |
+
echo "== $search: ${dir#benchmarks/} =="
|
| 38 |
+
uv run skydiscover-run "$init" "$dir/evaluator.py" \
|
| 39 |
+
-c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
|
| 40 |
+
-o "outputs/reproduce/$search/${dir#benchmarks/}"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# ── AdaEvolve ────────────────────────────────────────────────────────────────
|
| 44 |
+
|
| 45 |
+
run benchmarks/frontier-cs-eval adaevolve &
|
| 46 |
+
|
| 47 |
+
# ── EvoX ─────────────────────────────────────────────────────────────────────
|
| 48 |
+
|
| 49 |
+
run benchmarks/frontier-cs-eval evox &
|
| 50 |
+
|
| 51 |
+
wait
|
| 52 |
+
echo "frontier_cs.sh: all 2 runs finished."
|
scripts/reproduce/gpu.sh
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Reproduce GPU benchmarks (4 problems x 2 search methods).
|
| 3 |
+
# Requires a CUDA-capable GPU with Triton support.
|
| 4 |
+
# All benchmarks launch in parallel.
|
| 5 |
+
set -euo pipefail
|
| 6 |
+
|
| 7 |
+
# ── Settings ─────────────────────────────────────────────────────────────────
|
| 8 |
+
# Only two things to change:
|
| 9 |
+
|
| 10 |
+
MODEL="gpt-5" # main generation model
|
| 11 |
+
# MODEL="gemini/gemini-3.0-pro-preview" # alternative
|
| 12 |
+
ITERATIONS=100
|
| 13 |
+
|
| 14 |
+
# -m sets all models (main + guide/paradigm) to the same MODEL.
|
| 15 |
+
# API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
|
| 16 |
+
|
| 17 |
+
# ── Install ──────────────────────────────────────────────────────────────────
|
| 18 |
+
|
| 19 |
+
cd "$(dirname "$0")/../.."
|
| 20 |
+
uv sync
|
| 21 |
+
|
| 22 |
+
# ── Check GPU ────────────────────────────────────────────────────────────────
|
| 23 |
+
|
| 24 |
+
if ! command -v nvidia-smi &>/dev/null; then
|
| 25 |
+
echo "Warning: nvidia-smi not found. GPU benchmarks may fail." >&2
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
# ── Helper ───────────────────────────────────────────────────────────────────
|
| 29 |
+
|
| 30 |
+
run() {
|
| 31 |
+
local dir=$1 search=$2
|
| 32 |
+
local init="$dir/initial_program.py"
|
| 33 |
+
[[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
|
| 34 |
+
[[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
|
| 35 |
+
local cfg="$dir/config.yaml"
|
| 36 |
+
[[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
|
| 37 |
+
echo "== $search: ${dir#benchmarks/} =="
|
| 38 |
+
uv run skydiscover-run "$init" "$dir/evaluator.py" \
|
| 39 |
+
-c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
|
| 40 |
+
-o "outputs/reproduce/$search/${dir#benchmarks/}"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# ── AdaEvolve ────────────────────────────────────────────────────────────────
|
| 44 |
+
|
| 45 |
+
run benchmarks/gpu_mode/grayscale adaevolve &
|
| 46 |
+
run benchmarks/gpu_mode/mla_decode adaevolve &
|
| 47 |
+
run benchmarks/gpu_mode/trimul adaevolve &
|
| 48 |
+
run benchmarks/gpu_mode/vecadd adaevolve &
|
| 49 |
+
|
| 50 |
+
# ── EvoX ─────────────────────────────────────────────────────────────────────
|
| 51 |
+
|
| 52 |
+
run benchmarks/gpu_mode/grayscale evox &
|
| 53 |
+
run benchmarks/gpu_mode/mla_decode evox &
|
| 54 |
+
run benchmarks/gpu_mode/trimul evox &
|
| 55 |
+
run benchmarks/gpu_mode/vecadd evox &
|
| 56 |
+
|
| 57 |
+
wait
|
| 58 |
+
echo "gpu.sh: all 8 runs finished."
|
scripts/reproduce/math.sh
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Reproduce math benchmarks (17 problems x 2 search methods).
|
| 3 |
+
# All benchmarks launch in parallel.
|
| 4 |
+
set -euo pipefail
|
| 5 |
+
|
| 6 |
+
# ── Settings ─────────────────────────────────────────────────────────────────
|
| 7 |
+
# Only two things to change:
|
| 8 |
+
|
| 9 |
+
MODEL="gpt-5" # main generation model
|
| 10 |
+
# MODEL="gemini/gemini-3.0-pro-preview" # alternative
|
| 11 |
+
ITERATIONS=100
|
| 12 |
+
|
| 13 |
+
# -m sets all models (main + guide/paradigm) to the same MODEL.
|
| 14 |
+
# API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
|
| 15 |
+
|
| 16 |
+
# ── Install ──────────────────────────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
cd "$(dirname "$0")/../.."
|
| 19 |
+
uv sync --extra math
|
| 20 |
+
|
| 21 |
+
# ── Helper ───────────────────────────────────────────────────────────────────
|
| 22 |
+
|
| 23 |
+
run() {
|
| 24 |
+
local dir=$1 search=$2
|
| 25 |
+
local init="$dir/initial_program.py"
|
| 26 |
+
[[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
|
| 27 |
+
[[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
|
| 28 |
+
local cfg="$dir/config.yaml"
|
| 29 |
+
[[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
|
| 30 |
+
echo "== $search: ${dir#benchmarks/} =="
|
| 31 |
+
uv run skydiscover-run "$init" "$dir/evaluator.py" \
|
| 32 |
+
-c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
|
| 33 |
+
-o "outputs/reproduce/$search/${dir#benchmarks/}"
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# ── AdaEvolve ────────────────────────────────────────────────────────────────
|
| 37 |
+
|
| 38 |
+
run benchmarks/math/circle_packing adaevolve &
|
| 39 |
+
run benchmarks/math/circle_packing_rect adaevolve &
|
| 40 |
+
run benchmarks/math/erdos_min_overlap adaevolve &
|
| 41 |
+
run benchmarks/math/first_autocorr_ineq adaevolve &
|
| 42 |
+
run benchmarks/math/second_autocorr_ineq adaevolve &
|
| 43 |
+
run benchmarks/math/third_autocorr_ineq adaevolve &
|
| 44 |
+
run benchmarks/math/uncertainty_ineq adaevolve &
|
| 45 |
+
run benchmarks/math/hexagon_packing/11 adaevolve &
|
| 46 |
+
run benchmarks/math/hexagon_packing/12 adaevolve &
|
| 47 |
+
run benchmarks/math/heilbronn_convex/13 adaevolve &
|
| 48 |
+
run benchmarks/math/heilbronn_convex/14 adaevolve &
|
| 49 |
+
run benchmarks/math/heilbronn_triangle adaevolve &
|
| 50 |
+
run benchmarks/math/minimizing_max_min_dist/2 adaevolve &
|
| 51 |
+
run benchmarks/math/minimizing_max_min_dist/3 adaevolve &
|
| 52 |
+
run benchmarks/math/matmul adaevolve &
|
| 53 |
+
run benchmarks/math/signal_processing adaevolve &
|
| 54 |
+
run benchmarks/math/sums_diffs_finite_sets adaevolve &
|
| 55 |
+
|
| 56 |
+
# ── EvoX ─────────────────────────────────────────────────────────────────────
|
| 57 |
+
|
| 58 |
+
run benchmarks/math/circle_packing evox &
|
| 59 |
+
run benchmarks/math/circle_packing_rect evox &
|
| 60 |
+
run benchmarks/math/erdos_min_overlap evox &
|
| 61 |
+
run benchmarks/math/first_autocorr_ineq evox &
|
| 62 |
+
run benchmarks/math/second_autocorr_ineq evox &
|
| 63 |
+
run benchmarks/math/third_autocorr_ineq evox &
|
| 64 |
+
run benchmarks/math/uncertainty_ineq evox &
|
| 65 |
+
run benchmarks/math/hexagon_packing/11 evox &
|
| 66 |
+
run benchmarks/math/hexagon_packing/12 evox &
|
| 67 |
+
run benchmarks/math/heilbronn_convex/13 evox &
|
| 68 |
+
run benchmarks/math/heilbronn_convex/14 evox &
|
| 69 |
+
run benchmarks/math/heilbronn_triangle evox &
|
| 70 |
+
run benchmarks/math/minimizing_max_min_dist/2 evox &
|
| 71 |
+
run benchmarks/math/minimizing_max_min_dist/3 evox &
|
| 72 |
+
run benchmarks/math/matmul evox &
|
| 73 |
+
run benchmarks/math/signal_processing evox &
|
| 74 |
+
run benchmarks/math/sums_diffs_finite_sets evox &
|
| 75 |
+
|
| 76 |
+
wait
|
| 77 |
+
echo "math.sh: all 34 runs finished."
|
scripts/reproduce/prompt_opt.sh
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Reproduce prompt optimization benchmark (1 problem x 2 search methods).
|
| 3 |
+
# All benchmarks launch in parallel.
|
| 4 |
+
set -euo pipefail
|
| 5 |
+
|
| 6 |
+
# ── Settings ─────────────────────────────────────────────────────────────────
|
| 7 |
+
# Only two things to change:
|
| 8 |
+
|
| 9 |
+
MODEL="gpt-5" # main generation model
|
| 10 |
+
# MODEL="gemini/gemini-3.0-pro-preview" # alternative
|
| 11 |
+
ITERATIONS=100
|
| 12 |
+
|
| 13 |
+
# -m sets all models (main + guide/paradigm) to the same MODEL.
|
| 14 |
+
# API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
|
| 15 |
+
|
| 16 |
+
# ── Install ──────────────────────────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
cd "$(dirname "$0")/../.."
|
| 19 |
+
uv sync --extra prompt-optimization
|
| 20 |
+
|
| 21 |
+
# ── Helper ───────────────────────────────────────────────────────────────────
|
| 22 |
+
|
| 23 |
+
run() {
|
| 24 |
+
local dir=$1 search=$2
|
| 25 |
+
local init="$dir/initial_program.py"
|
| 26 |
+
[[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
|
| 27 |
+
[[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
|
| 28 |
+
local cfg="$dir/config.yaml"
|
| 29 |
+
[[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
|
| 30 |
+
echo "== $search: ${dir#benchmarks/} =="
|
| 31 |
+
uv run skydiscover-run "$init" "$dir/evaluator.py" \
|
| 32 |
+
-c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
|
| 33 |
+
-o "outputs/reproduce/$search/${dir#benchmarks/}"
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# ── AdaEvolve ────────────────────────────────────────────────────────────────
|
| 37 |
+
|
| 38 |
+
run benchmarks/prompt_optimization/hotpot_qa adaevolve &
|
| 39 |
+
|
| 40 |
+
# ── EvoX ─────────────────────────────────────────────────────────────────────
|
| 41 |
+
|
| 42 |
+
run benchmarks/prompt_optimization/hotpot_qa evox &
|
| 43 |
+
|
| 44 |
+
wait
|
| 45 |
+
echo "prompt_opt.sh: all 2 runs finished."
|
scripts/reproduce/run_all.sh
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Run all reproduce scripts in parallel.
|
| 3 |
+
# Each category launches in the background; we wait for all to finish.
|
| 4 |
+
# Tip: set ITERATIONS=2 in each script for a quick smoke test.
|
| 5 |
+
set -euo pipefail
|
| 6 |
+
|
| 7 |
+
DIR="$(dirname "$0")"
|
| 8 |
+
|
| 9 |
+
bash "$DIR/math.sh" &
|
| 10 |
+
bash "$DIR/adrs.sh" &
|
| 11 |
+
bash "$DIR/ale_bench.sh" &
|
| 12 |
+
bash "$DIR/frontier_cs.sh" &
|
| 13 |
+
bash "$DIR/gpu.sh" &
|
| 14 |
+
bash "$DIR/arc.sh" &
|
| 15 |
+
bash "$DIR/prompt_opt.sh" &
|
| 16 |
+
|
| 17 |
+
wait
|
| 18 |
+
echo "All reproduce scripts finished."
|
scripts/run_cp.sh
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Run circle_packing benchmark with topk search.
|
| 3 |
+
# Usage: ./scripts/run_cp.sh [ITERATIONS]
|
| 4 |
+
# Prerequisites: uv sync --extra math, OPENAI_API_KEY set
|
| 5 |
+
|
| 6 |
+
set -euo pipefail
|
| 7 |
+
|
| 8 |
+
cd "$(dirname "$0")/.."
|
| 9 |
+
|
| 10 |
+
ITERATIONS="${1:-3}"
|
| 11 |
+
|
| 12 |
+
echo "Running circle_packing benchmark (search=topk, iterations=$ITERATIONS)..."
|
| 13 |
+
uv run skydiscover-run \
|
| 14 |
+
benchmarks/math/circle_packing/initial_program.py \
|
| 15 |
+
benchmarks/math/circle_packing/evaluator.py \
|
| 16 |
+
--config benchmarks/math/circle_packing/config.yaml \
|
| 17 |
+
--search topk \
|
| 18 |
+
--iterations "$ITERATIONS"
|
| 19 |
+
|
| 20 |
+
echo "Done."
|
setup.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup
|
| 2 |
+
|
| 3 |
+
setup() # All config in pyproject.toml
|
skydiscover/README.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SkyDiscover
|
| 2 |
+
|
| 3 |
+
SkyDiscover is an iterative LLM-driven discovery engine. Each iteration runs a
|
| 4 |
+
four-step loop:
|
| 5 |
+
|
| 6 |
+
```
|
| 7 |
+
sample → prompt → generate → evaluate → add
|
| 8 |
+
↑ │
|
| 9 |
+
└───────────────────────────────────────┘
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
1. **Sample** — the search algorithm (`search/`) picks a parent solution and
|
| 13 |
+
any relevant context solutions from the database.
|
| 14 |
+
2. **Prompt** — the context builder (`context_builder/`) turns the parent solution,
|
| 15 |
+
relevant context solutions (if any), and problem spec into system + user messages.
|
| 16 |
+
3. **Generate** — the LLM (`llm/`) produces a candidate solution (code, text,
|
| 17 |
+
or image).
|
| 18 |
+
4. **Evaluate** — the evaluator (`evaluation/`) scores the candidate and
|
| 19 |
+
returns metrics.
|
| 20 |
+
5. **Add** — the scored candidate is stored back in the database, closing the
|
| 21 |
+
loop.
|
| 22 |
+
|
| 23 |
+
The `DiscoveryController` (`search/default_discovery_controller.py`) orchestrates
|
| 24 |
+
this loop. Search algorithms that need custom orchestration (e.g. co-evolution)
|
| 25 |
+
subclass it and override `run_discovery()`.
|
| 26 |
+
|
| 27 |
+
## Components
|
| 28 |
+
|
| 29 |
+
| Component | Subfolder | What it does | Extend by |
|
| 30 |
+
|:---|:---|:---|:---|
|
| 31 |
+
| **Context Builder** | `context_builder/` | Assembles LLM prompts from the problem spec, prior solutions, and feedback | Subclass `ContextBuilder` ([README](context_builder/README.md)) |
|
| 32 |
+
| **Solution Generator** | `llm/` | Produces candidates via LLM calls, with optional tool use | Subclass `LLMInterface` |
|
| 33 |
+
| **Evaluator** | `evaluation/` | Scores candidates and logs metadata back into the solution database | Provide an `evaluate.py` script |
|
| 34 |
+
| **Solution Selector** | `search/` | Maintains the solution database and picks parents for the next iteration | Subclass `ProgramDatabase` ([README](search/README.md)) |
|
| 35 |
+
|
| 36 |
+
## Additional subfolders
|
| 37 |
+
|
| 38 |
+
| Subfolder | What it does |
|
| 39 |
+
|:---|:---|
|
| 40 |
+
| `extras/` | External backends (OpenEvolve, GEPA, ShinkaEvolve) and the live monitor dashboard |
|
| 41 |
+
| `utils/` | Shared helpers — code parsing, metrics, formatting, async utilities, repo mapping |
|
| 42 |
+
|
| 43 |
+
## Entry points
|
| 44 |
+
|
| 45 |
+
| Entry point | Use case |
|
| 46 |
+
|:---|:---|
|
| 47 |
+
| `api.py` | Python API — `run_discovery()`, `discover_solution()` |
|
| 48 |
+
| `cli.py` | CLI — `skydiscover-run` |
|
| 49 |
+
| `runner.py` | Setup and run (used by both API and CLI) |
|
| 50 |
+
| `config.py` | Configuration loading and overrides |
|
skydiscover/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SkyDiscover: Self-Improving Framework for LLMs
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from skydiscover._version import __version__
|
| 6 |
+
from skydiscover.api import (
|
| 7 |
+
DiscoveryResult,
|
| 8 |
+
discover_solution,
|
| 9 |
+
run_discovery,
|
| 10 |
+
)
|
| 11 |
+
from skydiscover.runner import Runner
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"Runner",
|
| 15 |
+
"__version__",
|
| 16 |
+
"run_discovery",
|
| 17 |
+
"discover_solution",
|
| 18 |
+
"DiscoveryResult",
|
| 19 |
+
]
|
skydiscover/_version.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Version information for skydiscover package."""
|
| 2 |
+
|
| 3 |
+
__version__ = "0.0.0"
|
skydiscover/api.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Public library API for SkyDiscover.
|
| 3 |
+
|
| 4 |
+
This module exposes the two main entry points for programmatic use:
|
| 5 |
+
|
| 6 |
+
* `run_discovery`: accept file paths or inline strings for the initial program and evaluator,
|
| 7 |
+
wires up configuration, and returns a `DiscoveryResult`.
|
| 8 |
+
* `discover_solution`: convenience wrapper when the initial solution is a plain string and
|
| 9 |
+
the evaluator is a Python callable.
|
| 10 |
+
|
| 11 |
+
Quick-start::
|
| 12 |
+
|
| 13 |
+
from skydiscover import run_discovery
|
| 14 |
+
|
| 15 |
+
result = run_discovery(
|
| 16 |
+
evaluator="examples/my_problem/eval.py",
|
| 17 |
+
initial_program="examples/my_problem/init.py", # optional
|
| 18 |
+
model="gpt-5",
|
| 19 |
+
iterations=50,
|
| 20 |
+
)
|
| 21 |
+
print(result.best_score, result.best_solution)
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import asyncio
|
| 25 |
+
import logging
|
| 26 |
+
import os
|
| 27 |
+
import tempfile
|
| 28 |
+
from dataclasses import dataclass
|
| 29 |
+
from pathlib import Path
|
| 30 |
+
from typing import Any, Callable, Dict, List, Optional, Union
|
| 31 |
+
|
| 32 |
+
from skydiscover.benchmarks.resolution import resolve_benchmark_problem
|
| 33 |
+
from skydiscover.config import Config, apply_overrides, load_config
|
| 34 |
+
from skydiscover.runner import Runner
|
| 35 |
+
from skydiscover.search.base_database import Program
|
| 36 |
+
from skydiscover.utils.metrics import get_score
|
| 37 |
+
from skydiscover.utils.prepare import cleanup_temp, prepare_evaluator, prepare_program
|
| 38 |
+
|
| 39 |
+
logger = logging.getLogger(__name__)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class DiscoveryResult:
|
| 44 |
+
"""Result of a single discovery run."""
|
| 45 |
+
|
| 46 |
+
best_program: Optional[Program]
|
| 47 |
+
best_score: float
|
| 48 |
+
best_solution: str
|
| 49 |
+
metrics: Dict[str, Any]
|
| 50 |
+
output_dir: Optional[str]
|
| 51 |
+
initial_score: Optional[float] = None
|
| 52 |
+
|
| 53 |
+
def __repr__(self) -> str:
|
| 54 |
+
init = f"{self.initial_score:.4f}" if self.initial_score is not None else "N/A"
|
| 55 |
+
return f"DiscoveryResult(best_score={self.best_score:.4f}, initial_score={init})"
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def run_discovery(
|
| 59 |
+
evaluator: Union[str, Path, Callable],
|
| 60 |
+
initial_program: Optional[Union[str, Path, List[str]]] = None,
|
| 61 |
+
model: Optional[str] = None,
|
| 62 |
+
iterations: Optional[int] = None,
|
| 63 |
+
search: Optional[str] = None,
|
| 64 |
+
config: Union[str, Path, Config, None] = None,
|
| 65 |
+
agentic: bool = False,
|
| 66 |
+
output_dir: Optional[str] = None,
|
| 67 |
+
system_prompt: Optional[str] = None,
|
| 68 |
+
api_base: Optional[str] = None,
|
| 69 |
+
cleanup: bool = True,
|
| 70 |
+
) -> DiscoveryResult:
|
| 71 |
+
"""Run a discovery process and return the best result.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
evaluator: File path or callable (program_path) -> metrics_dict.
|
| 75 |
+
initial_program: File path or inline source code (string / list of lines).
|
| 76 |
+
Optional — when omitted the LLM generates a solution from scratch.
|
| 77 |
+
model: Model name(s), comma-separated. e.g. "gpt-5" or "gpt-5,gemini/gemini-3-pro".
|
| 78 |
+
iterations: Max iterations (overrides config).
|
| 79 |
+
search: Algorithm name ("topk", "adaevolve", "evox", "openevolve_native", etc.).
|
| 80 |
+
config: YAML path, Config object, or None for defaults.
|
| 81 |
+
agentic: Enable agentic mode (codebase root derived from initial_program).
|
| 82 |
+
output_dir: Where to write results (temp dir if None).
|
| 83 |
+
system_prompt: Domain-specific context for the LLM.
|
| 84 |
+
api_base: Base URL for an OpenAI-compatible API.
|
| 85 |
+
cleanup: Remove temp files after the run.
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
DiscoveryResult with best program, score, solution, metrics, and output directory.
|
| 89 |
+
"""
|
| 90 |
+
return asyncio.run(
|
| 91 |
+
_run_discovery_async(
|
| 92 |
+
initial_program,
|
| 93 |
+
evaluator,
|
| 94 |
+
config,
|
| 95 |
+
iterations=iterations,
|
| 96 |
+
output_dir=output_dir,
|
| 97 |
+
cleanup=cleanup,
|
| 98 |
+
agentic=agentic,
|
| 99 |
+
model=model,
|
| 100 |
+
search=search,
|
| 101 |
+
system_prompt=system_prompt,
|
| 102 |
+
api_base=api_base,
|
| 103 |
+
)
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
async def _run_discovery_async(
|
| 108 |
+
initial_program: Optional[Union[str, Path, List[str]]],
|
| 109 |
+
evaluator: Union[str, Path, Callable],
|
| 110 |
+
config: Union[str, Path, Config, None],
|
| 111 |
+
*,
|
| 112 |
+
model: Optional[str] = None,
|
| 113 |
+
iterations: Optional[int] = None,
|
| 114 |
+
search: Optional[str] = None,
|
| 115 |
+
agentic: bool = False,
|
| 116 |
+
output_dir: Optional[str] = None,
|
| 117 |
+
system_prompt: Optional[str] = None,
|
| 118 |
+
api_base: Optional[str] = None,
|
| 119 |
+
cleanup: bool = True,
|
| 120 |
+
) -> DiscoveryResult:
|
| 121 |
+
"""Async implementation of run_discovery."""
|
| 122 |
+
|
| 123 |
+
temp_dir: Optional[str] = None
|
| 124 |
+
temp_files: List[str] = []
|
| 125 |
+
evaluator_env_vars: Dict[str, str] = {}
|
| 126 |
+
|
| 127 |
+
try:
|
| 128 |
+
if isinstance(config, Config):
|
| 129 |
+
config_obj = config
|
| 130 |
+
else:
|
| 131 |
+
config_obj = load_config(str(config) if config else None)
|
| 132 |
+
|
| 133 |
+
apply_overrides(
|
| 134 |
+
config_obj,
|
| 135 |
+
model=model,
|
| 136 |
+
api_base=api_base,
|
| 137 |
+
agentic=agentic,
|
| 138 |
+
search=search,
|
| 139 |
+
system_prompt=system_prompt,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Resolve benchmark problem if configured and no initial_program provided
|
| 143 |
+
if initial_program is None and config_obj.benchmark and config_obj.benchmark.enabled:
|
| 144 |
+
try:
|
| 145 |
+
resolution = resolve_benchmark_problem(config_obj.benchmark)
|
| 146 |
+
initial_program = resolution.initial_program_path
|
| 147 |
+
evaluator = resolution.evaluator_path
|
| 148 |
+
evaluator_env_vars = resolution.evaluator_env_vars
|
| 149 |
+
logger.info(
|
| 150 |
+
f"[Benchmark Loader] Benchmark: {config_obj.benchmark.name}, Initial program: {initial_program}, Evaluator: {evaluator}"
|
| 151 |
+
)
|
| 152 |
+
except Exception as exc:
|
| 153 |
+
raise ValueError(f"Failed to load benchmark problem: {exc}") from exc
|
| 154 |
+
|
| 155 |
+
# Prepare the program (optional — None means "from scratch")
|
| 156 |
+
program_path = (
|
| 157 |
+
prepare_program(initial_program, temp_dir, temp_files)
|
| 158 |
+
if initial_program is not None
|
| 159 |
+
else None
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
if program_path and config_obj.agentic.enabled and not config_obj.agentic.codebase_root:
|
| 163 |
+
config_obj.agentic.codebase_root = os.path.dirname(os.path.abspath(program_path))
|
| 164 |
+
|
| 165 |
+
# Prepare the evaluator
|
| 166 |
+
evaluator_path = prepare_evaluator(evaluator, temp_dir, temp_files)
|
| 167 |
+
|
| 168 |
+
# Prepare the output directory
|
| 169 |
+
search_type = (
|
| 170 |
+
getattr(config_obj.search, "type", None) if hasattr(config_obj, "search") else None
|
| 171 |
+
)
|
| 172 |
+
if output_dir is None and cleanup:
|
| 173 |
+
temp_dir = tempfile.mkdtemp(prefix="skydiscover_")
|
| 174 |
+
actual_output_dir = temp_dir
|
| 175 |
+
else:
|
| 176 |
+
from skydiscover.config import build_output_dir
|
| 177 |
+
|
| 178 |
+
actual_output_dir = output_dir or build_output_dir(
|
| 179 |
+
search_type or "default", program_path or "scratch"
|
| 180 |
+
)
|
| 181 |
+
os.makedirs(actual_output_dir, exist_ok=True)
|
| 182 |
+
|
| 183 |
+
# External backends (openevolve, shinkaevolve, gepa)
|
| 184 |
+
if search_type:
|
| 185 |
+
from skydiscover.extras.external import KNOWN_EXTERNAL, get_runner, is_external
|
| 186 |
+
|
| 187 |
+
if is_external(search_type):
|
| 188 |
+
if evaluator_env_vars:
|
| 189 |
+
env_var_names = ", ".join(sorted(evaluator_env_vars))
|
| 190 |
+
raise ValueError(
|
| 191 |
+
"Passing evaluator environment variables to external backends is not yet supported. "
|
| 192 |
+
f"External backend '{search_type}' cannot be used with evaluator env vars: "
|
| 193 |
+
f"{env_var_names}"
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
from skydiscover.extras.monitor import start_monitor, stop_monitor
|
| 197 |
+
|
| 198 |
+
monitor_server, monitor_callback, feedback_reader = start_monitor(
|
| 199 |
+
config_obj, actual_output_dir
|
| 200 |
+
)
|
| 201 |
+
try:
|
| 202 |
+
result = await get_runner(search_type)(
|
| 203 |
+
program_path=program_path,
|
| 204 |
+
evaluator_path=evaluator_path,
|
| 205 |
+
config_obj=config_obj,
|
| 206 |
+
iterations=iterations or config_obj.max_iterations,
|
| 207 |
+
output_dir=actual_output_dir,
|
| 208 |
+
monitor_callback=monitor_callback,
|
| 209 |
+
feedback_reader=feedback_reader,
|
| 210 |
+
)
|
| 211 |
+
except ModuleNotFoundError as exc:
|
| 212 |
+
from skydiscover.extras.external import get_package_name
|
| 213 |
+
|
| 214 |
+
pkg = get_package_name(search_type)
|
| 215 |
+
raise ImportError(
|
| 216 |
+
f"{exc}\n\nThe '{search_type}' backend requires its package. "
|
| 217 |
+
f"Install with: pip install {pkg}"
|
| 218 |
+
) from exc
|
| 219 |
+
finally:
|
| 220 |
+
stop_monitor(monitor_server)
|
| 221 |
+
result.output_dir = actual_output_dir if not cleanup else None
|
| 222 |
+
return result
|
| 223 |
+
|
| 224 |
+
if search_type in KNOWN_EXTERNAL:
|
| 225 |
+
from skydiscover.extras.external import get_package_name
|
| 226 |
+
|
| 227 |
+
pkg = get_package_name(search_type)
|
| 228 |
+
raise ImportError(
|
| 229 |
+
f"Search type '{search_type}' requires the '{pkg}' package. "
|
| 230 |
+
f"Install with: pip install {pkg}"
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
if not config_obj.llm.models:
|
| 234 |
+
raise ValueError(
|
| 235 |
+
"No LLM models configured. Provide a config with models or "
|
| 236 |
+
"pass model= directly:\n\n"
|
| 237 |
+
" result = run_discovery(evaluator, model='gpt-5')"
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
# Initialize the runner
|
| 241 |
+
controller = Runner(
|
| 242 |
+
initial_program_path=program_path,
|
| 243 |
+
evaluation_file=evaluator_path,
|
| 244 |
+
config=config_obj,
|
| 245 |
+
output_dir=actual_output_dir,
|
| 246 |
+
evaluator_env_vars=evaluator_env_vars,
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
best_program = await controller.run(iterations=iterations)
|
| 250 |
+
|
| 251 |
+
best_score = 0.0
|
| 252 |
+
best_solution = ""
|
| 253 |
+
metrics: Dict[str, Any] = {}
|
| 254 |
+
|
| 255 |
+
if best_program:
|
| 256 |
+
best_solution = best_program.solution
|
| 257 |
+
metrics = best_program.metrics or {}
|
| 258 |
+
best_score = get_score(metrics)
|
| 259 |
+
|
| 260 |
+
initial_score = controller.initial_score
|
| 261 |
+
|
| 262 |
+
# Return the result
|
| 263 |
+
return DiscoveryResult(
|
| 264 |
+
best_program=best_program,
|
| 265 |
+
best_score=best_score,
|
| 266 |
+
best_solution=best_solution,
|
| 267 |
+
metrics=metrics,
|
| 268 |
+
output_dir=actual_output_dir if not cleanup else None,
|
| 269 |
+
initial_score=initial_score,
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
finally:
|
| 273 |
+
if cleanup:
|
| 274 |
+
cleanup_temp(temp_files, temp_dir)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def discover_solution(
|
| 278 |
+
evaluator: Callable[[str], Dict[str, Any]],
|
| 279 |
+
initial_solution: Optional[str] = None,
|
| 280 |
+
iterations: int = 100,
|
| 281 |
+
search: Optional[str] = None,
|
| 282 |
+
model: Optional[str] = None,
|
| 283 |
+
**kwargs: Any,
|
| 284 |
+
) -> DiscoveryResult:
|
| 285 |
+
"""Convenience wrapper: evolve a string solution with a callable evaluator.
|
| 286 |
+
|
| 287 |
+
Same as run_discovery but defaults to string input + callable evaluator.
|
| 288 |
+
"""
|
| 289 |
+
return run_discovery(
|
| 290 |
+
evaluator=evaluator,
|
| 291 |
+
initial_program=initial_solution,
|
| 292 |
+
iterations=iterations,
|
| 293 |
+
search=search,
|
| 294 |
+
model=model,
|
| 295 |
+
**kwargs,
|
| 296 |
+
)
|
skydiscover/benchmarks/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Benchmark resolver system for external problem sources."""
|
| 2 |
+
|
| 3 |
+
from skydiscover.benchmarks.base import BenchmarkResolver
|
| 4 |
+
|
| 5 |
+
__all__ = ["BenchmarkResolver"]
|
skydiscover/benchmarks/base.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base interface for benchmark resolvers.
|
| 2 |
+
|
| 3 |
+
Benchmark resolvers fetch problems from external sources (e.g., datasets, APIs)
|
| 4 |
+
and generate the necessary files (initial_program, evaluator configuration) for
|
| 5 |
+
SkyDiscover to run optimization on them.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from abc import ABC, abstractmethod
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any, Dict
|
| 11 |
+
|
| 12 |
+
from skydiscover.benchmarks.resolution import BenchmarkResolution
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class BenchmarkResolver(ABC):
|
| 16 |
+
"""Base class for benchmark-specific problem resolvers.
|
| 17 |
+
|
| 18 |
+
Resolvers are responsible for:
|
| 19 |
+
1. Fetching problem specifications from external sources
|
| 20 |
+
2. Generating initial_program files with appropriate structure
|
| 21 |
+
3. Configuring evaluators (via environment variables or generated files)
|
| 22 |
+
|
| 23 |
+
Example usage:
|
| 24 |
+
resolver = KernelBenchResolver()
|
| 25 |
+
initial_program, evaluator = resolver.resolve(
|
| 26 |
+
config={'level': 1, 'problem_id': 3},
|
| 27 |
+
output_dir=Path('/tmp/skydiscover_kernelbench_123')
|
| 28 |
+
)
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
@abstractmethod
|
| 32 |
+
def resolve(self, config: Dict[str, Any], output_dir: Path) -> BenchmarkResolution:
|
| 33 |
+
"""Resolve a benchmark problem to concrete file paths and evaluator config.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
config: Benchmark configuration dictionary containing benchmark-specific
|
| 37 |
+
problem specifications and parameters.
|
| 38 |
+
The exact keys depend on the benchmark implementation.
|
| 39 |
+
output_dir: Directory where generated files should be placed.
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
BenchmarkResolution containing:
|
| 43 |
+
- initial_program_path: Path to the generated initial program file
|
| 44 |
+
- evaluator_path: Path to the evaluator (file or directory)
|
| 45 |
+
- evaluator_env_vars: Per-run environment variables for the evaluator
|
| 46 |
+
|
| 47 |
+
"""
|
| 48 |
+
pass
|
skydiscover/benchmarks/resolution.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Benchmark resolution helpers."""
|
| 2 |
+
|
| 3 |
+
import importlib
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
import tempfile
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Any, Dict
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class BenchmarkResolution:
|
| 14 |
+
"""Resolved benchmark assets and evaluator-scoped configuration."""
|
| 15 |
+
|
| 16 |
+
initial_program_path: str
|
| 17 |
+
evaluator_path: str
|
| 18 |
+
evaluator_env_vars: Dict[str, str] = field(default_factory=dict)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def resolve_benchmark_problem(benchmark_config: Any) -> BenchmarkResolution:
|
| 22 |
+
"""Load benchmark problem from external dataset using the configured resolver."""
|
| 23 |
+
resolver_path = getattr(benchmark_config, "resolver", None)
|
| 24 |
+
if not resolver_path:
|
| 25 |
+
raise ValueError("BenchmarkConfig.resolver must be set to use benchmark loading")
|
| 26 |
+
|
| 27 |
+
cwd = os.getcwd()
|
| 28 |
+
if cwd not in sys.path:
|
| 29 |
+
sys.path.insert(0, cwd)
|
| 30 |
+
|
| 31 |
+
resolver_module = importlib.import_module(resolver_path)
|
| 32 |
+
resolver = resolver_module.resolver
|
| 33 |
+
|
| 34 |
+
benchmark_name = getattr(benchmark_config, "name", None) or "benchmark"
|
| 35 |
+
output_dir = Path(tempfile.mkdtemp(prefix=f"skydiscover_{benchmark_name}_"))
|
| 36 |
+
|
| 37 |
+
params = getattr(benchmark_config, "params", {})
|
| 38 |
+
return resolver.resolve(config=params, output_dir=output_dir)
|
skydiscover/cli.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Command-line interface for SkyDiscover."""
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import asyncio
|
| 5 |
+
import logging
|
| 6 |
+
import multiprocessing
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import traceback
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
from skydiscover import Runner
|
| 13 |
+
from skydiscover.benchmarks.resolution import resolve_benchmark_problem
|
| 14 |
+
from skydiscover.config import _parse_model_spec, apply_overrides, load_config
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
multiprocessing.set_start_method("spawn")
|
| 18 |
+
except RuntimeError:
|
| 19 |
+
pass
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
_SEARCH_CHOICES = [
|
| 24 |
+
"evox",
|
| 25 |
+
"adaevolve",
|
| 26 |
+
"best_of_n",
|
| 27 |
+
"beam_search",
|
| 28 |
+
"topk",
|
| 29 |
+
"openevolve_native",
|
| 30 |
+
"openevolve",
|
| 31 |
+
"shinkaevolve",
|
| 32 |
+
"gepa",
|
| 33 |
+
"gepa_native",
|
| 34 |
+
"claude_code",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def parse_args() -> argparse.Namespace:
|
| 39 |
+
"""Build and parse the CLI argument parser."""
|
| 40 |
+
parser = argparse.ArgumentParser(
|
| 41 |
+
description="SkyDiscover - AI-Driven Scientific and Algorithmic Discovery",
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
"initial_program",
|
| 46 |
+
nargs="?",
|
| 47 |
+
default=None,
|
| 48 |
+
help="Path to the initial program file (can be optional)",
|
| 49 |
+
)
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
"evaluation_file",
|
| 52 |
+
help=(
|
| 53 |
+
"Evaluator: path to a Python file (must define evaluate()) "
|
| 54 |
+
"or a benchmark directory containing Dockerfile + evaluate.sh"
|
| 55 |
+
),
|
| 56 |
+
)
|
| 57 |
+
parser.add_argument("--config", "-c", help="Path to configuration file (YAML)", default=None)
|
| 58 |
+
parser.add_argument("--output", "-o", help="Output directory for results", default=None)
|
| 59 |
+
parser.add_argument(
|
| 60 |
+
"--iterations", "-i", type=int, default=None, help="Maximum number of iterations"
|
| 61 |
+
)
|
| 62 |
+
parser.add_argument(
|
| 63 |
+
"--log-level",
|
| 64 |
+
"-l",
|
| 65 |
+
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
| 66 |
+
default=None,
|
| 67 |
+
help="Logging level",
|
| 68 |
+
)
|
| 69 |
+
parser.add_argument(
|
| 70 |
+
"--checkpoint",
|
| 71 |
+
default=None,
|
| 72 |
+
help="Path to a checkpoint directory to resume from",
|
| 73 |
+
)
|
| 74 |
+
parser.add_argument("--api-base", default=None, help="Base URL for the LLM API")
|
| 75 |
+
parser.add_argument(
|
| 76 |
+
"--agentic",
|
| 77 |
+
action="store_true",
|
| 78 |
+
default=False,
|
| 79 |
+
help="Enable agentic mode (codebase root derived from initial program location)",
|
| 80 |
+
)
|
| 81 |
+
parser.add_argument(
|
| 82 |
+
"--model",
|
| 83 |
+
"-m",
|
| 84 |
+
default=None,
|
| 85 |
+
help="LLM model(s) for solution generation, comma-separated (e.g. 'gpt-5', 'gpt-5,gemini/gemini-3-pro')",
|
| 86 |
+
)
|
| 87 |
+
parser.add_argument(
|
| 88 |
+
"--search",
|
| 89 |
+
"-s",
|
| 90 |
+
choices=_SEARCH_CHOICES,
|
| 91 |
+
default=None,
|
| 92 |
+
help="Search algorithm to use",
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
return parser.parse_args()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def main() -> int:
|
| 99 |
+
"""Synchronous entry point for the skydiscover console script."""
|
| 100 |
+
return asyncio.run(main_async())
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
async def main_async() -> int:
|
| 104 |
+
"""Async entry point for the CLI. Returns exit code."""
|
| 105 |
+
args = parse_args()
|
| 106 |
+
_configure_logging(args.log_level)
|
| 107 |
+
|
| 108 |
+
if args.initial_program and not os.path.exists(args.initial_program):
|
| 109 |
+
print(f"Error: Initial program file '{args.initial_program}' not found", file=sys.stderr)
|
| 110 |
+
return 1
|
| 111 |
+
if not os.path.exists(args.evaluation_file):
|
| 112 |
+
print(f"Error: Evaluation file '{args.evaluation_file}' not found", file=sys.stderr)
|
| 113 |
+
return 1
|
| 114 |
+
|
| 115 |
+
has_overrides = any((args.api_base, args.model, args.agentic, args.search))
|
| 116 |
+
config = None
|
| 117 |
+
evaluator_env_vars: Optional[dict[str, str]] = None
|
| 118 |
+
|
| 119 |
+
# Load the configuration
|
| 120 |
+
if args.config or has_overrides:
|
| 121 |
+
config = load_config(args.config)
|
| 122 |
+
|
| 123 |
+
evaluator_env_vars = None
|
| 124 |
+
|
| 125 |
+
try:
|
| 126 |
+
apply_overrides(
|
| 127 |
+
config,
|
| 128 |
+
model=args.model,
|
| 129 |
+
api_base=args.api_base,
|
| 130 |
+
agentic=args.agentic,
|
| 131 |
+
search=args.search,
|
| 132 |
+
)
|
| 133 |
+
except ValueError as exc:
|
| 134 |
+
print(f"Error: {exc}", file=sys.stderr)
|
| 135 |
+
return 1
|
| 136 |
+
|
| 137 |
+
# Resolve benchmark problem if configured and no initial_program provided
|
| 138 |
+
if args.initial_program is None and config.benchmark and config.benchmark.enabled:
|
| 139 |
+
try:
|
| 140 |
+
resolution = resolve_benchmark_problem(config.benchmark)
|
| 141 |
+
args.initial_program = resolution.initial_program_path
|
| 142 |
+
args.evaluation_file = resolution.evaluator_path
|
| 143 |
+
evaluator_env_vars = resolution.evaluator_env_vars
|
| 144 |
+
print(
|
| 145 |
+
f"[Benchmark Loader] Benchmark: {config.benchmark.name}, Initial program: {args.initial_program}, Evaluator: {args.evaluation_file}"
|
| 146 |
+
)
|
| 147 |
+
except Exception as exc:
|
| 148 |
+
print(f"Error: Failed to load benchmark problem: {exc}", file=sys.stderr)
|
| 149 |
+
traceback.print_exc()
|
| 150 |
+
return 1
|
| 151 |
+
|
| 152 |
+
if args.model:
|
| 153 |
+
print("Active models:")
|
| 154 |
+
for i, m in enumerate(config.llm.models):
|
| 155 |
+
provider, *_ = _parse_model_spec(m.name)
|
| 156 |
+
print(f" {i + 1}. {m.name} (provider: {provider}, weight: {m.weight})")
|
| 157 |
+
if args.api_base:
|
| 158 |
+
print(f"Using API base: {config.llm.api_base}")
|
| 159 |
+
if args.agentic:
|
| 160 |
+
if not config.agentic.codebase_root and args.initial_program:
|
| 161 |
+
config.agentic.codebase_root = os.path.dirname(
|
| 162 |
+
os.path.abspath(args.initial_program)
|
| 163 |
+
)
|
| 164 |
+
print(f"Agentic mode enabled (codebase: {config.agentic.codebase_root})")
|
| 165 |
+
if args.search:
|
| 166 |
+
print(f"Using search algorithm: {args.search}")
|
| 167 |
+
|
| 168 |
+
# Run the discovery
|
| 169 |
+
try:
|
| 170 |
+
search_type = config.search.type if config and hasattr(config, "search") else None
|
| 171 |
+
|
| 172 |
+
if search_type:
|
| 173 |
+
from skydiscover.extras.external import (
|
| 174 |
+
KNOWN_EXTERNAL,
|
| 175 |
+
get_package_name,
|
| 176 |
+
get_runner,
|
| 177 |
+
is_external,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
# External backends (openevolve, shinkaevolve, gepa)
|
| 181 |
+
if is_external(search_type):
|
| 182 |
+
if evaluator_env_vars:
|
| 183 |
+
env_var_names = ", ".join(sorted(evaluator_env_vars))
|
| 184 |
+
print(
|
| 185 |
+
"Error: Passing evaluator environment variables to external backends "
|
| 186 |
+
"is not yet supported. "
|
| 187 |
+
f"External backend '{search_type}' cannot be used with evaluator env vars: "
|
| 188 |
+
f"{env_var_names}",
|
| 189 |
+
file=sys.stderr,
|
| 190 |
+
)
|
| 191 |
+
return 1
|
| 192 |
+
|
| 193 |
+
from skydiscover.config import build_output_dir
|
| 194 |
+
|
| 195 |
+
output_dir = args.output or build_output_dir(
|
| 196 |
+
search_type, args.initial_program or "scratch"
|
| 197 |
+
)
|
| 198 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 199 |
+
|
| 200 |
+
from skydiscover.extras.monitor import start_monitor, stop_monitor
|
| 201 |
+
|
| 202 |
+
# Start monitor for external backends as well
|
| 203 |
+
monitor_server, monitor_callback, feedback_reader = start_monitor(
|
| 204 |
+
config, output_dir
|
| 205 |
+
)
|
| 206 |
+
try:
|
| 207 |
+
result = await get_runner(search_type)(
|
| 208 |
+
program_path=args.initial_program,
|
| 209 |
+
evaluator_path=args.evaluation_file,
|
| 210 |
+
config_obj=config,
|
| 211 |
+
iterations=args.iterations or config.max_iterations,
|
| 212 |
+
output_dir=output_dir,
|
| 213 |
+
monitor_callback=monitor_callback,
|
| 214 |
+
feedback_reader=feedback_reader,
|
| 215 |
+
)
|
| 216 |
+
except ModuleNotFoundError as exc:
|
| 217 |
+
pkg = get_package_name(search_type)
|
| 218 |
+
print(f"Error: {exc}", file=sys.stderr)
|
| 219 |
+
print(f"\nThe '{search_type}' backend requires its package.", file=sys.stderr)
|
| 220 |
+
print(f"Install with: pip install {pkg}", file=sys.stderr)
|
| 221 |
+
return 1
|
| 222 |
+
finally:
|
| 223 |
+
stop_monitor(monitor_server)
|
| 224 |
+
|
| 225 |
+
print(f"\nDiscovery complete! Best score: {result.best_score:.4f}")
|
| 226 |
+
return 0
|
| 227 |
+
|
| 228 |
+
if search_type in KNOWN_EXTERNAL:
|
| 229 |
+
pkg = get_package_name(search_type)
|
| 230 |
+
print(
|
| 231 |
+
f"Error: Search type '{search_type}' requires the '{pkg}' package. "
|
| 232 |
+
f"Install with: pip install {pkg}",
|
| 233 |
+
file=sys.stderr,
|
| 234 |
+
)
|
| 235 |
+
return 1
|
| 236 |
+
|
| 237 |
+
# Initialize the runner
|
| 238 |
+
runner = Runner(
|
| 239 |
+
initial_program_path=args.initial_program,
|
| 240 |
+
evaluation_file=args.evaluation_file,
|
| 241 |
+
config=config,
|
| 242 |
+
config_path=args.config if config is None else None,
|
| 243 |
+
output_dir=args.output,
|
| 244 |
+
evaluator_env_vars=evaluator_env_vars,
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# Load the checkpoint if provided
|
| 248 |
+
if args.checkpoint:
|
| 249 |
+
if not os.path.exists(args.checkpoint):
|
| 250 |
+
print(f"Error: Checkpoint directory '{args.checkpoint}' not found", file=sys.stderr)
|
| 251 |
+
return 1
|
| 252 |
+
print(f"Will resume from checkpoint: {args.checkpoint}")
|
| 253 |
+
|
| 254 |
+
# Run the discovery
|
| 255 |
+
best_program = await runner.run(
|
| 256 |
+
iterations=args.iterations,
|
| 257 |
+
checkpoint_path=args.checkpoint,
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
checkpoint_dir = os.path.join(runner.output_dir, "checkpoints")
|
| 261 |
+
latest_checkpoint = _find_latest_checkpoint(checkpoint_dir)
|
| 262 |
+
|
| 263 |
+
print("\nDiscovery complete!")
|
| 264 |
+
if best_program is None:
|
| 265 |
+
print("No valid programs were found.")
|
| 266 |
+
else:
|
| 267 |
+
print("Best program metrics:")
|
| 268 |
+
for name, value in best_program.metrics.items():
|
| 269 |
+
formatted = f"{value:.4f}" if isinstance(value, (int, float)) else str(value)
|
| 270 |
+
print(f" {name}: {formatted}")
|
| 271 |
+
|
| 272 |
+
if latest_checkpoint:
|
| 273 |
+
print(f"\nLatest checkpoint: {latest_checkpoint}")
|
| 274 |
+
print(f"To resume: --checkpoint {latest_checkpoint}")
|
| 275 |
+
|
| 276 |
+
return 0
|
| 277 |
+
|
| 278 |
+
except Exception as exc:
|
| 279 |
+
print(f"Error: {exc}", file=sys.stderr)
|
| 280 |
+
traceback.print_exc()
|
| 281 |
+
return 1
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def _configure_logging(level_name: Optional[str]) -> None:
|
| 285 |
+
"""Set up the root logger with the SkyDiscover console format."""
|
| 286 |
+
from skydiscover.search.utils.logging_utils import _ConsoleFilter, _ConsoleFormatter
|
| 287 |
+
|
| 288 |
+
log_level = getattr(logging, level_name) if level_name else logging.WARNING
|
| 289 |
+
root = logging.getLogger()
|
| 290 |
+
root.setLevel(log_level)
|
| 291 |
+
if not root.handlers:
|
| 292 |
+
handler = logging.StreamHandler()
|
| 293 |
+
handler.setFormatter(_ConsoleFormatter())
|
| 294 |
+
handler.addFilter(_ConsoleFilter())
|
| 295 |
+
root.addHandler(handler)
|
| 296 |
+
logging.getLogger("skydiscover").setLevel(logging.INFO)
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def _find_latest_checkpoint(checkpoint_dir: str) -> Optional[str]:
|
| 300 |
+
"""Return the path of the latest checkpoint directory named like ``checkpoint_<n>``."""
|
| 301 |
+
if not os.path.isdir(checkpoint_dir):
|
| 302 |
+
return None
|
| 303 |
+
|
| 304 |
+
def parse_iteration(path: str) -> Optional[int]:
|
| 305 |
+
try:
|
| 306 |
+
return int(path.rsplit("_", 1)[-1])
|
| 307 |
+
except (ValueError, IndexError):
|
| 308 |
+
return None
|
| 309 |
+
|
| 310 |
+
candidates = []
|
| 311 |
+
for name in os.listdir(checkpoint_dir):
|
| 312 |
+
full_path = os.path.join(checkpoint_dir, name)
|
| 313 |
+
if not os.path.isdir(full_path):
|
| 314 |
+
continue
|
| 315 |
+
iteration = parse_iteration(name)
|
| 316 |
+
if iteration is None:
|
| 317 |
+
continue
|
| 318 |
+
candidates.append((iteration, full_path))
|
| 319 |
+
|
| 320 |
+
if not candidates:
|
| 321 |
+
return None
|
| 322 |
+
|
| 323 |
+
return max(candidates, key=lambda item: item[0])[1]
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
if __name__ == "__main__":
|
| 327 |
+
sys.exit(main())
|