JustinTX commited on
Commit
e530698
·
verified ·
1 Parent(s): 730e01e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .github/workflows/ci.yml +51 -0
  2. .gitignore +60 -0
  3. .gitmodules +3 -0
  4. LICENSE +201 -0
  5. README.md +388 -0
  6. benchmarks/kernelbench/__init__.py +0 -0
  7. benchmarks/kernelbench/requirements.txt +4 -0
  8. docs/.gitignore +3 -0
  9. docs/README.md +13 -0
  10. docs/app/api/search/route.ts +6 -0
  11. docs/app/docs/layout.tsx +11 -0
  12. docs/app/global.css +3 -0
  13. docs/app/page.tsx +22 -0
  14. docs/content/docs/getting-started/index.mdx +11 -0
  15. docs/content/docs/getting-started/installation.mdx +51 -0
  16. docs/content/docs/getting-started/meta.json +4 -0
  17. docs/content/docs/meta.json +7 -0
  18. docs/lib/source.ts +7 -0
  19. docs/mdx-components.tsx +18 -0
  20. docs/next-env.d.ts +6 -0
  21. docs/next.config.mjs +19 -0
  22. docs/package-lock.json +0 -0
  23. docs/package.json +30 -0
  24. docs/postcss.config.mjs +5 -0
  25. docs/source.config.ts +7 -0
  26. docs/tsconfig.json +36 -0
  27. examples/text_similarity/config.yaml +41 -0
  28. examples/text_similarity/evaluator/Dockerfile +9 -0
  29. examples/text_similarity/evaluator/evaluate.sh +5 -0
  30. examples/text_similarity/evaluator/evaluator.py +48 -0
  31. examples/text_similarity/evaluator/pairs.json +51 -0
  32. examples/text_similarity/initial_program.py +34 -0
  33. scripts/reproduce/adrs.sh +66 -0
  34. scripts/reproduce/ale_bench.sh +63 -0
  35. scripts/reproduce/arc.sh +45 -0
  36. scripts/reproduce/frontier_cs.sh +52 -0
  37. scripts/reproduce/gpu.sh +58 -0
  38. scripts/reproduce/math.sh +77 -0
  39. scripts/reproduce/prompt_opt.sh +45 -0
  40. scripts/reproduce/run_all.sh +18 -0
  41. scripts/run_cp.sh +20 -0
  42. setup.py +3 -0
  43. skydiscover/README.md +50 -0
  44. skydiscover/__init__.py +19 -0
  45. skydiscover/_version.py +3 -0
  46. skydiscover/api.py +296 -0
  47. skydiscover/benchmarks/__init__.py +5 -0
  48. skydiscover/benchmarks/base.py +48 -0
  49. skydiscover/benchmarks/resolution.py +38 -0
  50. skydiscover/cli.py +327 -0
.github/workflows/ci.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ concurrency:
9
+ group: ${{ github.workflow }}-${{ github.ref }}
10
+ cancel-in-progress: true
11
+
12
+ jobs:
13
+ lint:
14
+ runs-on: ubuntu-latest
15
+ timeout-minutes: 5
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ - uses: astral-sh/setup-uv@v4
19
+ with:
20
+ python-version: "3.10"
21
+ enable-cache: true
22
+ - run: uv sync --frozen --extra dev
23
+ - run: uv run black --check skydiscover/
24
+ - run: uv run isort --check skydiscover/
25
+
26
+ test:
27
+ runs-on: ubuntu-latest
28
+ timeout-minutes: 10
29
+ steps:
30
+ - uses: actions/checkout@v4
31
+ - uses: astral-sh/setup-uv@v4
32
+ with:
33
+ python-version: "3.10"
34
+ enable-cache: true
35
+ - run: uv sync --frozen --extra dev
36
+ - name: Smoke test — package imports cleanly
37
+ run: uv run python -c "from skydiscover import Runner, run_discovery, discover_solution, __version__; print(f'skydiscover {__version__} OK')"
38
+ - name: Run tests
39
+ run: uv run pytest tests/ -v
40
+
41
+ build:
42
+ runs-on: ubuntu-latest
43
+ timeout-minutes: 5
44
+ needs: [lint, test]
45
+ steps:
46
+ - uses: actions/checkout@v4
47
+ - uses: astral-sh/setup-uv@v4
48
+ with:
49
+ python-version: "3.10"
50
+ enable-cache: true
51
+ - run: uv build
.gitignore ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg-info/
7
+ *.egg
8
+ dist/
9
+ build/
10
+ .eggs/
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+ env/
16
+
17
+ # IDE
18
+ .idea/
19
+ .vscode/
20
+ *.swp
21
+ *.swo
22
+ .claude/
23
+
24
+ # OS
25
+ .DS_Store
26
+
27
+ # Testing
28
+ .pytest_cache/
29
+ .coverage
30
+ htmlcov/
31
+
32
+ # Secrets
33
+ .env
34
+ secrets.yaml
35
+
36
+ # Logs & outputs
37
+ *.log
38
+ *.jsonl
39
+ output*/
40
+ outputs*/
41
+ outputs_*/
42
+
43
+
44
+ # Benchmark generated data
45
+ benchmarks/image_gen/sky_festival/sky_festival_output/
46
+ benchmarks/image_gen/sky_festival/sky_festival_paradigm_output_*/
47
+ benchmarks/frontier-cs-eval/Frontier-CS
48
+ benchmarks/ADRS/eplb/expert-load.json
49
+ benchmarks/ADRS/cloudcast/profiles/
50
+ benchmarks/ADRS/cloudcast/examples/
51
+ benchmarks/ADRS/llm_sql/datasets/
52
+
53
+ # Generated test outputs (re-generate with test_all_benchmarks.sh)
54
+ tests/**/test_outputs_*/
55
+
56
+ # Evaluation run outputs
57
+ eval_runs/
58
+
59
+ # Local documentation
60
+ tasks/
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "benchmarks/ale_bench/ALE-Bench"]
2
+ path = benchmarks/ale_bench/ALE-Bench
3
+ url = https://github.com/SakanaAI/ALE-Bench.git
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [2025] [SkyDiscover Team]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">
2
+   <img src="assets/logo_vector.png" height="80" alt="SkyDiscover logo" style="vertical-align: middle;">&nbsp;
3
+
4
+   <b>SkyDiscover</b>
5
+ </h1>
6
+
7
+
8
+ <p align="center"> A Flexible Framework for AI-Driven Scientific and Algorithmic Discovery</p>
9
+ <p align="center">
10
+   <a href="https://skydiscover-ai.github.io/blog.html"><img src="https://img.shields.io/badge/blog-SkyDiscover-orange?style=flat-square" alt="Blog" /></a>
11
+ <a href="https://arxiv.org/abs/2602.20133"><img src="https://img.shields.io/badge/paper-AdaEvolve-red?style=flat-square" alt="AdaEvolve Paper" /></a>
12
+ <a href="https://arxiv.org/abs/2602.23413"><img src="https://img.shields.io/badge/paper-EvoX-lightblue?style=flat-square" alt="EvoX Paper" /></a>
13
+   <a href="LICENSE"><img src="https://img.shields.io/badge/license-Apache--2.0-green?style=flat-square" /></a>
14
+ </p>
15
+
16
+
17
+
18
+ <p align="center">
19
+ <img src="assets/architecture.png" width="720" alt="SkyDiscover architecture"><br>
20
+ </p>
21
+
22
+
23
+ **SkyDiscover** is a modular framework for AI-driven scientific and algorithmic discovery, providing a unified interface for implementing, running, and fairly comparing discovery algorithms across 200+ optimization tasks.
24
+
25
+ SkyDiscover introduces two new adaptive optimization algorithms:
26
+
27
+ - **[AdaEvolve](https://arxiv.org/abs/2602.20133)**, which dynamically adjusts its optimization behavior based on observed progress.
28
+ - **[EvoX](https://arxiv.org/abs/2602.23413)**, which dynamically evolves the optimization (evolution) strategy itself using LLMs on the fly.
29
+
30
+ SkyDiscover also supports using OpenEvolve, ShinkaEvolve and GEPA to quickly benchmark these algorithms using their own source code. SkyDiscover also hosts native versions of OpenEvolve and GEPA under `openevolve_native` and `gepa_native` algorithms using the modular interface.
31
+
32
+ SkyDiscover natively supports [Harbor](https://harborframework.com/)-format benchmarks, so you can run external benchmark suites out of the box, including [AlgoTune](https://github.com/oripress/AlgoTune), [EvoEval](https://github.com/evo-eval/evoeval), [HumanEvalFix](https://github.com/bigcode-project/octopack), [BigCodeBench](https://github.com/bigcode-project/bigcodebench), [LiveCodeBench](https://livecodebench.github.io/), [USACO](https://usaco.org/), [CRUSTBench](https://github.com/AInfinity/CRUSTBench), and [CodePDE](https://github.com/).
33
+ > 🚧 This project is under active development.
34
+
35
+ ---
36
+
37
+ ## 🏆 Benchmark Performance
38
+
39
+ Across ~200 optimization benchmarks, AdaEvolve and EvoX achieve the strongest open-source results: matching or exceeding AlphaEvolve and human SOTA, and outperforming OpenEvolve, GEPA, and ShinkaEvolve under identical generation budgets.
40
+
41
+ - **Frontier-CS (172 problems)**: ~34% median score improvement over OpenEvolve, GEPA, and ShinkaEvolve
42
+ - **Math + Systems Optimization (14 tasks evaluated)**: Matches or exceeds AlphaEvolve and human-designed SOTA on 6/6 systems and 6/8 math tasks
43
+ - **Real-world systems impact**: 41% lower cross-cloud transfer cost, 14% better GPU load balance for MoE serving, and 29% lower KV-cache pressure via GPU model placement
44
+
45
+ <p align="center">
46
+ <img src="assets/benchmarks.png" width="900" alt="SkyDiscover benchmarks">
47
+ </p>
48
+
49
+ <details>
50
+ <summary><b>📊 Complete results of AdaEvolve and EvoX (100 iterations)</b></summary>
51
+
52
+ > AdaEvolve and EvoX are **complementary**: AdaEvolve adapts search *parameters* for fast early gains; EvoX evolves the search *strategy itself* for stronger long-horizon gains. Both are built on SkyDiscover.
53
+
54
+ <p align="center">
55
+ <img src="assets/comparison.png" width="900" alt="Main results for systems and math problems">
56
+ </p>
57
+
58
+ </details>
59
+
60
+ <details>
61
+ <summary><b>📈 Scaling behavior of AdaEvolve and EvoX</b></summary>
62
+
63
+ The scaling behavior of AdaEvolve and EvoX shows a **complementary crossover**. AdaEvolve's per-iteration parameter adaptation yields fast early gains in low-budget runs (T≤50), while EvoX's demand-driven strategy evolution unlocks step-change improvements in longer runs (T≥50).
64
+
65
+ <p align="center">
66
+ <img src="assets/scaling_comparison.png" width="900" alt="Scaling behavior of AdaEvolve vs EvoX across 500 iterations">
67
+ <br><em>Best-so-far score vs. iteration for Signal Processing, Heilbronn Convex, Prism, and Cloudcast (500 iterations, GPT-5).</em>
68
+ </p>
69
+
70
+ </details>
71
+
72
+ <details>
73
+ <summary><b>🔗 Evolving AdaEvolve's policy with EvoX (coming soon)</b></summary>
74
+
75
+ The two methods are **composable**: EvoX can evolve using AdaEvolve as its starting strategy, achieving the best results on 3 out of 4 benchmarks (100 iterations, GPT-5). This combined mode will be available in SkyDiscover soon.
76
+
77
+ | Benchmark | AdaEvolve | EvoX (Random Init) | EvoX (AdaEvolve Init) |
78
+ |:--|--:|--:|--:|
79
+ | Signal Proc. (↑) | 0.718 | 0.721 | **0.760** |
80
+ | Heilbronn Cvx. (↑) | 0.0290 | 0.0270 | **0.0291** |
81
+ | Cloudcast (↓) | 640.5 | 637.1 | **623.4** |
82
+ | Prism (↑) | 26.37 | **30.52** | 26.27 |
83
+
84
+ </details>
85
+
86
+ <details>
87
+ <summary><b>Task breakdown across math, systems, and programming challenges</b></summary>
88
+
89
+ | | Benchmark | Domain | Tasks | Description |
90
+ |-|-----------|--------|------:|-------------|
91
+ | 🔢 | [math/](benchmarks/math/) | Math | 14 | Circle packing, Erdos problems, geometric optimization |
92
+ | 🖥️ | [ADRS/](benchmarks/ADRS/) | Systems | 5 | Cloud scheduling, load balancing, MoE expert placement |
93
+ | ⚡ | [gpu_mode/](benchmarks/gpu_mode/) | Systems | 4 | GPU kernel optimization |
94
+ | 🔧 | [kernelbench/](benchmarks/kernelbench/) | Systems | 250+ | [KernelBench](https://github.com/ScalingIntelligence/KernelBench) GPU kernel speedup optimization |
95
+ | 🧩 | [frontier-cs-eval/](benchmarks/frontier-cs-eval/) | Algorithms | 172 | [Frontier-CS](https://frontier-cs.org/) competitive programming |
96
+ | 🧠 | [arc_benchmark/](benchmarks/arc_benchmark/) | Reasoning | — | ARC-AGI visual reasoning |
97
+ | 💻 | [ale_bench/](benchmarks/ale_bench/) | Algorithms | 10 | Algorithmic programming contests |
98
+ | 🎨 | [image_gen/](benchmarks/image_gen/) | Creative | 1 | AI image generation evolution |
99
+ | 💬 | [prompt_optimization/](benchmarks/prompt_optimization/) | NLP | 1 | HotPotQA prompt evolution |
100
+
101
+ See [Dependency extras](#dependency-extras) for install commands per benchmark.
102
+
103
+ </details>
104
+
105
+ ## 🚀 Quick Start
106
+
107
+ **Prerequisites:** Python >= 3.10, [uv](https://docs.astral.sh/uv/)
108
+
109
+ ```bash
110
+ # Install
111
+ uv sync
112
+ export OPENAI_API_KEY="<your-key>"
113
+
114
+ # Try the circle packing benchmark
115
+ uv sync --extra math
116
+ uv run skydiscover-run benchmarks/math/circle_packing/initial_program.py \
117
+ benchmarks/math/circle_packing/evaluator.py \
118
+ --config benchmarks/math/circle_packing/config.yaml \
119
+ --search evox \
120
+ --iterations 100
121
+
122
+ uv run skydiscover-run benchmarks/math/circle_packing/initial_program.py \
123
+ benchmarks/math/circle_packing/evaluator.py \
124
+ --config benchmarks/math/circle_packing/config.yaml \
125
+ --search adaevolve \
126
+ --iterations 100
127
+
128
+ # Or run on your own problem
129
+ # algo can be "evox", "adaevolve", "openevolve", "gepa", "shinkaevolve"
130
+ uv run skydiscover-run initial_program.py evaluator.py \
131
+ --search <algo> \
132
+ --model gpt-5 \
133
+ --iterations 100
134
+
135
+ # initial_program is optional — omit it to let the LLM start from scratch
136
+ uv run skydiscover-run evaluator.py \
137
+ --search <algo> \
138
+ --model gpt-5 \
139
+ --iterations 100
140
+
141
+ # Run a Harbor benchmark (e.g. AlgoTune) — no seed program needed
142
+ pip install harbor
143
+ harbor datasets download algotune@1.0 -o /tmp/algotune
144
+ uv run skydiscover-run /tmp/algotune/<id>/algotune-set-cover \
145
+ --model anthropic/claude-sonnet-4-6 \
146
+ --search best_of_n -i 10
147
+ ```
148
+
149
+ Or use the Python API:
150
+
151
+ ```python
152
+ from skydiscover import run_discovery
153
+
154
+ result = run_discovery(
155
+ initial_program="initial_program.py",
156
+ evaluator="evaluator.py",
157
+ search=[algo], # algo can be "adaevolve", "evox", "openevolve", "gepa", "shinkaevolve"
158
+ model="gpt-5",
159
+ iterations=100,
160
+ )
161
+
162
+ print(result.best_score, result.best_solution)
163
+ ```
164
+
165
+
166
+ ## ✏️ What You Write
167
+
168
+ ### Scoring Function (required)
169
+
170
+ SkyDiscover supports three evaluator formats — pick whichever fits your use case:
171
+
172
+ | Format | When to use | What you point `evaluation_file` at |
173
+ |:---|:---|:---|
174
+ | **Python function** | Simple tasks, no system deps | `evaluator.py` |
175
+ | **Containerized** | Custom deps, data files, isolation | `evaluator/` directory (must contain `Dockerfile` + `evaluate.sh`) |
176
+ | **Harbor task** | External benchmark suites (AlgoTune, EvoEval, HumanEvalFix, BigCodeBench, LiveCodeBench, USACO, CRUSTBench, CodePDE, and more) | Task directory (must contain `instruction.md` + `tests/` + `environment/Dockerfile`) |
177
+
178
+ SkyDiscover auto-detects the format. See [`benchmarks/README.md`](benchmarks/README.md#adding-a-benchmark) for full setup instructions.
179
+
180
+ **Python evaluator** — a file with an `evaluate(program_path)` function:
181
+
182
+ ```python
183
+ def evaluate(program_path):
184
+ score = run_and_grade(program_path)
185
+ return {
186
+ "combined_score": score, # primary optimization target (maximized)
187
+ "artifacts": { # optional — stored with the solution for future context
188
+ "feedback": "Off by one in the loop boundary",
189
+ },
190
+ }
191
+ ```
192
+
193
+ **Containerized evaluator** — a directory with a `Dockerfile` and `evaluate.sh` that writes JSON to stdout. Runs in Docker, so it can have arbitrary dependencies.
194
+
195
+ **Harbor task** — a directory following the [Harbor](https://harborframework.com/) format (`instruction.md`, `environment/Dockerfile`, `tests/test.sh`). Works out of the box with 8+ tested benchmark suites (see [benchmarks/README.md](benchmarks/README.md#tested-harbor-datasets) for the full list).
196
+
197
+ - **combined_score** drives evolution. If omitted, SkyDiscover averages all numeric values in the dict.
198
+ - **artifacts** is optional — entries are injected into the next LLM prompt as context.
199
+
200
+ For `search.type: adaevolve`, you can also enable explicit Pareto optimization by configuring `search.database.pareto_objectives` and returning those objective metrics directly from the evaluator. In that mode, `combined_score` becomes optional and is only used as a scalar fallback/proxy when configured.
201
+
202
+ ### Starting Solution (optional)
203
+
204
+ The initial program is **optional**. When omitted, the LLM generates a solution from scratch. If provided, it marks the region to mutate with EVOLVE-BLOCK markers. Everything outside is left untouched.
205
+
206
+ ```python
207
+ # EVOLVE-BLOCK-START
208
+ def solve(input_data):
209
+ return input_data # baseline — SkyDiscover will improve this
210
+ # EVOLVE-BLOCK-END
211
+ ```
212
+
213
+ If no markers are present, the entire file is treated as mutatable.
214
+
215
+
216
+ ## 🧬 Pick an Algorithm
217
+
218
+ See [Benchmark Performance](#-benchmark-performance) for a detailed comparison of AdaEvolve and EvoX against other algorithms.
219
+
220
+ | Algorithm | Flag | Description |
221
+ |:---|:---|:---|
222
+ | ⭐&nbsp;**AdaEvolve** | `--search adaevolve` | Multi-island adaptive search with UCB, migration, and paradigm breakthroughs |
223
+ | 🧠&nbsp;**EvoX** | `--search evox` | Self-evolving paradigm that co-adapts solution generation and experience management |
224
+ | 📊&nbsp;**Top-K** | `--search topk` | Selects top-K solutions to refine |
225
+ | 🔍&nbsp;**Beam&nbsp;Search** | `--search beam_search` | Breadth-first expansion of a beam of top solutions |
226
+ | 🎲&nbsp;**Best-of-N** | `--search best_of_n` | Generates N variants per iteration, keeps the best |
227
+ | 🧪&nbsp;**GEPA&nbsp;Native** | `--search gepa_native` | Pareto-efficient search with reflective prompting and LLM-mediated merge |
228
+ | 🗺️&nbsp;**OpenEvolve&nbsp;Native** | `--search openevolve_native` | MAP-Elites + island-based evolutionary search |
229
+
230
+ ### External backends
231
+
232
+ Install with `uv sync --extra external`, then use the corresponding flag:
233
+
234
+ | Backend | Flag | Source |
235
+ |:---|:---|:---|
236
+ | **OpenEvolve** | `--search openevolve` | [codelion/openevolve](https://github.com/codelion/openevolve) |
237
+ | **GEPA** | `--search gepa` | [gepa-ai/gepa](https://github.com/gepa-ai/gepa) |
238
+ | **ShinkaEvolve** | `--search shinkaevolve` | [SakanaAI/ShinkaEvolve](https://github.com/SakanaAI/ShinkaEvolve) (manual install) |
239
+
240
+ <details>
241
+ <summary>ShinkaEvolve manual install</summary>
242
+
243
+ ```bash
244
+ git clone --depth 1 https://github.com/SakanaAI/ShinkaEvolve.git external_repos/ShinkaEvolve
245
+ uv pip install -e external_repos/ShinkaEvolve
246
+ ```
247
+
248
+ </details>
249
+
250
+
251
+ ## ⚙️ Configuration
252
+
253
+ Pass a YAML config with `-c`. See [configs/](configs/) for full annotated templates.
254
+
255
+ ```yaml
256
+ max_iterations: 100
257
+ llm:
258
+ models: [{ name: "gemini/gemini-3-pro-preview", weight: 1.0 }]
259
+ search:
260
+ type: "adaevolve" # or "evox", "topk", "beam_search", "best_of_n"
261
+ prompt:
262
+ system_message: |
263
+ You are an expert at optimizing algorithms.
264
+ ```
265
+
266
+ API keys (OPENAI_API_KEY, GEMINI_API_KEY, etc.) are resolved from environment variables automatically.
267
+
268
+ ### 📊 Live Monitor & Human Feedback
269
+
270
+ Add `monitor: { enabled: true }` to your config. The dashboard URL prints at run start — scatter plot of all programs, code diffs, metrics, and AI summaries. A **Human Feedback** panel lets you steer evolution in real time.
271
+ Replay a completed run:
272
+
273
+ ```bash
274
+ uv run skydiscover-viewer /path/to/checkpoints/checkpoint_100
275
+ ```
276
+
277
+
278
+ ## 📖 Reference
279
+
280
+ <details>
281
+ <summary><b>CLI flags</b></summary>
282
+
283
+ ```
284
+ uv run skydiscover-run [INITIAL_PROGRAM] EVALUATOR [options]
285
+ ```
286
+
287
+ | Flag | Description |
288
+ |:---|:---|
289
+ | `-c, --config FILE` | Config YAML |
290
+ | `-i, --iterations N` | Number of iterations |
291
+ | `-m, --model MODEL` | LLM model (overrides config) |
292
+ | `-s, --search TYPE` | Search algorithm |
293
+ | `-o, --output DIR` | Output directory |
294
+ | `--api-base URL` | Override LLM API endpoint |
295
+ | `--checkpoint DIR` | Resume from checkpoint |
296
+ | `--agentic` | Enable agentic mode (LLM can read your files) |
297
+ | `-l, --log-level LEVEL` | DEBUG, INFO, WARNING, or ERROR |
298
+
299
+ </details>
300
+
301
+ <details>
302
+ <summary><b>Python API — discover_solution() (convenience wrapper)</b></summary>
303
+
304
+ `discover_solution()` is a convenience wrapper around `run_discovery()` (shown in [Quick Start](#-quick-start)) for inline string solutions and callable evaluators:
305
+
306
+ ```python
307
+ from skydiscover import discover_solution
308
+
309
+ result = discover_solution(
310
+ initial_solution="def solve(x): return x", # optional — omit to start from scratch
311
+ evaluator=lambda path: {"combined_score": run_tests(path)},
312
+ iterations=50,
313
+ search="evox",
314
+ )
315
+ ```
316
+
317
+ </details>
318
+
319
+ <details>
320
+ <summary><b>Model providers</b></summary>
321
+
322
+ Any [LiteLLM](https://docs.litellm.ai/)-compatible model works using `provider/model` format:
323
+
324
+ ```bash
325
+ --model gpt-5 # OpenAI (default)
326
+ --model gemini/gemini-3-pro-preview # Gemini
327
+ --model anthropic/claude-sonnet-4-20250514 # Anthropic
328
+ --model ollama/llama3 --api-base http://localhost:11434/v1 # Local (Ollama, vLLM, etc.)
329
+ ```
330
+
331
+ Multi-model pools with weighted sampling are supported in config:
332
+
333
+ ```yaml
334
+ llm:
335
+ models:
336
+ - name: "gpt-5-mini"
337
+ weight: 0.7
338
+ - name: "gemini/gemini-2.0-flash"
339
+ weight: 0.3
340
+ ```
341
+
342
+ </details>
343
+
344
+ <details id="dependency-extras">
345
+ <summary><b>Benchmark dependency extras</b></summary>
346
+
347
+ ```bash
348
+ uv sync # Base install
349
+ uv sync --extra math # Math benchmarks (SciPy, JAX, PyWavelets, …)
350
+ uv sync --extra adrs # ADRS systems benchmarks
351
+ uv sync --extra frontier-cs # Frontier-CS benchmark tooling
352
+ uv sync --extra external # OpenEvolve / GEPA / ShinkaEvolve backends
353
+ uv sync --extra prompt-optimization # HotPotQA prompt optimization
354
+ ```
355
+
356
+ Combine extras as needed: `uv sync --extra external --extra math`
357
+
358
+ If a benchmark ships its own `requirements.txt`, also run: `uv pip install -r path/to/requirements.txt`
359
+
360
+ </details>
361
+
362
+ ---
363
+
364
+ ## 🛠️ Extending SkyDiscover
365
+
366
+ - **New benchmark** → [`benchmarks/README.md`](benchmarks/README.md#adding-a-benchmark)
367
+ - **New search algorithm** → [`skydiscover/search/README.md`](skydiscover/search/README.md)
368
+ - **New context builder** → [`skydiscover/context_builder/README.md`](skydiscover/context_builder/README.md)
369
+
370
+ ---
371
+
372
+ ## 🔗 Related Work
373
+ SkyDiscover is inspired by [AlphaEvolve](https://deepmind.google/discover/blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/) and incorporates useful code components from open-source efforts such as [OpenEvolve](https://github.com/codelion/openevolve). Its interface is compatible with the [optimize_anything](https://gepa-ai.github.io/gepa/blog/2026/02/18/introducing-optimize-anything/) API.
374
+
375
+ ## ✍️ Citation
376
+
377
+ ```bibtex
378
+ @misc{skydiscover2026,
379
+ title = {SkyDiscover: A Flexible Framework for AI-Driven Scientific and Algorithmic Discovery},
380
+ author = {Liu, Shu and Cemri, Mert and Agarwal, Shubham and Krentsel, Alexander and Naren, Ashwin and Mang, Qiuyang and Li, Zhifei and Gupta, Akshat and Maheswaran, Monishwaran and Cheng, Audrey and Pan, Melissa and Boneh, Ethan and Ramchandran, Kannan and Sen, Koushik and Dimakis, Alexandros G. and Zaharia, Matei and Stoica, Ion},
381
+ year = {2026},
382
+ url = {https://skydiscover-ai.github.io/blog.html}
383
+ }
384
+ ```
385
+
386
+ ## 📬 Contact Us
387
+ For questions or feedback, reach out to us:
388
+ [lshu@berkeley.edu](mailto:lshu@berkeley.edu) · [mert_cemri@berkeley.edu](mailto:mert_cemri@berkeley.edu) · [shubham3@berkeley.edu](mailto:shubham3@berkeley.edu)
benchmarks/kernelbench/__init__.py ADDED
File without changes
benchmarks/kernelbench/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # KernelBench library for problem fetching (resolver)
2
+ # Note: The evaluator uses kernelbench[gpu] which includes GPU support
3
+ # For resolver-only usage (fetching problems), the base package is sufficient
4
+ kernelbench @ git+https://github.com/ScalingIntelligence/KernelBench.git
docs/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ node_modules/
2
+ .next/
3
+ .source/
docs/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SkyDiscover Documentation
2
+
3
+ Built with [Next.js](https://nextjs.org/) + [Fumadocs](https://fumadocs.vercel.app/).
4
+
5
+ ## Local Development
6
+
7
+ ```bash
8
+ cd docs
9
+ npm install
10
+ npm run dev
11
+ ```
12
+
13
+ Then open [http://localhost:3000](http://localhost:3000).
docs/app/api/search/route.ts ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import { source } from '@/lib/source';
2
+ import { createFromSource } from 'fumadocs-core/search/server';
3
+
4
+ export const { GET } = createFromSource(source, {
5
+ language: 'english',
6
+ });
docs/app/docs/layout.tsx ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { source } from '@/lib/source';
2
+ import { DocsLayout } from 'fumadocs-ui/layouts/docs';
3
+ import { baseOptions } from '@/lib/layout.shared';
4
+
5
+ export default function Layout({ children }: LayoutProps<'/docs'>) {
6
+ return (
7
+ <DocsLayout tree={source.getPageTree()} {...baseOptions()}>
8
+ {children}
9
+ </DocsLayout>
10
+ );
11
+ }
docs/app/global.css ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ @import 'tailwindcss';
2
+ @import 'fumadocs-ui/css/neutral.css';
3
+ @import 'fumadocs-ui/css/preset.css';
docs/app/page.tsx ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Link from 'next/link';
2
+
3
+ export default function HomePage() {
4
+ return (
5
+ <main className="flex min-h-screen flex-col items-center justify-center p-24">
6
+ <div className="z-10 max-w-5xl w-full items-center justify-center font-mono text-sm">
7
+ <h1 className="text-4xl font-bold mb-8 text-center">SkyDiscover Documentation</h1>
8
+ <p className="text-xl mb-8 text-center">
9
+ Documentation for SkyDiscover.
10
+ </p>
11
+ <div className="flex justify-center">
12
+ <Link
13
+ href="/docs"
14
+ className="inline-block bg-blue-600 hover:bg-blue-700 text-white font-bold py-3 px-6 rounded text-lg"
15
+ >
16
+ View Documentation
17
+ </Link>
18
+ </div>
19
+ </div>
20
+ </main>
21
+ );
22
+ }
docs/content/docs/getting-started/index.mdx ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Getting Started"
3
+ description: "Set up SkyDiscover, run your first discovery task, and learn how to configure it."
4
+ ---
5
+
6
+ Get up and running with SkyDiscover in a few minutes. This section covers
7
+ everything you need to go from zero to your first AI-driven discovery:
8
+
9
+ - **[Installation](/docs/getting-started/installation)** — install SkyDiscover and set up your API keys
10
+ - **[Quick Start](/docs/getting-started/quick-start)** — run your first task and understand the core workflow
11
+ - **[Configuration](/docs/getting-started/configuration)** — models, algorithms, config files, and the Python API
docs/content/docs/getting-started/installation.mdx ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Installation"
3
+ description: "Install SkyDiscover and configure your LLM API keys."
4
+ ---
5
+
6
+ ## Prerequisites
7
+
8
+ - **Python** >= 3.10
9
+ - **[uv](https://docs.astral.sh/uv/)** — fast Python package manager
10
+ - **[Docker](https://docs.docker.com/get-docker/)** — for containerized evaluators
11
+ - An **LLM API key** (OpenAI, Gemini, Anthropic, or a local model)
12
+
13
+ ## Install SkyDiscover
14
+
15
+ ```bash
16
+ git clone https://github.com/skydiscover-ai/skydiscover.git
17
+ cd skydiscover
18
+ uv sync
19
+ ```
20
+
21
+ ## Verify the installation
22
+
23
+ ```bash
24
+ uv run skydiscover-run --help
25
+ ```
26
+
27
+ You should see the help text with available flags.
28
+
29
+ ## Set your API key
30
+
31
+ SkyDiscover uses [LiteLLM](https://docs.litellm.ai/) under the hood, so any
32
+ provider works. Set the key for the provider you want to use:
33
+
34
+ ```bash
35
+ export ANTHROPIC_API_KEY="..."
36
+ # Or for other providers:
37
+ # export OPENAI_API_KEY="sk-..."
38
+ # export GEMINI_API_KEY="..."
39
+ ```
40
+
41
+ ## Optional extras
42
+
43
+ Running some comparison benchmarks requires additional dependencies. Install them as needed:
44
+
45
+ ```bash
46
+ uv sync --extra adrs # ADRS systems benchmarks
47
+ uv sync --extra external # OpenEvolve / GEPA / ShinkaEvolve backends
48
+ uv sync --extra math # Math benchmarks (SciPy, JAX, etc.)
49
+ ```
50
+
51
+ You can combine extras: `uv sync --extra external --extra math`
docs/content/docs/getting-started/meta.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "title": "Getting Started",
3
+ "pages": ["index", "installation", "quick-start", "configuration"]
4
+ }
docs/content/docs/meta.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "Documentation",
3
+ "pages": [
4
+ "index",
5
+ "getting-started"
6
+ ]
7
+ }
docs/lib/source.ts ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import { docs } from 'fumadocs-mdx:collections/server';
2
+ import { loader } from 'fumadocs-core/source';
3
+
4
+ export const source = loader({
5
+ baseUrl: '/docs',
6
+ source: docs.toFumadocsSource(),
7
+ });
docs/mdx-components.tsx ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import defaultMdxComponents from 'fumadocs-ui/mdx';
2
+ import { Callout } from 'fumadocs-ui/components/callout';
3
+ import { File, Files, Folder } from 'fumadocs-ui/components/files';
4
+ import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
5
+ import type { MDXComponents } from 'mdx/types';
6
+
7
+ export function getMDXComponents(components?: MDXComponents): MDXComponents {
8
+ return {
9
+ ...defaultMdxComponents,
10
+ Callout,
11
+ File,
12
+ Files,
13
+ Folder,
14
+ Tab,
15
+ Tabs,
16
+ ...components,
17
+ };
18
+ }
docs/next-env.d.ts ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ /// <reference types="next" />
2
+ /// <reference types="next/image-types/global" />
3
+ import "./.next/dev/types/routes.d.ts";
4
+
5
+ // NOTE: This file should not be edited
6
+ // see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
docs/next.config.mjs ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { createMDX } from 'fumadocs-mdx/next';
2
+
3
+ const withMDX = createMDX();
4
+
5
+ /** @type {import('next').NextConfig} */
6
+ const config = {
7
+ reactStrictMode: true,
8
+ async redirects() {
9
+ return [
10
+ {
11
+ source: '/',
12
+ destination: '/docs',
13
+ permanent: true,
14
+ },
15
+ ];
16
+ },
17
+ };
18
+
19
+ export default withMDX(config);
docs/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
docs/package.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "skydiscover-docs",
3
+ "version": "0.1.0",
4
+ "type": "module",
5
+ "private": true,
6
+ "scripts": {
7
+ "build": "next build",
8
+ "dev": "next dev",
9
+ "start": "next start",
10
+ "types:check": "fumadocs-mdx && next typegen && tsc --noEmit"
11
+ },
12
+ "dependencies": {
13
+ "fumadocs-core": "^16.4.8",
14
+ "fumadocs-mdx": "^14.2.6",
15
+ "fumadocs-ui": "^16.4.8",
16
+ "next": "^16.1.4",
17
+ "react": "^19.0.0",
18
+ "react-dom": "^19.0.0"
19
+ },
20
+ "devDependencies": {
21
+ "@tailwindcss/postcss": "^4.1.18",
22
+ "@types/mdx": "^2.0.13",
23
+ "@types/node": "25.0.10",
24
+ "@types/react": "^19.2.9",
25
+ "@types/react-dom": "^19.2.3",
26
+ "postcss": "^8.5.6",
27
+ "tailwindcss": "^4.1.18",
28
+ "typescript": "^5.9.3"
29
+ }
30
+ }
docs/postcss.config.mjs ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ export default {
2
+ plugins: {
3
+ '@tailwindcss/postcss': {},
4
+ },
5
+ };
docs/source.config.ts ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import { defineConfig, defineDocs } from 'fumadocs-mdx/config';
2
+
3
+ export const docs = defineDocs({
4
+ dir: 'content/docs',
5
+ });
6
+
7
+ export default defineConfig();
docs/tsconfig.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "baseUrl": ".",
4
+ "target": "ESNext",
5
+ "lib": ["dom", "dom.iterable", "esnext"],
6
+ "allowJs": true,
7
+ "skipLibCheck": true,
8
+ "strict": true,
9
+ "forceConsistentCasingInFileNames": true,
10
+ "noEmit": true,
11
+ "esModuleInterop": true,
12
+ "module": "esnext",
13
+ "moduleResolution": "bundler",
14
+ "resolveJsonModule": true,
15
+ "isolatedModules": true,
16
+ "jsx": "react-jsx",
17
+ "incremental": true,
18
+ "paths": {
19
+ "@/*": ["./*"],
20
+ "fumadocs-mdx:collections/*": [".source/*"]
21
+ },
22
+ "plugins": [
23
+ {
24
+ "name": "next"
25
+ }
26
+ ]
27
+ },
28
+ "include": [
29
+ "next-env.d.ts",
30
+ "**/*.ts",
31
+ "**/*.tsx",
32
+ ".next/types/**/*.ts",
33
+ ".next/dev/types/**/*.ts"
34
+ ],
35
+ "exclude": ["node_modules"]
36
+ }
examples/text_similarity/config.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text similarity benchmark — evolve a function that matches human judgments
2
+ # Usage: skydiscover-run examples/text_similarity/initial_program.py examples/text_similarity/evaluator/ -c examples/text_similarity/config.yaml
3
+ language: python
4
+ max_iterations: 50
5
+ checkpoint_interval: 5
6
+
7
+ llm:
8
+ models:
9
+ - name: "anthropic/claude-sonnet-4-6"
10
+ weight: 1.0
11
+ max_tokens: 8192
12
+ timeout: 300
13
+
14
+ prompt:
15
+ system_message: |-
16
+ You are an expert in natural language processing and string similarity
17
+ algorithms. Your task is to write a similarity function that scores how
18
+ alike two strings are, matching human intuition as closely as possible.
19
+
20
+ The function must return a float between 0.0 (unrelated) and 1.0 (identical).
21
+ Only use the Python standard library — no external packages.
22
+
23
+ The evaluation dataset includes:
24
+ - Typos and misspellings (should score high)
25
+ - Paraphrases (same meaning, different words — should score high)
26
+ - Word reordering (should score fairly high)
27
+ - Negation (high word overlap but opposite meaning — should score low)
28
+ - Unrelated strings (should score near 0)
29
+
30
+ A basic edit distance baseline gets ~0.3 correlation. Character n-grams,
31
+ token overlap, and word-level features can push it higher. Think about
32
+ combining multiple signals — there is no single trick that handles all
33
+ cases. Consider synonym awareness, word order, negation detection, and
34
+ length normalization.
35
+
36
+ evaluator:
37
+ timeout: 30
38
+
39
+ # Live dashboard — opens in your browser
40
+ monitor:
41
+ enabled: true
examples/text_similarity/evaluator/Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+ WORKDIR /benchmark
3
+
4
+ RUN pip install --no-cache-dir scipy
5
+
6
+ COPY evaluator.py pairs.json evaluate.sh ./
7
+ RUN chmod +x evaluate.sh
8
+
9
+ ENTRYPOINT ["./evaluate.sh"]
examples/text_similarity/evaluator/evaluate.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ PROGRAM="$1"
5
+ python /benchmark/evaluator.py "$PROGRAM"
examples/text_similarity/evaluator/evaluator.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Score a candidate text-similarity function against human judgments."""
3
+
4
+ import importlib.util
5
+ import json
6
+ import random
7
+ import sys
8
+
9
+ from scipy.stats import spearmanr
10
+
11
+ PAIRS = json.load(open("/benchmark/pairs.json"))
12
+
13
+
14
+ def main():
15
+ program_path = sys.argv[1]
16
+
17
+ # Load the candidate's similarity() function
18
+ spec = importlib.util.spec_from_file_location("candidate", program_path)
19
+ mod = importlib.util.module_from_spec(spec)
20
+ spec.loader.exec_module(mod)
21
+
22
+ # Score every pair
23
+ predicted = []
24
+ for a, b, _ in PAIRS:
25
+ try:
26
+ score = max(0.0, min(1.0, float(mod.similarity(a, b))))
27
+ except Exception:
28
+ score = 0.0
29
+ predicted.append(score)
30
+
31
+ human = [h for _, _, h in PAIRS]
32
+ correlation = spearmanr(predicted, human).statistic
33
+
34
+ samples = random.sample(range(len(PAIRS)), 3)
35
+ lines = [f"Spearman correlation: {correlation:.4f}", ""]
36
+ for i in samples:
37
+ a, b, h = PAIRS[i]
38
+ lines.append(f" '{a}' vs '{b}': predicted={predicted[i]:.2f}, human={h:.2f}")
39
+
40
+ print(json.dumps({
41
+ "status": "success",
42
+ "combined_score": round(max(0.0, correlation), 4),
43
+ "artifacts": {"feedback": "\n".join(lines)},
44
+ }))
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
examples/text_similarity/evaluator/pairs.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ ["the cat sat on the mat", "the cat sat on the mat", 1.0],
3
+ ["the cat sat on the mat", "the cat sat on a mat", 0.9],
4
+ ["restaurant", "restarant", 0.9],
5
+
6
+ ["the movie was great", "the film was excellent", 0.85],
7
+ ["she is happy", "she feels joyful", 0.8],
8
+ ["the car is fast", "the vehicle moves quickly", 0.8],
9
+ ["it is raining outside", "rain is falling outdoors", 0.85],
10
+ ["he fixed the bug", "he resolved the defect", 0.8],
11
+ ["prices went up", "costs increased", 0.85],
12
+
13
+ ["the dog chased the cat", "the cat was chased by the dog", 0.85],
14
+ ["I love programming in Python", "programming in Python is something I love", 0.8],
15
+
16
+ ["machine learning is a subset of AI", "machine learning uses data", 0.4],
17
+ ["the weather is nice today", "today is a good day", 0.5],
18
+ ["I went to the store", "I drove to the mall", 0.45],
19
+
20
+ ["the cat is on the mat", "the dog is in the yard", 0.25],
21
+ ["she plays piano", "he plays guitar", 0.35],
22
+ ["New York is a big city", "Tokyo has a large population", 0.3],
23
+ ["I ate breakfast", "the morning meal was consumed", 0.7],
24
+
25
+ ["the test passed", "the test did not pass", 0.2],
26
+ ["I love this movie", "I hate this movie", 0.2],
27
+ ["the system is working", "the system is not working", 0.15],
28
+
29
+ ["the cat sat on the mat", "quantum physics is fascinating", 0.0],
30
+ ["hello world", "purple elephants dance on mars", 0.0],
31
+ ["database optimization", "chocolate cake recipe", 0.0],
32
+ ["she went to school", "the stock market crashed", 0.05],
33
+
34
+ ["yes", "yeah", 0.8],
35
+ ["no", "nope", 0.8],
36
+ ["hi", "hello", 0.75],
37
+ ["error", "bug", 0.6],
38
+ ["fast", "quick", 0.85],
39
+ ["big", "large", 0.9],
40
+ ["happy", "sad", 0.15],
41
+ ["good", "bad", 0.15],
42
+
43
+ ["the server returned a 500 error", "the server threw an internal error", 0.85],
44
+ ["null pointer exception", "segmentation fault", 0.4],
45
+ ["open a pull request", "submit a PR", 0.85],
46
+ ["the function returns a list", "the method outputs an array", 0.7],
47
+
48
+ ["the meeting is at 3pm", "the meeting is at 3:00 PM", 0.95],
49
+ ["version 2.0", "v2.0", 0.9],
50
+ ["100 dollars", "$100", 0.9]
51
+ ]
examples/text_similarity/initial_program.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EVOLVE-BLOCK-START
2
+ def similarity(a: str, b: str) -> float:
3
+ """
4
+ Return a similarity score between 0.0 (unrelated) and 1.0 (identical)
5
+ for two input strings.
6
+
7
+ This should capture not just character-level similarity but also
8
+ meaning — paraphrases should score high, negations should score low,
9
+ and typos should be forgiven.
10
+
11
+ Only use the Python standard library (no external packages).
12
+ """
13
+ # Baseline: normalized Levenshtein distance
14
+ if a == b:
15
+ return 1.0
16
+ if not a or not b:
17
+ return 0.0
18
+
19
+ m, n = len(a), len(b)
20
+ dp = list(range(n + 1))
21
+ for i in range(1, m + 1):
22
+ prev = dp[0]
23
+ dp[0] = i
24
+ for j in range(1, n + 1):
25
+ temp = dp[j]
26
+ if a[i - 1] == b[j - 1]:
27
+ dp[j] = prev
28
+ else:
29
+ dp[j] = 1 + min(dp[j], dp[j - 1], prev)
30
+ prev = temp
31
+
32
+ max_len = max(m, n)
33
+ return 1.0 - dp[n] / max_len
34
+ # EVOLVE-BLOCK-END
scripts/reproduce/adrs.sh ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Reproduce ADRS benchmarks (5 problems x 2 search methods).
3
+ # All benchmarks launch in parallel.
4
+ set -euo pipefail
5
+
6
+ # ── Settings ─────────────────────────────────────────────────────────────────
7
+ # Only two things to change:
8
+
9
+ MODEL="gpt-5" # main generation model
10
+ # MODEL="gemini/gemini-3.0-pro-preview" # alternative
11
+ ITERATIONS=100
12
+
13
+ # -m sets all models (main + guide/paradigm) to the same MODEL.
14
+ # API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
15
+
16
+ # ── Install ──────────────────────────────────────────────────────────────────
17
+
18
+ cd "$(dirname "$0")/../.."
19
+ uv sync --extra adrs
20
+
21
+ # ── Download Data ────────────────────────────────────────────────────────────
22
+
23
+ if [[ ! -f benchmarks/ADRS/cloudcast/profiles/cost.csv ]]; then
24
+ echo "Downloading cloudcast dataset..."
25
+ bash benchmarks/ADRS/cloudcast/download_dataset.sh
26
+ fi
27
+
28
+ if [[ ! -d benchmarks/ADRS/llm_sql/datasets ]] || \
29
+ [[ -z "$(ls benchmarks/ADRS/llm_sql/datasets/*.csv 2>/dev/null)" ]]; then
30
+ echo "Downloading llm_sql dataset..."
31
+ bash benchmarks/ADRS/llm_sql/download_dataset.sh
32
+ fi
33
+
34
+ # ── Helper ───────────────────────────────────────────────────────────────────
35
+
36
+ run() {
37
+ local dir=$1 search=$2
38
+ local init="$dir/initial_program.py"
39
+ [[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
40
+ [[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
41
+ local cfg="$dir/config.yaml"
42
+ [[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
43
+ echo "== $search: ${dir#benchmarks/} =="
44
+ uv run skydiscover-run "$init" "$dir/evaluator.py" \
45
+ -c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
46
+ -o "outputs/reproduce/$search/${dir#benchmarks/}"
47
+ }
48
+
49
+ # ── AdaEvolve ────────────────────────────────────────────────────────────────
50
+
51
+ run benchmarks/ADRS/cloudcast adaevolve &
52
+ run benchmarks/ADRS/eplb adaevolve &
53
+ run benchmarks/ADRS/llm_sql adaevolve &
54
+ run benchmarks/ADRS/prism adaevolve &
55
+ run benchmarks/ADRS/txn_scheduling adaevolve &
56
+
57
+ # ── EvoX ─────────────────────────────────────────────────────────────────────
58
+
59
+ run benchmarks/ADRS/cloudcast evox &
60
+ run benchmarks/ADRS/eplb evox &
61
+ run benchmarks/ADRS/llm_sql evox &
62
+ run benchmarks/ADRS/prism evox &
63
+ run benchmarks/ADRS/txn_scheduling evox &
64
+
65
+ wait
66
+ echo "adrs.sh: all 10 runs finished."
scripts/reproduce/ale_bench.sh ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Reproduce ALE-Bench benchmarks (10 problems x 2 search methods).
3
+ # All benchmarks launch in parallel.
4
+ set -euo pipefail
5
+
6
+ # ── Settings ─────────────────────────────────────────────────────────────────
7
+ # Only two things to change:
8
+
9
+ MODEL="gpt-5" # main generation model
10
+ # MODEL="gemini/gemini-3.0-pro-preview" # alternative
11
+ ITERATIONS=100
12
+
13
+ # -m sets all models (main + guide/paradigm) to the same MODEL.
14
+ # API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
15
+
16
+ # ── Install ──────────────────────────────────────────────────────────────────
17
+
18
+ cd "$(dirname "$0")/../.."
19
+ uv sync --extra external
20
+
21
+ # ── Helper ───────────────────────────────────────────────────────────────────
22
+
23
+ run() {
24
+ local dir=$1 search=$2
25
+ local init="$dir/initial_program.py"
26
+ [[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
27
+ [[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
28
+ local cfg="$dir/config.yaml"
29
+ [[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
30
+ echo "== $search: ${dir#benchmarks/} =="
31
+ uv run skydiscover-run "$init" "$dir/evaluator.py" \
32
+ -c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
33
+ -o "outputs/reproduce/$search/${dir#benchmarks/}"
34
+ }
35
+
36
+ # ── AdaEvolve ────────────────────────────────────────────────────────────────
37
+
38
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc008 adaevolve &
39
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc011 adaevolve &
40
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc015 adaevolve &
41
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc016 adaevolve &
42
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc024 adaevolve &
43
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc025 adaevolve &
44
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc026 adaevolve &
45
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc027 adaevolve &
46
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc039 adaevolve &
47
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc046 adaevolve &
48
+
49
+ # ── EvoX ─────────────────────────────────────────────────────────────────────
50
+
51
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc008 evox &
52
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc011 evox &
53
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc015 evox &
54
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc016 evox &
55
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc024 evox &
56
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc025 evox &
57
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc026 evox &
58
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc027 evox &
59
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc039 evox &
60
+ run benchmarks/ale_bench/ale-bench-lite-problems/ahc046 evox &
61
+
62
+ wait
63
+ echo "ale_bench.sh: all 20 runs finished."
scripts/reproduce/arc.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Reproduce ARC benchmark (1 problem x 2 search methods).
3
+ # All benchmarks launch in parallel.
4
+ set -euo pipefail
5
+
6
+ # ── Settings ─────────────────────────────────────────────────────────────────
7
+ # Only two things to change:
8
+
9
+ MODEL="gpt-5" # main generation model
10
+ # MODEL="gemini/gemini-3.0-pro-preview" # alternative
11
+ ITERATIONS=100
12
+
13
+ # -m sets all models (main + guide/paradigm) to the same MODEL.
14
+ # API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
15
+
16
+ # ── Install ──────────────────────────────────────────────────────────────────
17
+
18
+ cd "$(dirname "$0")/../.."
19
+ uv sync
20
+
21
+ # ── Helper ───────────────────────────────────────────────────────────────────
22
+
23
+ run() {
24
+ local dir=$1 search=$2
25
+ local init="$dir/initial_program.py"
26
+ [[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
27
+ [[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
28
+ local cfg="$dir/config.yaml"
29
+ [[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
30
+ echo "== $search: ${dir#benchmarks/} =="
31
+ uv run skydiscover-run "$init" "$dir/evaluator.py" \
32
+ -c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
33
+ -o "outputs/reproduce/$search/${dir#benchmarks/}"
34
+ }
35
+
36
+ # ── AdaEvolve ────────────────────────────────────────────────────────────────
37
+
38
+ run benchmarks/arc_benchmark adaevolve &
39
+
40
+ # ── EvoX ─────────────────────────────────────────────────────────────────────
41
+
42
+ run benchmarks/arc_benchmark evox &
43
+
44
+ wait
45
+ echo "arc.sh: all 2 runs finished."
scripts/reproduce/frontier_cs.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Reproduce Frontier-CS benchmark (1 problem x 2 search methods).
3
+ # Requires Docker to be installed and running.
4
+ # All benchmarks launch in parallel.
5
+ set -euo pipefail
6
+
7
+ # ── Settings ─────────────────────────────────────────────────────────────────
8
+ # Only two things to change:
9
+
10
+ MODEL="gpt-5" # main generation model
11
+ # MODEL="gemini/gemini-3.0-pro-preview" # alternative
12
+ ITERATIONS=100
13
+
14
+ # -m sets all models (main + guide/paradigm) to the same MODEL.
15
+ # API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
16
+
17
+ # ── Install ──────────────────────────────────────────────────────────────────
18
+
19
+ cd "$(dirname "$0")/../.."
20
+ uv sync --extra frontier-cs
21
+
22
+ # ── Check Docker ─────────────────────────────────────────────────────────────
23
+
24
+ if ! command -v docker &>/dev/null; then
25
+ echo "Warning: Docker not found. The evaluator requires Docker." >&2
26
+ fi
27
+
28
+ # ── Helper ───────────────────────────────────────────────────────────────────
29
+
30
+ run() {
31
+ local dir=$1 search=$2
32
+ local init="$dir/initial_program.py"
33
+ [[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
34
+ [[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
35
+ local cfg="$dir/config.yaml"
36
+ [[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
37
+ echo "== $search: ${dir#benchmarks/} =="
38
+ uv run skydiscover-run "$init" "$dir/evaluator.py" \
39
+ -c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
40
+ -o "outputs/reproduce/$search/${dir#benchmarks/}"
41
+ }
42
+
43
+ # ── AdaEvolve ────────────────────────────────────────────────────────────────
44
+
45
+ run benchmarks/frontier-cs-eval adaevolve &
46
+
47
+ # ── EvoX ─────────────────────────────────────────────────────────────────────
48
+
49
+ run benchmarks/frontier-cs-eval evox &
50
+
51
+ wait
52
+ echo "frontier_cs.sh: all 2 runs finished."
scripts/reproduce/gpu.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Reproduce GPU benchmarks (4 problems x 2 search methods).
3
+ # Requires a CUDA-capable GPU with Triton support.
4
+ # All benchmarks launch in parallel.
5
+ set -euo pipefail
6
+
7
+ # ── Settings ─────────────────────────────────────────────────────────────────
8
+ # Only two things to change:
9
+
10
+ MODEL="gpt-5" # main generation model
11
+ # MODEL="gemini/gemini-3.0-pro-preview" # alternative
12
+ ITERATIONS=100
13
+
14
+ # -m sets all models (main + guide/paradigm) to the same MODEL.
15
+ # API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
16
+
17
+ # ── Install ──────────────────────────────────────────────────────────────────
18
+
19
+ cd "$(dirname "$0")/../.."
20
+ uv sync
21
+
22
+ # ── Check GPU ────────────────────────────────────────────────────────────────
23
+
24
+ if ! command -v nvidia-smi &>/dev/null; then
25
+ echo "Warning: nvidia-smi not found. GPU benchmarks may fail." >&2
26
+ fi
27
+
28
+ # ── Helper ───────────────────────────────────────────────────────────────────
29
+
30
+ run() {
31
+ local dir=$1 search=$2
32
+ local init="$dir/initial_program.py"
33
+ [[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
34
+ [[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
35
+ local cfg="$dir/config.yaml"
36
+ [[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
37
+ echo "== $search: ${dir#benchmarks/} =="
38
+ uv run skydiscover-run "$init" "$dir/evaluator.py" \
39
+ -c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
40
+ -o "outputs/reproduce/$search/${dir#benchmarks/}"
41
+ }
42
+
43
+ # ── AdaEvolve ────────────────────────────────────────────────────────────────
44
+
45
+ run benchmarks/gpu_mode/grayscale adaevolve &
46
+ run benchmarks/gpu_mode/mla_decode adaevolve &
47
+ run benchmarks/gpu_mode/trimul adaevolve &
48
+ run benchmarks/gpu_mode/vecadd adaevolve &
49
+
50
+ # ── EvoX ─────────────────────────────────────────────────────────────────────
51
+
52
+ run benchmarks/gpu_mode/grayscale evox &
53
+ run benchmarks/gpu_mode/mla_decode evox &
54
+ run benchmarks/gpu_mode/trimul evox &
55
+ run benchmarks/gpu_mode/vecadd evox &
56
+
57
+ wait
58
+ echo "gpu.sh: all 8 runs finished."
scripts/reproduce/math.sh ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Reproduce math benchmarks (17 problems x 2 search methods).
3
+ # All benchmarks launch in parallel.
4
+ set -euo pipefail
5
+
6
+ # ── Settings ─────────────────────────────────────────────────────────────────
7
+ # Only two things to change:
8
+
9
+ MODEL="gpt-5" # main generation model
10
+ # MODEL="gemini/gemini-3.0-pro-preview" # alternative
11
+ ITERATIONS=100
12
+
13
+ # -m sets all models (main + guide/paradigm) to the same MODEL.
14
+ # API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
15
+
16
+ # ── Install ──────────────────────────────────────────────────────────────────
17
+
18
+ cd "$(dirname "$0")/../.."
19
+ uv sync --extra math
20
+
21
+ # ── Helper ───────────────────────────────────────────────────────────────────
22
+
23
+ run() {
24
+ local dir=$1 search=$2
25
+ local init="$dir/initial_program.py"
26
+ [[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
27
+ [[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
28
+ local cfg="$dir/config.yaml"
29
+ [[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
30
+ echo "== $search: ${dir#benchmarks/} =="
31
+ uv run skydiscover-run "$init" "$dir/evaluator.py" \
32
+ -c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
33
+ -o "outputs/reproduce/$search/${dir#benchmarks/}"
34
+ }
35
+
36
+ # ── AdaEvolve ────────────────────────────────────────────────────────────────
37
+
38
+ run benchmarks/math/circle_packing adaevolve &
39
+ run benchmarks/math/circle_packing_rect adaevolve &
40
+ run benchmarks/math/erdos_min_overlap adaevolve &
41
+ run benchmarks/math/first_autocorr_ineq adaevolve &
42
+ run benchmarks/math/second_autocorr_ineq adaevolve &
43
+ run benchmarks/math/third_autocorr_ineq adaevolve &
44
+ run benchmarks/math/uncertainty_ineq adaevolve &
45
+ run benchmarks/math/hexagon_packing/11 adaevolve &
46
+ run benchmarks/math/hexagon_packing/12 adaevolve &
47
+ run benchmarks/math/heilbronn_convex/13 adaevolve &
48
+ run benchmarks/math/heilbronn_convex/14 adaevolve &
49
+ run benchmarks/math/heilbronn_triangle adaevolve &
50
+ run benchmarks/math/minimizing_max_min_dist/2 adaevolve &
51
+ run benchmarks/math/minimizing_max_min_dist/3 adaevolve &
52
+ run benchmarks/math/matmul adaevolve &
53
+ run benchmarks/math/signal_processing adaevolve &
54
+ run benchmarks/math/sums_diffs_finite_sets adaevolve &
55
+
56
+ # ── EvoX ─────────────────────────────────────────────────────────────────────
57
+
58
+ run benchmarks/math/circle_packing evox &
59
+ run benchmarks/math/circle_packing_rect evox &
60
+ run benchmarks/math/erdos_min_overlap evox &
61
+ run benchmarks/math/first_autocorr_ineq evox &
62
+ run benchmarks/math/second_autocorr_ineq evox &
63
+ run benchmarks/math/third_autocorr_ineq evox &
64
+ run benchmarks/math/uncertainty_ineq evox &
65
+ run benchmarks/math/hexagon_packing/11 evox &
66
+ run benchmarks/math/hexagon_packing/12 evox &
67
+ run benchmarks/math/heilbronn_convex/13 evox &
68
+ run benchmarks/math/heilbronn_convex/14 evox &
69
+ run benchmarks/math/heilbronn_triangle evox &
70
+ run benchmarks/math/minimizing_max_min_dist/2 evox &
71
+ run benchmarks/math/minimizing_max_min_dist/3 evox &
72
+ run benchmarks/math/matmul evox &
73
+ run benchmarks/math/signal_processing evox &
74
+ run benchmarks/math/sums_diffs_finite_sets evox &
75
+
76
+ wait
77
+ echo "math.sh: all 34 runs finished."
scripts/reproduce/prompt_opt.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Reproduce prompt optimization benchmark (1 problem x 2 search methods).
3
+ # All benchmarks launch in parallel.
4
+ set -euo pipefail
5
+
6
+ # ── Settings ─────────────────────────────────────────────────────────────────
7
+ # Only two things to change:
8
+
9
+ MODEL="gpt-5" # main generation model
10
+ # MODEL="gemini/gemini-3.0-pro-preview" # alternative
11
+ ITERATIONS=100
12
+
13
+ # -m sets all models (main + guide/paradigm) to the same MODEL.
14
+ # API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
15
+
16
+ # ── Install ──────────────────────────────────────────────────────────────────
17
+
18
+ cd "$(dirname "$0")/../.."
19
+ uv sync --extra prompt-optimization
20
+
21
+ # ── Helper ───────────────────────────────────────────────────────────────────
22
+
23
+ run() {
24
+ local dir=$1 search=$2
25
+ local init="$dir/initial_program.py"
26
+ [[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
27
+ [[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
28
+ local cfg="$dir/config.yaml"
29
+ [[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
30
+ echo "== $search: ${dir#benchmarks/} =="
31
+ uv run skydiscover-run "$init" "$dir/evaluator.py" \
32
+ -c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
33
+ -o "outputs/reproduce/$search/${dir#benchmarks/}"
34
+ }
35
+
36
+ # ── AdaEvolve ────────────────────────────────────────────────────────────────
37
+
38
+ run benchmarks/prompt_optimization/hotpot_qa adaevolve &
39
+
40
+ # ── EvoX ─────────────────────────────────────────────────────────────────────
41
+
42
+ run benchmarks/prompt_optimization/hotpot_qa evox &
43
+
44
+ wait
45
+ echo "prompt_opt.sh: all 2 runs finished."
scripts/reproduce/run_all.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Run all reproduce scripts in parallel.
3
+ # Each category launches in the background; we wait for all to finish.
4
+ # Tip: set ITERATIONS=2 in each script for a quick smoke test.
5
+ set -euo pipefail
6
+
7
+ DIR="$(dirname "$0")"
8
+
9
+ bash "$DIR/math.sh" &
10
+ bash "$DIR/adrs.sh" &
11
+ bash "$DIR/ale_bench.sh" &
12
+ bash "$DIR/frontier_cs.sh" &
13
+ bash "$DIR/gpu.sh" &
14
+ bash "$DIR/arc.sh" &
15
+ bash "$DIR/prompt_opt.sh" &
16
+
17
+ wait
18
+ echo "All reproduce scripts finished."
scripts/run_cp.sh ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Run circle_packing benchmark with topk search.
3
+ # Usage: ./scripts/run_cp.sh [ITERATIONS]
4
+ # Prerequisites: uv sync --extra math, OPENAI_API_KEY set
5
+
6
+ set -euo pipefail
7
+
8
+ cd "$(dirname "$0")/.."
9
+
10
+ ITERATIONS="${1:-3}"
11
+
12
+ echo "Running circle_packing benchmark (search=topk, iterations=$ITERATIONS)..."
13
+ uv run skydiscover-run \
14
+ benchmarks/math/circle_packing/initial_program.py \
15
+ benchmarks/math/circle_packing/evaluator.py \
16
+ --config benchmarks/math/circle_packing/config.yaml \
17
+ --search topk \
18
+ --iterations "$ITERATIONS"
19
+
20
+ echo "Done."
setup.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from setuptools import setup
2
+
3
+ setup() # All config in pyproject.toml
skydiscover/README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SkyDiscover
2
+
3
+ SkyDiscover is an iterative LLM-driven discovery engine. Each iteration runs a
4
+ four-step loop:
5
+
6
+ ```
7
+ sample → prompt → generate → evaluate → add
8
+ ↑ │
9
+ └───────────────────────────────────────┘
10
+ ```
11
+
12
+ 1. **Sample** — the search algorithm (`search/`) picks a parent solution and
13
+ any relevant context solutions from the database.
14
+ 2. **Prompt** — the context builder (`context_builder/`) turns the parent solution,
15
+ relevant context solutions (if any), and problem spec into system + user messages.
16
+ 3. **Generate** — the LLM (`llm/`) produces a candidate solution (code, text,
17
+ or image).
18
+ 4. **Evaluate** — the evaluator (`evaluation/`) scores the candidate and
19
+ returns metrics.
20
+ 5. **Add** — the scored candidate is stored back in the database, closing the
21
+ loop.
22
+
23
+ The `DiscoveryController` (`search/default_discovery_controller.py`) orchestrates
24
+ this loop. Search algorithms that need custom orchestration (e.g. co-evolution)
25
+ subclass it and override `run_discovery()`.
26
+
27
+ ## Components
28
+
29
+ | Component | Subfolder | What it does | Extend by |
30
+ |:---|:---|:---|:---|
31
+ | **Context Builder** | `context_builder/` | Assembles LLM prompts from the problem spec, prior solutions, and feedback | Subclass `ContextBuilder` ([README](context_builder/README.md)) |
32
+ | **Solution Generator** | `llm/` | Produces candidates via LLM calls, with optional tool use | Subclass `LLMInterface` |
33
+ | **Evaluator** | `evaluation/` | Scores candidates and logs metadata back into the solution database | Provide an `evaluate.py` script |
34
+ | **Solution Selector** | `search/` | Maintains the solution database and picks parents for the next iteration | Subclass `ProgramDatabase` ([README](search/README.md)) |
35
+
36
+ ## Additional subfolders
37
+
38
+ | Subfolder | What it does |
39
+ |:---|:---|
40
+ | `extras/` | External backends (OpenEvolve, GEPA, ShinkaEvolve) and the live monitor dashboard |
41
+ | `utils/` | Shared helpers — code parsing, metrics, formatting, async utilities, repo mapping |
42
+
43
+ ## Entry points
44
+
45
+ | Entry point | Use case |
46
+ |:---|:---|
47
+ | `api.py` | Python API — `run_discovery()`, `discover_solution()` |
48
+ | `cli.py` | CLI — `skydiscover-run` |
49
+ | `runner.py` | Setup and run (used by both API and CLI) |
50
+ | `config.py` | Configuration loading and overrides |
skydiscover/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SkyDiscover: Self-Improving Framework for LLMs
3
+ """
4
+
5
+ from skydiscover._version import __version__
6
+ from skydiscover.api import (
7
+ DiscoveryResult,
8
+ discover_solution,
9
+ run_discovery,
10
+ )
11
+ from skydiscover.runner import Runner
12
+
13
+ __all__ = [
14
+ "Runner",
15
+ "__version__",
16
+ "run_discovery",
17
+ "discover_solution",
18
+ "DiscoveryResult",
19
+ ]
skydiscover/_version.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """Version information for skydiscover package."""
2
+
3
+ __version__ = "0.0.0"
skydiscover/api.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Public library API for SkyDiscover.
3
+
4
+ This module exposes the two main entry points for programmatic use:
5
+
6
+ * `run_discovery`: accept file paths or inline strings for the initial program and evaluator,
7
+ wires up configuration, and returns a `DiscoveryResult`.
8
+ * `discover_solution`: convenience wrapper when the initial solution is a plain string and
9
+ the evaluator is a Python callable.
10
+
11
+ Quick-start::
12
+
13
+ from skydiscover import run_discovery
14
+
15
+ result = run_discovery(
16
+ evaluator="examples/my_problem/eval.py",
17
+ initial_program="examples/my_problem/init.py", # optional
18
+ model="gpt-5",
19
+ iterations=50,
20
+ )
21
+ print(result.best_score, result.best_solution)
22
+ """
23
+
24
+ import asyncio
25
+ import logging
26
+ import os
27
+ import tempfile
28
+ from dataclasses import dataclass
29
+ from pathlib import Path
30
+ from typing import Any, Callable, Dict, List, Optional, Union
31
+
32
+ from skydiscover.benchmarks.resolution import resolve_benchmark_problem
33
+ from skydiscover.config import Config, apply_overrides, load_config
34
+ from skydiscover.runner import Runner
35
+ from skydiscover.search.base_database import Program
36
+ from skydiscover.utils.metrics import get_score
37
+ from skydiscover.utils.prepare import cleanup_temp, prepare_evaluator, prepare_program
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ @dataclass
43
+ class DiscoveryResult:
44
+ """Result of a single discovery run."""
45
+
46
+ best_program: Optional[Program]
47
+ best_score: float
48
+ best_solution: str
49
+ metrics: Dict[str, Any]
50
+ output_dir: Optional[str]
51
+ initial_score: Optional[float] = None
52
+
53
+ def __repr__(self) -> str:
54
+ init = f"{self.initial_score:.4f}" if self.initial_score is not None else "N/A"
55
+ return f"DiscoveryResult(best_score={self.best_score:.4f}, initial_score={init})"
56
+
57
+
58
+ def run_discovery(
59
+ evaluator: Union[str, Path, Callable],
60
+ initial_program: Optional[Union[str, Path, List[str]]] = None,
61
+ model: Optional[str] = None,
62
+ iterations: Optional[int] = None,
63
+ search: Optional[str] = None,
64
+ config: Union[str, Path, Config, None] = None,
65
+ agentic: bool = False,
66
+ output_dir: Optional[str] = None,
67
+ system_prompt: Optional[str] = None,
68
+ api_base: Optional[str] = None,
69
+ cleanup: bool = True,
70
+ ) -> DiscoveryResult:
71
+ """Run a discovery process and return the best result.
72
+
73
+ Args:
74
+ evaluator: File path or callable (program_path) -> metrics_dict.
75
+ initial_program: File path or inline source code (string / list of lines).
76
+ Optional — when omitted the LLM generates a solution from scratch.
77
+ model: Model name(s), comma-separated. e.g. "gpt-5" or "gpt-5,gemini/gemini-3-pro".
78
+ iterations: Max iterations (overrides config).
79
+ search: Algorithm name ("topk", "adaevolve", "evox", "openevolve_native", etc.).
80
+ config: YAML path, Config object, or None for defaults.
81
+ agentic: Enable agentic mode (codebase root derived from initial_program).
82
+ output_dir: Where to write results (temp dir if None).
83
+ system_prompt: Domain-specific context for the LLM.
84
+ api_base: Base URL for an OpenAI-compatible API.
85
+ cleanup: Remove temp files after the run.
86
+
87
+ Returns:
88
+ DiscoveryResult with best program, score, solution, metrics, and output directory.
89
+ """
90
+ return asyncio.run(
91
+ _run_discovery_async(
92
+ initial_program,
93
+ evaluator,
94
+ config,
95
+ iterations=iterations,
96
+ output_dir=output_dir,
97
+ cleanup=cleanup,
98
+ agentic=agentic,
99
+ model=model,
100
+ search=search,
101
+ system_prompt=system_prompt,
102
+ api_base=api_base,
103
+ )
104
+ )
105
+
106
+
107
+ async def _run_discovery_async(
108
+ initial_program: Optional[Union[str, Path, List[str]]],
109
+ evaluator: Union[str, Path, Callable],
110
+ config: Union[str, Path, Config, None],
111
+ *,
112
+ model: Optional[str] = None,
113
+ iterations: Optional[int] = None,
114
+ search: Optional[str] = None,
115
+ agentic: bool = False,
116
+ output_dir: Optional[str] = None,
117
+ system_prompt: Optional[str] = None,
118
+ api_base: Optional[str] = None,
119
+ cleanup: bool = True,
120
+ ) -> DiscoveryResult:
121
+ """Async implementation of run_discovery."""
122
+
123
+ temp_dir: Optional[str] = None
124
+ temp_files: List[str] = []
125
+ evaluator_env_vars: Dict[str, str] = {}
126
+
127
+ try:
128
+ if isinstance(config, Config):
129
+ config_obj = config
130
+ else:
131
+ config_obj = load_config(str(config) if config else None)
132
+
133
+ apply_overrides(
134
+ config_obj,
135
+ model=model,
136
+ api_base=api_base,
137
+ agentic=agentic,
138
+ search=search,
139
+ system_prompt=system_prompt,
140
+ )
141
+
142
+ # Resolve benchmark problem if configured and no initial_program provided
143
+ if initial_program is None and config_obj.benchmark and config_obj.benchmark.enabled:
144
+ try:
145
+ resolution = resolve_benchmark_problem(config_obj.benchmark)
146
+ initial_program = resolution.initial_program_path
147
+ evaluator = resolution.evaluator_path
148
+ evaluator_env_vars = resolution.evaluator_env_vars
149
+ logger.info(
150
+ f"[Benchmark Loader] Benchmark: {config_obj.benchmark.name}, Initial program: {initial_program}, Evaluator: {evaluator}"
151
+ )
152
+ except Exception as exc:
153
+ raise ValueError(f"Failed to load benchmark problem: {exc}") from exc
154
+
155
+ # Prepare the program (optional — None means "from scratch")
156
+ program_path = (
157
+ prepare_program(initial_program, temp_dir, temp_files)
158
+ if initial_program is not None
159
+ else None
160
+ )
161
+
162
+ if program_path and config_obj.agentic.enabled and not config_obj.agentic.codebase_root:
163
+ config_obj.agentic.codebase_root = os.path.dirname(os.path.abspath(program_path))
164
+
165
+ # Prepare the evaluator
166
+ evaluator_path = prepare_evaluator(evaluator, temp_dir, temp_files)
167
+
168
+ # Prepare the output directory
169
+ search_type = (
170
+ getattr(config_obj.search, "type", None) if hasattr(config_obj, "search") else None
171
+ )
172
+ if output_dir is None and cleanup:
173
+ temp_dir = tempfile.mkdtemp(prefix="skydiscover_")
174
+ actual_output_dir = temp_dir
175
+ else:
176
+ from skydiscover.config import build_output_dir
177
+
178
+ actual_output_dir = output_dir or build_output_dir(
179
+ search_type or "default", program_path or "scratch"
180
+ )
181
+ os.makedirs(actual_output_dir, exist_ok=True)
182
+
183
+ # External backends (openevolve, shinkaevolve, gepa)
184
+ if search_type:
185
+ from skydiscover.extras.external import KNOWN_EXTERNAL, get_runner, is_external
186
+
187
+ if is_external(search_type):
188
+ if evaluator_env_vars:
189
+ env_var_names = ", ".join(sorted(evaluator_env_vars))
190
+ raise ValueError(
191
+ "Passing evaluator environment variables to external backends is not yet supported. "
192
+ f"External backend '{search_type}' cannot be used with evaluator env vars: "
193
+ f"{env_var_names}"
194
+ )
195
+
196
+ from skydiscover.extras.monitor import start_monitor, stop_monitor
197
+
198
+ monitor_server, monitor_callback, feedback_reader = start_monitor(
199
+ config_obj, actual_output_dir
200
+ )
201
+ try:
202
+ result = await get_runner(search_type)(
203
+ program_path=program_path,
204
+ evaluator_path=evaluator_path,
205
+ config_obj=config_obj,
206
+ iterations=iterations or config_obj.max_iterations,
207
+ output_dir=actual_output_dir,
208
+ monitor_callback=monitor_callback,
209
+ feedback_reader=feedback_reader,
210
+ )
211
+ except ModuleNotFoundError as exc:
212
+ from skydiscover.extras.external import get_package_name
213
+
214
+ pkg = get_package_name(search_type)
215
+ raise ImportError(
216
+ f"{exc}\n\nThe '{search_type}' backend requires its package. "
217
+ f"Install with: pip install {pkg}"
218
+ ) from exc
219
+ finally:
220
+ stop_monitor(monitor_server)
221
+ result.output_dir = actual_output_dir if not cleanup else None
222
+ return result
223
+
224
+ if search_type in KNOWN_EXTERNAL:
225
+ from skydiscover.extras.external import get_package_name
226
+
227
+ pkg = get_package_name(search_type)
228
+ raise ImportError(
229
+ f"Search type '{search_type}' requires the '{pkg}' package. "
230
+ f"Install with: pip install {pkg}"
231
+ )
232
+
233
+ if not config_obj.llm.models:
234
+ raise ValueError(
235
+ "No LLM models configured. Provide a config with models or "
236
+ "pass model= directly:\n\n"
237
+ " result = run_discovery(evaluator, model='gpt-5')"
238
+ )
239
+
240
+ # Initialize the runner
241
+ controller = Runner(
242
+ initial_program_path=program_path,
243
+ evaluation_file=evaluator_path,
244
+ config=config_obj,
245
+ output_dir=actual_output_dir,
246
+ evaluator_env_vars=evaluator_env_vars,
247
+ )
248
+
249
+ best_program = await controller.run(iterations=iterations)
250
+
251
+ best_score = 0.0
252
+ best_solution = ""
253
+ metrics: Dict[str, Any] = {}
254
+
255
+ if best_program:
256
+ best_solution = best_program.solution
257
+ metrics = best_program.metrics or {}
258
+ best_score = get_score(metrics)
259
+
260
+ initial_score = controller.initial_score
261
+
262
+ # Return the result
263
+ return DiscoveryResult(
264
+ best_program=best_program,
265
+ best_score=best_score,
266
+ best_solution=best_solution,
267
+ metrics=metrics,
268
+ output_dir=actual_output_dir if not cleanup else None,
269
+ initial_score=initial_score,
270
+ )
271
+
272
+ finally:
273
+ if cleanup:
274
+ cleanup_temp(temp_files, temp_dir)
275
+
276
+
277
+ def discover_solution(
278
+ evaluator: Callable[[str], Dict[str, Any]],
279
+ initial_solution: Optional[str] = None,
280
+ iterations: int = 100,
281
+ search: Optional[str] = None,
282
+ model: Optional[str] = None,
283
+ **kwargs: Any,
284
+ ) -> DiscoveryResult:
285
+ """Convenience wrapper: evolve a string solution with a callable evaluator.
286
+
287
+ Same as run_discovery but defaults to string input + callable evaluator.
288
+ """
289
+ return run_discovery(
290
+ evaluator=evaluator,
291
+ initial_program=initial_solution,
292
+ iterations=iterations,
293
+ search=search,
294
+ model=model,
295
+ **kwargs,
296
+ )
skydiscover/benchmarks/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Benchmark resolver system for external problem sources."""
2
+
3
+ from skydiscover.benchmarks.base import BenchmarkResolver
4
+
5
+ __all__ = ["BenchmarkResolver"]
skydiscover/benchmarks/base.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base interface for benchmark resolvers.
2
+
3
+ Benchmark resolvers fetch problems from external sources (e.g., datasets, APIs)
4
+ and generate the necessary files (initial_program, evaluator configuration) for
5
+ SkyDiscover to run optimization on them.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from pathlib import Path
10
+ from typing import Any, Dict
11
+
12
+ from skydiscover.benchmarks.resolution import BenchmarkResolution
13
+
14
+
15
+ class BenchmarkResolver(ABC):
16
+ """Base class for benchmark-specific problem resolvers.
17
+
18
+ Resolvers are responsible for:
19
+ 1. Fetching problem specifications from external sources
20
+ 2. Generating initial_program files with appropriate structure
21
+ 3. Configuring evaluators (via environment variables or generated files)
22
+
23
+ Example usage:
24
+ resolver = KernelBenchResolver()
25
+ initial_program, evaluator = resolver.resolve(
26
+ config={'level': 1, 'problem_id': 3},
27
+ output_dir=Path('/tmp/skydiscover_kernelbench_123')
28
+ )
29
+ """
30
+
31
+ @abstractmethod
32
+ def resolve(self, config: Dict[str, Any], output_dir: Path) -> BenchmarkResolution:
33
+ """Resolve a benchmark problem to concrete file paths and evaluator config.
34
+
35
+ Args:
36
+ config: Benchmark configuration dictionary containing benchmark-specific
37
+ problem specifications and parameters.
38
+ The exact keys depend on the benchmark implementation.
39
+ output_dir: Directory where generated files should be placed.
40
+
41
+ Returns:
42
+ BenchmarkResolution containing:
43
+ - initial_program_path: Path to the generated initial program file
44
+ - evaluator_path: Path to the evaluator (file or directory)
45
+ - evaluator_env_vars: Per-run environment variables for the evaluator
46
+
47
+ """
48
+ pass
skydiscover/benchmarks/resolution.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Benchmark resolution helpers."""
2
+
3
+ import importlib
4
+ import os
5
+ import sys
6
+ import tempfile
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any, Dict
10
+
11
+
12
+ @dataclass
13
+ class BenchmarkResolution:
14
+ """Resolved benchmark assets and evaluator-scoped configuration."""
15
+
16
+ initial_program_path: str
17
+ evaluator_path: str
18
+ evaluator_env_vars: Dict[str, str] = field(default_factory=dict)
19
+
20
+
21
+ def resolve_benchmark_problem(benchmark_config: Any) -> BenchmarkResolution:
22
+ """Load benchmark problem from external dataset using the configured resolver."""
23
+ resolver_path = getattr(benchmark_config, "resolver", None)
24
+ if not resolver_path:
25
+ raise ValueError("BenchmarkConfig.resolver must be set to use benchmark loading")
26
+
27
+ cwd = os.getcwd()
28
+ if cwd not in sys.path:
29
+ sys.path.insert(0, cwd)
30
+
31
+ resolver_module = importlib.import_module(resolver_path)
32
+ resolver = resolver_module.resolver
33
+
34
+ benchmark_name = getattr(benchmark_config, "name", None) or "benchmark"
35
+ output_dir = Path(tempfile.mkdtemp(prefix=f"skydiscover_{benchmark_name}_"))
36
+
37
+ params = getattr(benchmark_config, "params", {})
38
+ return resolver.resolve(config=params, output_dir=output_dir)
skydiscover/cli.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Command-line interface for SkyDiscover."""
2
+
3
+ import argparse
4
+ import asyncio
5
+ import logging
6
+ import multiprocessing
7
+ import os
8
+ import sys
9
+ import traceback
10
+ from typing import Optional
11
+
12
+ from skydiscover import Runner
13
+ from skydiscover.benchmarks.resolution import resolve_benchmark_problem
14
+ from skydiscover.config import _parse_model_spec, apply_overrides, load_config
15
+
16
+ try:
17
+ multiprocessing.set_start_method("spawn")
18
+ except RuntimeError:
19
+ pass
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ _SEARCH_CHOICES = [
24
+ "evox",
25
+ "adaevolve",
26
+ "best_of_n",
27
+ "beam_search",
28
+ "topk",
29
+ "openevolve_native",
30
+ "openevolve",
31
+ "shinkaevolve",
32
+ "gepa",
33
+ "gepa_native",
34
+ "claude_code",
35
+ ]
36
+
37
+
38
+ def parse_args() -> argparse.Namespace:
39
+ """Build and parse the CLI argument parser."""
40
+ parser = argparse.ArgumentParser(
41
+ description="SkyDiscover - AI-Driven Scientific and Algorithmic Discovery",
42
+ )
43
+
44
+ parser.add_argument(
45
+ "initial_program",
46
+ nargs="?",
47
+ default=None,
48
+ help="Path to the initial program file (can be optional)",
49
+ )
50
+ parser.add_argument(
51
+ "evaluation_file",
52
+ help=(
53
+ "Evaluator: path to a Python file (must define evaluate()) "
54
+ "or a benchmark directory containing Dockerfile + evaluate.sh"
55
+ ),
56
+ )
57
+ parser.add_argument("--config", "-c", help="Path to configuration file (YAML)", default=None)
58
+ parser.add_argument("--output", "-o", help="Output directory for results", default=None)
59
+ parser.add_argument(
60
+ "--iterations", "-i", type=int, default=None, help="Maximum number of iterations"
61
+ )
62
+ parser.add_argument(
63
+ "--log-level",
64
+ "-l",
65
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
66
+ default=None,
67
+ help="Logging level",
68
+ )
69
+ parser.add_argument(
70
+ "--checkpoint",
71
+ default=None,
72
+ help="Path to a checkpoint directory to resume from",
73
+ )
74
+ parser.add_argument("--api-base", default=None, help="Base URL for the LLM API")
75
+ parser.add_argument(
76
+ "--agentic",
77
+ action="store_true",
78
+ default=False,
79
+ help="Enable agentic mode (codebase root derived from initial program location)",
80
+ )
81
+ parser.add_argument(
82
+ "--model",
83
+ "-m",
84
+ default=None,
85
+ help="LLM model(s) for solution generation, comma-separated (e.g. 'gpt-5', 'gpt-5,gemini/gemini-3-pro')",
86
+ )
87
+ parser.add_argument(
88
+ "--search",
89
+ "-s",
90
+ choices=_SEARCH_CHOICES,
91
+ default=None,
92
+ help="Search algorithm to use",
93
+ )
94
+
95
+ return parser.parse_args()
96
+
97
+
98
+ def main() -> int:
99
+ """Synchronous entry point for the skydiscover console script."""
100
+ return asyncio.run(main_async())
101
+
102
+
103
+ async def main_async() -> int:
104
+ """Async entry point for the CLI. Returns exit code."""
105
+ args = parse_args()
106
+ _configure_logging(args.log_level)
107
+
108
+ if args.initial_program and not os.path.exists(args.initial_program):
109
+ print(f"Error: Initial program file '{args.initial_program}' not found", file=sys.stderr)
110
+ return 1
111
+ if not os.path.exists(args.evaluation_file):
112
+ print(f"Error: Evaluation file '{args.evaluation_file}' not found", file=sys.stderr)
113
+ return 1
114
+
115
+ has_overrides = any((args.api_base, args.model, args.agentic, args.search))
116
+ config = None
117
+ evaluator_env_vars: Optional[dict[str, str]] = None
118
+
119
+ # Load the configuration
120
+ if args.config or has_overrides:
121
+ config = load_config(args.config)
122
+
123
+ evaluator_env_vars = None
124
+
125
+ try:
126
+ apply_overrides(
127
+ config,
128
+ model=args.model,
129
+ api_base=args.api_base,
130
+ agentic=args.agentic,
131
+ search=args.search,
132
+ )
133
+ except ValueError as exc:
134
+ print(f"Error: {exc}", file=sys.stderr)
135
+ return 1
136
+
137
+ # Resolve benchmark problem if configured and no initial_program provided
138
+ if args.initial_program is None and config.benchmark and config.benchmark.enabled:
139
+ try:
140
+ resolution = resolve_benchmark_problem(config.benchmark)
141
+ args.initial_program = resolution.initial_program_path
142
+ args.evaluation_file = resolution.evaluator_path
143
+ evaluator_env_vars = resolution.evaluator_env_vars
144
+ print(
145
+ f"[Benchmark Loader] Benchmark: {config.benchmark.name}, Initial program: {args.initial_program}, Evaluator: {args.evaluation_file}"
146
+ )
147
+ except Exception as exc:
148
+ print(f"Error: Failed to load benchmark problem: {exc}", file=sys.stderr)
149
+ traceback.print_exc()
150
+ return 1
151
+
152
+ if args.model:
153
+ print("Active models:")
154
+ for i, m in enumerate(config.llm.models):
155
+ provider, *_ = _parse_model_spec(m.name)
156
+ print(f" {i + 1}. {m.name} (provider: {provider}, weight: {m.weight})")
157
+ if args.api_base:
158
+ print(f"Using API base: {config.llm.api_base}")
159
+ if args.agentic:
160
+ if not config.agentic.codebase_root and args.initial_program:
161
+ config.agentic.codebase_root = os.path.dirname(
162
+ os.path.abspath(args.initial_program)
163
+ )
164
+ print(f"Agentic mode enabled (codebase: {config.agentic.codebase_root})")
165
+ if args.search:
166
+ print(f"Using search algorithm: {args.search}")
167
+
168
+ # Run the discovery
169
+ try:
170
+ search_type = config.search.type if config and hasattr(config, "search") else None
171
+
172
+ if search_type:
173
+ from skydiscover.extras.external import (
174
+ KNOWN_EXTERNAL,
175
+ get_package_name,
176
+ get_runner,
177
+ is_external,
178
+ )
179
+
180
+ # External backends (openevolve, shinkaevolve, gepa)
181
+ if is_external(search_type):
182
+ if evaluator_env_vars:
183
+ env_var_names = ", ".join(sorted(evaluator_env_vars))
184
+ print(
185
+ "Error: Passing evaluator environment variables to external backends "
186
+ "is not yet supported. "
187
+ f"External backend '{search_type}' cannot be used with evaluator env vars: "
188
+ f"{env_var_names}",
189
+ file=sys.stderr,
190
+ )
191
+ return 1
192
+
193
+ from skydiscover.config import build_output_dir
194
+
195
+ output_dir = args.output or build_output_dir(
196
+ search_type, args.initial_program or "scratch"
197
+ )
198
+ os.makedirs(output_dir, exist_ok=True)
199
+
200
+ from skydiscover.extras.monitor import start_monitor, stop_monitor
201
+
202
+ # Start monitor for external backends as well
203
+ monitor_server, monitor_callback, feedback_reader = start_monitor(
204
+ config, output_dir
205
+ )
206
+ try:
207
+ result = await get_runner(search_type)(
208
+ program_path=args.initial_program,
209
+ evaluator_path=args.evaluation_file,
210
+ config_obj=config,
211
+ iterations=args.iterations or config.max_iterations,
212
+ output_dir=output_dir,
213
+ monitor_callback=monitor_callback,
214
+ feedback_reader=feedback_reader,
215
+ )
216
+ except ModuleNotFoundError as exc:
217
+ pkg = get_package_name(search_type)
218
+ print(f"Error: {exc}", file=sys.stderr)
219
+ print(f"\nThe '{search_type}' backend requires its package.", file=sys.stderr)
220
+ print(f"Install with: pip install {pkg}", file=sys.stderr)
221
+ return 1
222
+ finally:
223
+ stop_monitor(monitor_server)
224
+
225
+ print(f"\nDiscovery complete! Best score: {result.best_score:.4f}")
226
+ return 0
227
+
228
+ if search_type in KNOWN_EXTERNAL:
229
+ pkg = get_package_name(search_type)
230
+ print(
231
+ f"Error: Search type '{search_type}' requires the '{pkg}' package. "
232
+ f"Install with: pip install {pkg}",
233
+ file=sys.stderr,
234
+ )
235
+ return 1
236
+
237
+ # Initialize the runner
238
+ runner = Runner(
239
+ initial_program_path=args.initial_program,
240
+ evaluation_file=args.evaluation_file,
241
+ config=config,
242
+ config_path=args.config if config is None else None,
243
+ output_dir=args.output,
244
+ evaluator_env_vars=evaluator_env_vars,
245
+ )
246
+
247
+ # Load the checkpoint if provided
248
+ if args.checkpoint:
249
+ if not os.path.exists(args.checkpoint):
250
+ print(f"Error: Checkpoint directory '{args.checkpoint}' not found", file=sys.stderr)
251
+ return 1
252
+ print(f"Will resume from checkpoint: {args.checkpoint}")
253
+
254
+ # Run the discovery
255
+ best_program = await runner.run(
256
+ iterations=args.iterations,
257
+ checkpoint_path=args.checkpoint,
258
+ )
259
+
260
+ checkpoint_dir = os.path.join(runner.output_dir, "checkpoints")
261
+ latest_checkpoint = _find_latest_checkpoint(checkpoint_dir)
262
+
263
+ print("\nDiscovery complete!")
264
+ if best_program is None:
265
+ print("No valid programs were found.")
266
+ else:
267
+ print("Best program metrics:")
268
+ for name, value in best_program.metrics.items():
269
+ formatted = f"{value:.4f}" if isinstance(value, (int, float)) else str(value)
270
+ print(f" {name}: {formatted}")
271
+
272
+ if latest_checkpoint:
273
+ print(f"\nLatest checkpoint: {latest_checkpoint}")
274
+ print(f"To resume: --checkpoint {latest_checkpoint}")
275
+
276
+ return 0
277
+
278
+ except Exception as exc:
279
+ print(f"Error: {exc}", file=sys.stderr)
280
+ traceback.print_exc()
281
+ return 1
282
+
283
+
284
+ def _configure_logging(level_name: Optional[str]) -> None:
285
+ """Set up the root logger with the SkyDiscover console format."""
286
+ from skydiscover.search.utils.logging_utils import _ConsoleFilter, _ConsoleFormatter
287
+
288
+ log_level = getattr(logging, level_name) if level_name else logging.WARNING
289
+ root = logging.getLogger()
290
+ root.setLevel(log_level)
291
+ if not root.handlers:
292
+ handler = logging.StreamHandler()
293
+ handler.setFormatter(_ConsoleFormatter())
294
+ handler.addFilter(_ConsoleFilter())
295
+ root.addHandler(handler)
296
+ logging.getLogger("skydiscover").setLevel(logging.INFO)
297
+
298
+
299
+ def _find_latest_checkpoint(checkpoint_dir: str) -> Optional[str]:
300
+ """Return the path of the latest checkpoint directory named like ``checkpoint_<n>``."""
301
+ if not os.path.isdir(checkpoint_dir):
302
+ return None
303
+
304
+ def parse_iteration(path: str) -> Optional[int]:
305
+ try:
306
+ return int(path.rsplit("_", 1)[-1])
307
+ except (ValueError, IndexError):
308
+ return None
309
+
310
+ candidates = []
311
+ for name in os.listdir(checkpoint_dir):
312
+ full_path = os.path.join(checkpoint_dir, name)
313
+ if not os.path.isdir(full_path):
314
+ continue
315
+ iteration = parse_iteration(name)
316
+ if iteration is None:
317
+ continue
318
+ candidates.append((iteration, full_path))
319
+
320
+ if not candidates:
321
+ return None
322
+
323
+ return max(candidates, key=lambda item: item[0])[1]
324
+
325
+
326
+ if __name__ == "__main__":
327
+ sys.exit(main())