Spaces:
Sleeping
Sleeping
Deploy OpenClaw PR API
Browse files- README.md +6 -0
- pyproject.toml +2 -9
- src/slop_farmer.egg-info/PKG-INFO +34 -18
- src/slop_farmer.egg-info/SOURCES.txt +7 -0
- src/slop_farmer.egg-info/requires.txt +1 -1
- src/slop_farmer/__init__.py +1 -1
- src/slop_farmer/app/analysis_id.py +81 -0
- src/slop_farmer/app/cli.py +437 -136
- src/slop_farmer/app/dataset_refresh.py +36 -8
- src/slop_farmer/app/dataset_status.py +161 -27
- src/slop_farmer/app/deploy.py +15 -4
- src/slop_farmer/app/hf_checkpoint_import.py +14 -72
- src/slop_farmer/app/pipeline.py +12 -101
- src/slop_farmer/app/pr_search.py +74 -0
- src/slop_farmer/app/pr_search_api.py +191 -8
- src/slop_farmer/app/publish_analysis.py +366 -0
- src/slop_farmer/app/publish_dataset_snapshot.py +62 -0
- src/slop_farmer/app/save_cache.py +115 -0
- src/slop_farmer/app_config.py +28 -17
- src/slop_farmer/config.py +57 -23
- src/slop_farmer/data/search_duckdb.py +146 -0
- src/slop_farmer/data/snapshot_materialize.py +272 -177
- src/slop_farmer/data/snapshot_paths.py +385 -3
- src/slop_farmer/reports/analysis.py +418 -217
- src/slop_farmer/reports/analysis_service.py +165 -58
- src/slop_farmer/reports/dashboard.py +53 -3
- src/slop_farmer/reports/new_contributor_report.py +11 -3
- src/slop_farmer/reports/pr_scope.py +9 -16
- src/slop_farmer/reports/pr_search_scope.py +12 -16
- src/slop_farmer/reports/pr_search_service.py +166 -1
- src/slop_farmer/reports/read_views.py +67 -18
- uv.lock +36 -36
README.md
CHANGED
|
@@ -20,6 +20,12 @@ tags:
|
|
| 20 |
|
| 21 |
Machine-oriented API for PR similarity search.
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
Defaults for this deployment:
|
| 24 |
|
| 25 |
- repo: `openclaw/openclaw`
|
|
|
|
| 20 |
|
| 21 |
Machine-oriented API for PR similarity search.
|
| 22 |
|
| 23 |
+
Canonical storage roles:
|
| 24 |
+
|
| 25 |
+
- dataset repo: published latest state and canonical current analysis
|
| 26 |
+
- mounted bucket: mutable operational cache only
|
| 27 |
+
- Space disk: ephemeral runtime storage
|
| 28 |
+
|
| 29 |
Defaults for this deployment:
|
| 30 |
|
| 31 |
- repo: `openclaw/openclaw`
|
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "slop-farmer"
|
| 7 |
-
version = "0.1.
|
| 8 |
description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.13.5"
|
|
@@ -12,7 +12,7 @@ dependencies = [
|
|
| 12 |
"duckdb>=1.2.2",
|
| 13 |
"pyarrow>=18.0.0",
|
| 14 |
"fastapi>=0.115.0",
|
| 15 |
-
"huggingface_hub>=
|
| 16 |
"pydantic>=2.11",
|
| 17 |
"PyYAML>=6.0.2",
|
| 18 |
"rank-bm25>=0.2.2",
|
|
@@ -60,13 +60,6 @@ select = [
|
|
| 60 |
]
|
| 61 |
ignore = ["E501"]
|
| 62 |
|
| 63 |
-
[tool.slop-farmer.analyze]
|
| 64 |
-
output-dir = "eval_data"
|
| 65 |
-
hf-repo-id = "evalstate/transformers-pr"
|
| 66 |
-
ranking-backend = "hybrid"
|
| 67 |
-
model = "gpt-5.4-mini"
|
| 68 |
-
max-clusters = 10
|
| 69 |
-
|
| 70 |
[tool.slop-farmer.dashboard-data]
|
| 71 |
output-dir = "web/public/data"
|
| 72 |
window-days = 14
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "slop-farmer"
|
| 7 |
+
version = "0.1.1"
|
| 8 |
description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.13.5"
|
|
|
|
| 12 |
"duckdb>=1.2.2",
|
| 13 |
"pyarrow>=18.0.0",
|
| 14 |
"fastapi>=0.115.0",
|
| 15 |
+
"huggingface_hub>=1.11.0",
|
| 16 |
"pydantic>=2.11",
|
| 17 |
"PyYAML>=6.0.2",
|
| 18 |
"rank-bm25>=0.2.2",
|
|
|
|
| 60 |
]
|
| 61 |
ignore = ["E501"]
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
[tool.slop-farmer.dashboard-data]
|
| 64 |
output-dir = "web/public/data"
|
| 65 |
window-days = 14
|
src/slop_farmer.egg-info/PKG-INFO
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
Metadata-Version: 2.4
|
| 2 |
Name: slop-farmer
|
| 3 |
-
Version: 0.1.
|
| 4 |
Summary: GitHub-to-Hub data pipeline for transformers issue and PR triage research.
|
| 5 |
Requires-Python: >=3.13.5
|
| 6 |
Description-Content-Type: text/markdown
|
| 7 |
Requires-Dist: duckdb>=1.2.2
|
| 8 |
Requires-Dist: pyarrow>=18.0.0
|
| 9 |
Requires-Dist: fastapi>=0.115.0
|
| 10 |
-
Requires-Dist: huggingface_hub>=
|
| 11 |
Requires-Dist: pydantic>=2.11
|
| 12 |
Requires-Dist: PyYAML>=6.0.2
|
| 13 |
Requires-Dist: rank-bm25>=0.2.2
|
|
@@ -126,18 +126,25 @@ Authentication defaults:
|
|
| 126 |
- GitHub: `GITHUB_TOKEN`, then `gh auth token`
|
| 127 |
- Hugging Face: `HF_TOKEN`, otherwise existing `hf auth` login
|
| 128 |
|
| 129 |
-
##
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
```bash
|
| 135 |
-
scripts/
|
| 136 |
```
|
| 137 |
|
| 138 |
By default this creates a scheduled HF Job that:
|
| 139 |
|
| 140 |
-
- reads `configs/transformers.yaml`
|
| 141 |
- refreshes `dataset_id` incrementally against the current Hub dataset state
|
| 142 |
- regenerates the new contributor report
|
| 143 |
- uploads the updated snapshot back to the dataset repo
|
|
@@ -146,20 +153,28 @@ Useful overrides:
|
|
| 146 |
|
| 147 |
```bash
|
| 148 |
# fire once immediately instead of creating a schedule
|
| 149 |
-
MODE=run scripts/
|
| 150 |
|
| 151 |
# change the cron schedule
|
| 152 |
-
SCHEDULE="0 */6 * * *" scripts/
|
| 153 |
|
| 154 |
# optionally mount a writable HF bucket for temp files
|
| 155 |
SCRATCH_BUCKET=evalstate/slop-farmer-scratch \
|
| 156 |
-
scripts/
|
| 157 |
```
|
| 158 |
|
| 159 |
Buckets are best treated here as optional scratch space via `TMPDIR`, not as the canonical
|
| 160 |
published dataset. The repo's local analysis and PR-scope tooling already knows how to
|
| 161 |
materialize versioned Hub **dataset repos**; it does not currently read HF buckets directly.
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
## Analyze a Hub dataset
|
| 164 |
|
| 165 |
You can analyze the published Hugging Face dataset directly without scraping GitHub again:
|
|
@@ -176,15 +191,12 @@ This materializes the dataset-viewer parquet export into a local snapshot cache
|
|
| 176 |
|
| 177 |
Repo-local defaults for `analyze` can be stored in `pyproject.toml` under `[tool.slop-farmer.analyze]`. This repo currently defaults to:
|
| 178 |
|
| 179 |
-
- `output-dir = "
|
| 180 |
-
- `hf-repo-id = "evalstate/transformers-pr"`
|
| 181 |
-
- `ranking-backend = "hybrid"`
|
| 182 |
-
- `model = "gpt-5-mini?reasoning=low"`
|
| 183 |
|
| 184 |
-
|
| 185 |
|
| 186 |
```bash
|
| 187 |
-
uv run slop-farmer analyze
|
| 188 |
```
|
| 189 |
|
| 190 |
## Cluster open PRs by code scope
|
|
@@ -353,15 +365,19 @@ scrape:
|
|
| 353 |
Then commands stay aligned without repeating repo/workspace/window settings:
|
| 354 |
|
| 355 |
```bash
|
| 356 |
-
uv run slop-farmer --config configs/diffusers.yaml
|
| 357 |
uv run slop-farmer --config configs/diffusers.yaml analyze
|
| 358 |
uv run slop-farmer --config configs/diffusers.yaml pr-scope
|
|
|
|
| 359 |
uv run slop-farmer --config configs/diffusers.yaml new-contributor-report
|
| 360 |
uv run slop-farmer --config configs/diffusers.yaml dashboard-data
|
| 361 |
-
uv run slop-farmer --config configs/diffusers.yaml publish-snapshot
|
| 362 |
uv run slop-farmer --config configs/diffusers.yaml deploy-dashboard --refresh-contributors
|
|
|
|
| 363 |
```
|
| 364 |
|
|
|
|
|
|
|
|
|
|
| 365 |
If you run `analyze` before `publish-snapshot`, the uploaded snapshot will also include
|
| 366 |
`analysis-state/`, which makes the hybrid cache portable across machines and reusable in
|
| 367 |
later snapshots when `analysis.cached_analysis: true` is enabled.
|
|
|
|
| 1 |
Metadata-Version: 2.4
|
| 2 |
Name: slop-farmer
|
| 3 |
+
Version: 0.1.1
|
| 4 |
Summary: GitHub-to-Hub data pipeline for transformers issue and PR triage research.
|
| 5 |
Requires-Python: >=3.13.5
|
| 6 |
Description-Content-Type: text/markdown
|
| 7 |
Requires-Dist: duckdb>=1.2.2
|
| 8 |
Requires-Dist: pyarrow>=18.0.0
|
| 9 |
Requires-Dist: fastapi>=0.115.0
|
| 10 |
+
Requires-Dist: huggingface_hub>=1.11.0
|
| 11 |
Requires-Dist: pydantic>=2.11
|
| 12 |
Requires-Dist: PyYAML>=6.0.2
|
| 13 |
Requires-Dist: rank-bm25>=0.2.2
|
|
|
|
| 126 |
- GitHub: `GITHUB_TOKEN`, then `gh auth token`
|
| 127 |
- Hugging Face: `HF_TOKEN`, otherwise existing `hf auth` login
|
| 128 |
|
| 129 |
+
## Canonical dataset upkeep
|
| 130 |
|
| 131 |
+
`dataset_id` is the canonical latest dataset repo.
|
| 132 |
+
|
| 133 |
+
Use the remote-first writer:
|
| 134 |
+
|
| 135 |
+
```bash
|
| 136 |
+
uv run slop-farmer --config configs/transformers.yaml refresh-dataset
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
Or submit the generic HF Job wrapper:
|
| 140 |
|
| 141 |
```bash
|
| 142 |
+
scripts/submit_dataset_job.sh
|
| 143 |
```
|
| 144 |
|
| 145 |
By default this creates a scheduled HF Job that:
|
| 146 |
|
| 147 |
+
- reads `CONFIG_PATH` (defaults to `configs/transformers.yaml`)
|
| 148 |
- refreshes `dataset_id` incrementally against the current Hub dataset state
|
| 149 |
- regenerates the new contributor report
|
| 150 |
- uploads the updated snapshot back to the dataset repo
|
|
|
|
| 153 |
|
| 154 |
```bash
|
| 155 |
# fire once immediately instead of creating a schedule
|
| 156 |
+
MODE=run scripts/submit_dataset_job.sh
|
| 157 |
|
| 158 |
# change the cron schedule
|
| 159 |
+
SCHEDULE="0 */6 * * *" scripts/submit_dataset_job.sh
|
| 160 |
|
| 161 |
# optionally mount a writable HF bucket for temp files
|
| 162 |
SCRATCH_BUCKET=evalstate/slop-farmer-scratch \
|
| 163 |
+
scripts/submit_dataset_job.sh
|
| 164 |
```
|
| 165 |
|
| 166 |
Buckets are best treated here as optional scratch space via `TMPDIR`, not as the canonical
|
| 167 |
published dataset. The repo's local analysis and PR-scope tooling already knows how to
|
| 168 |
materialize versioned Hub **dataset repos**; it does not currently read HF buckets directly.
|
| 169 |
|
| 170 |
+
Compatibility wrappers remain available:
|
| 171 |
+
|
| 172 |
+
- `scripts/submit_transformers_dataset_job.sh`
|
| 173 |
+
- `scripts/submit_openclaw_dataset_job.sh`
|
| 174 |
+
|
| 175 |
+
For the current storage model and recommended modes, see
|
| 176 |
+
[`docs/data-architecture.md`](docs/data-architecture.md).
|
| 177 |
+
|
| 178 |
## Analyze a Hub dataset
|
| 179 |
|
| 180 |
You can analyze the published Hugging Face dataset directly without scraping GitHub again:
|
|
|
|
| 191 |
|
| 192 |
Repo-local defaults for `analyze` can be stored in `pyproject.toml` under `[tool.slop-farmer.analyze]`. This repo currently defaults to:
|
| 193 |
|
| 194 |
+
- `dashboard-data.output-dir = "web/public/data"`
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
+
For repo-specific remote-first analysis, prefer a YAML config with `dataset_id`, e.g.:
|
| 197 |
|
| 198 |
```bash
|
| 199 |
+
uv run slop-farmer --config configs/openclaw.yaml analyze
|
| 200 |
```
|
| 201 |
|
| 202 |
## Cluster open PRs by code scope
|
|
|
|
| 365 |
Then commands stay aligned without repeating repo/workspace/window settings:
|
| 366 |
|
| 367 |
```bash
|
| 368 |
+
uv run slop-farmer --config configs/diffusers.yaml refresh-dataset
|
| 369 |
uv run slop-farmer --config configs/diffusers.yaml analyze
|
| 370 |
uv run slop-farmer --config configs/diffusers.yaml pr-scope
|
| 371 |
+
uv run slop-farmer --config configs/diffusers.yaml pr-search refresh
|
| 372 |
uv run slop-farmer --config configs/diffusers.yaml new-contributor-report
|
| 373 |
uv run slop-farmer --config configs/diffusers.yaml dashboard-data
|
|
|
|
| 374 |
uv run slop-farmer --config configs/diffusers.yaml deploy-dashboard --refresh-contributors
|
| 375 |
+
uv run slop-farmer --config configs/diffusers.yaml dataset-status
|
| 376 |
```
|
| 377 |
|
| 378 |
+
Those reader commands default to `dataset_id` when configured. Pass `--snapshot-dir` to force
|
| 379 |
+
an explicit local snapshot instead.
|
| 380 |
+
|
| 381 |
If you run `analyze` before `publish-snapshot`, the uploaded snapshot will also include
|
| 382 |
`analysis-state/`, which makes the hybrid cache portable across machines and reusable in
|
| 383 |
later snapshots when `analysis.cached_analysis: true` is enabled.
|
src/slop_farmer.egg-info/SOURCES.txt
CHANGED
|
@@ -11,6 +11,8 @@ src/slop_farmer.egg-info/requires.txt
|
|
| 11 |
src/slop_farmer.egg-info/top_level.txt
|
| 12 |
src/slop_farmer/app/__init__.py
|
| 13 |
src/slop_farmer/app/cli.py
|
|
|
|
|
|
|
| 14 |
src/slop_farmer/app/deploy.py
|
| 15 |
src/slop_farmer/app/duplicate_prs.py
|
| 16 |
src/slop_farmer/app/hf_checkpoint_import.py
|
|
@@ -21,8 +23,10 @@ src/slop_farmer/app/publish.py
|
|
| 21 |
src/slop_farmer/app/snapshot_state.py
|
| 22 |
src/slop_farmer/app/workflow.py
|
| 23 |
src/slop_farmer/data/__init__.py
|
|
|
|
| 24 |
src/slop_farmer/data/ghreplica_api.py
|
| 25 |
src/slop_farmer/data/github_api.py
|
|
|
|
| 26 |
src/slop_farmer/data/http.py
|
| 27 |
src/slop_farmer/data/links.py
|
| 28 |
src/slop_farmer/data/normalize.py
|
|
@@ -30,9 +34,11 @@ src/slop_farmer/data/parquet_io.py
|
|
| 30 |
src/slop_farmer/data/search_duckdb.py
|
| 31 |
src/slop_farmer/data/snapshot_materialize.py
|
| 32 |
src/slop_farmer/data/snapshot_paths.py
|
|
|
|
| 33 |
src/slop_farmer/reports/__init__.py
|
| 34 |
src/slop_farmer/reports/analysis.py
|
| 35 |
src/slop_farmer/reports/analysis_cache.py
|
|
|
|
| 36 |
src/slop_farmer/reports/canonical_duplicate_pr.py
|
| 37 |
src/slop_farmer/reports/dashboard.py
|
| 38 |
src/slop_farmer/reports/duplicate_prs.py
|
|
@@ -49,6 +55,7 @@ tests/test_canonical_duplicate_pr.py
|
|
| 49 |
tests/test_cli.py
|
| 50 |
tests/test_config.py
|
| 51 |
tests/test_dashboard.py
|
|
|
|
| 52 |
tests/test_farmer_setup_assets.py
|
| 53 |
tests/test_ghreplica_api.py
|
| 54 |
tests/test_github_api.py
|
|
|
|
| 11 |
src/slop_farmer.egg-info/top_level.txt
|
| 12 |
src/slop_farmer/app/__init__.py
|
| 13 |
src/slop_farmer/app/cli.py
|
| 14 |
+
src/slop_farmer/app/dataset_refresh.py
|
| 15 |
+
src/slop_farmer/app/dataset_status.py
|
| 16 |
src/slop_farmer/app/deploy.py
|
| 17 |
src/slop_farmer/app/duplicate_prs.py
|
| 18 |
src/slop_farmer/app/hf_checkpoint_import.py
|
|
|
|
| 23 |
src/slop_farmer/app/snapshot_state.py
|
| 24 |
src/slop_farmer/app/workflow.py
|
| 25 |
src/slop_farmer/data/__init__.py
|
| 26 |
+
src/slop_farmer/data/dataset_card.py
|
| 27 |
src/slop_farmer/data/ghreplica_api.py
|
| 28 |
src/slop_farmer/data/github_api.py
|
| 29 |
+
src/slop_farmer/data/hf_dataset_repo.py
|
| 30 |
src/slop_farmer/data/http.py
|
| 31 |
src/slop_farmer/data/links.py
|
| 32 |
src/slop_farmer/data/normalize.py
|
|
|
|
| 34 |
src/slop_farmer/data/search_duckdb.py
|
| 35 |
src/slop_farmer/data/snapshot_materialize.py
|
| 36 |
src/slop_farmer/data/snapshot_paths.py
|
| 37 |
+
src/slop_farmer/data/snapshot_source.py
|
| 38 |
src/slop_farmer/reports/__init__.py
|
| 39 |
src/slop_farmer/reports/analysis.py
|
| 40 |
src/slop_farmer/reports/analysis_cache.py
|
| 41 |
+
src/slop_farmer/reports/analysis_service.py
|
| 42 |
src/slop_farmer/reports/canonical_duplicate_pr.py
|
| 43 |
src/slop_farmer/reports/dashboard.py
|
| 44 |
src/slop_farmer/reports/duplicate_prs.py
|
|
|
|
| 55 |
tests/test_cli.py
|
| 56 |
tests/test_config.py
|
| 57 |
tests/test_dashboard.py
|
| 58 |
+
tests/test_dataset_status.py
|
| 59 |
tests/test_farmer_setup_assets.py
|
| 60 |
tests/test_ghreplica_api.py
|
| 61 |
tests/test_github_api.py
|
src/slop_farmer.egg-info/requires.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
duckdb>=1.2.2
|
| 2 |
pyarrow>=18.0.0
|
| 3 |
fastapi>=0.115.0
|
| 4 |
-
huggingface_hub>=
|
| 5 |
pydantic>=2.11
|
| 6 |
PyYAML>=6.0.2
|
| 7 |
rank-bm25>=0.2.2
|
|
|
|
| 1 |
duckdb>=1.2.2
|
| 2 |
pyarrow>=18.0.0
|
| 3 |
fastapi>=0.115.0
|
| 4 |
+
huggingface_hub>=1.11.0
|
| 5 |
pydantic>=2.11
|
| 6 |
PyYAML>=6.0.2
|
| 7 |
rank-bm25>=0.2.2
|
src/slop_farmer/__init__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
__all__ = ["__version__"]
|
| 2 |
|
| 3 |
-
__version__ = "0.1.
|
|
|
|
| 1 |
__all__ = ["__version__"]
|
| 2 |
|
| 3 |
+
__version__ = "0.1.1"
|
src/slop_farmer/app/analysis_id.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from slop_farmer.app_config import command_defaults
|
| 7 |
+
from slop_farmer.data.parquet_io import read_json
|
| 8 |
+
from slop_farmer.data.snapshot_paths import ROOT_MANIFEST_FILENAME, resolve_snapshot_dir_from_output
|
| 9 |
+
|
| 10 |
+
MODEL_SLUG_PATTERN = re.compile(r"[^a-z0-9]+")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def model_slug(model: str) -> str:
|
| 14 |
+
base = model.split("?", 1)[0].strip().lower()
|
| 15 |
+
slug = MODEL_SLUG_PATTERN.sub("", base)
|
| 16 |
+
return slug or "model"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def build_analysis_id(
|
| 20 |
+
*,
|
| 21 |
+
snapshot_id: str,
|
| 22 |
+
model: str,
|
| 23 |
+
ranking_backend: str,
|
| 24 |
+
suffix: str | None = None,
|
| 25 |
+
) -> str:
|
| 26 |
+
parts = [
|
| 27 |
+
MODEL_SLUG_PATTERN.sub("", ranking_backend.strip().lower()) or "analysis",
|
| 28 |
+
model_slug(model),
|
| 29 |
+
snapshot_id.strip().lower(),
|
| 30 |
+
]
|
| 31 |
+
if suffix:
|
| 32 |
+
normalized_suffix = MODEL_SLUG_PATTERN.sub("-", suffix.strip().lower()).strip("-")
|
| 33 |
+
if normalized_suffix:
|
| 34 |
+
parts.append(normalized_suffix)
|
| 35 |
+
return "-".join(parts)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def analysis_id_from_snapshot(
|
| 39 |
+
*,
|
| 40 |
+
snapshot_dir: Path,
|
| 41 |
+
model: str,
|
| 42 |
+
ranking_backend: str,
|
| 43 |
+
suffix: str | None = None,
|
| 44 |
+
) -> str:
|
| 45 |
+
manifest_path = snapshot_dir / ROOT_MANIFEST_FILENAME
|
| 46 |
+
manifest = read_json(manifest_path) if manifest_path.exists() else {}
|
| 47 |
+
if not isinstance(manifest, dict):
|
| 48 |
+
raise ValueError(f"Snapshot manifest at {manifest_path} must contain a JSON object.")
|
| 49 |
+
snapshot_id = str(manifest.get("snapshot_id") or snapshot_dir.name).strip()
|
| 50 |
+
if not snapshot_id:
|
| 51 |
+
raise ValueError(f"Could not resolve snapshot_id from {manifest_path}")
|
| 52 |
+
return build_analysis_id(
|
| 53 |
+
snapshot_id=snapshot_id,
|
| 54 |
+
model=model,
|
| 55 |
+
ranking_backend=ranking_backend,
|
| 56 |
+
suffix=suffix,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def analysis_id_from_config(
|
| 61 |
+
*,
|
| 62 |
+
config_path: Path,
|
| 63 |
+
output_dir: Path | None = None,
|
| 64 |
+
snapshot_dir: Path | None = None,
|
| 65 |
+
model: str | None = None,
|
| 66 |
+
ranking_backend: str | None = None,
|
| 67 |
+
suffix: str | None = None,
|
| 68 |
+
) -> str:
|
| 69 |
+
defaults = command_defaults("analyze", config_path=config_path)
|
| 70 |
+
resolved_snapshot_dir = resolve_snapshot_dir_from_output(
|
| 71 |
+
Path(output_dir or defaults.get("output-dir", "data")),
|
| 72 |
+
snapshot_dir,
|
| 73 |
+
)
|
| 74 |
+
resolved_model = str(model or defaults.get("model", "gpt-5.4-mini?service_tier=flex"))
|
| 75 |
+
resolved_backend = str(ranking_backend or defaults.get("ranking-backend", "hybrid"))
|
| 76 |
+
return analysis_id_from_snapshot(
|
| 77 |
+
snapshot_dir=resolved_snapshot_dir,
|
| 78 |
+
model=resolved_model,
|
| 79 |
+
ranking_backend=resolved_backend,
|
| 80 |
+
suffix=suffix,
|
| 81 |
+
)
|
src/slop_farmer/app/cli.py
CHANGED
|
@@ -13,15 +13,17 @@ from slop_farmer.config import (
|
|
| 13 |
AnalysisOptions,
|
| 14 |
CheckpointImportOptions,
|
| 15 |
DashboardDataOptions,
|
|
|
|
|
|
|
| 16 |
DeployDashboardOptions,
|
| 17 |
-
FullPipelineOptions,
|
| 18 |
MarkdownReportOptions,
|
| 19 |
NewContributorReportOptions,
|
| 20 |
PipelineOptions,
|
| 21 |
PrScopeOptions,
|
| 22 |
PrSearchRefreshOptions,
|
| 23 |
-
|
| 24 |
RepoRef,
|
|
|
|
| 25 |
SnapshotAdoptOptions,
|
| 26 |
)
|
| 27 |
from slop_farmer.reports.duplicate_prs import DEFAULT_DUPLICATE_PR_MODEL
|
|
@@ -29,6 +31,16 @@ from slop_farmer.reports.duplicate_prs import DEFAULT_DUPLICATE_PR_MODEL
|
|
| 29 |
CommandHandler = Callable[[argparse.Namespace, Path | None], None]
|
| 30 |
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
| 33 |
defaults = _load_parser_defaults(config_path)
|
| 34 |
|
|
@@ -41,6 +53,7 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
|
| 41 |
subparsers = parser.add_subparsers(dest="command", required=True)
|
| 42 |
|
| 43 |
_add_scrape_parser(subparsers, defaults["scrape"])
|
|
|
|
| 44 |
_add_analyze_parser(subparsers, defaults["analyze"])
|
| 45 |
_add_pr_scope_parser(subparsers, defaults["pr-scope"])
|
| 46 |
_add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
|
|
@@ -50,15 +63,17 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
|
| 50 |
_add_pr_search_parser(subparsers, defaults["pr-search"])
|
| 51 |
_add_new_contributor_report_parser(subparsers, defaults["new-contributor-report"])
|
| 52 |
_add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
|
| 53 |
-
|
|
|
|
| 54 |
_add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
|
| 55 |
-
|
| 56 |
return parser
|
| 57 |
|
| 58 |
|
| 59 |
def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
|
| 60 |
commands = (
|
| 61 |
"scrape",
|
|
|
|
| 62 |
"analyze",
|
| 63 |
"import-hf-checkpoint",
|
| 64 |
"pr-scope",
|
|
@@ -66,9 +81,10 @@ def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]
|
|
| 66 |
"adopt-snapshot",
|
| 67 |
"new-contributor-report",
|
| 68 |
"dashboard-data",
|
| 69 |
-
"publish-
|
|
|
|
| 70 |
"deploy-dashboard",
|
| 71 |
-
"
|
| 72 |
)
|
| 73 |
return {command: command_defaults(command, config_path=config_path) for command in commands}
|
| 74 |
|
|
@@ -141,52 +157,110 @@ def _add_scrape_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
|
| 141 |
help="Fetch issue timeline events for linkage rows.",
|
| 142 |
)
|
| 143 |
scrape.add_argument(
|
| 144 |
-
"--
|
|
|
|
| 145 |
action="store_true",
|
| 146 |
-
default=
|
| 147 |
-
help="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
)
|
| 149 |
scrape.add_argument(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
"--hf-repo-id",
|
| 151 |
default=defaults.get("hf-repo-id"),
|
| 152 |
-
|
|
|
|
| 153 |
)
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
action="store_true",
|
| 157 |
-
default=bool(defaults.get("
|
| 158 |
-
help="Create the Hub dataset repo as private.",
|
| 159 |
)
|
| 160 |
-
|
| 161 |
"--new-contributor-report",
|
| 162 |
dest="new_contributor_report",
|
| 163 |
action="store_true",
|
| 164 |
-
default=defaults.get("new-contributor-report"),
|
| 165 |
-
help="Generate new contributor dataset/report artifacts. Defaults to enabled when --publish is used.",
|
| 166 |
)
|
| 167 |
-
|
| 168 |
"--no-new-contributor-report",
|
| 169 |
dest="new_contributor_report",
|
| 170 |
action="store_false",
|
| 171 |
-
help="Skip new contributor dataset/report generation.",
|
| 172 |
)
|
| 173 |
-
|
| 174 |
"--new-contributor-window-days",
|
| 175 |
type=int,
|
| 176 |
default=int(defaults.get("new-contributor-window-days", 42)),
|
| 177 |
-
help="Recent public activity window for contributor enrichment.",
|
| 178 |
)
|
| 179 |
-
|
| 180 |
"--new-contributor-max-authors",
|
| 181 |
type=int,
|
| 182 |
default=int(defaults.get("new-contributor-max-authors", 25)),
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
)
|
| 185 |
|
| 186 |
|
| 187 |
def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 188 |
analyze = subparsers.add_parser(
|
| 189 |
-
"analyze",
|
|
|
|
| 190 |
)
|
| 191 |
analyze.add_argument(
|
| 192 |
"--snapshot-dir",
|
|
@@ -200,7 +274,7 @@ def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
|
| 200 |
analyze.add_argument(
|
| 201 |
"--hf-repo-id",
|
| 202 |
default=defaults.get("hf-repo-id"),
|
| 203 |
-
help="Analyze a Hugging Face dataset repo by materializing
|
| 204 |
)
|
| 205 |
analyze.add_argument(
|
| 206 |
"--hf-revision",
|
|
@@ -223,7 +297,7 @@ def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
|
| 223 |
)
|
| 224 |
analyze.add_argument(
|
| 225 |
"--model",
|
| 226 |
-
default=defaults.get("model", "gpt-5-mini?
|
| 227 |
help="Model string used by fast-agent when enabled.",
|
| 228 |
)
|
| 229 |
analyze.add_argument(
|
|
@@ -232,6 +306,15 @@ def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
|
| 232 |
default=int(defaults.get("max-clusters", 10)),
|
| 233 |
help="Maximum number of meta clusters to include in the report.",
|
| 234 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
analyze.add_argument(
|
| 236 |
"--open-prs-only",
|
| 237 |
action="store_true",
|
|
@@ -637,6 +720,61 @@ def _add_pr_search_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
|
| 637 |
status.add_argument("--repo", help="Optional repo override.")
|
| 638 |
status.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 639 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
|
| 641 |
def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 642 |
new_contributor = subparsers.add_parser(
|
|
@@ -659,6 +797,24 @@ def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]
|
|
| 659 |
new_contributor.add_argument(
|
| 660 |
"--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
|
| 661 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
new_contributor.add_argument(
|
| 663 |
"--window-days",
|
| 664 |
type=int,
|
|
@@ -690,17 +846,35 @@ def _add_dashboard_data_parser(subparsers: Any, defaults: dict[str, Any]) -> Non
|
|
| 690 |
dashboard.add_argument(
|
| 691 |
"--analysis-input",
|
| 692 |
type=Path,
|
| 693 |
-
help="Optional analysis report JSON. Defaults to analysis
|
| 694 |
)
|
| 695 |
dashboard.add_argument(
|
| 696 |
"--contributors-input",
|
| 697 |
type=Path,
|
| 698 |
-
help="Optional
|
| 699 |
)
|
| 700 |
dashboard.add_argument(
|
| 701 |
"--pr-scope-input",
|
| 702 |
type=Path,
|
| 703 |
-
help="Optional PR scope cluster JSON. Defaults to pr-scope-clusters.json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 704 |
)
|
| 705 |
dashboard.add_argument(
|
| 706 |
"--window-days",
|
|
@@ -710,27 +884,77 @@ def _add_dashboard_data_parser(subparsers: Any, defaults: dict[str, Any]) -> Non
|
|
| 710 |
)
|
| 711 |
|
| 712 |
|
| 713 |
-
def
|
| 714 |
-
|
| 715 |
-
"publish-
|
| 716 |
-
help="Publish
|
| 717 |
)
|
| 718 |
-
|
| 719 |
"--output-dir",
|
| 720 |
type=Path,
|
| 721 |
default=Path(defaults.get("output-dir", "data")),
|
| 722 |
help="Pipeline workspace root containing snapshots/latest.json.",
|
| 723 |
)
|
| 724 |
-
|
| 725 |
-
"--snapshot-dir",
|
|
|
|
|
|
|
| 726 |
)
|
| 727 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 728 |
"--hf-repo-id",
|
| 729 |
default=defaults.get("hf-repo-id"),
|
| 730 |
required=defaults.get("hf-repo-id") is None,
|
| 731 |
help="Target Hugging Face dataset repo id.",
|
| 732 |
)
|
| 733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
"--private-hf-repo",
|
| 735 |
action="store_true",
|
| 736 |
default=bool(defaults.get("private-hf-repo", False)),
|
|
@@ -740,7 +964,8 @@ def _add_publish_snapshot_parser(subparsers: Any, defaults: dict[str, Any]) -> N
|
|
| 740 |
|
| 741 |
def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 742 |
deploy_dashboard = subparsers.add_parser(
|
| 743 |
-
"deploy-dashboard",
|
|
|
|
| 744 |
)
|
| 745 |
deploy_dashboard.add_argument(
|
| 746 |
"--pipeline-data-dir",
|
|
@@ -756,10 +981,37 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
|
|
| 756 |
help="Optional snapshot directory to publish. Defaults to the latest snapshot in --pipeline-data-dir.",
|
| 757 |
)
|
| 758 |
deploy_dashboard.add_argument(
|
| 759 |
-
"--analysis-input",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
)
|
| 761 |
deploy_dashboard.add_argument(
|
| 762 |
-
"--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 763 |
)
|
| 764 |
deploy_dashboard.add_argument(
|
| 765 |
"--refresh-contributors",
|
|
@@ -817,71 +1069,29 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
|
|
| 817 |
)
|
| 818 |
|
| 819 |
|
| 820 |
-
def
|
| 821 |
-
|
| 822 |
-
"
|
| 823 |
-
help="
|
| 824 |
-
)
|
| 825 |
-
full_pipeline.add_argument(
|
| 826 |
-
"--repo",
|
| 827 |
-
default=defaults.get("repo"),
|
| 828 |
-
required=defaults.get("repo") is None,
|
| 829 |
-
help="GitHub repository in owner/name form.",
|
| 830 |
-
)
|
| 831 |
-
full_pipeline.add_argument(
|
| 832 |
-
"--dataset",
|
| 833 |
-
default=defaults.get("dataset"),
|
| 834 |
-
required=defaults.get("dataset") is None,
|
| 835 |
-
help="Target Hugging Face dataset repo id.",
|
| 836 |
)
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
"--
|
| 840 |
type=Path,
|
| 841 |
-
default=Path(defaults.get("
|
| 842 |
-
|
| 843 |
-
full_pipeline.add_argument("--private-hf-repo", action="store_true")
|
| 844 |
-
full_pipeline.add_argument(
|
| 845 |
-
"--ranking-backend",
|
| 846 |
-
choices=("hybrid", "deterministic"),
|
| 847 |
-
default=defaults.get("ranking-backend", "hybrid"),
|
| 848 |
-
)
|
| 849 |
-
full_pipeline.add_argument(
|
| 850 |
-
"--max-clusters", type=int, default=int(defaults.get("max-clusters", 10))
|
| 851 |
-
)
|
| 852 |
-
full_pipeline.add_argument(
|
| 853 |
-
"--fetch-timeline", dest="fetch_timeline", action="store_true", default=True
|
| 854 |
-
)
|
| 855 |
-
full_pipeline.add_argument("--no-fetch-timeline", dest="fetch_timeline", action="store_false")
|
| 856 |
-
full_pipeline.add_argument(
|
| 857 |
-
"--dashboard-window-days",
|
| 858 |
-
type=int,
|
| 859 |
-
default=int(defaults.get("dashboard-window-days", 14)),
|
| 860 |
-
)
|
| 861 |
-
full_pipeline.add_argument(
|
| 862 |
-
"--new-contributor-window-days",
|
| 863 |
-
type=int,
|
| 864 |
-
default=int(defaults.get("new-contributor-window-days", 42)),
|
| 865 |
-
)
|
| 866 |
-
full_pipeline.add_argument(
|
| 867 |
-
"--new-contributor-max-authors",
|
| 868 |
-
type=int,
|
| 869 |
-
default=int(defaults.get("new-contributor-max-authors", 25)),
|
| 870 |
-
help="Contributor enrichment cap override. Full pipeline treats 0 as no cap and currently forces no cap.",
|
| 871 |
-
)
|
| 872 |
-
full_pipeline.add_argument(
|
| 873 |
-
"--issue-max-age-days", type=int, default=defaults.get("issue-max-age-days")
|
| 874 |
)
|
| 875 |
-
|
| 876 |
-
"--
|
|
|
|
|
|
|
| 877 |
)
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
"
|
| 882 |
-
action="store_true",
|
| 883 |
-
default=bool(defaults.get("open-prs-only", False)),
|
| 884 |
)
|
|
|
|
| 885 |
|
| 886 |
|
| 887 |
# Dispatch helpers
|
|
@@ -905,9 +1115,7 @@ def _resolve_hf_inputs(args: argparse.Namespace) -> tuple[str | None, str | None
|
|
| 905 |
def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 906 |
from slop_farmer.app.pipeline import run_pipeline
|
| 907 |
|
| 908 |
-
new_contributor_report = args.new_contributor_report
|
| 909 |
-
if new_contributor_report is None:
|
| 910 |
-
new_contributor_report = bool(args.publish)
|
| 911 |
options = PipelineOptions(
|
| 912 |
repo=RepoRef.parse(args.repo),
|
| 913 |
output_dir=args.output_dir,
|
|
@@ -921,9 +1129,6 @@ def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 921 |
max_reviews_per_pr=args.max_reviews_per_pr,
|
| 922 |
max_review_comments_per_pr=args.max_review_comments_per_pr,
|
| 923 |
fetch_timeline=args.fetch_timeline,
|
| 924 |
-
publish=args.publish,
|
| 925 |
-
hf_repo_id=args.hf_repo_id,
|
| 926 |
-
private_hf_repo=args.private_hf_repo,
|
| 927 |
new_contributor_report=new_contributor_report,
|
| 928 |
new_contributor_window_days=args.new_contributor_window_days,
|
| 929 |
new_contributor_max_authors=args.new_contributor_max_authors,
|
|
@@ -933,6 +1138,34 @@ def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 933 |
print(run_pipeline(options))
|
| 934 |
|
| 935 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 937 |
from slop_farmer.reports.analysis import run_analysis
|
| 938 |
|
|
@@ -948,6 +1181,7 @@ def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 948 |
ranking_backend=args.ranking_backend,
|
| 949 |
model=args.model,
|
| 950 |
max_clusters=args.max_clusters,
|
|
|
|
| 951 |
open_prs_only=args.open_prs_only,
|
| 952 |
cached_analysis=bool(analyze_defaults.get("cached_analysis", False)),
|
| 953 |
pr_template_cleanup_mode=str(
|
|
@@ -1041,12 +1275,18 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 1041 |
explain_pr_search_pair,
|
| 1042 |
format_pr_search_candidate_clusters,
|
| 1043 |
format_pr_search_cluster,
|
|
|
|
|
|
|
| 1044 |
format_pr_search_pair,
|
| 1045 |
format_pr_search_probe,
|
|
|
|
| 1046 |
format_pr_search_similar,
|
| 1047 |
format_pr_search_status,
|
| 1048 |
get_pr_search_candidate_clusters,
|
| 1049 |
get_pr_search_cluster,
|
|
|
|
|
|
|
|
|
|
| 1050 |
get_pr_search_similar,
|
| 1051 |
get_pr_search_status,
|
| 1052 |
probe_pr_search_github,
|
|
@@ -1140,6 +1380,36 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 1140 |
print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
|
| 1141 |
return
|
| 1142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1143 |
raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
|
| 1144 |
|
| 1145 |
|
|
@@ -1181,6 +1451,7 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
|
|
| 1181 |
del config_path
|
| 1182 |
from slop_farmer.reports.new_contributor_report import run_new_contributor_report
|
| 1183 |
|
|
|
|
| 1184 |
print(
|
| 1185 |
run_new_contributor_report(
|
| 1186 |
NewContributorReportOptions(
|
|
@@ -1188,6 +1459,9 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
|
|
| 1188 |
output_dir=args.output_dir,
|
| 1189 |
output=args.output,
|
| 1190 |
json_output=args.json_output,
|
|
|
|
|
|
|
|
|
|
| 1191 |
window_days=args.window_days,
|
| 1192 |
max_authors=args.max_authors,
|
| 1193 |
)
|
|
@@ -1199,6 +1473,7 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
|
|
| 1199 |
from slop_farmer.reports.dashboard import run_dashboard_data
|
| 1200 |
|
| 1201 |
dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
|
|
|
|
| 1202 |
print(
|
| 1203 |
run_dashboard_data(
|
| 1204 |
DashboardDataOptions(
|
|
@@ -1207,6 +1482,9 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
|
|
| 1207 |
analysis_input=args.analysis_input,
|
| 1208 |
contributors_input=args.contributors_input,
|
| 1209 |
pr_scope_input=args.pr_scope_input,
|
|
|
|
|
|
|
|
|
|
| 1210 |
window_days=args.window_days,
|
| 1211 |
snapshot_root=(
|
| 1212 |
Path(dashboard_defaults["snapshot-root"])
|
|
@@ -1222,6 +1500,7 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
|
|
| 1222 |
del config_path
|
| 1223 |
from slop_farmer.app.deploy import run_deploy_dashboard
|
| 1224 |
|
|
|
|
| 1225 |
run_deploy_dashboard(
|
| 1226 |
DeployDashboardOptions(
|
| 1227 |
pipeline_data_dir=args.pipeline_data_dir,
|
|
@@ -1229,6 +1508,10 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
|
|
| 1229 |
snapshot_dir=args.snapshot_dir,
|
| 1230 |
analysis_input=args.analysis_input,
|
| 1231 |
contributors_input=args.contributors_input,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1232 |
refresh_contributors=args.refresh_contributors,
|
| 1233 |
dashboard_window_days=args.dashboard_window_days,
|
| 1234 |
contributor_window_days=args.contributor_window_days,
|
|
@@ -1247,44 +1530,60 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
|
|
| 1247 |
)
|
| 1248 |
|
| 1249 |
|
| 1250 |
-
def
|
| 1251 |
del config_path
|
| 1252 |
-
from slop_farmer.app.
|
| 1253 |
|
| 1254 |
-
|
| 1255 |
-
|
|
|
|
| 1256 |
output_dir=args.output_dir,
|
| 1257 |
-
snapshot_dir=args.snapshot_dir,
|
| 1258 |
hf_repo_id=args.hf_repo_id,
|
| 1259 |
-
|
|
|
|
| 1260 |
)
|
| 1261 |
)
|
|
|
|
| 1262 |
|
| 1263 |
|
| 1264 |
-
def
|
| 1265 |
del config_path
|
| 1266 |
-
from slop_farmer.app.
|
| 1267 |
|
| 1268 |
print(
|
| 1269 |
-
|
| 1270 |
-
|
| 1271 |
-
|
| 1272 |
-
|
| 1273 |
-
|
| 1274 |
-
|
| 1275 |
-
|
| 1276 |
-
|
| 1277 |
-
|
| 1278 |
-
|
| 1279 |
-
|
| 1280 |
-
|
| 1281 |
-
|
| 1282 |
-
|
| 1283 |
-
|
| 1284 |
-
|
| 1285 |
-
|
| 1286 |
-
|
| 1287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1288 |
)
|
| 1289 |
)
|
| 1290 |
|
|
@@ -1296,6 +1595,7 @@ def main() -> None:
|
|
| 1296 |
|
| 1297 |
handlers: dict[str, CommandHandler] = {
|
| 1298 |
"scrape": _run_scrape,
|
|
|
|
| 1299 |
"analyze": _run_analyze,
|
| 1300 |
"markdown-report": _run_markdown_report,
|
| 1301 |
"duplicate-prs": _run_duplicate_prs,
|
|
@@ -1306,8 +1606,9 @@ def main() -> None:
|
|
| 1306 |
"new-contributor-report": _run_new_contributor_report,
|
| 1307 |
"dashboard-data": _run_dashboard_data,
|
| 1308 |
"deploy-dashboard": _run_deploy_dashboard,
|
| 1309 |
-
"
|
| 1310 |
-
"
|
|
|
|
| 1311 |
}
|
| 1312 |
handler = handlers.get(args.command)
|
| 1313 |
if handler is None:
|
|
|
|
| 13 |
AnalysisOptions,
|
| 14 |
CheckpointImportOptions,
|
| 15 |
DashboardDataOptions,
|
| 16 |
+
DatasetRefreshOptions,
|
| 17 |
+
DatasetStatusOptions,
|
| 18 |
DeployDashboardOptions,
|
|
|
|
| 19 |
MarkdownReportOptions,
|
| 20 |
NewContributorReportOptions,
|
| 21 |
PipelineOptions,
|
| 22 |
PrScopeOptions,
|
| 23 |
PrSearchRefreshOptions,
|
| 24 |
+
PublishAnalysisArtifactsOptions,
|
| 25 |
RepoRef,
|
| 26 |
+
SaveCacheOptions,
|
| 27 |
SnapshotAdoptOptions,
|
| 28 |
)
|
| 29 |
from slop_farmer.reports.duplicate_prs import DEFAULT_DUPLICATE_PR_MODEL
|
|
|
|
| 31 |
CommandHandler = Callable[[argparse.Namespace, Path | None], None]
|
| 32 |
|
| 33 |
|
| 34 |
+
def _int_at_least(minimum: int) -> Callable[[str], int]:
|
| 35 |
+
def parse(raw: str) -> int:
|
| 36 |
+
value = int(raw)
|
| 37 |
+
if value < minimum:
|
| 38 |
+
raise argparse.ArgumentTypeError(f"expected integer >= {minimum}")
|
| 39 |
+
return value
|
| 40 |
+
|
| 41 |
+
return parse
|
| 42 |
+
|
| 43 |
+
|
| 44 |
def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
| 45 |
defaults = _load_parser_defaults(config_path)
|
| 46 |
|
|
|
|
| 53 |
subparsers = parser.add_subparsers(dest="command", required=True)
|
| 54 |
|
| 55 |
_add_scrape_parser(subparsers, defaults["scrape"])
|
| 56 |
+
_add_refresh_dataset_parser(subparsers, defaults["refresh-dataset"])
|
| 57 |
_add_analyze_parser(subparsers, defaults["analyze"])
|
| 58 |
_add_pr_scope_parser(subparsers, defaults["pr-scope"])
|
| 59 |
_add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
|
|
|
|
| 63 |
_add_pr_search_parser(subparsers, defaults["pr-search"])
|
| 64 |
_add_new_contributor_report_parser(subparsers, defaults["new-contributor-report"])
|
| 65 |
_add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
|
| 66 |
+
_add_publish_analysis_artifacts_parser(subparsers, defaults["publish-analysis-artifacts"])
|
| 67 |
+
_add_save_cache_parser(subparsers, defaults["save-cache"])
|
| 68 |
_add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
|
| 69 |
+
_add_dataset_status_parser(subparsers, defaults["dataset-status"])
|
| 70 |
return parser
|
| 71 |
|
| 72 |
|
| 73 |
def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
|
| 74 |
commands = (
|
| 75 |
"scrape",
|
| 76 |
+
"refresh-dataset",
|
| 77 |
"analyze",
|
| 78 |
"import-hf-checkpoint",
|
| 79 |
"pr-scope",
|
|
|
|
| 81 |
"adopt-snapshot",
|
| 82 |
"new-contributor-report",
|
| 83 |
"dashboard-data",
|
| 84 |
+
"publish-analysis-artifacts",
|
| 85 |
+
"save-cache",
|
| 86 |
"deploy-dashboard",
|
| 87 |
+
"dataset-status",
|
| 88 |
)
|
| 89 |
return {command: command_defaults(command, config_path=config_path) for command in commands}
|
| 90 |
|
|
|
|
| 157 |
help="Fetch issue timeline events for linkage rows.",
|
| 158 |
)
|
| 159 |
scrape.add_argument(
|
| 160 |
+
"--new-contributor-report",
|
| 161 |
+
dest="new_contributor_report",
|
| 162 |
action="store_true",
|
| 163 |
+
default=defaults.get("new-contributor-report"),
|
| 164 |
+
help="Generate new contributor dataset/report artifacts for the local snapshot.",
|
| 165 |
+
)
|
| 166 |
+
scrape.add_argument(
|
| 167 |
+
"--no-new-contributor-report",
|
| 168 |
+
dest="new_contributor_report",
|
| 169 |
+
action="store_false",
|
| 170 |
+
help="Skip new contributor dataset/report generation.",
|
| 171 |
+
)
|
| 172 |
+
scrape.add_argument(
|
| 173 |
+
"--new-contributor-window-days",
|
| 174 |
+
type=int,
|
| 175 |
+
default=int(defaults.get("new-contributor-window-days", 42)),
|
| 176 |
+
help="Recent public activity window for contributor enrichment.",
|
| 177 |
)
|
| 178 |
scrape.add_argument(
|
| 179 |
+
"--new-contributor-max-authors",
|
| 180 |
+
type=int,
|
| 181 |
+
default=int(defaults.get("new-contributor-max-authors", 25)),
|
| 182 |
+
help="Maximum number of contributors to include in the new contributor report. Use 0 for no cap.",
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def _add_refresh_dataset_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 187 |
+
refresh = subparsers.add_parser(
|
| 188 |
+
"refresh-dataset",
|
| 189 |
+
help="Refresh the canonical Hugging Face dataset repo from remote watermark state.",
|
| 190 |
+
)
|
| 191 |
+
refresh.add_argument(
|
| 192 |
+
"--repo",
|
| 193 |
+
default=defaults.get("repo", "huggingface/transformers"),
|
| 194 |
+
help="GitHub repository in owner/name form.",
|
| 195 |
+
)
|
| 196 |
+
refresh.add_argument(
|
| 197 |
"--hf-repo-id",
|
| 198 |
default=defaults.get("hf-repo-id"),
|
| 199 |
+
required=defaults.get("hf-repo-id") is None,
|
| 200 |
+
help="Canonical Hugging Face dataset repo id to refresh.",
|
| 201 |
)
|
| 202 |
+
refresh.add_argument("--max-issues", type=int, default=defaults.get("max-issues"))
|
| 203 |
+
refresh.add_argument("--max-prs", type=int, default=defaults.get("max-prs"))
|
| 204 |
+
refresh.add_argument(
|
| 205 |
+
"--max-issue-comments", type=int, default=defaults.get("max-issue-comments")
|
| 206 |
+
)
|
| 207 |
+
refresh.add_argument(
|
| 208 |
+
"--max-reviews-per-pr", type=int, default=defaults.get("max-reviews-per-pr")
|
| 209 |
+
)
|
| 210 |
+
refresh.add_argument(
|
| 211 |
+
"--max-review-comments-per-pr",
|
| 212 |
+
type=int,
|
| 213 |
+
default=defaults.get("max-review-comments-per-pr"),
|
| 214 |
+
)
|
| 215 |
+
refresh.add_argument(
|
| 216 |
+
"--fetch-timeline",
|
| 217 |
action="store_true",
|
| 218 |
+
default=bool(defaults.get("fetch-timeline", False)),
|
|
|
|
| 219 |
)
|
| 220 |
+
refresh.add_argument(
|
| 221 |
"--new-contributor-report",
|
| 222 |
dest="new_contributor_report",
|
| 223 |
action="store_true",
|
| 224 |
+
default=bool(defaults.get("new-contributor-report", True)),
|
|
|
|
| 225 |
)
|
| 226 |
+
refresh.add_argument(
|
| 227 |
"--no-new-contributor-report",
|
| 228 |
dest="new_contributor_report",
|
| 229 |
action="store_false",
|
|
|
|
| 230 |
)
|
| 231 |
+
refresh.add_argument(
|
| 232 |
"--new-contributor-window-days",
|
| 233 |
type=int,
|
| 234 |
default=int(defaults.get("new-contributor-window-days", 42)),
|
|
|
|
| 235 |
)
|
| 236 |
+
refresh.add_argument(
|
| 237 |
"--new-contributor-max-authors",
|
| 238 |
type=int,
|
| 239 |
default=int(defaults.get("new-contributor-max-authors", 25)),
|
| 240 |
+
)
|
| 241 |
+
refresh.add_argument("--http-timeout", type=int, default=300)
|
| 242 |
+
refresh.add_argument("--http-max-retries", type=int, default=8)
|
| 243 |
+
refresh.add_argument("--checkpoint-every-comments", type=int, default=1000)
|
| 244 |
+
refresh.add_argument("--checkpoint-every-prs", type=int, default=25)
|
| 245 |
+
refresh.add_argument(
|
| 246 |
+
"--private-hf-repo",
|
| 247 |
+
dest="private_hf_repo",
|
| 248 |
+
action="store_true",
|
| 249 |
+
default=bool(defaults.get("private-hf-repo", False)),
|
| 250 |
+
help="Create the target dataset repo as private if needed.",
|
| 251 |
+
)
|
| 252 |
+
refresh.add_argument(
|
| 253 |
+
"--private",
|
| 254 |
+
dest="private_hf_repo",
|
| 255 |
+
action="store_true",
|
| 256 |
+
help=argparse.SUPPRESS,
|
| 257 |
)
|
| 258 |
|
| 259 |
|
| 260 |
def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 261 |
analyze = subparsers.add_parser(
|
| 262 |
+
"analyze",
|
| 263 |
+
help="Analyze a snapshot and write a local JSON report. Canonical publication is separate.",
|
| 264 |
)
|
| 265 |
analyze.add_argument(
|
| 266 |
"--snapshot-dir",
|
|
|
|
| 274 |
analyze.add_argument(
|
| 275 |
"--hf-repo-id",
|
| 276 |
default=defaults.get("hf-repo-id"),
|
| 277 |
+
help="Analyze a canonical Hugging Face dataset repo by materializing a self-consistent published snapshot locally.",
|
| 278 |
)
|
| 279 |
analyze.add_argument(
|
| 280 |
"--hf-revision",
|
|
|
|
| 297 |
)
|
| 298 |
analyze.add_argument(
|
| 299 |
"--model",
|
| 300 |
+
default=defaults.get("model", "gpt-5.4-mini?service_tier=flex"),
|
| 301 |
help="Model string used by fast-agent when enabled.",
|
| 302 |
)
|
| 303 |
analyze.add_argument(
|
|
|
|
| 306 |
default=int(defaults.get("max-clusters", 10)),
|
| 307 |
help="Maximum number of meta clusters to include in the report.",
|
| 308 |
)
|
| 309 |
+
analyze.add_argument(
|
| 310 |
+
"--hybrid-llm-concurrency",
|
| 311 |
+
type=_int_at_least(1),
|
| 312 |
+
default=int(defaults.get("hybrid-llm-concurrency", 1)),
|
| 313 |
+
help=(
|
| 314 |
+
"Maximum number of hybrid LLM review units to run at once. "
|
| 315 |
+
"Use 1 to minimize provider pressure."
|
| 316 |
+
),
|
| 317 |
+
)
|
| 318 |
analyze.add_argument(
|
| 319 |
"--open-prs-only",
|
| 320 |
action="store_true",
|
|
|
|
| 720 |
status.add_argument("--repo", help="Optional repo override.")
|
| 721 |
status.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 722 |
|
| 723 |
+
contributor = pr_search_subparsers.add_parser(
|
| 724 |
+
"contributor", help="Show indexed contributor summary for one author login."
|
| 725 |
+
)
|
| 726 |
+
contributor.add_argument("login", help="GitHub author login to query.")
|
| 727 |
+
contributor.add_argument(
|
| 728 |
+
"--db",
|
| 729 |
+
type=Path,
|
| 730 |
+
default=Path(defaults["db"]) if defaults.get("db") else None,
|
| 731 |
+
help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
|
| 732 |
+
)
|
| 733 |
+
contributor.add_argument(
|
| 734 |
+
"--output-dir",
|
| 735 |
+
type=Path,
|
| 736 |
+
default=Path(defaults.get("output-dir", "data")),
|
| 737 |
+
)
|
| 738 |
+
contributor.add_argument("--repo", help="Optional repo override.")
|
| 739 |
+
contributor.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 740 |
+
|
| 741 |
+
contributor_prs = pr_search_subparsers.add_parser(
|
| 742 |
+
"contributor-prs", help="List indexed PRs for one contributor login."
|
| 743 |
+
)
|
| 744 |
+
contributor_prs.add_argument("login", help="GitHub author login to query.")
|
| 745 |
+
contributor_prs.add_argument(
|
| 746 |
+
"--db",
|
| 747 |
+
type=Path,
|
| 748 |
+
default=Path(defaults["db"]) if defaults.get("db") else None,
|
| 749 |
+
help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
|
| 750 |
+
)
|
| 751 |
+
contributor_prs.add_argument(
|
| 752 |
+
"--output-dir",
|
| 753 |
+
type=Path,
|
| 754 |
+
default=Path(defaults.get("output-dir", "data")),
|
| 755 |
+
)
|
| 756 |
+
contributor_prs.add_argument("--repo", help="Optional repo override.")
|
| 757 |
+
contributor_prs.add_argument("--limit", type=int, default=20, help="Maximum rows to show.")
|
| 758 |
+
contributor_prs.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 759 |
+
|
| 760 |
+
pr_contributor = pr_search_subparsers.add_parser(
|
| 761 |
+
"pr-contributor", help="Show contributor summary for the author of one indexed PR."
|
| 762 |
+
)
|
| 763 |
+
pr_contributor.add_argument("pr_number", type=int, help="Pull request number to query.")
|
| 764 |
+
pr_contributor.add_argument(
|
| 765 |
+
"--db",
|
| 766 |
+
type=Path,
|
| 767 |
+
default=Path(defaults["db"]) if defaults.get("db") else None,
|
| 768 |
+
help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
|
| 769 |
+
)
|
| 770 |
+
pr_contributor.add_argument(
|
| 771 |
+
"--output-dir",
|
| 772 |
+
type=Path,
|
| 773 |
+
default=Path(defaults.get("output-dir", "data")),
|
| 774 |
+
)
|
| 775 |
+
pr_contributor.add_argument("--repo", help="Optional repo override.")
|
| 776 |
+
pr_contributor.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 777 |
+
|
| 778 |
|
| 779 |
def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 780 |
new_contributor = subparsers.add_parser(
|
|
|
|
| 797 |
new_contributor.add_argument(
|
| 798 |
"--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
|
| 799 |
)
|
| 800 |
+
new_contributor.add_argument(
|
| 801 |
+
"--hf-repo-id",
|
| 802 |
+
default=defaults.get("hf-repo-id"),
|
| 803 |
+
help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
|
| 804 |
+
)
|
| 805 |
+
new_contributor.add_argument(
|
| 806 |
+
"--hf-revision",
|
| 807 |
+
default=defaults.get("hf-revision"),
|
| 808 |
+
help="Optional Hub revision for metadata and README download.",
|
| 809 |
+
)
|
| 810 |
+
new_contributor.add_argument(
|
| 811 |
+
"--hf-materialize-dir",
|
| 812 |
+
type=Path,
|
| 813 |
+
default=Path(defaults["hf-materialize-dir"])
|
| 814 |
+
if defaults.get("hf-materialize-dir")
|
| 815 |
+
else None,
|
| 816 |
+
help="Optional local directory used when materializing an HF dataset snapshot.",
|
| 817 |
+
)
|
| 818 |
new_contributor.add_argument(
|
| 819 |
"--window-days",
|
| 820 |
type=int,
|
|
|
|
| 846 |
dashboard.add_argument(
|
| 847 |
"--analysis-input",
|
| 848 |
type=Path,
|
| 849 |
+
help="Optional analysis report JSON override. Defaults to canonical published current analysis when available, otherwise falls back to snapshot-local analysis files.",
|
| 850 |
)
|
| 851 |
dashboard.add_argument(
|
| 852 |
"--contributors-input",
|
| 853 |
type=Path,
|
| 854 |
+
help="Optional contributor report JSON override. Defaults to the materialized snapshot's new-contributors-report.json.",
|
| 855 |
)
|
| 856 |
dashboard.add_argument(
|
| 857 |
"--pr-scope-input",
|
| 858 |
type=Path,
|
| 859 |
+
help="Optional PR scope cluster JSON override. Defaults to the materialized snapshot's pr-scope-clusters.json.",
|
| 860 |
+
)
|
| 861 |
+
dashboard.add_argument(
|
| 862 |
+
"--hf-repo-id",
|
| 863 |
+
default=defaults.get("hf-repo-id"),
|
| 864 |
+
help="Materialize the canonical Hugging Face dataset repo instead of using the latest local snapshot.",
|
| 865 |
+
)
|
| 866 |
+
dashboard.add_argument(
|
| 867 |
+
"--hf-revision",
|
| 868 |
+
default=defaults.get("hf-revision"),
|
| 869 |
+
help="Optional Hub revision for metadata and README download.",
|
| 870 |
+
)
|
| 871 |
+
dashboard.add_argument(
|
| 872 |
+
"--hf-materialize-dir",
|
| 873 |
+
type=Path,
|
| 874 |
+
default=Path(defaults["hf-materialize-dir"])
|
| 875 |
+
if defaults.get("hf-materialize-dir")
|
| 876 |
+
else None,
|
| 877 |
+
help="Optional local directory used when materializing an HF dataset snapshot.",
|
| 878 |
)
|
| 879 |
dashboard.add_argument(
|
| 880 |
"--window-days",
|
|
|
|
| 884 |
)
|
| 885 |
|
| 886 |
|
| 887 |
+
def _add_publish_analysis_artifacts_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 888 |
+
publish_analysis = subparsers.add_parser(
|
| 889 |
+
"publish-analysis-artifacts",
|
| 890 |
+
help="Publish archived and optional canonical hybrid analysis artifacts to a dataset repo.",
|
| 891 |
)
|
| 892 |
+
publish_analysis.add_argument(
|
| 893 |
"--output-dir",
|
| 894 |
type=Path,
|
| 895 |
default=Path(defaults.get("output-dir", "data")),
|
| 896 |
help="Pipeline workspace root containing snapshots/latest.json.",
|
| 897 |
)
|
| 898 |
+
publish_analysis.add_argument(
|
| 899 |
+
"--snapshot-dir",
|
| 900 |
+
type=Path,
|
| 901 |
+
help="Optional explicit snapshot directory containing analysis-report-hybrid.json.",
|
| 902 |
)
|
| 903 |
+
publish_analysis.add_argument(
|
| 904 |
+
"--analysis-input",
|
| 905 |
+
type=Path,
|
| 906 |
+
help="Optional explicit hybrid analysis report JSON to publish instead of snapshot-dir discovery.",
|
| 907 |
+
)
|
| 908 |
+
publish_analysis.add_argument(
|
| 909 |
"--hf-repo-id",
|
| 910 |
default=defaults.get("hf-repo-id"),
|
| 911 |
required=defaults.get("hf-repo-id") is None,
|
| 912 |
help="Target Hugging Face dataset repo id.",
|
| 913 |
)
|
| 914 |
+
publish_analysis.add_argument("--analysis-id", required=True, help="Immutable analysis run id.")
|
| 915 |
+
publish_analysis.add_argument(
|
| 916 |
+
"--canonical",
|
| 917 |
+
action="store_true",
|
| 918 |
+
default=bool(defaults.get("canonical", False)),
|
| 919 |
+
help="Also update the stable analysis/current canonical alias.",
|
| 920 |
+
)
|
| 921 |
+
publish_analysis.add_argument(
|
| 922 |
+
"--save-cache",
|
| 923 |
+
action="store_true",
|
| 924 |
+
default=bool(defaults.get("save-cache", False)),
|
| 925 |
+
help="Also upload snapshot-local analysis-state/ as mutable operational cache at repo-root analysis-state/.",
|
| 926 |
+
)
|
| 927 |
+
publish_analysis.add_argument(
|
| 928 |
+
"--private-hf-repo",
|
| 929 |
+
action="store_true",
|
| 930 |
+
default=bool(defaults.get("private-hf-repo", False)),
|
| 931 |
+
help="Create the target dataset repo as private if needed.",
|
| 932 |
+
)
|
| 933 |
+
|
| 934 |
+
|
| 935 |
+
def _add_save_cache_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 936 |
+
save_cache = subparsers.add_parser(
|
| 937 |
+
"save-cache",
|
| 938 |
+
help="Upload snapshot-local analysis-state/ as mutable operational cache to a dataset repo.",
|
| 939 |
+
)
|
| 940 |
+
save_cache.add_argument(
|
| 941 |
+
"--output-dir",
|
| 942 |
+
type=Path,
|
| 943 |
+
default=Path(defaults.get("output-dir", "data")),
|
| 944 |
+
help="Pipeline workspace root containing snapshots/latest.json.",
|
| 945 |
+
)
|
| 946 |
+
save_cache.add_argument(
|
| 947 |
+
"--snapshot-dir",
|
| 948 |
+
type=Path,
|
| 949 |
+
help="Optional explicit snapshot directory containing analysis-state/.",
|
| 950 |
+
)
|
| 951 |
+
save_cache.add_argument(
|
| 952 |
+
"--hf-repo-id",
|
| 953 |
+
default=defaults.get("hf-repo-id"),
|
| 954 |
+
required=defaults.get("hf-repo-id") is None,
|
| 955 |
+
help="Target Hugging Face dataset repo id.",
|
| 956 |
+
)
|
| 957 |
+
save_cache.add_argument(
|
| 958 |
"--private-hf-repo",
|
| 959 |
action="store_true",
|
| 960 |
default=bool(defaults.get("private-hf-repo", False)),
|
|
|
|
| 964 |
|
| 965 |
def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 966 |
deploy_dashboard = subparsers.add_parser(
|
| 967 |
+
"deploy-dashboard",
|
| 968 |
+
help="Build and publish the static dashboard to a Hugging Face Space from a materialized dataset view.",
|
| 969 |
)
|
| 970 |
deploy_dashboard.add_argument(
|
| 971 |
"--pipeline-data-dir",
|
|
|
|
| 981 |
help="Optional snapshot directory to publish. Defaults to the latest snapshot in --pipeline-data-dir.",
|
| 982 |
)
|
| 983 |
deploy_dashboard.add_argument(
|
| 984 |
+
"--analysis-input",
|
| 985 |
+
type=Path,
|
| 986 |
+
help="Optional analysis report JSON override. Omit to prefer canonical published current analysis when available.",
|
| 987 |
+
)
|
| 988 |
+
deploy_dashboard.add_argument(
|
| 989 |
+
"--contributors-input",
|
| 990 |
+
type=Path,
|
| 991 |
+
help="Optional contributor report JSON override.",
|
| 992 |
+
)
|
| 993 |
+
deploy_dashboard.add_argument(
|
| 994 |
+
"--pr-scope-input",
|
| 995 |
+
type=Path,
|
| 996 |
+
help="Optional PR scope cluster JSON override.",
|
| 997 |
+
)
|
| 998 |
+
deploy_dashboard.add_argument(
|
| 999 |
+
"--hf-repo-id",
|
| 1000 |
+
default=defaults.get("hf-repo-id"),
|
| 1001 |
+
help="Materialize the canonical Hugging Face dataset repo instead of using the latest local snapshot.",
|
| 1002 |
)
|
| 1003 |
deploy_dashboard.add_argument(
|
| 1004 |
+
"--hf-revision",
|
| 1005 |
+
default=defaults.get("hf-revision"),
|
| 1006 |
+
help="Optional Hub revision for metadata and README download.",
|
| 1007 |
+
)
|
| 1008 |
+
deploy_dashboard.add_argument(
|
| 1009 |
+
"--hf-materialize-dir",
|
| 1010 |
+
type=Path,
|
| 1011 |
+
default=Path(defaults["hf-materialize-dir"])
|
| 1012 |
+
if defaults.get("hf-materialize-dir")
|
| 1013 |
+
else None,
|
| 1014 |
+
help="Optional local directory used when materializing an HF dataset snapshot.",
|
| 1015 |
)
|
| 1016 |
deploy_dashboard.add_argument(
|
| 1017 |
"--refresh-contributors",
|
|
|
|
| 1069 |
)
|
| 1070 |
|
| 1071 |
|
| 1072 |
+
def _add_dataset_status_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 1073 |
+
dataset_status = subparsers.add_parser(
|
| 1074 |
+
"dataset-status",
|
| 1075 |
+
help="Inspect canonical dataset freshness and the local latest pointer.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1076 |
)
|
| 1077 |
+
dataset_status.add_argument("--repo", default=defaults.get("repo"))
|
| 1078 |
+
dataset_status.add_argument(
|
| 1079 |
+
"--output-dir",
|
| 1080 |
type=Path,
|
| 1081 |
+
default=Path(defaults.get("output-dir", "data")),
|
| 1082 |
+
help="Local workspace root containing snapshots/latest.json.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1083 |
)
|
| 1084 |
+
dataset_status.add_argument(
|
| 1085 |
+
"--hf-repo-id",
|
| 1086 |
+
default=defaults.get("hf-repo-id"),
|
| 1087 |
+
help="Canonical Hugging Face dataset repo id to inspect.",
|
| 1088 |
)
|
| 1089 |
+
dataset_status.add_argument(
|
| 1090 |
+
"--hf-revision",
|
| 1091 |
+
default=defaults.get("hf-revision"),
|
| 1092 |
+
help="Optional Hub revision for metadata and README download.",
|
|
|
|
|
|
|
| 1093 |
)
|
| 1094 |
+
dataset_status.add_argument("--json", action="store_true", help="Emit machine-readable JSON.")
|
| 1095 |
|
| 1096 |
|
| 1097 |
# Dispatch helpers
|
|
|
|
| 1115 |
def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1116 |
from slop_farmer.app.pipeline import run_pipeline
|
| 1117 |
|
| 1118 |
+
new_contributor_report = bool(args.new_contributor_report)
|
|
|
|
|
|
|
| 1119 |
options = PipelineOptions(
|
| 1120 |
repo=RepoRef.parse(args.repo),
|
| 1121 |
output_dir=args.output_dir,
|
|
|
|
| 1129 |
max_reviews_per_pr=args.max_reviews_per_pr,
|
| 1130 |
max_review_comments_per_pr=args.max_review_comments_per_pr,
|
| 1131 |
fetch_timeline=args.fetch_timeline,
|
|
|
|
|
|
|
|
|
|
| 1132 |
new_contributor_report=new_contributor_report,
|
| 1133 |
new_contributor_window_days=args.new_contributor_window_days,
|
| 1134 |
new_contributor_max_authors=args.new_contributor_max_authors,
|
|
|
|
| 1138 |
print(run_pipeline(options))
|
| 1139 |
|
| 1140 |
|
| 1141 |
+
def _run_refresh_dataset(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1142 |
+
from slop_farmer.app.dataset_refresh import run_dataset_refresh
|
| 1143 |
+
|
| 1144 |
+
refresh_defaults = command_defaults("refresh-dataset", config_path=config_path)
|
| 1145 |
+
result = run_dataset_refresh(
|
| 1146 |
+
DatasetRefreshOptions(
|
| 1147 |
+
repo=RepoRef.parse(args.repo),
|
| 1148 |
+
hf_repo_id=args.hf_repo_id,
|
| 1149 |
+
private_hf_repo=args.private_hf_repo,
|
| 1150 |
+
max_issues=args.max_issues,
|
| 1151 |
+
max_prs=args.max_prs,
|
| 1152 |
+
max_issue_comments=args.max_issue_comments,
|
| 1153 |
+
max_reviews_per_pr=args.max_reviews_per_pr,
|
| 1154 |
+
max_review_comments_per_pr=args.max_review_comments_per_pr,
|
| 1155 |
+
fetch_timeline=args.fetch_timeline,
|
| 1156 |
+
new_contributor_report=args.new_contributor_report,
|
| 1157 |
+
new_contributor_window_days=args.new_contributor_window_days,
|
| 1158 |
+
new_contributor_max_authors=args.new_contributor_max_authors,
|
| 1159 |
+
http_timeout=args.http_timeout,
|
| 1160 |
+
http_max_retries=args.http_max_retries,
|
| 1161 |
+
checkpoint_every_comments=args.checkpoint_every_comments,
|
| 1162 |
+
checkpoint_every_prs=args.checkpoint_every_prs,
|
| 1163 |
+
cluster_suppression_rules=tuple(refresh_defaults.get("cluster-suppression-rules", ())),
|
| 1164 |
+
)
|
| 1165 |
+
)
|
| 1166 |
+
print(json.dumps(result, indent=2))
|
| 1167 |
+
|
| 1168 |
+
|
| 1169 |
def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1170 |
from slop_farmer.reports.analysis import run_analysis
|
| 1171 |
|
|
|
|
| 1181 |
ranking_backend=args.ranking_backend,
|
| 1182 |
model=args.model,
|
| 1183 |
max_clusters=args.max_clusters,
|
| 1184 |
+
hybrid_llm_concurrency=args.hybrid_llm_concurrency,
|
| 1185 |
open_prs_only=args.open_prs_only,
|
| 1186 |
cached_analysis=bool(analyze_defaults.get("cached_analysis", False)),
|
| 1187 |
pr_template_cleanup_mode=str(
|
|
|
|
| 1275 |
explain_pr_search_pair,
|
| 1276 |
format_pr_search_candidate_clusters,
|
| 1277 |
format_pr_search_cluster,
|
| 1278 |
+
format_pr_search_contributor,
|
| 1279 |
+
format_pr_search_contributor_pulls,
|
| 1280 |
format_pr_search_pair,
|
| 1281 |
format_pr_search_probe,
|
| 1282 |
+
format_pr_search_pull_contributor,
|
| 1283 |
format_pr_search_similar,
|
| 1284 |
format_pr_search_status,
|
| 1285 |
get_pr_search_candidate_clusters,
|
| 1286 |
get_pr_search_cluster,
|
| 1287 |
+
get_pr_search_contributor,
|
| 1288 |
+
get_pr_search_contributor_pulls,
|
| 1289 |
+
get_pr_search_pull_contributor,
|
| 1290 |
get_pr_search_similar,
|
| 1291 |
get_pr_search_status,
|
| 1292 |
probe_pr_search_github,
|
|
|
|
| 1380 |
print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
|
| 1381 |
return
|
| 1382 |
|
| 1383 |
+
if args.pr_search_command == "contributor":
|
| 1384 |
+
result = get_pr_search_contributor(db_path, author_login=args.login, repo=args.repo)
|
| 1385 |
+
print(json.dumps(result, indent=2) if args.json else format_pr_search_contributor(result))
|
| 1386 |
+
return
|
| 1387 |
+
|
| 1388 |
+
if args.pr_search_command == "contributor-prs":
|
| 1389 |
+
result = get_pr_search_contributor_pulls(
|
| 1390 |
+
db_path,
|
| 1391 |
+
author_login=args.login,
|
| 1392 |
+
repo=args.repo,
|
| 1393 |
+
limit=args.limit,
|
| 1394 |
+
)
|
| 1395 |
+
print(
|
| 1396 |
+
json.dumps(result, indent=2)
|
| 1397 |
+
if args.json
|
| 1398 |
+
else format_pr_search_contributor_pulls(result)
|
| 1399 |
+
)
|
| 1400 |
+
return
|
| 1401 |
+
|
| 1402 |
+
if args.pr_search_command == "pr-contributor":
|
| 1403 |
+
result = get_pr_search_pull_contributor(
|
| 1404 |
+
db_path,
|
| 1405 |
+
pr_number=args.pr_number,
|
| 1406 |
+
repo=args.repo,
|
| 1407 |
+
)
|
| 1408 |
+
print(
|
| 1409 |
+
json.dumps(result, indent=2) if args.json else format_pr_search_pull_contributor(result)
|
| 1410 |
+
)
|
| 1411 |
+
return
|
| 1412 |
+
|
| 1413 |
raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
|
| 1414 |
|
| 1415 |
|
|
|
|
| 1451 |
del config_path
|
| 1452 |
from slop_farmer.reports.new_contributor_report import run_new_contributor_report
|
| 1453 |
|
| 1454 |
+
hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
|
| 1455 |
print(
|
| 1456 |
run_new_contributor_report(
|
| 1457 |
NewContributorReportOptions(
|
|
|
|
| 1459 |
output_dir=args.output_dir,
|
| 1460 |
output=args.output,
|
| 1461 |
json_output=args.json_output,
|
| 1462 |
+
hf_repo_id=hf_repo_id,
|
| 1463 |
+
hf_revision=hf_revision,
|
| 1464 |
+
hf_materialize_dir=hf_materialize_dir,
|
| 1465 |
window_days=args.window_days,
|
| 1466 |
max_authors=args.max_authors,
|
| 1467 |
)
|
|
|
|
| 1473 |
from slop_farmer.reports.dashboard import run_dashboard_data
|
| 1474 |
|
| 1475 |
dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
|
| 1476 |
+
hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
|
| 1477 |
print(
|
| 1478 |
run_dashboard_data(
|
| 1479 |
DashboardDataOptions(
|
|
|
|
| 1482 |
analysis_input=args.analysis_input,
|
| 1483 |
contributors_input=args.contributors_input,
|
| 1484 |
pr_scope_input=args.pr_scope_input,
|
| 1485 |
+
hf_repo_id=hf_repo_id,
|
| 1486 |
+
hf_revision=hf_revision,
|
| 1487 |
+
hf_materialize_dir=hf_materialize_dir,
|
| 1488 |
window_days=args.window_days,
|
| 1489 |
snapshot_root=(
|
| 1490 |
Path(dashboard_defaults["snapshot-root"])
|
|
|
|
| 1500 |
del config_path
|
| 1501 |
from slop_farmer.app.deploy import run_deploy_dashboard
|
| 1502 |
|
| 1503 |
+
hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
|
| 1504 |
run_deploy_dashboard(
|
| 1505 |
DeployDashboardOptions(
|
| 1506 |
pipeline_data_dir=args.pipeline_data_dir,
|
|
|
|
| 1508 |
snapshot_dir=args.snapshot_dir,
|
| 1509 |
analysis_input=args.analysis_input,
|
| 1510 |
contributors_input=args.contributors_input,
|
| 1511 |
+
pr_scope_input=args.pr_scope_input,
|
| 1512 |
+
hf_repo_id=hf_repo_id,
|
| 1513 |
+
hf_revision=hf_revision,
|
| 1514 |
+
hf_materialize_dir=hf_materialize_dir,
|
| 1515 |
refresh_contributors=args.refresh_contributors,
|
| 1516 |
dashboard_window_days=args.dashboard_window_days,
|
| 1517 |
contributor_window_days=args.contributor_window_days,
|
|
|
|
| 1530 |
)
|
| 1531 |
|
| 1532 |
|
| 1533 |
+
def _run_dataset_status(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1534 |
del config_path
|
| 1535 |
+
from slop_farmer.app.dataset_status import format_dataset_status, get_dataset_status
|
| 1536 |
|
| 1537 |
+
result = get_dataset_status(
|
| 1538 |
+
DatasetStatusOptions(
|
| 1539 |
+
repo=args.repo,
|
| 1540 |
output_dir=args.output_dir,
|
|
|
|
| 1541 |
hf_repo_id=args.hf_repo_id,
|
| 1542 |
+
hf_revision=args.hf_revision,
|
| 1543 |
+
json_output=args.json,
|
| 1544 |
)
|
| 1545 |
)
|
| 1546 |
+
print(json.dumps(result, indent=2) if args.json else format_dataset_status(result))
|
| 1547 |
|
| 1548 |
|
| 1549 |
+
def _run_publish_analysis_artifacts(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1550 |
del config_path
|
| 1551 |
+
from slop_farmer.app.publish_analysis import run_publish_analysis_artifacts
|
| 1552 |
|
| 1553 |
print(
|
| 1554 |
+
json.dumps(
|
| 1555 |
+
run_publish_analysis_artifacts(
|
| 1556 |
+
PublishAnalysisArtifactsOptions(
|
| 1557 |
+
output_dir=args.output_dir,
|
| 1558 |
+
snapshot_dir=args.snapshot_dir,
|
| 1559 |
+
analysis_input=args.analysis_input,
|
| 1560 |
+
hf_repo_id=args.hf_repo_id,
|
| 1561 |
+
analysis_id=args.analysis_id,
|
| 1562 |
+
canonical=args.canonical,
|
| 1563 |
+
save_cache=args.save_cache,
|
| 1564 |
+
private_hf_repo=args.private_hf_repo,
|
| 1565 |
+
)
|
| 1566 |
+
),
|
| 1567 |
+
indent=2,
|
| 1568 |
+
)
|
| 1569 |
+
)
|
| 1570 |
+
|
| 1571 |
+
|
| 1572 |
+
def _run_save_cache(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1573 |
+
del config_path
|
| 1574 |
+
from slop_farmer.app.save_cache import run_save_cache
|
| 1575 |
+
|
| 1576 |
+
print(
|
| 1577 |
+
json.dumps(
|
| 1578 |
+
run_save_cache(
|
| 1579 |
+
SaveCacheOptions(
|
| 1580 |
+
output_dir=args.output_dir,
|
| 1581 |
+
snapshot_dir=args.snapshot_dir,
|
| 1582 |
+
hf_repo_id=args.hf_repo_id,
|
| 1583 |
+
private_hf_repo=args.private_hf_repo,
|
| 1584 |
+
)
|
| 1585 |
+
),
|
| 1586 |
+
indent=2,
|
| 1587 |
)
|
| 1588 |
)
|
| 1589 |
|
|
|
|
| 1595 |
|
| 1596 |
handlers: dict[str, CommandHandler] = {
|
| 1597 |
"scrape": _run_scrape,
|
| 1598 |
+
"refresh-dataset": _run_refresh_dataset,
|
| 1599 |
"analyze": _run_analyze,
|
| 1600 |
"markdown-report": _run_markdown_report,
|
| 1601 |
"duplicate-prs": _run_duplicate_prs,
|
|
|
|
| 1606 |
"new-contributor-report": _run_new_contributor_report,
|
| 1607 |
"dashboard-data": _run_dashboard_data,
|
| 1608 |
"deploy-dashboard": _run_deploy_dashboard,
|
| 1609 |
+
"dataset-status": _run_dataset_status,
|
| 1610 |
+
"publish-analysis-artifacts": _run_publish_analysis_artifacts,
|
| 1611 |
+
"save-cache": _run_save_cache,
|
| 1612 |
}
|
| 1613 |
handler = handlers.get(args.command)
|
| 1614 |
if handler is None:
|
src/slop_farmer/app/dataset_refresh.py
CHANGED
|
@@ -17,6 +17,7 @@ from slop_farmer.app_config import command_defaults, extract_cli_config_path
|
|
| 17 |
from slop_farmer.config import (
|
| 18 |
DatasetRefreshOptions,
|
| 19 |
NewContributorReportOptions,
|
|
|
|
| 20 |
RepoRef,
|
| 21 |
resolve_github_token,
|
| 22 |
)
|
|
@@ -48,6 +49,7 @@ from slop_farmer.data.parquet_io import (
|
|
| 48 |
write_text,
|
| 49 |
)
|
| 50 |
from slop_farmer.reports.new_contributor_report import run_new_contributor_report
|
|
|
|
| 51 |
|
| 52 |
PRIMARY_KEYS: dict[str, tuple[str, ...]] = {
|
| 53 |
"issues": ("github_id",),
|
|
@@ -318,6 +320,9 @@ def _build_argument_parser(*, config_path: Path | None = None) -> argparse.Argum
|
|
| 318 |
default=bool(defaults.get("private-hf-repo", False)),
|
| 319 |
)
|
| 320 |
parser.add_argument("--private", dest="private_hf_repo", action="store_true")
|
|
|
|
|
|
|
|
|
|
| 321 |
return parser
|
| 322 |
|
| 323 |
|
|
@@ -872,7 +877,7 @@ def run_dataset_refresh(options: DatasetRefreshOptions) -> dict[str, Any]:
|
|
| 872 |
table_name: merge_rows(table_name, previous_tables[table_name], delta_rows)
|
| 873 |
for table_name, delta_rows in delta_tables.items()
|
| 874 |
}
|
| 875 |
-
manifest = {
|
| 876 |
"repo": repo_slug,
|
| 877 |
"snapshot_id": sid,
|
| 878 |
"crawl_started_at": crawl_started_at,
|
|
@@ -918,8 +923,27 @@ def run_dataset_refresh(options: DatasetRefreshOptions) -> dict[str, Any]:
|
|
| 918 |
)
|
| 919 |
write_parquet(issue_comment_rows, output_root / "issue_comments.parquet", "comments")
|
| 920 |
write_parquet(pr_comment_rows, output_root / "pr_comments.parquet", "comments")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 921 |
if options.new_contributor_report:
|
| 922 |
-
write_json(manifest, output_root / "manifest.json")
|
| 923 |
log("Generating new contributor dataset/report artifacts")
|
| 924 |
run_new_contributor_report(
|
| 925 |
NewContributorReportOptions(
|
|
@@ -937,11 +961,14 @@ def run_dataset_refresh(options: DatasetRefreshOptions) -> dict[str, Any]:
|
|
| 937 |
manifest["counts"]["new_contributors"] = len(
|
| 938 |
read_parquet_rows(output_root / "new_contributors.parquet")
|
| 939 |
)
|
| 940 |
-
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
|
|
|
|
|
|
|
|
|
| 945 |
manifest["watermark"].pop("previous_snapshot_dir", None)
|
| 946 |
write_json(manifest, output_root / "manifest.json")
|
| 947 |
write_text(
|
|
@@ -962,7 +989,7 @@ def run_dataset_refresh(options: DatasetRefreshOptions) -> dict[str, Any]:
|
|
| 962 |
},
|
| 963 |
output_root / "state" / "watermark.json",
|
| 964 |
)
|
| 965 |
-
write_json(manifest,
|
| 966 |
write_json(
|
| 967 |
{
|
| 968 |
"repo": repo_slug,
|
|
@@ -1012,6 +1039,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
| 1012 |
http_max_retries=args.http_max_retries,
|
| 1013 |
checkpoint_every_comments=args.checkpoint_every_comments,
|
| 1014 |
checkpoint_every_prs=args.checkpoint_every_prs,
|
|
|
|
| 1015 |
)
|
| 1016 |
)
|
| 1017 |
print(json.dumps(result, indent=2))
|
|
|
|
| 17 |
from slop_farmer.config import (
|
| 18 |
DatasetRefreshOptions,
|
| 19 |
NewContributorReportOptions,
|
| 20 |
+
PrScopeOptions,
|
| 21 |
RepoRef,
|
| 22 |
resolve_github_token,
|
| 23 |
)
|
|
|
|
| 49 |
write_text,
|
| 50 |
)
|
| 51 |
from slop_farmer.reports.new_contributor_report import run_new_contributor_report
|
| 52 |
+
from slop_farmer.reports.pr_scope import run_pr_scope_report
|
| 53 |
|
| 54 |
PRIMARY_KEYS: dict[str, tuple[str, ...]] = {
|
| 55 |
"issues": ("github_id",),
|
|
|
|
| 320 |
default=bool(defaults.get("private-hf-repo", False)),
|
| 321 |
)
|
| 322 |
parser.add_argument("--private", dest="private_hf_repo", action="store_true")
|
| 323 |
+
parser.set_defaults(
|
| 324 |
+
cluster_suppression_rules=tuple(defaults.get("cluster-suppression-rules", ()))
|
| 325 |
+
)
|
| 326 |
return parser
|
| 327 |
|
| 328 |
|
|
|
|
| 877 |
table_name: merge_rows(table_name, previous_tables[table_name], delta_rows)
|
| 878 |
for table_name, delta_rows in delta_tables.items()
|
| 879 |
}
|
| 880 |
+
manifest: dict[str, Any] = {
|
| 881 |
"repo": repo_slug,
|
| 882 |
"snapshot_id": sid,
|
| 883 |
"crawl_started_at": crawl_started_at,
|
|
|
|
| 923 |
)
|
| 924 |
write_parquet(issue_comment_rows, output_root / "issue_comments.parquet", "comments")
|
| 925 |
write_parquet(pr_comment_rows, output_root / "pr_comments.parquet", "comments")
|
| 926 |
+
archived_snapshot_dir = output_root / "snapshots" / sid
|
| 927 |
+
archived_snapshot_dir.mkdir(parents=True, exist_ok=True)
|
| 928 |
+
write_json(manifest, output_root / "manifest.json")
|
| 929 |
+
log("Generating PR scope clusters")
|
| 930 |
+
pr_scope_path = run_pr_scope_report(
|
| 931 |
+
PrScopeOptions(
|
| 932 |
+
snapshot_dir=output_root,
|
| 933 |
+
output_dir=output_root,
|
| 934 |
+
output=output_root / "pr-scope-clusters.json",
|
| 935 |
+
hf_repo_id=None,
|
| 936 |
+
hf_revision=None,
|
| 937 |
+
hf_materialize_dir=None,
|
| 938 |
+
cluster_suppression_rules=options.cluster_suppression_rules,
|
| 939 |
+
)
|
| 940 |
+
)
|
| 941 |
+
shutil.copy2(pr_scope_path, archived_snapshot_dir / pr_scope_path.name)
|
| 942 |
+
artifacts: dict[str, str] = {
|
| 943 |
+
"pr_scope_clusters_json": pr_scope_path.name,
|
| 944 |
+
"archived_pr_scope_clusters_json": f"snapshots/{sid}/{pr_scope_path.name}",
|
| 945 |
+
}
|
| 946 |
if options.new_contributor_report:
|
|
|
|
| 947 |
log("Generating new contributor dataset/report artifacts")
|
| 948 |
run_new_contributor_report(
|
| 949 |
NewContributorReportOptions(
|
|
|
|
| 961 |
manifest["counts"]["new_contributors"] = len(
|
| 962 |
read_parquet_rows(output_root / "new_contributors.parquet")
|
| 963 |
)
|
| 964 |
+
artifacts.update(
|
| 965 |
+
{
|
| 966 |
+
"new_contributors_parquet": "new_contributors.parquet",
|
| 967 |
+
"new_contributors_json": "new-contributors-report.json",
|
| 968 |
+
"new_contributors_markdown": "new-contributors-report.md",
|
| 969 |
+
}
|
| 970 |
+
)
|
| 971 |
+
manifest["artifacts"] = artifacts
|
| 972 |
manifest["watermark"].pop("previous_snapshot_dir", None)
|
| 973 |
write_json(manifest, output_root / "manifest.json")
|
| 974 |
write_text(
|
|
|
|
| 989 |
},
|
| 990 |
output_root / "state" / "watermark.json",
|
| 991 |
)
|
| 992 |
+
write_json(manifest, archived_snapshot_dir / "manifest.json")
|
| 993 |
write_json(
|
| 994 |
{
|
| 995 |
"repo": repo_slug,
|
|
|
|
| 1039 |
http_max_retries=args.http_max_retries,
|
| 1040 |
checkpoint_every_comments=args.checkpoint_every_comments,
|
| 1041 |
checkpoint_every_prs=args.checkpoint_every_prs,
|
| 1042 |
+
cluster_suppression_rules=tuple(args.cluster_suppression_rules),
|
| 1043 |
)
|
| 1044 |
)
|
| 1045 |
print(json.dumps(result, indent=2))
|
src/slop_farmer/app/dataset_status.py
CHANGED
|
@@ -15,6 +15,14 @@ from slop_farmer.data.hf_dataset_repo import (
|
|
| 15 |
stable_snapshot_candidates,
|
| 16 |
)
|
| 17 |
from slop_farmer.data.parquet_io import read_json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def _coerce_datetime(value: Any) -> datetime | None:
|
|
@@ -51,17 +59,41 @@ def _local_status(output_dir: Path) -> dict[str, Any] | None:
|
|
| 51 |
if not latest_path.exists():
|
| 52 |
return None
|
| 53 |
payload = read_json(latest_path)
|
| 54 |
-
|
| 55 |
-
manifest = {}
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
| 58 |
if manifest_path.exists():
|
| 59 |
manifest = read_json(manifest_path)
|
|
|
|
| 60 |
return {
|
| 61 |
"latest_path": str(latest_path),
|
| 62 |
"latest_pointer": payload,
|
| 63 |
-
"snapshot_dir":
|
| 64 |
"snapshot_id": manifest.get("snapshot_id") or payload.get("latest_snapshot_id"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
}
|
| 66 |
|
| 67 |
|
|
@@ -73,7 +105,7 @@ def _remote_status(repo_id: str, revision: str | None) -> dict[str, Any]:
|
|
| 73 |
latest_pointer = load_remote_json_file(
|
| 74 |
api,
|
| 75 |
repo_id,
|
| 76 |
-
|
| 77 |
root,
|
| 78 |
revision=revision,
|
| 79 |
)
|
|
@@ -98,27 +130,32 @@ def _remote_status(repo_id: str, revision: str | None) -> dict[str, Any]:
|
|
| 98 |
continue
|
| 99 |
manifest = read_json(downloaded)
|
| 100 |
break
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
if isinstance(latest_pointer, dict)
|
| 104 |
-
else
|
| 105 |
)
|
| 106 |
-
|
| 107 |
-
path
|
| 108 |
-
for path in
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
)
|
| 113 |
)
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
f"{snapshot_prefix}/new-contributors-report.md",
|
| 121 |
-
)
|
| 122 |
)
|
| 123 |
extracted_at = manifest.get("extracted_at") if manifest else None
|
| 124 |
return {
|
|
@@ -127,12 +164,75 @@ def _remote_status(repo_id: str, revision: str | None) -> dict[str, Any]:
|
|
| 127 |
"latest_pointer": latest_pointer,
|
| 128 |
"watermark": watermark,
|
| 129 |
"manifest": manifest,
|
| 130 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
"remote_path_count": len(remote_paths),
|
| 132 |
"age": _age_summary(extracted_at),
|
| 133 |
}
|
| 134 |
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
def get_dataset_status(options: DatasetStatusOptions) -> dict[str, Any]:
|
| 137 |
remote = _remote_status(options.hf_repo_id, options.hf_revision) if options.hf_repo_id else None
|
| 138 |
local = _local_status(options.output_dir)
|
|
@@ -156,6 +256,9 @@ def format_dataset_status(status: dict[str, Any]) -> str:
|
|
| 156 |
watermark = remote.get("watermark") or {}
|
| 157 |
latest_pointer = remote.get("latest_pointer") or {}
|
| 158 |
age = remote.get("age") or {}
|
|
|
|
|
|
|
|
|
|
| 159 |
lines = [
|
| 160 |
f"Repo: {status.get('repo') or '?'}",
|
| 161 |
f"Dataset: {status.get('dataset_id') or 'not configured'}",
|
|
@@ -166,10 +269,32 @@ def format_dataset_status(status: dict[str, Any]) -> str:
|
|
| 166 |
f"Remote latest snapshot: {manifest.get('snapshot_id') or latest_pointer.get('latest_snapshot_id') or '?'}",
|
| 167 |
f"Remote extracted at: {manifest.get('extracted_at') or '?'}",
|
| 168 |
f"Remote next_since: {watermark.get('next_since') or latest_pointer.get('next_since') or '?'}",
|
| 169 |
-
f"
|
| 170 |
-
f"
|
| 171 |
]
|
| 172 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
if local:
|
| 174 |
lines.extend(
|
| 175 |
[
|
|
@@ -177,6 +302,15 @@ def format_dataset_status(status: dict[str, Any]) -> str:
|
|
| 177 |
f"Local snapshot id: {local.get('snapshot_id') or '?'}",
|
| 178 |
]
|
| 179 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
else:
|
| 181 |
lines.append("Local latest pointer: none")
|
| 182 |
return "\n".join(lines)
|
|
|
|
| 15 |
stable_snapshot_candidates,
|
| 16 |
)
|
| 17 |
from slop_farmer.data.parquet_io import read_json
|
| 18 |
+
from slop_farmer.data.snapshot_paths import (
|
| 19 |
+
CONTRIBUTOR_ARTIFACT_FILENAMES,
|
| 20 |
+
CURRENT_ANALYSIS_MANIFEST_PATH,
|
| 21 |
+
PR_SCOPE_CLUSTERS_FILENAME,
|
| 22 |
+
SNAPSHOTS_LATEST_PATH,
|
| 23 |
+
load_current_analysis_manifest,
|
| 24 |
+
repo_relative_path_to_local,
|
| 25 |
+
)
|
| 26 |
|
| 27 |
|
| 28 |
def _coerce_datetime(value: Any) -> datetime | None:
|
|
|
|
| 59 |
if not latest_path.exists():
|
| 60 |
return None
|
| 61 |
payload = read_json(latest_path)
|
| 62 |
+
snapshot_dir_raw = payload.get("snapshot_dir")
|
| 63 |
+
manifest: dict[str, Any] = {}
|
| 64 |
+
snapshot_dir: Path | None = None
|
| 65 |
+
if isinstance(snapshot_dir_raw, str) and snapshot_dir_raw:
|
| 66 |
+
snapshot_dir = Path(snapshot_dir_raw).resolve()
|
| 67 |
+
manifest_path = snapshot_dir / "manifest.json"
|
| 68 |
if manifest_path.exists():
|
| 69 |
manifest = read_json(manifest_path)
|
| 70 |
+
current_analysis = _local_current_analysis(snapshot_dir)
|
| 71 |
return {
|
| 72 |
"latest_path": str(latest_path),
|
| 73 |
"latest_pointer": payload,
|
| 74 |
+
"snapshot_dir": snapshot_dir_raw,
|
| 75 |
"snapshot_id": manifest.get("snapshot_id") or payload.get("latest_snapshot_id"),
|
| 76 |
+
"current_analysis": current_analysis,
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _local_current_analysis(snapshot_dir: Path | None) -> dict[str, Any]:
|
| 81 |
+
if snapshot_dir is None:
|
| 82 |
+
return {"present": False}
|
| 83 |
+
manifest_path = repo_relative_path_to_local(snapshot_dir, CURRENT_ANALYSIS_MANIFEST_PATH)
|
| 84 |
+
if not manifest_path.exists():
|
| 85 |
+
return {"present": False}
|
| 86 |
+
try:
|
| 87 |
+
manifest = load_current_analysis_manifest(manifest_path)
|
| 88 |
+
except ValueError as exc:
|
| 89 |
+
return {"present": True, "valid": False, "detail": str(exc)}
|
| 90 |
+
return {
|
| 91 |
+
"present": True,
|
| 92 |
+
"valid": True,
|
| 93 |
+
"snapshot_id": manifest["snapshot_id"],
|
| 94 |
+
"analysis_id": manifest["analysis_id"],
|
| 95 |
+
"variant": manifest["variant"],
|
| 96 |
+
"published_at": manifest["published_at"],
|
| 97 |
}
|
| 98 |
|
| 99 |
|
|
|
|
| 105 |
latest_pointer = load_remote_json_file(
|
| 106 |
api,
|
| 107 |
repo_id,
|
| 108 |
+
SNAPSHOTS_LATEST_PATH,
|
| 109 |
root,
|
| 110 |
revision=revision,
|
| 111 |
)
|
|
|
|
| 130 |
continue
|
| 131 |
manifest = read_json(downloaded)
|
| 132 |
break
|
| 133 |
+
current_analysis = _remote_current_analysis(
|
| 134 |
+
api,
|
| 135 |
+
repo_id,
|
| 136 |
+
root,
|
| 137 |
+
revision=revision,
|
| 138 |
+
remote_paths=remote_paths,
|
| 139 |
+
latest_pointer=latest_pointer,
|
| 140 |
+
)
|
| 141 |
+
latest_snapshot_id = (
|
| 142 |
+
str(latest_pointer.get("latest_snapshot_id"))
|
| 143 |
if isinstance(latest_pointer, dict)
|
| 144 |
+
else None
|
| 145 |
)
|
| 146 |
+
archived_run_manifests = sorted(
|
| 147 |
+
path
|
| 148 |
+
for path in remote_paths
|
| 149 |
+
if path.startswith("snapshots/")
|
| 150 |
+
and "/analysis-runs/" in path
|
| 151 |
+
and path.endswith("/manifest.json")
|
|
|
|
| 152 |
)
|
| 153 |
+
current_snapshot_run_count = 0
|
| 154 |
+
if latest_snapshot_id:
|
| 155 |
+
current_snapshot_run_count = sum(
|
| 156 |
+
1
|
| 157 |
+
for path in archived_run_manifests
|
| 158 |
+
if path.startswith(f"snapshots/{latest_snapshot_id}/analysis-runs/")
|
|
|
|
|
|
|
| 159 |
)
|
| 160 |
extracted_at = manifest.get("extracted_at") if manifest else None
|
| 161 |
return {
|
|
|
|
| 164 |
"latest_pointer": latest_pointer,
|
| 165 |
"watermark": watermark,
|
| 166 |
"manifest": manifest,
|
| 167 |
+
"cheap_artifacts": {
|
| 168 |
+
"pr_scope_clusters": _remote_has_latest_artifact(
|
| 169 |
+
remote_paths,
|
| 170 |
+
latest_pointer,
|
| 171 |
+
PR_SCOPE_CLUSTERS_FILENAME,
|
| 172 |
+
),
|
| 173 |
+
"contributors": all(
|
| 174 |
+
_remote_has_latest_artifact(remote_paths, latest_pointer, filename)
|
| 175 |
+
for filename in CONTRIBUTOR_ARTIFACT_FILENAMES
|
| 176 |
+
),
|
| 177 |
+
},
|
| 178 |
+
"current_analysis": current_analysis,
|
| 179 |
+
"archived_analysis_runs": {
|
| 180 |
+
"count": len(archived_run_manifests),
|
| 181 |
+
"current_snapshot_count": current_snapshot_run_count,
|
| 182 |
+
},
|
| 183 |
"remote_path_count": len(remote_paths),
|
| 184 |
"age": _age_summary(extracted_at),
|
| 185 |
}
|
| 186 |
|
| 187 |
|
| 188 |
+
def _remote_current_analysis(
|
| 189 |
+
api: HfApi,
|
| 190 |
+
repo_id: str,
|
| 191 |
+
root: Path,
|
| 192 |
+
*,
|
| 193 |
+
revision: str | None,
|
| 194 |
+
remote_paths: set[str],
|
| 195 |
+
latest_pointer: dict[str, Any] | None,
|
| 196 |
+
) -> dict[str, Any]:
|
| 197 |
+
if CURRENT_ANALYSIS_MANIFEST_PATH not in remote_paths:
|
| 198 |
+
return {"present": False}
|
| 199 |
+
downloaded = load_remote_file(
|
| 200 |
+
api,
|
| 201 |
+
repo_id,
|
| 202 |
+
CURRENT_ANALYSIS_MANIFEST_PATH,
|
| 203 |
+
root,
|
| 204 |
+
revision=revision,
|
| 205 |
+
)
|
| 206 |
+
if downloaded is None:
|
| 207 |
+
return {"present": False}
|
| 208 |
+
try:
|
| 209 |
+
manifest = load_current_analysis_manifest(downloaded)
|
| 210 |
+
except ValueError as exc:
|
| 211 |
+
return {"present": True, "valid": False, "detail": str(exc)}
|
| 212 |
+
latest_snapshot_id = (
|
| 213 |
+
str(latest_pointer.get("latest_snapshot_id")) if isinstance(latest_pointer, dict) else None
|
| 214 |
+
)
|
| 215 |
+
return {
|
| 216 |
+
"present": True,
|
| 217 |
+
"valid": True,
|
| 218 |
+
"snapshot_id": manifest["snapshot_id"],
|
| 219 |
+
"analysis_id": manifest["analysis_id"],
|
| 220 |
+
"variant": manifest["variant"],
|
| 221 |
+
"published_at": manifest["published_at"],
|
| 222 |
+
"matches_latest_snapshot": manifest["snapshot_id"] == latest_snapshot_id,
|
| 223 |
+
"artifact_count": len(manifest["artifacts"]),
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def _remote_has_latest_artifact(
|
| 228 |
+
remote_paths: set[str],
|
| 229 |
+
latest_pointer: dict[str, Any] | None,
|
| 230 |
+
filename: str,
|
| 231 |
+
) -> bool:
|
| 232 |
+
candidates = stable_snapshot_candidates(latest_pointer, filename)
|
| 233 |
+
return any(candidate in remote_paths for candidate in candidates)
|
| 234 |
+
|
| 235 |
+
|
| 236 |
def get_dataset_status(options: DatasetStatusOptions) -> dict[str, Any]:
|
| 237 |
remote = _remote_status(options.hf_repo_id, options.hf_revision) if options.hf_repo_id else None
|
| 238 |
local = _local_status(options.output_dir)
|
|
|
|
| 256 |
watermark = remote.get("watermark") or {}
|
| 257 |
latest_pointer = remote.get("latest_pointer") or {}
|
| 258 |
age = remote.get("age") or {}
|
| 259 |
+
current_analysis = remote.get("current_analysis") or {}
|
| 260 |
+
cheap_artifacts = remote.get("cheap_artifacts") or {}
|
| 261 |
+
archived_runs = remote.get("archived_analysis_runs") or {}
|
| 262 |
lines = [
|
| 263 |
f"Repo: {status.get('repo') or '?'}",
|
| 264 |
f"Dataset: {status.get('dataset_id') or 'not configured'}",
|
|
|
|
| 269 |
f"Remote latest snapshot: {manifest.get('snapshot_id') or latest_pointer.get('latest_snapshot_id') or '?'}",
|
| 270 |
f"Remote extracted at: {manifest.get('extracted_at') or '?'}",
|
| 271 |
f"Remote next_since: {watermark.get('next_since') or latest_pointer.get('next_since') or '?'}",
|
| 272 |
+
f"PR scope artifact: {'yes' if cheap_artifacts.get('pr_scope_clusters') else 'no'}",
|
| 273 |
+
f"Contributor artifacts: {'yes' if cheap_artifacts.get('contributors') else 'no'}",
|
| 274 |
]
|
| 275 |
)
|
| 276 |
+
if current_analysis.get("present"):
|
| 277 |
+
if current_analysis.get("valid") is False:
|
| 278 |
+
lines.append(f"Current analysis: invalid ({current_analysis.get('detail')})")
|
| 279 |
+
else:
|
| 280 |
+
lines.append(
|
| 281 |
+
"Current analysis: "
|
| 282 |
+
f"snapshot={current_analysis.get('snapshot_id')} "
|
| 283 |
+
f"analysis_id={current_analysis.get('analysis_id')}"
|
| 284 |
+
)
|
| 285 |
+
lines.append(
|
| 286 |
+
"Current analysis matches latest snapshot: "
|
| 287 |
+
f"{'yes' if current_analysis.get('matches_latest_snapshot') else 'no'}"
|
| 288 |
+
)
|
| 289 |
+
else:
|
| 290 |
+
lines.append("Current analysis: none")
|
| 291 |
+
lines.append(
|
| 292 |
+
"Archived analysis runs: "
|
| 293 |
+
f"{archived_runs.get('count', 0)} total, {archived_runs.get('current_snapshot_count', 0)} for latest snapshot"
|
| 294 |
+
)
|
| 295 |
+
lines.append(
|
| 296 |
+
f"Freshness: {age.get('summary') or 'unknown'} ({age.get('staleness') or 'unknown'})"
|
| 297 |
+
)
|
| 298 |
if local:
|
| 299 |
lines.extend(
|
| 300 |
[
|
|
|
|
| 302 |
f"Local snapshot id: {local.get('snapshot_id') or '?'}",
|
| 303 |
]
|
| 304 |
)
|
| 305 |
+
local_current_analysis = local.get("current_analysis") or {}
|
| 306 |
+
if local_current_analysis.get("present"):
|
| 307 |
+
lines.append(
|
| 308 |
+
"Local current analysis: "
|
| 309 |
+
f"snapshot={local_current_analysis.get('snapshot_id')} "
|
| 310 |
+
f"analysis_id={local_current_analysis.get('analysis_id')}"
|
| 311 |
+
)
|
| 312 |
+
else:
|
| 313 |
+
lines.append("Local current analysis: none")
|
| 314 |
else:
|
| 315 |
lines.append("Local latest pointer: none")
|
| 316 |
return "\n".join(lines)
|
src/slop_farmer/app/deploy.py
CHANGED
|
@@ -5,6 +5,7 @@ import subprocess
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
from slop_farmer.config import DeployDashboardOptions
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
@@ -17,6 +18,16 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
| 17 |
{
|
| 18 |
"PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
|
| 19 |
"WEB_DIR": str(options.web_dir),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
|
| 21 |
"CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
|
| 22 |
"CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
|
|
@@ -28,12 +39,12 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
| 28 |
"SPACE_SHORT_DESCRIPTION": options.space_short_description,
|
| 29 |
}
|
| 30 |
)
|
| 31 |
-
if options.snapshot_dir is not None:
|
| 32 |
-
env["SNAPSHOT_DIR"] = str(options.snapshot_dir)
|
| 33 |
if options.analysis_input is not None:
|
| 34 |
-
env["ANALYSIS_INPUT"] = str(options.analysis_input)
|
| 35 |
if options.contributors_input is not None:
|
| 36 |
-
env["CONTRIBUTORS_INPUT"] = str(options.contributors_input)
|
|
|
|
|
|
|
| 37 |
if options.refresh_contributors:
|
| 38 |
env["REFRESH_CONTRIBUTORS"] = "1"
|
| 39 |
if options.private_space:
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
from slop_farmer.config import DeployDashboardOptions
|
| 8 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
| 9 |
|
| 10 |
|
| 11 |
def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
|
|
| 18 |
{
|
| 19 |
"PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
|
| 20 |
"WEB_DIR": str(options.web_dir),
|
| 21 |
+
"SNAPSHOT_DIR": str(
|
| 22 |
+
resolve_snapshot_source_dir(
|
| 23 |
+
snapshot_dir=options.snapshot_dir,
|
| 24 |
+
local_snapshots_root=options.pipeline_data_dir.resolve() / "snapshots",
|
| 25 |
+
hf_repo_id=options.hf_repo_id,
|
| 26 |
+
hf_revision=options.hf_revision,
|
| 27 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 28 |
+
hf_output_dir=options.pipeline_data_dir,
|
| 29 |
+
)
|
| 30 |
+
),
|
| 31 |
"DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
|
| 32 |
"CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
|
| 33 |
"CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
|
|
|
|
| 39 |
"SPACE_SHORT_DESCRIPTION": options.space_short_description,
|
| 40 |
}
|
| 41 |
)
|
|
|
|
|
|
|
| 42 |
if options.analysis_input is not None:
|
| 43 |
+
env["ANALYSIS_INPUT"] = str(options.analysis_input.resolve())
|
| 44 |
if options.contributors_input is not None:
|
| 45 |
+
env["CONTRIBUTORS_INPUT"] = str(options.contributors_input.resolve())
|
| 46 |
+
if options.pr_scope_input is not None:
|
| 47 |
+
env["PR_SCOPE_INPUT"] = str(options.pr_scope_input.resolve())
|
| 48 |
if options.refresh_contributors:
|
| 49 |
env["REFRESH_CONTRIBUTORS"] = "1"
|
| 50 |
if options.private_space:
|
src/slop_farmer/app/hf_checkpoint_import.py
CHANGED
|
@@ -26,8 +26,9 @@ from typing import Any
|
|
| 26 |
|
| 27 |
from huggingface_hub import HfApi, hf_hub_download
|
| 28 |
|
| 29 |
-
from slop_farmer.app.
|
| 30 |
from slop_farmer.config import CheckpointImportOptions
|
|
|
|
| 31 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 32 |
from slop_farmer.data.parquet_io import (
|
| 33 |
SCHEMAS,
|
|
@@ -106,7 +107,9 @@ def import_hf_checkpoint(options: CheckpointImportOptions) -> Path:
|
|
| 106 |
force=options.force,
|
| 107 |
)
|
| 108 |
if options.publish_repo_id:
|
| 109 |
-
|
|
|
|
|
|
|
| 110 |
return snapshot_dir
|
| 111 |
|
| 112 |
|
|
@@ -455,76 +458,15 @@ def _viewer_comment_rows(
|
|
| 455 |
def _dataset_card(
|
| 456 |
repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
|
| 457 |
) -> str:
|
| 458 |
-
return
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
data_files:
|
| 468 |
-
- split: train
|
| 469 |
-
path: pull_requests.parquet
|
| 470 |
-
- config_name: issue_comments
|
| 471 |
-
data_files:
|
| 472 |
-
- split: train
|
| 473 |
-
path: issue_comments.parquet
|
| 474 |
-
- config_name: pr_comments
|
| 475 |
-
data_files:
|
| 476 |
-
- split: train
|
| 477 |
-
path: pr_comments.parquet
|
| 478 |
-
- config_name: pr_reviews
|
| 479 |
-
data_files:
|
| 480 |
-
- split: train
|
| 481 |
-
path: reviews.parquet
|
| 482 |
-
- config_name: pr_files
|
| 483 |
-
data_files:
|
| 484 |
-
- split: train
|
| 485 |
-
path: pr_files.parquet
|
| 486 |
-
- config_name: pr_diffs
|
| 487 |
-
data_files:
|
| 488 |
-
- split: train
|
| 489 |
-
path: pr_diffs.parquet
|
| 490 |
-
- config_name: review_comments
|
| 491 |
-
data_files:
|
| 492 |
-
- split: train
|
| 493 |
-
path: review_comments.parquet
|
| 494 |
-
- config_name: links
|
| 495 |
-
data_files:
|
| 496 |
-
- split: train
|
| 497 |
-
path: links.parquet
|
| 498 |
-
- config_name: events
|
| 499 |
-
data_files:
|
| 500 |
-
- split: train
|
| 501 |
-
path: events.parquet
|
| 502 |
-
---
|
| 503 |
-
---
|
| 504 |
-
|
| 505 |
-
# Transformers PR Slop Dataset
|
| 506 |
-
|
| 507 |
-
Imported checkpoint snapshot for `{repo_slug}`.
|
| 508 |
-
|
| 509 |
-
Files:
|
| 510 |
-
- `issues.parquet`
|
| 511 |
-
- `pull_requests.parquet`
|
| 512 |
-
- `comments.parquet`
|
| 513 |
-
- `issue_comments.parquet`
|
| 514 |
-
- `pr_comments.parquet`
|
| 515 |
-
- `reviews.parquet`
|
| 516 |
-
- `pr_files.parquet`
|
| 517 |
-
- `pr_diffs.parquet`
|
| 518 |
-
- `review_comments.parquet`
|
| 519 |
-
- `links.parquet`
|
| 520 |
-
- `events.parquet`
|
| 521 |
-
|
| 522 |
-
Notes:
|
| 523 |
-
- source HF dataset: `{source_repo_id}`
|
| 524 |
-
- source checkpoint root: `{checkpoint_root}`
|
| 525 |
-
- latest imported checkpoint: `{snapshot_id}`
|
| 526 |
-
- links were regenerated locally from text references and timeline events
|
| 527 |
-
"""
|
| 528 |
|
| 529 |
|
| 530 |
def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:
|
|
|
|
| 26 |
|
| 27 |
from huggingface_hub import HfApi, hf_hub_download
|
| 28 |
|
| 29 |
+
from slop_farmer.app.publish_dataset_snapshot import publish_dataset_snapshot
|
| 30 |
from slop_farmer.config import CheckpointImportOptions
|
| 31 |
+
from slop_farmer.data.dataset_card import build_hf_dataset_card
|
| 32 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 33 |
from slop_farmer.data.parquet_io import (
|
| 34 |
SCHEMAS,
|
|
|
|
| 107 |
force=options.force,
|
| 108 |
)
|
| 109 |
if options.publish_repo_id:
|
| 110 |
+
publish_dataset_snapshot(
|
| 111 |
+
snapshot_dir, options.publish_repo_id, private=options.private_hf_repo
|
| 112 |
+
)
|
| 113 |
return snapshot_dir
|
| 114 |
|
| 115 |
|
|
|
|
| 458 |
def _dataset_card(
|
| 459 |
repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
|
| 460 |
) -> str:
|
| 461 |
+
return build_hf_dataset_card(
|
| 462 |
+
repo_slug,
|
| 463 |
+
snapshot_id,
|
| 464 |
+
notes=[
|
| 465 |
+
f"source HF dataset: `{source_repo_id}`",
|
| 466 |
+
f"source checkpoint root: `{checkpoint_root}`",
|
| 467 |
+
"links were regenerated locally from text references and timeline events",
|
| 468 |
+
],
|
| 469 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
|
| 471 |
|
| 472 |
def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:
|
src/slop_farmer/app/pipeline.py
CHANGED
|
@@ -7,8 +7,8 @@ from datetime import UTC, datetime, timedelta
|
|
| 7 |
from pathlib import Path
|
| 8 |
from typing import Any, Protocol
|
| 9 |
|
| 10 |
-
from slop_farmer.app.publish import publish_snapshot
|
| 11 |
from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
|
|
|
|
| 12 |
from slop_farmer.data.github_api import GitHubClient
|
| 13 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 14 |
from slop_farmer.data.normalize import (
|
|
@@ -112,96 +112,14 @@ def _reference_time_for_age_caps(crawl_started_at: str) -> datetime:
|
|
| 112 |
def _dataset_card(
|
| 113 |
repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
|
| 114 |
) -> str:
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
new_contributor_file = """- `new_contributors.parquet`
|
| 124 |
-
- `new-contributors-report.json`
|
| 125 |
-
- `new-contributors-report.md`
|
| 126 |
-
"""
|
| 127 |
-
return f"""---
|
| 128 |
-
pretty_name: Transformers PR Slop Dataset
|
| 129 |
-
configs:
|
| 130 |
-
- config_name: issues
|
| 131 |
-
data_files:
|
| 132 |
-
- split: train
|
| 133 |
-
path: issues.parquet
|
| 134 |
-
default: true
|
| 135 |
-
- config_name: prs
|
| 136 |
-
data_files:
|
| 137 |
-
- split: train
|
| 138 |
-
path: pull_requests.parquet
|
| 139 |
-
- config_name: issue_comments
|
| 140 |
-
data_files:
|
| 141 |
-
- split: train
|
| 142 |
-
path: issue_comments.parquet
|
| 143 |
-
- config_name: pr_comments
|
| 144 |
-
data_files:
|
| 145 |
-
- split: train
|
| 146 |
-
path: pr_comments.parquet
|
| 147 |
-
- config_name: pr_reviews
|
| 148 |
-
data_files:
|
| 149 |
-
- split: train
|
| 150 |
-
path: reviews.parquet
|
| 151 |
-
- config_name: pr_files
|
| 152 |
-
data_files:
|
| 153 |
-
- split: train
|
| 154 |
-
path: pr_files.parquet
|
| 155 |
-
- config_name: pr_diffs
|
| 156 |
-
data_files:
|
| 157 |
-
- split: train
|
| 158 |
-
path: pr_diffs.parquet
|
| 159 |
-
- config_name: review_comments
|
| 160 |
-
data_files:
|
| 161 |
-
- split: train
|
| 162 |
-
path: review_comments.parquet
|
| 163 |
-
- config_name: links
|
| 164 |
-
data_files:
|
| 165 |
-
- split: train
|
| 166 |
-
path: links.parquet
|
| 167 |
-
- config_name: events
|
| 168 |
-
data_files:
|
| 169 |
-
- split: train
|
| 170 |
-
path: events.parquet
|
| 171 |
-
{new_contributor_config}---
|
| 172 |
-
---
|
| 173 |
-
|
| 174 |
-
# Transformers PR Slop Dataset
|
| 175 |
-
|
| 176 |
-
Normalized snapshots of issues, pull requests, comments, reviews, and linkage data from `{repo}`.
|
| 177 |
-
|
| 178 |
-
Files:
|
| 179 |
-
- `issues.parquet`
|
| 180 |
-
- `pull_requests.parquet`
|
| 181 |
-
- `comments.parquet`
|
| 182 |
-
- `issue_comments.parquet` (derived view of issue discussion comments)
|
| 183 |
-
- `pr_comments.parquet` (derived view of pull request discussion comments)
|
| 184 |
-
- `reviews.parquet`
|
| 185 |
-
- `pr_files.parquet`
|
| 186 |
-
- `pr_diffs.parquet`
|
| 187 |
-
- `review_comments.parquet`
|
| 188 |
-
- `links.parquet`
|
| 189 |
-
- `events.parquet`
|
| 190 |
-
{new_contributor_file}
|
| 191 |
-
|
| 192 |
-
Use:
|
| 193 |
-
- duplicate PR and issue analysis
|
| 194 |
-
- triage and ranking experiments
|
| 195 |
-
- eval set creation
|
| 196 |
-
|
| 197 |
-
Notes:
|
| 198 |
-
- updated daily
|
| 199 |
-
- latest snapshot: `{snapshot_id}`
|
| 200 |
-
- raw data only; no labels or moderation decisions
|
| 201 |
-
- PR metadata, file-level patch hunks, and full unified diffs are included
|
| 202 |
-
- new contributor reviewer artifacts are included when generated for the snapshot
|
| 203 |
-
- full file contents for changed files are not included
|
| 204 |
-
"""
|
| 205 |
|
| 206 |
|
| 207 |
def _viewer_comment_rows(
|
|
@@ -982,9 +900,6 @@ def run_pipeline(options: PipelineOptions, client: GitHubClientLike | None = Non
|
|
| 982 |
"issue_max_age_days": options.issue_max_age_days,
|
| 983 |
"pr_max_age_days": options.pr_max_age_days,
|
| 984 |
"fetch_timeline": options.fetch_timeline,
|
| 985 |
-
"publish": options.publish,
|
| 986 |
-
"hf_repo_id": options.hf_repo_id,
|
| 987 |
-
"private_hf_repo": options.private_hf_repo,
|
| 988 |
"new_contributor_report": options.new_contributor_report,
|
| 989 |
"new_contributor_window_days": options.new_contributor_window_days,
|
| 990 |
"new_contributor_max_authors": options.new_contributor_max_authors,
|
|
@@ -1045,6 +960,9 @@ def run_pipeline(options: PipelineOptions, client: GitHubClientLike | None = Non
|
|
| 1045 |
output_dir=options.output_dir,
|
| 1046 |
output=None,
|
| 1047 |
json_output=None,
|
|
|
|
|
|
|
|
|
|
| 1048 |
window_days=options.new_contributor_window_days,
|
| 1049 |
max_authors=options.new_contributor_max_authors,
|
| 1050 |
)
|
|
@@ -1094,12 +1012,5 @@ def run_pipeline(options: PipelineOptions, client: GitHubClientLike | None = Non
|
|
| 1094 |
_log(f"Updated watermark state: {_watermark_path(options.output_dir)}")
|
| 1095 |
|
| 1096 |
_clear_checkpoint(options.output_dir, snapshot_dir)
|
| 1097 |
-
if options.publish:
|
| 1098 |
-
if not options.hf_repo_id:
|
| 1099 |
-
raise ValueError("--publish requires --hf-repo-id")
|
| 1100 |
-
publish_snapshot(
|
| 1101 |
-
snapshot_dir, options.hf_repo_id, private=options.private_hf_repo, log=_log
|
| 1102 |
-
)
|
| 1103 |
-
|
| 1104 |
_log(f"Snapshot complete: {snapshot_dir}")
|
| 1105 |
return snapshot_dir
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
from typing import Any, Protocol
|
| 9 |
|
|
|
|
| 10 |
from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
|
| 11 |
+
from slop_farmer.data.dataset_card import build_hf_dataset_card
|
| 12 |
from slop_farmer.data.github_api import GitHubClient
|
| 13 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 14 |
from slop_farmer.data.normalize import (
|
|
|
|
| 112 |
def _dataset_card(
|
| 113 |
repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
|
| 114 |
) -> str:
|
| 115 |
+
notes = ["new contributor reviewer artifacts are included"] if include_new_contributors else []
|
| 116 |
+
del manifest
|
| 117 |
+
return build_hf_dataset_card(
|
| 118 |
+
repo,
|
| 119 |
+
snapshot_id,
|
| 120 |
+
include_new_contributors=include_new_contributors,
|
| 121 |
+
notes=notes,
|
| 122 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
|
| 125 |
def _viewer_comment_rows(
|
|
|
|
| 900 |
"issue_max_age_days": options.issue_max_age_days,
|
| 901 |
"pr_max_age_days": options.pr_max_age_days,
|
| 902 |
"fetch_timeline": options.fetch_timeline,
|
|
|
|
|
|
|
|
|
|
| 903 |
"new_contributor_report": options.new_contributor_report,
|
| 904 |
"new_contributor_window_days": options.new_contributor_window_days,
|
| 905 |
"new_contributor_max_authors": options.new_contributor_max_authors,
|
|
|
|
| 960 |
output_dir=options.output_dir,
|
| 961 |
output=None,
|
| 962 |
json_output=None,
|
| 963 |
+
hf_repo_id=None,
|
| 964 |
+
hf_revision=None,
|
| 965 |
+
hf_materialize_dir=None,
|
| 966 |
window_days=options.new_contributor_window_days,
|
| 967 |
max_authors=options.new_contributor_max_authors,
|
| 968 |
)
|
|
|
|
| 1012 |
_log(f"Updated watermark state: {_watermark_path(options.output_dir)}")
|
| 1013 |
|
| 1014 |
_clear_checkpoint(options.output_dir, snapshot_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1015 |
_log(f"Snapshot complete: {snapshot_dir}")
|
| 1016 |
return snapshot_dir
|
src/slop_farmer/app/pr_search.py
CHANGED
|
@@ -10,9 +10,12 @@ get_pr_search_status = pr_search_service.get_pr_search_status
|
|
| 10 |
get_pr_search_similar = pr_search_service.get_pr_search_similar
|
| 11 |
get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
|
| 12 |
get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
|
|
|
|
|
|
|
| 13 |
get_pr_search_clusters = pr_search_service.get_pr_search_clusters
|
| 14 |
list_pr_search_clusters = pr_search_service.list_pr_search_clusters
|
| 15 |
get_pr_search_cluster = pr_search_service.get_pr_search_cluster
|
|
|
|
| 16 |
explain_pr_search_pair = pr_search_service.explain_pr_search_pair
|
| 17 |
probe_pr_search_live = pr_search_service.probe_pr_search_live
|
| 18 |
probe_pr_search_github = pr_search_service.probe_pr_search_github
|
|
@@ -31,6 +34,7 @@ def format_pr_search_status(result: Mapping[str, Any]) -> str:
|
|
| 31 |
(
|
| 32 |
"Rows: "
|
| 33 |
f"documents={counts['documents']} "
|
|
|
|
| 34 |
f"features={counts['features']} "
|
| 35 |
f"neighbors={counts['neighbors']} "
|
| 36 |
f"clusters={counts['clusters']} "
|
|
@@ -245,3 +249,73 @@ def format_pr_search_probe(result: Mapping[str, Any]) -> str:
|
|
| 245 |
if row.get("reason"):
|
| 246 |
lines.append(f" reason: {row['reason']}")
|
| 247 |
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
get_pr_search_similar = pr_search_service.get_pr_search_similar
|
| 11 |
get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
|
| 12 |
get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
|
| 13 |
+
get_pr_search_contributor = pr_search_service.get_pr_search_contributor
|
| 14 |
+
get_pr_search_contributor_pulls = pr_search_service.get_pr_search_contributor_pulls
|
| 15 |
get_pr_search_clusters = pr_search_service.get_pr_search_clusters
|
| 16 |
list_pr_search_clusters = pr_search_service.list_pr_search_clusters
|
| 17 |
get_pr_search_cluster = pr_search_service.get_pr_search_cluster
|
| 18 |
+
get_pr_search_pull_contributor = pr_search_service.get_pr_search_pull_contributor
|
| 19 |
explain_pr_search_pair = pr_search_service.explain_pr_search_pair
|
| 20 |
probe_pr_search_live = pr_search_service.probe_pr_search_live
|
| 21 |
probe_pr_search_github = pr_search_service.probe_pr_search_github
|
|
|
|
| 34 |
(
|
| 35 |
"Rows: "
|
| 36 |
f"documents={counts['documents']} "
|
| 37 |
+
f"contributors={counts.get('contributors', 0)} "
|
| 38 |
f"features={counts['features']} "
|
| 39 |
f"neighbors={counts['neighbors']} "
|
| 40 |
f"clusters={counts['clusters']} "
|
|
|
|
| 249 |
if row.get("reason"):
|
| 250 |
lines.append(f" reason: {row['reason']}")
|
| 251 |
return "\n".join(lines)
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def format_pr_search_contributor(result: Mapping[str, Any]) -> str:
|
| 255 |
+
contributor = result["contributor"]
|
| 256 |
+
lines = [
|
| 257 |
+
f"Contributor {contributor['author_login']}",
|
| 258 |
+
f"Repo: {result['repo']}",
|
| 259 |
+
f"Snapshot: {result['snapshot_id']}",
|
| 260 |
+
f"Name: {contributor.get('name') or '-'}",
|
| 261 |
+
f"Profile: {contributor.get('profile_url') or '-'}",
|
| 262 |
+
f"Association: {contributor.get('repo_association') or '-'}",
|
| 263 |
+
f"First seen in snapshot: {'yes' if contributor.get('first_seen_in_snapshot') else 'no'}",
|
| 264 |
+
(
|
| 265 |
+
"Scores: "
|
| 266 |
+
f"follow-through={contributor.get('follow_through_score') or '-'} "
|
| 267 |
+
f"breadth={contributor.get('breadth_score') or '-'} "
|
| 268 |
+
f"risk={contributor.get('automation_risk_signal') or '-'}"
|
| 269 |
+
),
|
| 270 |
+
f"Heuristic: {contributor.get('heuristic_note') or '-'}",
|
| 271 |
+
f"Public orgs: {', '.join(contributor.get('public_orgs') or []) or '-'}",
|
| 272 |
+
"",
|
| 273 |
+
"Recent indexed PRs:",
|
| 274 |
+
]
|
| 275 |
+
pulls = result.get("pulls") or []
|
| 276 |
+
if not pulls:
|
| 277 |
+
lines.append("- none")
|
| 278 |
+
return "\n".join(lines)
|
| 279 |
+
for row in pulls:
|
| 280 |
+
lines.append(
|
| 281 |
+
f"- PR #{row['pr_number']}: {row.get('title') or ''} "
|
| 282 |
+
f"[state={row.get('state') or '-'} merged={'yes' if row.get('merged') else 'no'}]"
|
| 283 |
+
)
|
| 284 |
+
return "\n".join(lines)
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def format_pr_search_contributor_pulls(result: Mapping[str, Any]) -> str:
|
| 288 |
+
contributor = result["contributor"]
|
| 289 |
+
lines = [
|
| 290 |
+
f"Contributor PRs: {contributor['author_login']}",
|
| 291 |
+
f"Repo: {result['repo']}",
|
| 292 |
+
f"Snapshot: {result['snapshot_id']}",
|
| 293 |
+
f"Pull requests: {result.get('pull_count', len(result.get('pulls') or []))}",
|
| 294 |
+
"",
|
| 295 |
+
]
|
| 296 |
+
pulls = result.get("pulls") or []
|
| 297 |
+
if not pulls:
|
| 298 |
+
lines.append("No indexed PRs found for that contributor.")
|
| 299 |
+
return "\n".join(lines)
|
| 300 |
+
for row in pulls:
|
| 301 |
+
lines.append(
|
| 302 |
+
f"- PR #{row['pr_number']}: {row.get('title') or ''} "
|
| 303 |
+
f"(updated={row.get('updated_at') or '-'}, state={row.get('state') or '-'})"
|
| 304 |
+
)
|
| 305 |
+
return "\n".join(lines)
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def format_pr_search_pull_contributor(result: Mapping[str, Any]) -> str:
|
| 309 |
+
pr = result["pr"]
|
| 310 |
+
contributor = result["contributor"]
|
| 311 |
+
return "\n".join(
|
| 312 |
+
[
|
| 313 |
+
f"PR #{pr['pr_number']}: {pr.get('title') or ''}",
|
| 314 |
+
f"Author: {contributor['author_login']}",
|
| 315 |
+
f"Risk: {contributor.get('automation_risk_signal') or '-'}",
|
| 316 |
+
f"Follow-through: {contributor.get('follow_through_score') or '-'}",
|
| 317 |
+
f"Breadth: {contributor.get('breadth_score') or '-'}",
|
| 318 |
+
f"Heuristic: {contributor.get('heuristic_note') or '-'}",
|
| 319 |
+
f"Profile: {contributor.get('profile_url') or '-'}",
|
| 320 |
+
]
|
| 321 |
+
)
|
src/slop_farmer/app/pr_search_api.py
CHANGED
|
@@ -12,10 +12,23 @@ from fastapi.responses import JSONResponse
|
|
| 12 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 13 |
from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
|
| 14 |
from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
|
| 15 |
-
from slop_farmer.data.snapshot_paths import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
from slop_farmer.reports.pr_search_service import (
|
| 17 |
get_pr_search_cluster,
|
| 18 |
get_pr_search_clusters,
|
|
|
|
|
|
|
| 19 |
get_pr_search_similar_lookup,
|
| 20 |
get_pr_search_status,
|
| 21 |
list_pr_search_clusters,
|
|
@@ -120,7 +133,7 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 120 |
app.state.startup_error = str(exc)
|
| 121 |
yield
|
| 122 |
|
| 123 |
-
app = FastAPI(title="slop PR search API", version="0.1.
|
| 124 |
|
| 125 |
@app.exception_handler(ValueError)
|
| 126 |
async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
|
|
@@ -156,11 +169,13 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 156 |
contributor_snapshot_dir = _surface_snapshot_dir(
|
| 157 |
settings, repo_slug, surface="contributors"
|
| 158 |
)
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
"
|
|
|
|
|
|
|
|
|
|
| 162 |
}
|
| 163 |
-
return {**status, "surfaces": surface_payload}
|
| 164 |
|
| 165 |
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/similar")
|
| 166 |
async def pr_similar(
|
|
@@ -238,6 +253,166 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 238 |
),
|
| 239 |
)
|
| 240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
@app.get("/v1/repos/{owner}/{repo}/issues/status")
|
| 242 |
async def issue_status(
|
| 243 |
owner: str,
|
|
@@ -364,7 +539,9 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 364 |
) -> dict[str, Any]:
|
| 365 |
settings = request.app.state.settings
|
| 366 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 367 |
-
return get_contributor_status(
|
|
|
|
|
|
|
| 368 |
|
| 369 |
@app.get("/v1/repos/{owner}/{repo}/contributors")
|
| 370 |
async def contributors(
|
|
@@ -531,7 +708,9 @@ def _surface_available(snapshot_dir: Path, *, surface: Literal["issues", "contri
|
|
| 531 |
if not snapshot_dir.exists():
|
| 532 |
return False
|
| 533 |
if surface == "issues":
|
| 534 |
-
return
|
|
|
|
|
|
|
| 535 |
return (snapshot_dir / "new-contributors-report.json").exists()
|
| 536 |
|
| 537 |
|
|
@@ -558,6 +737,10 @@ def _looks_not_found(exc: ValueError) -> bool:
|
|
| 558 |
message = str(exc).lower()
|
| 559 |
return (
|
| 560 |
"not found" in message
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
or "no active pr search run" in message
|
| 562 |
or "was not found in the active indexed universe" in message
|
| 563 |
)
|
|
|
|
| 12 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 13 |
from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
|
| 14 |
from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
|
| 15 |
+
from slop_farmer.data.snapshot_paths import (
|
| 16 |
+
CURRENT_ANALYSIS_MANIFEST_PATH,
|
| 17 |
+
default_hf_materialize_dir,
|
| 18 |
+
)
|
| 19 |
+
from slop_farmer.reports.analysis_service import (
|
| 20 |
+
get_analysis_best,
|
| 21 |
+
get_analysis_meta_bug,
|
| 22 |
+
get_analysis_status,
|
| 23 |
+
get_pr_analysis,
|
| 24 |
+
list_analysis_duplicate_prs,
|
| 25 |
+
list_analysis_meta_bugs,
|
| 26 |
+
)
|
| 27 |
from slop_farmer.reports.pr_search_service import (
|
| 28 |
get_pr_search_cluster,
|
| 29 |
get_pr_search_clusters,
|
| 30 |
+
get_pr_search_contributor_pulls,
|
| 31 |
+
get_pr_search_pull_contributor,
|
| 32 |
get_pr_search_similar_lookup,
|
| 33 |
get_pr_search_status,
|
| 34 |
list_pr_search_clusters,
|
|
|
|
| 133 |
app.state.startup_error = str(exc)
|
| 134 |
yield
|
| 135 |
|
| 136 |
+
app = FastAPI(title="slop PR search API", version="0.1.1", lifespan=lifespan)
|
| 137 |
|
| 138 |
@app.exception_handler(ValueError)
|
| 139 |
async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
|
|
|
|
| 169 |
contributor_snapshot_dir = _surface_snapshot_dir(
|
| 170 |
settings, repo_slug, surface="contributors"
|
| 171 |
)
|
| 172 |
+
return {
|
| 173 |
+
**status,
|
| 174 |
+
"surfaces": {
|
| 175 |
+
"issues": get_snapshot_surfaces(issue_snapshot_dir)["issues"],
|
| 176 |
+
"contributors": get_snapshot_surfaces(contributor_snapshot_dir)["contributors"],
|
| 177 |
+
},
|
| 178 |
}
|
|
|
|
| 179 |
|
| 180 |
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/similar")
|
| 181 |
async def pr_similar(
|
|
|
|
| 253 |
),
|
| 254 |
)
|
| 255 |
|
| 256 |
+
@app.get("/v1/repos/{owner}/{repo}/contributors/{login}/pulls")
|
| 257 |
+
async def contributor_pulls(
|
| 258 |
+
owner: str,
|
| 259 |
+
repo: str,
|
| 260 |
+
login: str,
|
| 261 |
+
request: Request,
|
| 262 |
+
limit: int | None = None,
|
| 263 |
+
) -> dict[str, Any]:
|
| 264 |
+
settings = request.app.state.settings
|
| 265 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 266 |
+
return get_pr_search_contributor_pulls(
|
| 267 |
+
settings.index_path,
|
| 268 |
+
repo=repo_slug,
|
| 269 |
+
author_login=login,
|
| 270 |
+
limit=_limit(
|
| 271 |
+
limit, default=settings.similar_limit_default, maximum=settings.similar_limit_max
|
| 272 |
+
),
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/contributor")
|
| 276 |
+
async def pull_contributor(
|
| 277 |
+
owner: str,
|
| 278 |
+
repo: str,
|
| 279 |
+
number: int,
|
| 280 |
+
request: Request,
|
| 281 |
+
) -> dict[str, Any]:
|
| 282 |
+
settings = request.app.state.settings
|
| 283 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 284 |
+
return get_pr_search_pull_contributor(settings.index_path, repo=repo_slug, pr_number=number)
|
| 285 |
+
|
| 286 |
+
@app.get("/v1/repos/{owner}/{repo}/analysis/status")
|
| 287 |
+
async def analysis_status(
|
| 288 |
+
owner: str,
|
| 289 |
+
repo: str,
|
| 290 |
+
request: Request,
|
| 291 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 292 |
+
snapshot_id: str | None = None,
|
| 293 |
+
analysis_id: str | None = None,
|
| 294 |
+
) -> dict[str, Any]:
|
| 295 |
+
settings = request.app.state.settings
|
| 296 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 297 |
+
return get_analysis_status(
|
| 298 |
+
settings.index_path,
|
| 299 |
+
repo=repo_slug,
|
| 300 |
+
variant=variant,
|
| 301 |
+
snapshot_id=snapshot_id,
|
| 302 |
+
analysis_id=analysis_id,
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/analysis")
|
| 306 |
+
async def pr_analysis(
|
| 307 |
+
owner: str,
|
| 308 |
+
repo: str,
|
| 309 |
+
number: int,
|
| 310 |
+
request: Request,
|
| 311 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 312 |
+
snapshot_id: str | None = None,
|
| 313 |
+
analysis_id: str | None = None,
|
| 314 |
+
) -> dict[str, Any]:
|
| 315 |
+
settings = request.app.state.settings
|
| 316 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 317 |
+
return get_pr_analysis(
|
| 318 |
+
settings.index_path,
|
| 319 |
+
repo=repo_slug,
|
| 320 |
+
pr_number=number,
|
| 321 |
+
variant=variant,
|
| 322 |
+
snapshot_id=snapshot_id,
|
| 323 |
+
analysis_id=analysis_id,
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
@app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs")
|
| 327 |
+
async def analysis_meta_bugs(
|
| 328 |
+
owner: str,
|
| 329 |
+
repo: str,
|
| 330 |
+
request: Request,
|
| 331 |
+
limit: int | None = None,
|
| 332 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 333 |
+
snapshot_id: str | None = None,
|
| 334 |
+
analysis_id: str | None = None,
|
| 335 |
+
) -> dict[str, Any]:
|
| 336 |
+
settings = request.app.state.settings
|
| 337 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 338 |
+
return list_analysis_meta_bugs(
|
| 339 |
+
settings.index_path,
|
| 340 |
+
repo=repo_slug,
|
| 341 |
+
variant=variant,
|
| 342 |
+
limit=_limit(
|
| 343 |
+
limit,
|
| 344 |
+
default=settings.cluster_list_limit_default,
|
| 345 |
+
maximum=settings.cluster_list_limit_max,
|
| 346 |
+
),
|
| 347 |
+
snapshot_id=snapshot_id,
|
| 348 |
+
analysis_id=analysis_id,
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
@app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs/{cluster_id}")
|
| 352 |
+
async def analysis_meta_bug(
|
| 353 |
+
owner: str,
|
| 354 |
+
repo: str,
|
| 355 |
+
cluster_id: str,
|
| 356 |
+
request: Request,
|
| 357 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 358 |
+
snapshot_id: str | None = None,
|
| 359 |
+
analysis_id: str | None = None,
|
| 360 |
+
) -> dict[str, Any]:
|
| 361 |
+
settings = request.app.state.settings
|
| 362 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 363 |
+
return get_analysis_meta_bug(
|
| 364 |
+
settings.index_path,
|
| 365 |
+
repo=repo_slug,
|
| 366 |
+
cluster_id=cluster_id,
|
| 367 |
+
variant=variant,
|
| 368 |
+
snapshot_id=snapshot_id,
|
| 369 |
+
analysis_id=analysis_id,
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
@app.get("/v1/repos/{owner}/{repo}/analysis/duplicate-prs")
|
| 373 |
+
async def analysis_duplicate_prs(
|
| 374 |
+
owner: str,
|
| 375 |
+
repo: str,
|
| 376 |
+
request: Request,
|
| 377 |
+
limit: int | None = None,
|
| 378 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 379 |
+
snapshot_id: str | None = None,
|
| 380 |
+
analysis_id: str | None = None,
|
| 381 |
+
) -> dict[str, Any]:
|
| 382 |
+
settings = request.app.state.settings
|
| 383 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 384 |
+
return list_analysis_duplicate_prs(
|
| 385 |
+
settings.index_path,
|
| 386 |
+
repo=repo_slug,
|
| 387 |
+
variant=variant,
|
| 388 |
+
limit=_limit(
|
| 389 |
+
limit,
|
| 390 |
+
default=settings.cluster_list_limit_default,
|
| 391 |
+
maximum=settings.cluster_list_limit_max,
|
| 392 |
+
),
|
| 393 |
+
snapshot_id=snapshot_id,
|
| 394 |
+
analysis_id=analysis_id,
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
@app.get("/v1/repos/{owner}/{repo}/analysis/best")
|
| 398 |
+
async def analysis_best(
|
| 399 |
+
owner: str,
|
| 400 |
+
repo: str,
|
| 401 |
+
request: Request,
|
| 402 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 403 |
+
snapshot_id: str | None = None,
|
| 404 |
+
analysis_id: str | None = None,
|
| 405 |
+
) -> dict[str, Any]:
|
| 406 |
+
settings = request.app.state.settings
|
| 407 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 408 |
+
return get_analysis_best(
|
| 409 |
+
settings.index_path,
|
| 410 |
+
repo=repo_slug,
|
| 411 |
+
variant=variant,
|
| 412 |
+
snapshot_id=snapshot_id,
|
| 413 |
+
analysis_id=analysis_id,
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
@app.get("/v1/repos/{owner}/{repo}/issues/status")
|
| 417 |
async def issue_status(
|
| 418 |
owner: str,
|
|
|
|
| 539 |
) -> dict[str, Any]:
|
| 540 |
settings = request.app.state.settings
|
| 541 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 542 |
+
return get_contributor_status(
|
| 543 |
+
_surface_snapshot_dir(settings, repo_slug, surface="contributors")
|
| 544 |
+
)
|
| 545 |
|
| 546 |
@app.get("/v1/repos/{owner}/{repo}/contributors")
|
| 547 |
async def contributors(
|
|
|
|
| 708 |
if not snapshot_dir.exists():
|
| 709 |
return False
|
| 710 |
if surface == "issues":
|
| 711 |
+
return (snapshot_dir / CURRENT_ANALYSIS_MANIFEST_PATH).exists() or any(
|
| 712 |
+
snapshot_dir.glob("analysis-report*.json")
|
| 713 |
+
)
|
| 714 |
return (snapshot_dir / "new-contributors-report.json").exists()
|
| 715 |
|
| 716 |
|
|
|
|
| 737 |
message = str(exc).lower()
|
| 738 |
return (
|
| 739 |
"not found" in message
|
| 740 |
+
or "analysis report was not found" in message
|
| 741 |
+
or "no analysis report was found" in message
|
| 742 |
+
or "published analysis" in message
|
| 743 |
+
or "materialized snapshot" in message
|
| 744 |
or "no active pr search run" in message
|
| 745 |
or "was not found in the active indexed universe" in message
|
| 746 |
)
|
src/slop_farmer/app/publish_analysis.py
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from collections.abc import Callable, Iterable
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from datetime import UTC, datetime
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any, Protocol, cast
|
| 9 |
+
|
| 10 |
+
from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
|
| 11 |
+
|
| 12 |
+
from slop_farmer.app.save_cache import _save_analysis_cache_api
|
| 13 |
+
from slop_farmer.config import PublishAnalysisArtifactsOptions
|
| 14 |
+
from slop_farmer.data.parquet_io import read_json
|
| 15 |
+
from slop_farmer.data.snapshot_paths import (
|
| 16 |
+
ANALYSIS_REPORT_FILENAME_BY_VARIANT,
|
| 17 |
+
HYBRID_ANALYSIS_REVIEWS_FILENAME,
|
| 18 |
+
ROOT_MANIFEST_FILENAME,
|
| 19 |
+
analysis_run_artifact_path,
|
| 20 |
+
analysis_run_manifest_path,
|
| 21 |
+
archived_snapshot_manifest_path,
|
| 22 |
+
build_archived_analysis_run_manifest,
|
| 23 |
+
build_current_analysis_manifest,
|
| 24 |
+
current_analysis_artifact_path,
|
| 25 |
+
resolve_snapshot_dir_from_output,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class HubApiLike(Protocol):
|
| 30 |
+
def create_repo(
|
| 31 |
+
self,
|
| 32 |
+
repo_id: str,
|
| 33 |
+
*,
|
| 34 |
+
repo_type: str,
|
| 35 |
+
private: bool,
|
| 36 |
+
exist_ok: bool,
|
| 37 |
+
) -> None: ...
|
| 38 |
+
|
| 39 |
+
def create_commit(
|
| 40 |
+
self,
|
| 41 |
+
repo_id: str,
|
| 42 |
+
operations: Iterable[CommitOperationAdd],
|
| 43 |
+
*,
|
| 44 |
+
commit_message: str,
|
| 45 |
+
repo_type: str,
|
| 46 |
+
) -> Any: ...
|
| 47 |
+
|
| 48 |
+
def upload_folder(
|
| 49 |
+
self,
|
| 50 |
+
*,
|
| 51 |
+
repo_id: str,
|
| 52 |
+
folder_path: Path,
|
| 53 |
+
path_in_repo: str,
|
| 54 |
+
repo_type: str,
|
| 55 |
+
commit_message: str,
|
| 56 |
+
) -> None: ...
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@dataclass(frozen=True, slots=True)
|
| 60 |
+
class PublishableAnalysisArtifacts:
|
| 61 |
+
repo: str
|
| 62 |
+
snapshot_id: str
|
| 63 |
+
model: str | None
|
| 64 |
+
report_path: Path
|
| 65 |
+
reviews_path: Path | None
|
| 66 |
+
report_payload: dict[str, Any]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def run_publish_analysis_artifacts(options: PublishAnalysisArtifactsOptions) -> dict[str, Any]:
|
| 70 |
+
snapshot_dir = resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 71 |
+
return publish_analysis_artifacts(
|
| 72 |
+
snapshot_dir=snapshot_dir,
|
| 73 |
+
analysis_input=options.analysis_input,
|
| 74 |
+
hf_repo_id=options.hf_repo_id,
|
| 75 |
+
analysis_id=options.analysis_id,
|
| 76 |
+
canonical=options.canonical,
|
| 77 |
+
save_cache=options.save_cache,
|
| 78 |
+
private=options.private_hf_repo,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def publish_analysis_artifacts(
|
| 83 |
+
*,
|
| 84 |
+
snapshot_dir: Path,
|
| 85 |
+
analysis_input: Path | None,
|
| 86 |
+
hf_repo_id: str,
|
| 87 |
+
analysis_id: str,
|
| 88 |
+
canonical: bool,
|
| 89 |
+
private: bool,
|
| 90 |
+
save_cache: bool = False,
|
| 91 |
+
log: Callable[[str], None] | None = None,
|
| 92 |
+
) -> dict[str, Any]:
|
| 93 |
+
return _publish_analysis_artifacts_api(
|
| 94 |
+
cast("HubApiLike", HfApi()),
|
| 95 |
+
snapshot_dir=snapshot_dir,
|
| 96 |
+
analysis_input=analysis_input,
|
| 97 |
+
hf_repo_id=hf_repo_id,
|
| 98 |
+
analysis_id=analysis_id,
|
| 99 |
+
canonical=canonical,
|
| 100 |
+
private=private,
|
| 101 |
+
save_cache=save_cache,
|
| 102 |
+
log=log,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def _publish_analysis_artifacts_api(
|
| 107 |
+
api: HubApiLike,
|
| 108 |
+
*,
|
| 109 |
+
snapshot_dir: Path,
|
| 110 |
+
analysis_input: Path | None = None,
|
| 111 |
+
hf_repo_id: str,
|
| 112 |
+
analysis_id: str,
|
| 113 |
+
canonical: bool,
|
| 114 |
+
private: bool,
|
| 115 |
+
save_cache: bool = False,
|
| 116 |
+
log: Callable[[str], None] | None = None,
|
| 117 |
+
) -> dict[str, Any]:
|
| 118 |
+
artifacts = _discover_publishable_analysis(snapshot_dir, analysis_input=analysis_input)
|
| 119 |
+
published_at = _iso_now()
|
| 120 |
+
channel = "canonical" if canonical else "comparison"
|
| 121 |
+
archived_manifest = build_archived_analysis_run_manifest(
|
| 122 |
+
repo=artifacts.repo,
|
| 123 |
+
snapshot_id=artifacts.snapshot_id,
|
| 124 |
+
analysis_id=analysis_id,
|
| 125 |
+
variant="hybrid",
|
| 126 |
+
channel=channel,
|
| 127 |
+
model=artifacts.model,
|
| 128 |
+
published_at=published_at,
|
| 129 |
+
include_hybrid_reviews=artifacts.reviews_path is not None,
|
| 130 |
+
)
|
| 131 |
+
current_manifest = (
|
| 132 |
+
build_current_analysis_manifest(
|
| 133 |
+
repo=artifacts.repo,
|
| 134 |
+
snapshot_id=artifacts.snapshot_id,
|
| 135 |
+
analysis_id=analysis_id,
|
| 136 |
+
variant="hybrid",
|
| 137 |
+
channel=channel,
|
| 138 |
+
model=artifacts.model,
|
| 139 |
+
published_at=published_at,
|
| 140 |
+
include_hybrid_reviews=artifacts.reviews_path is not None,
|
| 141 |
+
)
|
| 142 |
+
if canonical
|
| 143 |
+
else None
|
| 144 |
+
)
|
| 145 |
+
snapshot_manifest = _updated_snapshot_manifest(
|
| 146 |
+
snapshot_dir=snapshot_dir,
|
| 147 |
+
hf_repo_id=hf_repo_id,
|
| 148 |
+
snapshot_id=artifacts.snapshot_id,
|
| 149 |
+
analysis_id=analysis_id,
|
| 150 |
+
archived_manifest=archived_manifest,
|
| 151 |
+
canonical=canonical,
|
| 152 |
+
)
|
| 153 |
+
operations = _commit_operations(
|
| 154 |
+
artifacts=artifacts,
|
| 155 |
+
analysis_id=analysis_id,
|
| 156 |
+
archived_manifest=archived_manifest,
|
| 157 |
+
current_manifest=current_manifest,
|
| 158 |
+
snapshot_manifest=snapshot_manifest,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
if log:
|
| 162 |
+
log(f"Ensuring Hub dataset repo exists: {hf_repo_id}")
|
| 163 |
+
api.create_repo(hf_repo_id, repo_type="dataset", private=private, exist_ok=True)
|
| 164 |
+
if log:
|
| 165 |
+
log(f"Publishing analysis {analysis_id} for snapshot {artifacts.snapshot_id}")
|
| 166 |
+
api.create_commit(
|
| 167 |
+
hf_repo_id,
|
| 168 |
+
operations,
|
| 169 |
+
commit_message=f"Publish analysis {analysis_id} for snapshot {artifacts.snapshot_id}",
|
| 170 |
+
repo_type="dataset",
|
| 171 |
+
)
|
| 172 |
+
cache_result = (
|
| 173 |
+
_save_analysis_cache_api(
|
| 174 |
+
api,
|
| 175 |
+
snapshot_dir=snapshot_dir,
|
| 176 |
+
hf_repo_id=hf_repo_id,
|
| 177 |
+
private=private,
|
| 178 |
+
log=log,
|
| 179 |
+
)
|
| 180 |
+
if save_cache
|
| 181 |
+
else None
|
| 182 |
+
)
|
| 183 |
+
result: dict[str, Any] = {
|
| 184 |
+
"repo": artifacts.repo,
|
| 185 |
+
"dataset_id": hf_repo_id,
|
| 186 |
+
"snapshot_id": artifacts.snapshot_id,
|
| 187 |
+
"analysis_id": analysis_id,
|
| 188 |
+
"canonical": canonical,
|
| 189 |
+
"save_cache": save_cache,
|
| 190 |
+
"published_at": published_at,
|
| 191 |
+
"artifact_paths": [operation.path_in_repo for operation in operations],
|
| 192 |
+
}
|
| 193 |
+
if cache_result is not None:
|
| 194 |
+
result["cache"] = cache_result
|
| 195 |
+
if log:
|
| 196 |
+
log(f"Published analysis artifacts to {hf_repo_id}")
|
| 197 |
+
return result
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def _discover_publishable_analysis(
|
| 201 |
+
snapshot_dir: Path, *, analysis_input: Path | None
|
| 202 |
+
) -> PublishableAnalysisArtifacts:
|
| 203 |
+
manifest_path = snapshot_dir / ROOT_MANIFEST_FILENAME
|
| 204 |
+
if not manifest_path.exists():
|
| 205 |
+
raise FileNotFoundError(f"Snapshot manifest is missing: {manifest_path}")
|
| 206 |
+
manifest = read_json(manifest_path)
|
| 207 |
+
if not isinstance(manifest, dict):
|
| 208 |
+
raise ValueError(f"Snapshot manifest at {manifest_path} must contain a JSON object.")
|
| 209 |
+
snapshot_id = str(manifest.get("snapshot_id") or snapshot_dir.name).strip()
|
| 210 |
+
repo = str(manifest.get("repo") or "").strip()
|
| 211 |
+
if not repo:
|
| 212 |
+
raise ValueError(f"Snapshot manifest at {manifest_path} does not define repo.")
|
| 213 |
+
|
| 214 |
+
report_path = (
|
| 215 |
+
analysis_input.resolve()
|
| 216 |
+
if analysis_input is not None
|
| 217 |
+
else snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"]
|
| 218 |
+
)
|
| 219 |
+
if not report_path.exists():
|
| 220 |
+
raise FileNotFoundError(f"Hybrid analysis report is missing: {report_path}")
|
| 221 |
+
report_payload = read_json(report_path)
|
| 222 |
+
if not isinstance(report_payload, dict):
|
| 223 |
+
raise ValueError(f"Hybrid analysis report at {report_path} must contain a JSON object.")
|
| 224 |
+
report_snapshot_id = str(report_payload.get("snapshot_id") or snapshot_id).strip()
|
| 225 |
+
if report_snapshot_id != snapshot_id:
|
| 226 |
+
raise ValueError(
|
| 227 |
+
f"Hybrid analysis report snapshot_id {report_snapshot_id!r} does not match manifest snapshot_id {snapshot_id!r}."
|
| 228 |
+
)
|
| 229 |
+
report_repo = str(report_payload.get("repo") or repo).strip()
|
| 230 |
+
if report_repo != repo:
|
| 231 |
+
raise ValueError(
|
| 232 |
+
f"Hybrid analysis report repo {report_repo!r} does not match manifest repo {repo!r}."
|
| 233 |
+
)
|
| 234 |
+
model = report_payload.get("model")
|
| 235 |
+
if model is not None:
|
| 236 |
+
model = str(model)
|
| 237 |
+
|
| 238 |
+
reviews_path = report_path.with_name(f"{report_path.stem}.llm-reviews.json")
|
| 239 |
+
return PublishableAnalysisArtifacts(
|
| 240 |
+
repo=repo,
|
| 241 |
+
snapshot_id=snapshot_id,
|
| 242 |
+
model=model,
|
| 243 |
+
report_path=report_path,
|
| 244 |
+
reviews_path=reviews_path if reviews_path.exists() else None,
|
| 245 |
+
report_payload={str(key): value for key, value in report_payload.items()},
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def _updated_snapshot_manifest(
|
| 250 |
+
*,
|
| 251 |
+
snapshot_dir: Path,
|
| 252 |
+
hf_repo_id: str,
|
| 253 |
+
snapshot_id: str,
|
| 254 |
+
analysis_id: str,
|
| 255 |
+
archived_manifest: dict[str, Any],
|
| 256 |
+
canonical: bool,
|
| 257 |
+
) -> dict[str, Any]:
|
| 258 |
+
manifest = _load_remote_snapshot_manifest(hf_repo_id, snapshot_id) or read_json(
|
| 259 |
+
snapshot_dir / ROOT_MANIFEST_FILENAME
|
| 260 |
+
)
|
| 261 |
+
if not isinstance(manifest, dict):
|
| 262 |
+
raise ValueError("Archived snapshot manifest must contain a JSON object.")
|
| 263 |
+
updated = {str(key): value for key, value in manifest.items()}
|
| 264 |
+
published_analysis: dict[str, Any] | Any = updated.get("published_analysis")
|
| 265 |
+
if not isinstance(published_analysis, dict):
|
| 266 |
+
published_analysis = {"schema_version": 1, "runs": {}}
|
| 267 |
+
runs: dict[str, Any] | Any = published_analysis.get("runs")
|
| 268 |
+
if not isinstance(runs, dict):
|
| 269 |
+
runs = {}
|
| 270 |
+
runs[analysis_id] = {
|
| 271 |
+
"analysis_id": analysis_id,
|
| 272 |
+
"variant": archived_manifest["variant"],
|
| 273 |
+
"channel": archived_manifest["channel"],
|
| 274 |
+
"model": archived_manifest.get("model"),
|
| 275 |
+
"published_at": archived_manifest["published_at"],
|
| 276 |
+
"manifest_path": analysis_run_manifest_path(snapshot_id, analysis_id),
|
| 277 |
+
"artifacts": archived_manifest["artifacts"],
|
| 278 |
+
}
|
| 279 |
+
published_analysis["schema_version"] = 1
|
| 280 |
+
published_analysis["runs"] = runs
|
| 281 |
+
if canonical:
|
| 282 |
+
published_analysis["canonical_analysis_id"] = analysis_id
|
| 283 |
+
updated["published_analysis"] = published_analysis
|
| 284 |
+
return updated
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def _load_remote_snapshot_manifest(hf_repo_id: str, snapshot_id: str) -> dict[str, Any] | None:
|
| 288 |
+
try:
|
| 289 |
+
downloaded = hf_hub_download(
|
| 290 |
+
repo_id=hf_repo_id,
|
| 291 |
+
repo_type="dataset",
|
| 292 |
+
filename=archived_snapshot_manifest_path(snapshot_id),
|
| 293 |
+
)
|
| 294 |
+
except Exception:
|
| 295 |
+
return None
|
| 296 |
+
payload = json.loads(Path(downloaded).read_text(encoding="utf-8"))
|
| 297 |
+
return payload if isinstance(payload, dict) else None
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def _commit_operations(
|
| 301 |
+
*,
|
| 302 |
+
artifacts: PublishableAnalysisArtifacts,
|
| 303 |
+
analysis_id: str,
|
| 304 |
+
archived_manifest: dict[str, Any],
|
| 305 |
+
current_manifest: dict[str, Any] | None,
|
| 306 |
+
snapshot_manifest: dict[str, Any],
|
| 307 |
+
) -> list[CommitOperationAdd]:
|
| 308 |
+
report_filename = ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"]
|
| 309 |
+
operations = [
|
| 310 |
+
CommitOperationAdd(
|
| 311 |
+
path_in_repo=analysis_run_artifact_path(
|
| 312 |
+
artifacts.snapshot_id,
|
| 313 |
+
analysis_id,
|
| 314 |
+
report_filename,
|
| 315 |
+
),
|
| 316 |
+
path_or_fileobj=artifacts.report_path,
|
| 317 |
+
),
|
| 318 |
+
CommitOperationAdd(
|
| 319 |
+
path_in_repo=analysis_run_manifest_path(artifacts.snapshot_id, analysis_id),
|
| 320 |
+
path_or_fileobj=_json_bytes(archived_manifest),
|
| 321 |
+
),
|
| 322 |
+
CommitOperationAdd(
|
| 323 |
+
path_in_repo=archived_snapshot_manifest_path(artifacts.snapshot_id),
|
| 324 |
+
path_or_fileobj=_json_bytes(snapshot_manifest),
|
| 325 |
+
),
|
| 326 |
+
]
|
| 327 |
+
if artifacts.reviews_path is not None:
|
| 328 |
+
operations.append(
|
| 329 |
+
CommitOperationAdd(
|
| 330 |
+
path_in_repo=analysis_run_artifact_path(
|
| 331 |
+
artifacts.snapshot_id,
|
| 332 |
+
analysis_id,
|
| 333 |
+
HYBRID_ANALYSIS_REVIEWS_FILENAME,
|
| 334 |
+
),
|
| 335 |
+
path_or_fileobj=artifacts.reviews_path,
|
| 336 |
+
)
|
| 337 |
+
)
|
| 338 |
+
if current_manifest is not None:
|
| 339 |
+
operations.extend(
|
| 340 |
+
[
|
| 341 |
+
CommitOperationAdd(
|
| 342 |
+
path_in_repo=current_analysis_artifact_path(report_filename),
|
| 343 |
+
path_or_fileobj=artifacts.report_path,
|
| 344 |
+
),
|
| 345 |
+
CommitOperationAdd(
|
| 346 |
+
path_in_repo=current_analysis_artifact_path(ROOT_MANIFEST_FILENAME),
|
| 347 |
+
path_or_fileobj=_json_bytes(current_manifest),
|
| 348 |
+
),
|
| 349 |
+
]
|
| 350 |
+
)
|
| 351 |
+
if artifacts.reviews_path is not None:
|
| 352 |
+
operations.append(
|
| 353 |
+
CommitOperationAdd(
|
| 354 |
+
path_in_repo=current_analysis_artifact_path(HYBRID_ANALYSIS_REVIEWS_FILENAME),
|
| 355 |
+
path_or_fileobj=artifacts.reviews_path,
|
| 356 |
+
)
|
| 357 |
+
)
|
| 358 |
+
return operations
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def _json_bytes(payload: dict[str, Any]) -> bytes:
|
| 362 |
+
return (json.dumps(payload, indent=2, sort_keys=True) + "\n").encode("utf-8")
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def _iso_now() -> str:
|
| 366 |
+
return datetime.now(tz=UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
src/slop_farmer/app/publish_dataset_snapshot.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from collections.abc import Callable
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Protocol, cast
|
| 6 |
+
|
| 7 |
+
from huggingface_hub import HfApi
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class HubApiLike(Protocol):
|
| 11 |
+
def create_repo(
|
| 12 |
+
self, repo_id: str, *, repo_type: str, private: bool, exist_ok: bool
|
| 13 |
+
) -> None: ...
|
| 14 |
+
|
| 15 |
+
def upload_folder(
|
| 16 |
+
self,
|
| 17 |
+
*,
|
| 18 |
+
repo_id: str,
|
| 19 |
+
folder_path: Path,
|
| 20 |
+
path_in_repo: str,
|
| 21 |
+
repo_type: str,
|
| 22 |
+
commit_message: str,
|
| 23 |
+
) -> None: ...
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def publish_dataset_snapshot(
|
| 27 |
+
snapshot_dir: Path,
|
| 28 |
+
hf_repo_id: str,
|
| 29 |
+
*,
|
| 30 |
+
private: bool,
|
| 31 |
+
log: Callable[[str], None] | None = None,
|
| 32 |
+
) -> None:
|
| 33 |
+
_publish_dataset_snapshot_api(
|
| 34 |
+
cast("HubApiLike", HfApi()),
|
| 35 |
+
snapshot_dir,
|
| 36 |
+
hf_repo_id,
|
| 37 |
+
private,
|
| 38 |
+
log=log,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _publish_dataset_snapshot_api(
|
| 43 |
+
api: HubApiLike,
|
| 44 |
+
snapshot_dir: Path,
|
| 45 |
+
hf_repo_id: str,
|
| 46 |
+
private: bool,
|
| 47 |
+
log: Callable[[str], None] | None = None,
|
| 48 |
+
) -> None:
|
| 49 |
+
if log:
|
| 50 |
+
log(f"Ensuring Hub dataset repo exists: {hf_repo_id}")
|
| 51 |
+
api.create_repo(hf_repo_id, repo_type="dataset", private=private, exist_ok=True)
|
| 52 |
+
if log:
|
| 53 |
+
log(f"Uploading snapshot to Hub: {snapshot_dir}")
|
| 54 |
+
api.upload_folder(
|
| 55 |
+
repo_id=hf_repo_id,
|
| 56 |
+
folder_path=snapshot_dir,
|
| 57 |
+
path_in_repo=".",
|
| 58 |
+
repo_type="dataset",
|
| 59 |
+
commit_message=f"Add snapshot {snapshot_dir.name}",
|
| 60 |
+
)
|
| 61 |
+
if log:
|
| 62 |
+
log(f"Upload finished: {hf_repo_id}")
|
src/slop_farmer/app/save_cache.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from collections.abc import Callable
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any, Protocol, cast
|
| 6 |
+
|
| 7 |
+
from huggingface_hub import HfApi
|
| 8 |
+
|
| 9 |
+
from slop_farmer.config import SaveCacheOptions
|
| 10 |
+
from slop_farmer.data.parquet_io import read_json
|
| 11 |
+
from slop_farmer.data.snapshot_paths import ROOT_MANIFEST_FILENAME, resolve_snapshot_dir_from_output
|
| 12 |
+
|
| 13 |
+
ANALYSIS_STATE_DIRNAME = "analysis-state"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class HubApiLike(Protocol):
|
| 17 |
+
def create_repo(
|
| 18 |
+
self,
|
| 19 |
+
repo_id: str,
|
| 20 |
+
*,
|
| 21 |
+
repo_type: str,
|
| 22 |
+
private: bool,
|
| 23 |
+
exist_ok: bool,
|
| 24 |
+
) -> None: ...
|
| 25 |
+
|
| 26 |
+
def upload_folder(
|
| 27 |
+
self,
|
| 28 |
+
*,
|
| 29 |
+
repo_id: str,
|
| 30 |
+
folder_path: Path,
|
| 31 |
+
path_in_repo: str,
|
| 32 |
+
repo_type: str,
|
| 33 |
+
commit_message: str,
|
| 34 |
+
) -> None: ...
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def run_save_cache(options: SaveCacheOptions) -> dict[str, Any]:
|
| 38 |
+
snapshot_dir = resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 39 |
+
return save_analysis_cache(
|
| 40 |
+
snapshot_dir=snapshot_dir,
|
| 41 |
+
hf_repo_id=options.hf_repo_id,
|
| 42 |
+
private=options.private_hf_repo,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def save_analysis_cache(
|
| 47 |
+
*,
|
| 48 |
+
snapshot_dir: Path,
|
| 49 |
+
hf_repo_id: str,
|
| 50 |
+
private: bool,
|
| 51 |
+
log: Callable[[str], None] | None = None,
|
| 52 |
+
) -> dict[str, Any]:
|
| 53 |
+
return _save_analysis_cache_api(
|
| 54 |
+
cast("HubApiLike", HfApi()),
|
| 55 |
+
snapshot_dir=snapshot_dir,
|
| 56 |
+
hf_repo_id=hf_repo_id,
|
| 57 |
+
private=private,
|
| 58 |
+
log=log,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _save_analysis_cache_api(
|
| 63 |
+
api: HubApiLike,
|
| 64 |
+
*,
|
| 65 |
+
snapshot_dir: Path,
|
| 66 |
+
hf_repo_id: str,
|
| 67 |
+
private: bool,
|
| 68 |
+
log: Callable[[str], None] | None = None,
|
| 69 |
+
) -> dict[str, Any]:
|
| 70 |
+
cache_dir = snapshot_dir / ANALYSIS_STATE_DIRNAME
|
| 71 |
+
if not cache_dir.exists():
|
| 72 |
+
raise FileNotFoundError(f"Analysis cache directory is missing: {cache_dir}")
|
| 73 |
+
if not cache_dir.is_dir():
|
| 74 |
+
raise NotADirectoryError(f"Analysis cache path is not a directory: {cache_dir}")
|
| 75 |
+
artifact_paths = _cache_artifact_paths(cache_dir)
|
| 76 |
+
if not artifact_paths:
|
| 77 |
+
raise ValueError(f"Analysis cache directory is empty: {cache_dir}")
|
| 78 |
+
|
| 79 |
+
manifest_path = snapshot_dir / ROOT_MANIFEST_FILENAME
|
| 80 |
+
manifest = read_json(manifest_path) if manifest_path.exists() else {}
|
| 81 |
+
if not isinstance(manifest, dict):
|
| 82 |
+
raise ValueError(f"Snapshot manifest at {manifest_path} must contain a JSON object.")
|
| 83 |
+
snapshot_id = str(manifest.get("snapshot_id") or snapshot_dir.name).strip()
|
| 84 |
+
repo = str(manifest.get("repo") or "").strip()
|
| 85 |
+
|
| 86 |
+
if log:
|
| 87 |
+
log(f"Ensuring Hub dataset repo exists: {hf_repo_id}")
|
| 88 |
+
api.create_repo(hf_repo_id, repo_type="dataset", private=private, exist_ok=True)
|
| 89 |
+
if log:
|
| 90 |
+
log(f"Saving analysis cache for snapshot {snapshot_id}")
|
| 91 |
+
api.upload_folder(
|
| 92 |
+
repo_id=hf_repo_id,
|
| 93 |
+
folder_path=cache_dir,
|
| 94 |
+
path_in_repo=ANALYSIS_STATE_DIRNAME,
|
| 95 |
+
repo_type="dataset",
|
| 96 |
+
commit_message=f"Save analysis cache for snapshot {snapshot_id}",
|
| 97 |
+
)
|
| 98 |
+
result = {
|
| 99 |
+
"dataset_id": hf_repo_id,
|
| 100 |
+
"snapshot_id": snapshot_id,
|
| 101 |
+
"artifact_paths": [f"{ANALYSIS_STATE_DIRNAME}/{path}" for path in artifact_paths],
|
| 102 |
+
}
|
| 103 |
+
if repo:
|
| 104 |
+
result["repo"] = repo
|
| 105 |
+
if log:
|
| 106 |
+
log(f"Saved analysis cache to {hf_repo_id}")
|
| 107 |
+
return result
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _cache_artifact_paths(cache_dir: Path) -> list[str]:
|
| 111 |
+
return sorted(
|
| 112 |
+
str(path.relative_to(cache_dir).as_posix())
|
| 113 |
+
for path in cache_dir.rglob("*")
|
| 114 |
+
if path.is_file()
|
| 115 |
+
)
|
src/slop_farmer/app_config.py
CHANGED
|
@@ -109,7 +109,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 109 |
dashboard = payload.get("dashboard")
|
| 110 |
analysis = payload.get("analysis")
|
| 111 |
scrape = payload.get("scrape")
|
| 112 |
-
full_pipeline = payload.get("full-pipeline")
|
| 113 |
pull_requests = payload.get("pull-requests")
|
| 114 |
if dashboard is None:
|
| 115 |
dashboard = {}
|
|
@@ -117,8 +116,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 117 |
analysis = {}
|
| 118 |
if scrape is None:
|
| 119 |
scrape = {}
|
| 120 |
-
if full_pipeline is None:
|
| 121 |
-
full_pipeline = {}
|
| 122 |
if pull_requests is None:
|
| 123 |
pull_requests = {}
|
| 124 |
if not isinstance(dashboard, dict):
|
|
@@ -127,8 +124,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 127 |
raise ValueError(f"Expected analysis mapping in config file: {config_path}")
|
| 128 |
if not isinstance(scrape, dict):
|
| 129 |
raise ValueError(f"Expected scrape mapping in config file: {config_path}")
|
| 130 |
-
if not isinstance(full_pipeline, dict):
|
| 131 |
-
raise ValueError(f"Expected full-pipeline mapping in config file: {config_path}")
|
| 132 |
if not isinstance(pull_requests, dict):
|
| 133 |
raise ValueError(f"Expected pull-requests mapping in config file: {config_path}")
|
| 134 |
|
|
@@ -184,12 +179,26 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 184 |
"new-contributor-window-days": contributor_window_days,
|
| 185 |
"new-contributor-max-authors": contributor_max_authors,
|
| 186 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
"analyze": {
|
| 188 |
"output-dir": str(data_dir) if data_dir else None,
|
| 189 |
"hf-repo-id": analysis.get("hf-repo-id", dataset_id),
|
| 190 |
"model": analysis.get("model"),
|
| 191 |
"ranking-backend": analysis.get("ranking_backend"),
|
| 192 |
"max-clusters": analysis.get("max_clusters"),
|
|
|
|
| 193 |
"cached_analysis": analysis.get("cached_analysis"),
|
| 194 |
"open-prs-only": analysis.get("open_prs_only"),
|
| 195 |
"pr-template-cleanup-mode": pr_template_cleanup_mode,
|
|
@@ -201,6 +210,7 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 201 |
},
|
| 202 |
"pr-scope": {
|
| 203 |
"output-dir": str(data_dir) if data_dir else None,
|
|
|
|
| 204 |
"cluster-suppression-rules": cluster_suppression_rules,
|
| 205 |
},
|
| 206 |
"pr-search": {
|
|
@@ -210,32 +220,28 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 210 |
},
|
| 211 |
"new-contributor-report": {
|
| 212 |
"output-dir": str(data_dir) if data_dir else None,
|
|
|
|
| 213 |
"window-days": contributor_window_days,
|
| 214 |
"max-authors": contributor_max_authors,
|
| 215 |
},
|
| 216 |
"dashboard-data": {
|
| 217 |
"output-dir": str(dashboard_dir) if dashboard_dir else None,
|
| 218 |
"snapshot-root": str(data_dir / "snapshots") if data_dir else None,
|
|
|
|
| 219 |
"window-days": dashboard_window_days,
|
| 220 |
},
|
| 221 |
-
"publish-
|
| 222 |
"output-dir": str(data_dir) if data_dir else None,
|
| 223 |
"hf-repo-id": dataset_id,
|
| 224 |
},
|
| 225 |
-
"
|
| 226 |
-
"
|
| 227 |
-
"
|
| 228 |
-
"workspace-root": str(workspace_path.parent) if workspace_path else None,
|
| 229 |
-
"model": analysis.get("model"),
|
| 230 |
-
"ranking-backend": analysis.get("ranking_backend"),
|
| 231 |
-
"max-clusters": analysis.get("max_clusters"),
|
| 232 |
-
"dashboard-window-days": dashboard_window_days,
|
| 233 |
-
"new-contributor-window-days": contributor_window_days,
|
| 234 |
-
"new-contributor-max-authors": contributor_max_authors,
|
| 235 |
},
|
| 236 |
"deploy-dashboard": {
|
| 237 |
"pipeline-data-dir": str(data_dir) if data_dir else None,
|
| 238 |
"web-dir": str(web_dir) if web_dir else None,
|
|
|
|
| 239 |
"dashboard-window-days": dashboard_window_days,
|
| 240 |
"contributor-window-days": contributor_window_days,
|
| 241 |
"contributor-max-authors": contributor_max_authors,
|
|
@@ -248,6 +254,11 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 248 |
"dataset-id": dataset_id,
|
| 249 |
"space-tags": tags_value,
|
| 250 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
}
|
| 252 |
for command, values in defaults.items():
|
| 253 |
defaults[command] = {key: value for key, value in values.items() if value is not None}
|
|
@@ -259,8 +270,8 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 259 |
defaults[command].update(_resolve_command_paths(config_path, values))
|
| 260 |
|
| 261 |
defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
|
|
|
|
| 262 |
defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
|
| 263 |
-
defaults["full-pipeline"].update(_resolve_command_paths(config_path, full_pipeline))
|
| 264 |
return defaults
|
| 265 |
|
| 266 |
|
|
|
|
| 109 |
dashboard = payload.get("dashboard")
|
| 110 |
analysis = payload.get("analysis")
|
| 111 |
scrape = payload.get("scrape")
|
|
|
|
| 112 |
pull_requests = payload.get("pull-requests")
|
| 113 |
if dashboard is None:
|
| 114 |
dashboard = {}
|
|
|
|
| 116 |
analysis = {}
|
| 117 |
if scrape is None:
|
| 118 |
scrape = {}
|
|
|
|
|
|
|
| 119 |
if pull_requests is None:
|
| 120 |
pull_requests = {}
|
| 121 |
if not isinstance(dashboard, dict):
|
|
|
|
| 124 |
raise ValueError(f"Expected analysis mapping in config file: {config_path}")
|
| 125 |
if not isinstance(scrape, dict):
|
| 126 |
raise ValueError(f"Expected scrape mapping in config file: {config_path}")
|
|
|
|
|
|
|
| 127 |
if not isinstance(pull_requests, dict):
|
| 128 |
raise ValueError(f"Expected pull-requests mapping in config file: {config_path}")
|
| 129 |
|
|
|
|
| 179 |
"new-contributor-window-days": contributor_window_days,
|
| 180 |
"new-contributor-max-authors": contributor_max_authors,
|
| 181 |
},
|
| 182 |
+
"refresh-dataset": {
|
| 183 |
+
"repo": repo,
|
| 184 |
+
"hf-repo-id": dataset_id,
|
| 185 |
+
"fetch-timeline": scrape.get("fetch-timeline"),
|
| 186 |
+
"max-issues": scrape.get("max-issues"),
|
| 187 |
+
"max-prs": scrape.get("max-prs"),
|
| 188 |
+
"max-issue-comments": scrape.get("max-issue-comments"),
|
| 189 |
+
"max-reviews-per-pr": scrape.get("max-reviews-per-pr"),
|
| 190 |
+
"max-review-comments-per-pr": scrape.get("max-review-comments-per-pr"),
|
| 191 |
+
"new-contributor-window-days": contributor_window_days,
|
| 192 |
+
"new-contributor-max-authors": contributor_max_authors,
|
| 193 |
+
"cluster-suppression-rules": cluster_suppression_rules,
|
| 194 |
+
},
|
| 195 |
"analyze": {
|
| 196 |
"output-dir": str(data_dir) if data_dir else None,
|
| 197 |
"hf-repo-id": analysis.get("hf-repo-id", dataset_id),
|
| 198 |
"model": analysis.get("model"),
|
| 199 |
"ranking-backend": analysis.get("ranking_backend"),
|
| 200 |
"max-clusters": analysis.get("max_clusters"),
|
| 201 |
+
"hybrid-llm-concurrency": analysis.get("hybrid_llm_concurrency"),
|
| 202 |
"cached_analysis": analysis.get("cached_analysis"),
|
| 203 |
"open-prs-only": analysis.get("open_prs_only"),
|
| 204 |
"pr-template-cleanup-mode": pr_template_cleanup_mode,
|
|
|
|
| 210 |
},
|
| 211 |
"pr-scope": {
|
| 212 |
"output-dir": str(data_dir) if data_dir else None,
|
| 213 |
+
"hf-repo-id": dataset_id,
|
| 214 |
"cluster-suppression-rules": cluster_suppression_rules,
|
| 215 |
},
|
| 216 |
"pr-search": {
|
|
|
|
| 220 |
},
|
| 221 |
"new-contributor-report": {
|
| 222 |
"output-dir": str(data_dir) if data_dir else None,
|
| 223 |
+
"hf-repo-id": dataset_id,
|
| 224 |
"window-days": contributor_window_days,
|
| 225 |
"max-authors": contributor_max_authors,
|
| 226 |
},
|
| 227 |
"dashboard-data": {
|
| 228 |
"output-dir": str(dashboard_dir) if dashboard_dir else None,
|
| 229 |
"snapshot-root": str(data_dir / "snapshots") if data_dir else None,
|
| 230 |
+
"hf-repo-id": dataset_id,
|
| 231 |
"window-days": dashboard_window_days,
|
| 232 |
},
|
| 233 |
+
"publish-analysis-artifacts": {
|
| 234 |
"output-dir": str(data_dir) if data_dir else None,
|
| 235 |
"hf-repo-id": dataset_id,
|
| 236 |
},
|
| 237 |
+
"save-cache": {
|
| 238 |
+
"output-dir": str(data_dir) if data_dir else None,
|
| 239 |
+
"hf-repo-id": dataset_id,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
},
|
| 241 |
"deploy-dashboard": {
|
| 242 |
"pipeline-data-dir": str(data_dir) if data_dir else None,
|
| 243 |
"web-dir": str(web_dir) if web_dir else None,
|
| 244 |
+
"hf-repo-id": dataset_id,
|
| 245 |
"dashboard-window-days": dashboard_window_days,
|
| 246 |
"contributor-window-days": contributor_window_days,
|
| 247 |
"contributor-max-authors": contributor_max_authors,
|
|
|
|
| 254 |
"dataset-id": dataset_id,
|
| 255 |
"space-tags": tags_value,
|
| 256 |
},
|
| 257 |
+
"dataset-status": {
|
| 258 |
+
"repo": repo,
|
| 259 |
+
"output-dir": str(data_dir) if data_dir else None,
|
| 260 |
+
"hf-repo-id": dataset_id,
|
| 261 |
+
},
|
| 262 |
}
|
| 263 |
for command, values in defaults.items():
|
| 264 |
defaults[command] = {key: value for key, value in values.items() if value is not None}
|
|
|
|
| 270 |
defaults[command].update(_resolve_command_paths(config_path, values))
|
| 271 |
|
| 272 |
defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
|
| 273 |
+
defaults["refresh-dataset"].update(_resolve_command_paths(config_path, scrape))
|
| 274 |
defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
|
|
|
|
| 275 |
return defaults
|
| 276 |
|
| 277 |
|
src/slop_farmer/config.py
CHANGED
|
@@ -81,9 +81,6 @@ class PipelineOptions:
|
|
| 81 |
max_reviews_per_pr: int | None
|
| 82 |
max_review_comments_per_pr: int | None
|
| 83 |
fetch_timeline: bool
|
| 84 |
-
publish: bool
|
| 85 |
-
hf_repo_id: str | None
|
| 86 |
-
private_hf_repo: bool
|
| 87 |
new_contributor_report: bool
|
| 88 |
new_contributor_window_days: int
|
| 89 |
new_contributor_max_authors: int
|
|
@@ -102,6 +99,7 @@ class AnalysisOptions:
|
|
| 102 |
ranking_backend: str
|
| 103 |
model: str
|
| 104 |
max_clusters: int
|
|
|
|
| 105 |
open_prs_only: bool = False
|
| 106 |
cached_analysis: bool = False
|
| 107 |
pr_template_cleanup_mode: str = "merge_defaults"
|
|
@@ -111,6 +109,10 @@ class AnalysisOptions:
|
|
| 111 |
pr_template_line_patterns: tuple[str, ...] = ()
|
| 112 |
cluster_suppression_rules: tuple[dict[str, Any], ...] = ()
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
@dataclass(slots=True)
|
| 116 |
class MarkdownReportOptions:
|
|
@@ -127,6 +129,9 @@ class NewContributorReportOptions:
|
|
| 127 |
json_output: Path | None
|
| 128 |
window_days: int
|
| 129 |
max_authors: int
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
@dataclass(slots=True)
|
|
@@ -137,17 +142,12 @@ class DashboardDataOptions:
|
|
| 137 |
contributors_input: Path | None
|
| 138 |
pr_scope_input: Path | None
|
| 139 |
window_days: int
|
|
|
|
|
|
|
|
|
|
| 140 |
snapshot_root: Path | None = None
|
| 141 |
|
| 142 |
|
| 143 |
-
@dataclass(slots=True)
|
| 144 |
-
class PublishSnapshotOptions:
|
| 145 |
-
output_dir: Path
|
| 146 |
-
snapshot_dir: Path | None
|
| 147 |
-
hf_repo_id: str
|
| 148 |
-
private_hf_repo: bool
|
| 149 |
-
|
| 150 |
-
|
| 151 |
@dataclass(slots=True)
|
| 152 |
class DeployDashboardOptions:
|
| 153 |
pipeline_data_dir: Path
|
|
@@ -155,6 +155,10 @@ class DeployDashboardOptions:
|
|
| 155 |
snapshot_dir: Path | None
|
| 156 |
analysis_input: Path | None
|
| 157 |
contributors_input: Path | None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
refresh_contributors: bool
|
| 159 |
dashboard_window_days: int
|
| 160 |
contributor_window_days: int
|
|
@@ -216,20 +220,50 @@ class SnapshotAdoptOptions:
|
|
| 216 |
|
| 217 |
|
| 218 |
@dataclass(slots=True)
|
| 219 |
-
class
|
| 220 |
repo: RepoRef
|
| 221 |
-
|
| 222 |
-
model: str
|
| 223 |
-
workspace_root: Path
|
| 224 |
private_hf_repo: bool
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
| 227 |
fetch_timeline: bool
|
| 228 |
-
|
| 229 |
new_contributor_window_days: int
|
| 230 |
new_contributor_max_authors: int
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
max_reviews_per_pr: int | None
|
| 82 |
max_review_comments_per_pr: int | None
|
| 83 |
fetch_timeline: bool
|
|
|
|
|
|
|
|
|
|
| 84 |
new_contributor_report: bool
|
| 85 |
new_contributor_window_days: int
|
| 86 |
new_contributor_max_authors: int
|
|
|
|
| 99 |
ranking_backend: str
|
| 100 |
model: str
|
| 101 |
max_clusters: int
|
| 102 |
+
hybrid_llm_concurrency: int = 1
|
| 103 |
open_prs_only: bool = False
|
| 104 |
cached_analysis: bool = False
|
| 105 |
pr_template_cleanup_mode: str = "merge_defaults"
|
|
|
|
| 109 |
pr_template_line_patterns: tuple[str, ...] = ()
|
| 110 |
cluster_suppression_rules: tuple[dict[str, Any], ...] = ()
|
| 111 |
|
| 112 |
+
def __post_init__(self) -> None:
|
| 113 |
+
if self.hybrid_llm_concurrency < 1:
|
| 114 |
+
raise ValueError("hybrid_llm_concurrency must be >= 1")
|
| 115 |
+
|
| 116 |
|
| 117 |
@dataclass(slots=True)
|
| 118 |
class MarkdownReportOptions:
|
|
|
|
| 129 |
json_output: Path | None
|
| 130 |
window_days: int
|
| 131 |
max_authors: int
|
| 132 |
+
hf_repo_id: str | None = None
|
| 133 |
+
hf_revision: str | None = None
|
| 134 |
+
hf_materialize_dir: Path | None = None
|
| 135 |
|
| 136 |
|
| 137 |
@dataclass(slots=True)
|
|
|
|
| 142 |
contributors_input: Path | None
|
| 143 |
pr_scope_input: Path | None
|
| 144 |
window_days: int
|
| 145 |
+
hf_repo_id: str | None = None
|
| 146 |
+
hf_revision: str | None = None
|
| 147 |
+
hf_materialize_dir: Path | None = None
|
| 148 |
snapshot_root: Path | None = None
|
| 149 |
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
@dataclass(slots=True)
|
| 152 |
class DeployDashboardOptions:
|
| 153 |
pipeline_data_dir: Path
|
|
|
|
| 155 |
snapshot_dir: Path | None
|
| 156 |
analysis_input: Path | None
|
| 157 |
contributors_input: Path | None
|
| 158 |
+
pr_scope_input: Path | None
|
| 159 |
+
hf_repo_id: str | None
|
| 160 |
+
hf_revision: str | None
|
| 161 |
+
hf_materialize_dir: Path | None
|
| 162 |
refresh_contributors: bool
|
| 163 |
dashboard_window_days: int
|
| 164 |
contributor_window_days: int
|
|
|
|
| 220 |
|
| 221 |
|
| 222 |
@dataclass(slots=True)
|
| 223 |
+
class DatasetRefreshOptions:
|
| 224 |
repo: RepoRef
|
| 225 |
+
hf_repo_id: str
|
|
|
|
|
|
|
| 226 |
private_hf_repo: bool
|
| 227 |
+
max_issues: int | None
|
| 228 |
+
max_prs: int | None
|
| 229 |
+
max_issue_comments: int | None
|
| 230 |
+
max_reviews_per_pr: int | None
|
| 231 |
+
max_review_comments_per_pr: int | None
|
| 232 |
fetch_timeline: bool
|
| 233 |
+
new_contributor_report: bool
|
| 234 |
new_contributor_window_days: int
|
| 235 |
new_contributor_max_authors: int
|
| 236 |
+
http_timeout: int
|
| 237 |
+
http_max_retries: int
|
| 238 |
+
checkpoint_every_comments: int
|
| 239 |
+
checkpoint_every_prs: int
|
| 240 |
+
cluster_suppression_rules: tuple[dict[str, Any], ...] = ()
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
@dataclass(slots=True)
|
| 244 |
+
class PublishAnalysisArtifactsOptions:
|
| 245 |
+
output_dir: Path
|
| 246 |
+
snapshot_dir: Path | None
|
| 247 |
+
analysis_input: Path | None
|
| 248 |
+
hf_repo_id: str
|
| 249 |
+
analysis_id: str
|
| 250 |
+
canonical: bool = False
|
| 251 |
+
save_cache: bool = False
|
| 252 |
+
private_hf_repo: bool = False
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
@dataclass(slots=True)
|
| 256 |
+
class SaveCacheOptions:
|
| 257 |
+
output_dir: Path
|
| 258 |
+
snapshot_dir: Path | None
|
| 259 |
+
hf_repo_id: str
|
| 260 |
+
private_hf_repo: bool = False
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
@dataclass(slots=True)
|
| 264 |
+
class DatasetStatusOptions:
|
| 265 |
+
output_dir: Path
|
| 266 |
+
hf_repo_id: str | None
|
| 267 |
+
hf_revision: str | None
|
| 268 |
+
repo: str | None = None
|
| 269 |
+
json_output: bool = False
|
src/slop_farmer/data/search_duckdb.py
CHANGED
|
@@ -31,6 +31,7 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
|
|
| 31 |
"repo",
|
| 32 |
"pr_number",
|
| 33 |
"github_id",
|
|
|
|
| 34 |
"state",
|
| 35 |
"draft",
|
| 36 |
"merged",
|
|
@@ -46,6 +47,48 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
|
|
| 46 |
"review_comments_count",
|
| 47 |
"html_url",
|
| 48 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"pr_scope_features": (
|
| 50 |
"run_id",
|
| 51 |
"repo",
|
|
@@ -144,6 +187,7 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
|
|
| 144 |
repo VARCHAR,
|
| 145 |
pr_number BIGINT,
|
| 146 |
github_id BIGINT,
|
|
|
|
| 147 |
state VARCHAR,
|
| 148 |
draft BOOLEAN,
|
| 149 |
merged BOOLEAN,
|
|
@@ -159,6 +203,48 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
|
|
| 159 |
review_comments_count BIGINT,
|
| 160 |
html_url VARCHAR
|
| 161 |
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
CREATE TABLE IF NOT EXISTS pr_scope_features (
|
| 163 |
run_id VARCHAR,
|
| 164 |
repo VARCHAR,
|
|
@@ -232,6 +318,8 @@ CREATE TABLE IF NOT EXISTS pr_scope_cluster_candidates (
|
|
| 232 |
CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
|
| 233 |
CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
|
| 234 |
CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
|
|
|
|
|
|
|
| 235 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
|
| 236 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
|
| 237 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
|
|
@@ -256,6 +344,9 @@ def connect_pr_search_db(path: Path, *, read_only: bool = False) -> duckdb.DuckD
|
|
| 256 |
|
| 257 |
def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
|
| 258 |
connection.execute(SCHEMA_SQL)
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
def insert_rows(
|
|
@@ -353,6 +444,7 @@ def resolve_active_run(
|
|
| 353 |
def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
|
| 354 |
return {
|
| 355 |
"documents": _count(connection, "pr_search_documents", run_id),
|
|
|
|
| 356 |
"features": _count(connection, "pr_scope_features", run_id),
|
| 357 |
"run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
|
| 358 |
"neighbors": _count(connection, "pr_scope_neighbors", run_id),
|
|
@@ -375,6 +467,60 @@ def get_document(
|
|
| 375 |
)
|
| 376 |
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
def get_feature(
|
| 379 |
connection: duckdb.DuckDBPyConnection,
|
| 380 |
*,
|
|
|
|
| 31 |
"repo",
|
| 32 |
"pr_number",
|
| 33 |
"github_id",
|
| 34 |
+
"author_login",
|
| 35 |
"state",
|
| 36 |
"draft",
|
| 37 |
"merged",
|
|
|
|
| 47 |
"review_comments_count",
|
| 48 |
"html_url",
|
| 49 |
),
|
| 50 |
+
"pr_search_contributors": (
|
| 51 |
+
"run_id",
|
| 52 |
+
"repo",
|
| 53 |
+
"snapshot_id",
|
| 54 |
+
"report_generated_at",
|
| 55 |
+
"window_days",
|
| 56 |
+
"author_login",
|
| 57 |
+
"name",
|
| 58 |
+
"profile_url",
|
| 59 |
+
"repo_pull_requests_url",
|
| 60 |
+
"repo_issues_url",
|
| 61 |
+
"repo_first_seen_at",
|
| 62 |
+
"repo_last_seen_at",
|
| 63 |
+
"repo_primary_artifact_count",
|
| 64 |
+
"repo_artifact_count",
|
| 65 |
+
"snapshot_issue_count",
|
| 66 |
+
"snapshot_pr_count",
|
| 67 |
+
"snapshot_comment_count",
|
| 68 |
+
"snapshot_review_count",
|
| 69 |
+
"snapshot_review_comment_count",
|
| 70 |
+
"repo_association",
|
| 71 |
+
"new_to_repo",
|
| 72 |
+
"first_seen_in_snapshot",
|
| 73 |
+
"report_reason",
|
| 74 |
+
"account_age_days",
|
| 75 |
+
"young_account",
|
| 76 |
+
"follow_through_score",
|
| 77 |
+
"breadth_score",
|
| 78 |
+
"automation_risk_signal",
|
| 79 |
+
"heuristic_note",
|
| 80 |
+
"public_orgs_json",
|
| 81 |
+
"visible_authored_pr_count",
|
| 82 |
+
"merged_pr_count",
|
| 83 |
+
"closed_unmerged_pr_count",
|
| 84 |
+
"open_pr_count",
|
| 85 |
+
"merged_pr_rate",
|
| 86 |
+
"closed_unmerged_pr_rate",
|
| 87 |
+
"still_open_pr_rate",
|
| 88 |
+
"distinct_repos_with_authored_prs",
|
| 89 |
+
"distinct_repos_with_open_prs",
|
| 90 |
+
"fetch_error",
|
| 91 |
+
),
|
| 92 |
"pr_scope_features": (
|
| 93 |
"run_id",
|
| 94 |
"repo",
|
|
|
|
| 187 |
repo VARCHAR,
|
| 188 |
pr_number BIGINT,
|
| 189 |
github_id BIGINT,
|
| 190 |
+
author_login VARCHAR,
|
| 191 |
state VARCHAR,
|
| 192 |
draft BOOLEAN,
|
| 193 |
merged BOOLEAN,
|
|
|
|
| 203 |
review_comments_count BIGINT,
|
| 204 |
html_url VARCHAR
|
| 205 |
);
|
| 206 |
+
CREATE TABLE IF NOT EXISTS pr_search_contributors (
|
| 207 |
+
run_id VARCHAR,
|
| 208 |
+
repo VARCHAR,
|
| 209 |
+
snapshot_id VARCHAR,
|
| 210 |
+
report_generated_at VARCHAR,
|
| 211 |
+
window_days BIGINT,
|
| 212 |
+
author_login VARCHAR,
|
| 213 |
+
name VARCHAR,
|
| 214 |
+
profile_url VARCHAR,
|
| 215 |
+
repo_pull_requests_url VARCHAR,
|
| 216 |
+
repo_issues_url VARCHAR,
|
| 217 |
+
repo_first_seen_at VARCHAR,
|
| 218 |
+
repo_last_seen_at VARCHAR,
|
| 219 |
+
repo_primary_artifact_count BIGINT,
|
| 220 |
+
repo_artifact_count BIGINT,
|
| 221 |
+
snapshot_issue_count BIGINT,
|
| 222 |
+
snapshot_pr_count BIGINT,
|
| 223 |
+
snapshot_comment_count BIGINT,
|
| 224 |
+
snapshot_review_count BIGINT,
|
| 225 |
+
snapshot_review_comment_count BIGINT,
|
| 226 |
+
repo_association VARCHAR,
|
| 227 |
+
new_to_repo BOOLEAN,
|
| 228 |
+
first_seen_in_snapshot BOOLEAN,
|
| 229 |
+
report_reason VARCHAR,
|
| 230 |
+
account_age_days BIGINT,
|
| 231 |
+
young_account BOOLEAN,
|
| 232 |
+
follow_through_score VARCHAR,
|
| 233 |
+
breadth_score VARCHAR,
|
| 234 |
+
automation_risk_signal VARCHAR,
|
| 235 |
+
heuristic_note VARCHAR,
|
| 236 |
+
public_orgs_json VARCHAR,
|
| 237 |
+
visible_authored_pr_count BIGINT,
|
| 238 |
+
merged_pr_count BIGINT,
|
| 239 |
+
closed_unmerged_pr_count BIGINT,
|
| 240 |
+
open_pr_count BIGINT,
|
| 241 |
+
merged_pr_rate DOUBLE,
|
| 242 |
+
closed_unmerged_pr_rate DOUBLE,
|
| 243 |
+
still_open_pr_rate DOUBLE,
|
| 244 |
+
distinct_repos_with_authored_prs BIGINT,
|
| 245 |
+
distinct_repos_with_open_prs BIGINT,
|
| 246 |
+
fetch_error VARCHAR
|
| 247 |
+
);
|
| 248 |
CREATE TABLE IF NOT EXISTS pr_scope_features (
|
| 249 |
run_id VARCHAR,
|
| 250 |
repo VARCHAR,
|
|
|
|
| 318 |
CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
|
| 319 |
CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
|
| 320 |
CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
|
| 321 |
+
CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_author ON pr_search_documents (run_id, author_login);
|
| 322 |
+
CREATE INDEX IF NOT EXISTS idx_pr_search_contributors_run_author ON pr_search_contributors (run_id, author_login);
|
| 323 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
|
| 324 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
|
| 325 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
|
|
|
|
| 344 |
|
| 345 |
def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
|
| 346 |
connection.execute(SCHEMA_SQL)
|
| 347 |
+
connection.execute(
|
| 348 |
+
"ALTER TABLE pr_search_documents ADD COLUMN IF NOT EXISTS author_login VARCHAR"
|
| 349 |
+
)
|
| 350 |
|
| 351 |
|
| 352 |
def insert_rows(
|
|
|
|
| 444 |
def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
|
| 445 |
return {
|
| 446 |
"documents": _count(connection, "pr_search_documents", run_id),
|
| 447 |
+
"contributors": _count(connection, "pr_search_contributors", run_id),
|
| 448 |
"features": _count(connection, "pr_scope_features", run_id),
|
| 449 |
"run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
|
| 450 |
"neighbors": _count(connection, "pr_scope_neighbors", run_id),
|
|
|
|
| 467 |
)
|
| 468 |
|
| 469 |
|
| 470 |
+
def get_contributor(
|
| 471 |
+
connection: duckdb.DuckDBPyConnection,
|
| 472 |
+
*,
|
| 473 |
+
run_id: str,
|
| 474 |
+
author_login: str,
|
| 475 |
+
) -> dict[str, Any] | None:
|
| 476 |
+
return fetch_one(
|
| 477 |
+
connection,
|
| 478 |
+
"""
|
| 479 |
+
SELECT *
|
| 480 |
+
FROM pr_search_contributors
|
| 481 |
+
WHERE run_id = ? AND lower(author_login) = lower(?)
|
| 482 |
+
""",
|
| 483 |
+
[run_id, author_login],
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
def get_contributor_pulls(
|
| 488 |
+
connection: duckdb.DuckDBPyConnection,
|
| 489 |
+
*,
|
| 490 |
+
run_id: str,
|
| 491 |
+
author_login: str,
|
| 492 |
+
limit: int,
|
| 493 |
+
) -> list[dict[str, Any]]:
|
| 494 |
+
return fetch_rows(
|
| 495 |
+
connection,
|
| 496 |
+
"""
|
| 497 |
+
SELECT
|
| 498 |
+
pr_number,
|
| 499 |
+
github_id,
|
| 500 |
+
author_login,
|
| 501 |
+
state,
|
| 502 |
+
draft,
|
| 503 |
+
merged,
|
| 504 |
+
title,
|
| 505 |
+
base_ref,
|
| 506 |
+
created_at,
|
| 507 |
+
updated_at,
|
| 508 |
+
merged_at,
|
| 509 |
+
additions,
|
| 510 |
+
deletions,
|
| 511 |
+
changed_files,
|
| 512 |
+
comments_count,
|
| 513 |
+
review_comments_count,
|
| 514 |
+
html_url
|
| 515 |
+
FROM pr_search_documents
|
| 516 |
+
WHERE run_id = ? AND lower(author_login) = lower(?)
|
| 517 |
+
ORDER BY updated_at DESC NULLS LAST, pr_number DESC
|
| 518 |
+
LIMIT ?
|
| 519 |
+
""",
|
| 520 |
+
[run_id, author_login, limit],
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
|
| 524 |
def get_feature(
|
| 525 |
connection: duckdb.DuckDBPyConnection,
|
| 526 |
*,
|
src/slop_farmer/data/snapshot_materialize.py
CHANGED
|
@@ -5,13 +5,27 @@ import shutil
|
|
| 5 |
import urllib.parse
|
| 6 |
import urllib.request
|
| 7 |
from datetime import UTC, datetime
|
| 8 |
-
from pathlib import Path
|
| 9 |
from typing import Any
|
| 10 |
|
| 11 |
from huggingface_hub import HfApi, hf_hub_download
|
| 12 |
|
| 13 |
from slop_farmer.data.http import urlopen_with_retry
|
| 14 |
from slop_farmer.data.parquet_io import read_json, write_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
def materialize_hf_dataset_snapshot(
|
|
@@ -22,11 +36,13 @@ def materialize_hf_dataset_snapshot(
|
|
| 22 |
) -> Path:
|
| 23 |
info = _hf_dataset_info(repo_id=repo_id, revision=revision, files_metadata=True)
|
| 24 |
remote_paths = {sibling.rfilename for sibling in info.siblings}
|
| 25 |
-
|
|
|
|
| 26 |
return _materialize_hf_snapshot_repo_snapshot(
|
| 27 |
repo_id=repo_id,
|
| 28 |
local_dir=local_dir,
|
| 29 |
-
revision=
|
|
|
|
| 30 |
hf_sha=info.sha,
|
| 31 |
remote_paths=remote_paths,
|
| 32 |
)
|
|
@@ -34,14 +50,16 @@ def materialize_hf_dataset_snapshot(
|
|
| 34 |
return _materialize_hf_root_snapshot(
|
| 35 |
repo_id=repo_id,
|
| 36 |
local_dir=local_dir,
|
| 37 |
-
revision=
|
|
|
|
| 38 |
hf_sha=info.sha,
|
| 39 |
remote_paths=remote_paths,
|
| 40 |
)
|
| 41 |
return _materialize_hf_dataset_viewer_snapshot(
|
| 42 |
repo_id=repo_id,
|
| 43 |
local_dir=local_dir,
|
| 44 |
-
revision=
|
|
|
|
| 45 |
hf_sha=info.sha,
|
| 46 |
)
|
| 47 |
|
|
@@ -50,84 +68,101 @@ def _materialize_hf_snapshot_repo_snapshot(
|
|
| 50 |
*,
|
| 51 |
repo_id: str,
|
| 52 |
local_dir: Path,
|
| 53 |
-
revision: str
|
|
|
|
| 54 |
hf_sha: str | None,
|
| 55 |
remote_paths: set[str],
|
| 56 |
) -> Path:
|
| 57 |
local_dir.mkdir(parents=True, exist_ok=True)
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
| 63 |
)
|
| 64 |
-
latest_payload = json.loads(
|
| 65 |
downloaded_files: set[str] = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
for filename in (
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
"pr_files.parquet",
|
| 73 |
-
"pr_diffs.parquet",
|
| 74 |
-
"links.parquet",
|
| 75 |
-
"events.parquet",
|
| 76 |
-
"manifest.json",
|
| 77 |
-
"analysis-report.json",
|
| 78 |
-
"analysis-report-hybrid.json",
|
| 79 |
-
"analysis-report-deterministic.json",
|
| 80 |
-
"new_contributors.parquet",
|
| 81 |
-
"new-contributors-report.json",
|
| 82 |
-
"new-contributors-report.md",
|
| 83 |
):
|
| 84 |
-
|
| 85 |
repo_id=repo_id,
|
| 86 |
revision=revision,
|
| 87 |
filenames=_hf_latest_snapshot_candidates(latest_payload, filename),
|
| 88 |
)
|
| 89 |
-
if
|
| 90 |
continue
|
| 91 |
-
|
| 92 |
downloaded_files.add(filename)
|
| 93 |
-
|
| 94 |
-
|
|
|
|
| 95 |
repo_id=repo_id,
|
| 96 |
revision=revision,
|
| 97 |
local_dir=local_dir,
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
prefixes=_hf_latest_snapshot_prefixes(latest_payload),
|
| 101 |
-
),
|
| 102 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
)
|
| 104 |
|
| 105 |
-
|
| 106 |
repo_id=repo_id,
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
| 110 |
)
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
manifest = (
|
| 113 |
-
read_json(local_dir /
|
|
|
|
|
|
|
| 114 |
)
|
| 115 |
manifest.setdefault("repo", _infer_repo_from_materialized_snapshot(local_dir))
|
| 116 |
manifest.setdefault(
|
| 117 |
-
"snapshot_id",
|
|
|
|
| 118 |
)
|
| 119 |
manifest.update(
|
| 120 |
{
|
| 121 |
"source_type": "hf_snapshot_repo",
|
| 122 |
"hf_repo_id": repo_id,
|
| 123 |
-
"hf_revision":
|
|
|
|
| 124 |
"hf_sha": hf_sha,
|
| 125 |
"materialized_at": _iso_now(),
|
| 126 |
"downloaded_files": sorted(downloaded_files),
|
| 127 |
"hf_latest_pointer": latest_payload,
|
| 128 |
}
|
| 129 |
)
|
| 130 |
-
write_text(json.dumps(manifest, indent=2) + "\n", local_dir /
|
| 131 |
return local_dir
|
| 132 |
|
| 133 |
|
|
@@ -135,60 +170,53 @@ def _materialize_hf_root_snapshot(
|
|
| 135 |
*,
|
| 136 |
repo_id: str,
|
| 137 |
local_dir: Path,
|
| 138 |
-
revision: str
|
|
|
|
| 139 |
hf_sha: str | None,
|
| 140 |
remote_paths: set[str],
|
| 141 |
) -> Path:
|
| 142 |
local_dir.mkdir(parents=True, exist_ok=True)
|
| 143 |
downloaded_files: set[str] = set()
|
| 144 |
-
for
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
"events.parquet",
|
| 154 |
-
"manifest.json",
|
| 155 |
-
"analysis-report.json",
|
| 156 |
-
"analysis-report-hybrid.json",
|
| 157 |
-
"analysis-report-deterministic.json",
|
| 158 |
-
"new_contributors.parquet",
|
| 159 |
-
"new-contributors-report.json",
|
| 160 |
-
"new-contributors-report.md",
|
| 161 |
):
|
| 162 |
-
if
|
| 163 |
continue
|
| 164 |
-
|
| 165 |
-
repo_id=repo_id,
|
| 166 |
-
repo_type="dataset",
|
| 167 |
-
filename=filename,
|
| 168 |
-
revision=revision,
|
| 169 |
-
)
|
| 170 |
-
shutil.copy2(downloaded_path, local_dir / filename)
|
| 171 |
-
downloaded_files.add(filename)
|
| 172 |
-
downloaded_files.update(
|
| 173 |
-
_download_hf_analysis_state_files(
|
| 174 |
repo_id=repo_id,
|
| 175 |
revision=revision,
|
| 176 |
local_dir=local_dir,
|
| 177 |
-
|
|
|
|
| 178 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
)
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
shutil.copy2(readme_path, local_dir / "README.md")
|
| 189 |
|
| 190 |
manifest = (
|
| 191 |
-
read_json(local_dir /
|
|
|
|
|
|
|
| 192 |
)
|
| 193 |
manifest.setdefault("repo", _infer_repo_from_materialized_snapshot(local_dir))
|
| 194 |
manifest.setdefault("snapshot_id", hf_sha or local_dir.name)
|
|
@@ -196,13 +224,14 @@ def _materialize_hf_root_snapshot(
|
|
| 196 |
{
|
| 197 |
"source_type": "hf_root_snapshot",
|
| 198 |
"hf_repo_id": repo_id,
|
| 199 |
-
"hf_revision":
|
|
|
|
| 200 |
"hf_sha": hf_sha,
|
| 201 |
"materialized_at": _iso_now(),
|
| 202 |
"downloaded_files": sorted(downloaded_files),
|
| 203 |
}
|
| 204 |
)
|
| 205 |
-
write_text(json.dumps(manifest, indent=2) + "\n", local_dir /
|
| 206 |
return local_dir
|
| 207 |
|
| 208 |
|
|
@@ -210,7 +239,8 @@ def _materialize_hf_dataset_viewer_snapshot(
|
|
| 210 |
*,
|
| 211 |
repo_id: str,
|
| 212 |
local_dir: Path,
|
| 213 |
-
revision: str
|
|
|
|
| 214 |
hf_sha: str | None,
|
| 215 |
) -> Path:
|
| 216 |
local_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -225,24 +255,165 @@ def _materialize_hf_dataset_viewer_snapshot(
|
|
| 225 |
readme_path = hf_hub_download(
|
| 226 |
repo_id=repo_id,
|
| 227 |
repo_type="dataset",
|
| 228 |
-
filename=
|
| 229 |
-
revision=revision
|
| 230 |
)
|
| 231 |
-
shutil.copy2(readme_path, local_dir /
|
|
|
|
| 232 |
manifest = {
|
| 233 |
"repo": _infer_repo_from_materialized_snapshot(local_dir),
|
| 234 |
"snapshot_id": hf_sha or local_dir.name,
|
| 235 |
"source_type": "hf_dataset_viewer",
|
| 236 |
"hf_repo_id": repo_id,
|
| 237 |
-
"hf_revision":
|
|
|
|
| 238 |
"hf_sha": hf_sha,
|
| 239 |
"materialized_at": _iso_now(),
|
| 240 |
"downloaded_files": sorted(downloaded_files),
|
| 241 |
}
|
| 242 |
-
write_text(json.dumps(manifest, indent=2) + "\n", local_dir /
|
| 243 |
return local_dir
|
| 244 |
|
| 245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
def _hf_dataset_info(repo_id: str, revision: str | None, *, files_metadata: bool) -> Any:
|
| 247 |
api = HfApi()
|
| 248 |
try:
|
|
@@ -270,7 +441,7 @@ def _hf_dataset_parquet_urls(repo_id: str, revision: str | None = None) -> list[
|
|
| 270 |
def _download_first_available_hf_file(
|
| 271 |
*,
|
| 272 |
repo_id: str,
|
| 273 |
-
revision: str
|
| 274 |
filenames: list[str],
|
| 275 |
) -> Path | None:
|
| 276 |
for filename in filenames:
|
|
@@ -290,65 +461,24 @@ def _download_first_available_hf_file(
|
|
| 290 |
return None
|
| 291 |
|
| 292 |
|
| 293 |
-
def _download_hf_analysis_state_files(
|
| 294 |
-
*,
|
| 295 |
-
repo_id: str,
|
| 296 |
-
revision: str | None,
|
| 297 |
-
local_dir: Path,
|
| 298 |
-
path_pairs: list[tuple[str, str]],
|
| 299 |
-
) -> set[str]:
|
| 300 |
-
downloaded_files: set[str] = set()
|
| 301 |
-
for remote_path, relative_path in path_pairs:
|
| 302 |
-
downloaded_path = Path(
|
| 303 |
-
hf_hub_download(
|
| 304 |
-
repo_id=repo_id,
|
| 305 |
-
repo_type="dataset",
|
| 306 |
-
filename=remote_path,
|
| 307 |
-
revision=revision,
|
| 308 |
-
)
|
| 309 |
-
)
|
| 310 |
-
destination = local_dir / "analysis-state" / relative_path
|
| 311 |
-
destination.parent.mkdir(parents=True, exist_ok=True)
|
| 312 |
-
shutil.copy2(downloaded_path, destination)
|
| 313 |
-
downloaded_files.add(str(Path("analysis-state") / relative_path))
|
| 314 |
-
return downloaded_files
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
def _hf_analysis_state_path_pairs(
|
| 318 |
-
remote_paths: set[str],
|
| 319 |
-
*,
|
| 320 |
-
prefixes: list[str],
|
| 321 |
-
) -> list[tuple[str, str]]:
|
| 322 |
-
pairs: list[tuple[str, str]] = []
|
| 323 |
-
seen_relative_paths: set[str] = set()
|
| 324 |
-
for prefix in prefixes:
|
| 325 |
-
base = f"{prefix.strip('/')}/analysis-state/" if prefix else "analysis-state/"
|
| 326 |
-
for remote_path in sorted(remote_paths):
|
| 327 |
-
if not remote_path.startswith(base):
|
| 328 |
-
continue
|
| 329 |
-
relative_path = remote_path.removeprefix(base)
|
| 330 |
-
if not relative_path or relative_path in seen_relative_paths:
|
| 331 |
-
continue
|
| 332 |
-
seen_relative_paths.add(relative_path)
|
| 333 |
-
pairs.append((remote_path, relative_path))
|
| 334 |
-
return pairs
|
| 335 |
-
|
| 336 |
-
|
| 337 |
def _hf_latest_snapshot_candidates(latest_payload: dict[str, Any], filename: str) -> list[str]:
|
| 338 |
candidates: list[str] = []
|
| 339 |
manifest_path = str(latest_payload.get("manifest_path") or "").strip("/")
|
| 340 |
snapshot_dir = str(latest_payload.get("snapshot_dir") or "").strip("/")
|
| 341 |
latest_snapshot_id = str(latest_payload.get("latest_snapshot_id") or "").strip()
|
|
|
|
| 342 |
|
| 343 |
-
if filename ==
|
| 344 |
candidates.append(manifest_path)
|
| 345 |
if snapshot_dir and snapshot_dir not in {".", "/"}:
|
| 346 |
candidates.append(f"{snapshot_dir}/{filename}")
|
|
|
|
|
|
|
| 347 |
if manifest_path and "/" in manifest_path:
|
| 348 |
manifest_dir = manifest_path.rsplit("/", 1)[0]
|
| 349 |
candidates.append(f"{manifest_dir}/{filename}")
|
| 350 |
if latest_snapshot_id:
|
| 351 |
-
candidates.append(
|
| 352 |
candidates.append(filename)
|
| 353 |
|
| 354 |
seen: set[str] = set()
|
|
@@ -362,31 +492,6 @@ def _hf_latest_snapshot_candidates(latest_payload: dict[str, Any], filename: str
|
|
| 362 |
return deduped
|
| 363 |
|
| 364 |
|
| 365 |
-
def _hf_latest_snapshot_prefixes(latest_payload: dict[str, Any]) -> list[str]:
|
| 366 |
-
prefixes: list[str] = []
|
| 367 |
-
manifest_path = str(latest_payload.get("manifest_path") or "").strip("/")
|
| 368 |
-
snapshot_dir = str(latest_payload.get("snapshot_dir") or "").strip("/")
|
| 369 |
-
latest_snapshot_id = str(latest_payload.get("latest_snapshot_id") or "").strip()
|
| 370 |
-
|
| 371 |
-
if snapshot_dir and snapshot_dir not in {".", "/"}:
|
| 372 |
-
prefixes.append(snapshot_dir)
|
| 373 |
-
if manifest_path and "/" in manifest_path:
|
| 374 |
-
prefixes.append(manifest_path.rsplit("/", 1)[0])
|
| 375 |
-
if latest_snapshot_id:
|
| 376 |
-
prefixes.append(f"snapshots/{latest_snapshot_id}")
|
| 377 |
-
prefixes.append("")
|
| 378 |
-
|
| 379 |
-
seen: set[str] = set()
|
| 380 |
-
deduped: list[str] = []
|
| 381 |
-
for prefix in prefixes:
|
| 382 |
-
normalized = prefix.lstrip("./")
|
| 383 |
-
if normalized in seen:
|
| 384 |
-
continue
|
| 385 |
-
seen.add(normalized)
|
| 386 |
-
deduped.append(normalized)
|
| 387 |
-
return deduped
|
| 388 |
-
|
| 389 |
-
|
| 390 |
def _download_url_to_path(url: str, destination: Path) -> None:
|
| 391 |
destination.parent.mkdir(parents=True, exist_ok=True)
|
| 392 |
urllib.request.urlretrieve(url, destination)
|
|
@@ -420,18 +525,8 @@ def _parquet_table_name(path: Path) -> str:
|
|
| 420 |
def _infer_repo_from_materialized_snapshot(local_dir: Path) -> str:
|
| 421 |
import pyarrow.parquet as pq
|
| 422 |
|
| 423 |
-
for
|
| 424 |
-
|
| 425 |
-
"pull_requests",
|
| 426 |
-
"comments",
|
| 427 |
-
"reviews",
|
| 428 |
-
"review_comments",
|
| 429 |
-
"pr_files",
|
| 430 |
-
"pr_diffs",
|
| 431 |
-
"links",
|
| 432 |
-
"events",
|
| 433 |
-
):
|
| 434 |
-
path = local_dir / f"{table_name}.parquet"
|
| 435 |
if not path.exists():
|
| 436 |
continue
|
| 437 |
rows = pq.read_table(path).slice(0, 1).to_pylist()
|
|
|
|
| 5 |
import urllib.parse
|
| 6 |
import urllib.request
|
| 7 |
from datetime import UTC, datetime
|
| 8 |
+
from pathlib import Path, PurePosixPath
|
| 9 |
from typing import Any
|
| 10 |
|
| 11 |
from huggingface_hub import HfApi, hf_hub_download
|
| 12 |
|
| 13 |
from slop_farmer.data.http import urlopen_with_retry
|
| 14 |
from slop_farmer.data.parquet_io import read_json, write_text
|
| 15 |
+
from slop_farmer.data.snapshot_paths import (
|
| 16 |
+
CONTRIBUTOR_ARTIFACT_FILENAMES,
|
| 17 |
+
CURRENT_ANALYSIS_MANIFEST_PATH,
|
| 18 |
+
LEGACY_ANALYSIS_FILENAMES,
|
| 19 |
+
PR_SCOPE_CLUSTERS_FILENAME,
|
| 20 |
+
RAW_TABLE_FILENAMES,
|
| 21 |
+
README_FILENAME,
|
| 22 |
+
ROOT_MANIFEST_FILENAME,
|
| 23 |
+
SNAPSHOTS_LATEST_PATH,
|
| 24 |
+
STATE_WATERMARK_PATH,
|
| 25 |
+
load_archived_analysis_run_manifest,
|
| 26 |
+
load_current_analysis_manifest,
|
| 27 |
+
repo_relative_path_to_local,
|
| 28 |
+
)
|
| 29 |
|
| 30 |
|
| 31 |
def materialize_hf_dataset_snapshot(
|
|
|
|
| 36 |
) -> Path:
|
| 37 |
info = _hf_dataset_info(repo_id=repo_id, revision=revision, files_metadata=True)
|
| 38 |
remote_paths = {sibling.rfilename for sibling in info.siblings}
|
| 39 |
+
resolved_revision = str(info.sha or revision or "main")
|
| 40 |
+
if SNAPSHOTS_LATEST_PATH in remote_paths:
|
| 41 |
return _materialize_hf_snapshot_repo_snapshot(
|
| 42 |
repo_id=repo_id,
|
| 43 |
local_dir=local_dir,
|
| 44 |
+
revision=resolved_revision,
|
| 45 |
+
requested_revision=revision,
|
| 46 |
hf_sha=info.sha,
|
| 47 |
remote_paths=remote_paths,
|
| 48 |
)
|
|
|
|
| 50 |
return _materialize_hf_root_snapshot(
|
| 51 |
repo_id=repo_id,
|
| 52 |
local_dir=local_dir,
|
| 53 |
+
revision=resolved_revision,
|
| 54 |
+
requested_revision=revision,
|
| 55 |
hf_sha=info.sha,
|
| 56 |
remote_paths=remote_paths,
|
| 57 |
)
|
| 58 |
return _materialize_hf_dataset_viewer_snapshot(
|
| 59 |
repo_id=repo_id,
|
| 60 |
local_dir=local_dir,
|
| 61 |
+
revision=resolved_revision,
|
| 62 |
+
requested_revision=revision,
|
| 63 |
hf_sha=info.sha,
|
| 64 |
)
|
| 65 |
|
|
|
|
| 68 |
*,
|
| 69 |
repo_id: str,
|
| 70 |
local_dir: Path,
|
| 71 |
+
revision: str,
|
| 72 |
+
requested_revision: str | None,
|
| 73 |
hf_sha: str | None,
|
| 74 |
remote_paths: set[str],
|
| 75 |
) -> Path:
|
| 76 |
local_dir.mkdir(parents=True, exist_ok=True)
|
| 77 |
+
latest_download = Path(
|
| 78 |
+
hf_hub_download(
|
| 79 |
+
repo_id=repo_id,
|
| 80 |
+
repo_type="dataset",
|
| 81 |
+
filename=SNAPSHOTS_LATEST_PATH,
|
| 82 |
+
revision=revision,
|
| 83 |
+
)
|
| 84 |
)
|
| 85 |
+
latest_payload = json.loads(latest_download.read_text(encoding="utf-8"))
|
| 86 |
downloaded_files: set[str] = set()
|
| 87 |
+
_copy_downloaded_file(
|
| 88 |
+
latest_download, repo_relative_path_to_local(local_dir, SNAPSHOTS_LATEST_PATH)
|
| 89 |
+
)
|
| 90 |
+
downloaded_files.add(SNAPSHOTS_LATEST_PATH)
|
| 91 |
+
|
| 92 |
for filename in (
|
| 93 |
+
*RAW_TABLE_FILENAMES,
|
| 94 |
+
ROOT_MANIFEST_FILENAME,
|
| 95 |
+
PR_SCOPE_CLUSTERS_FILENAME,
|
| 96 |
+
*CONTRIBUTOR_ARTIFACT_FILENAMES,
|
| 97 |
+
*LEGACY_ANALYSIS_FILENAMES,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
):
|
| 99 |
+
downloaded = _download_first_available_hf_file(
|
| 100 |
repo_id=repo_id,
|
| 101 |
revision=revision,
|
| 102 |
filenames=_hf_latest_snapshot_candidates(latest_payload, filename),
|
| 103 |
)
|
| 104 |
+
if downloaded is None:
|
| 105 |
continue
|
| 106 |
+
_copy_downloaded_file(downloaded, local_dir / filename)
|
| 107 |
downloaded_files.add(filename)
|
| 108 |
+
|
| 109 |
+
if STATE_WATERMARK_PATH in remote_paths:
|
| 110 |
+
_download_repo_file(
|
| 111 |
repo_id=repo_id,
|
| 112 |
revision=revision,
|
| 113 |
local_dir=local_dir,
|
| 114 |
+
repo_path=STATE_WATERMARK_PATH,
|
| 115 |
+
downloaded_files=downloaded_files,
|
|
|
|
|
|
|
| 116 |
)
|
| 117 |
+
|
| 118 |
+
_download_analysis_state_files(
|
| 119 |
+
repo_id=repo_id,
|
| 120 |
+
revision=revision,
|
| 121 |
+
local_dir=local_dir,
|
| 122 |
+
remote_paths=remote_paths,
|
| 123 |
+
downloaded_files=downloaded_files,
|
| 124 |
)
|
| 125 |
|
| 126 |
+
_download_published_analysis_files(
|
| 127 |
repo_id=repo_id,
|
| 128 |
+
revision=revision,
|
| 129 |
+
local_dir=local_dir,
|
| 130 |
+
remote_paths=remote_paths,
|
| 131 |
+
downloaded_files=downloaded_files,
|
| 132 |
)
|
| 133 |
+
|
| 134 |
+
_download_repo_file(
|
| 135 |
+
repo_id=repo_id,
|
| 136 |
+
revision=revision,
|
| 137 |
+
local_dir=local_dir,
|
| 138 |
+
repo_path=README_FILENAME,
|
| 139 |
+
downloaded_files=downloaded_files,
|
| 140 |
+
required=False,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
manifest = (
|
| 144 |
+
read_json(local_dir / ROOT_MANIFEST_FILENAME)
|
| 145 |
+
if (local_dir / ROOT_MANIFEST_FILENAME).exists()
|
| 146 |
+
else {}
|
| 147 |
)
|
| 148 |
manifest.setdefault("repo", _infer_repo_from_materialized_snapshot(local_dir))
|
| 149 |
manifest.setdefault(
|
| 150 |
+
"snapshot_id",
|
| 151 |
+
str(latest_payload.get("latest_snapshot_id") or hf_sha or local_dir.name),
|
| 152 |
)
|
| 153 |
manifest.update(
|
| 154 |
{
|
| 155 |
"source_type": "hf_snapshot_repo",
|
| 156 |
"hf_repo_id": repo_id,
|
| 157 |
+
"hf_revision": requested_revision,
|
| 158 |
+
"hf_resolved_revision": revision,
|
| 159 |
"hf_sha": hf_sha,
|
| 160 |
"materialized_at": _iso_now(),
|
| 161 |
"downloaded_files": sorted(downloaded_files),
|
| 162 |
"hf_latest_pointer": latest_payload,
|
| 163 |
}
|
| 164 |
)
|
| 165 |
+
write_text(json.dumps(manifest, indent=2) + "\n", local_dir / ROOT_MANIFEST_FILENAME)
|
| 166 |
return local_dir
|
| 167 |
|
| 168 |
|
|
|
|
| 170 |
*,
|
| 171 |
repo_id: str,
|
| 172 |
local_dir: Path,
|
| 173 |
+
revision: str,
|
| 174 |
+
requested_revision: str | None,
|
| 175 |
hf_sha: str | None,
|
| 176 |
remote_paths: set[str],
|
| 177 |
) -> Path:
|
| 178 |
local_dir.mkdir(parents=True, exist_ok=True)
|
| 179 |
downloaded_files: set[str] = set()
|
| 180 |
+
for repo_path in (
|
| 181 |
+
*RAW_TABLE_FILENAMES,
|
| 182 |
+
ROOT_MANIFEST_FILENAME,
|
| 183 |
+
PR_SCOPE_CLUSTERS_FILENAME,
|
| 184 |
+
*CONTRIBUTOR_ARTIFACT_FILENAMES,
|
| 185 |
+
*LEGACY_ANALYSIS_FILENAMES,
|
| 186 |
+
SNAPSHOTS_LATEST_PATH,
|
| 187 |
+
STATE_WATERMARK_PATH,
|
| 188 |
+
README_FILENAME,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
):
|
| 190 |
+
if repo_path not in remote_paths:
|
| 191 |
continue
|
| 192 |
+
_download_repo_file(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
repo_id=repo_id,
|
| 194 |
revision=revision,
|
| 195 |
local_dir=local_dir,
|
| 196 |
+
repo_path=repo_path,
|
| 197 |
+
downloaded_files=downloaded_files,
|
| 198 |
)
|
| 199 |
+
|
| 200 |
+
_download_analysis_state_files(
|
| 201 |
+
repo_id=repo_id,
|
| 202 |
+
revision=revision,
|
| 203 |
+
local_dir=local_dir,
|
| 204 |
+
remote_paths=remote_paths,
|
| 205 |
+
downloaded_files=downloaded_files,
|
| 206 |
)
|
| 207 |
|
| 208 |
+
_download_published_analysis_files(
|
| 209 |
+
repo_id=repo_id,
|
| 210 |
+
revision=revision,
|
| 211 |
+
local_dir=local_dir,
|
| 212 |
+
remote_paths=remote_paths,
|
| 213 |
+
downloaded_files=downloaded_files,
|
| 214 |
+
)
|
|
|
|
| 215 |
|
| 216 |
manifest = (
|
| 217 |
+
read_json(local_dir / ROOT_MANIFEST_FILENAME)
|
| 218 |
+
if (local_dir / ROOT_MANIFEST_FILENAME).exists()
|
| 219 |
+
else {}
|
| 220 |
)
|
| 221 |
manifest.setdefault("repo", _infer_repo_from_materialized_snapshot(local_dir))
|
| 222 |
manifest.setdefault("snapshot_id", hf_sha or local_dir.name)
|
|
|
|
| 224 |
{
|
| 225 |
"source_type": "hf_root_snapshot",
|
| 226 |
"hf_repo_id": repo_id,
|
| 227 |
+
"hf_revision": requested_revision,
|
| 228 |
+
"hf_resolved_revision": revision,
|
| 229 |
"hf_sha": hf_sha,
|
| 230 |
"materialized_at": _iso_now(),
|
| 231 |
"downloaded_files": sorted(downloaded_files),
|
| 232 |
}
|
| 233 |
)
|
| 234 |
+
write_text(json.dumps(manifest, indent=2) + "\n", local_dir / ROOT_MANIFEST_FILENAME)
|
| 235 |
return local_dir
|
| 236 |
|
| 237 |
|
|
|
|
| 239 |
*,
|
| 240 |
repo_id: str,
|
| 241 |
local_dir: Path,
|
| 242 |
+
revision: str,
|
| 243 |
+
requested_revision: str | None,
|
| 244 |
hf_sha: str | None,
|
| 245 |
) -> Path:
|
| 246 |
local_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 255 |
readme_path = hf_hub_download(
|
| 256 |
repo_id=repo_id,
|
| 257 |
repo_type="dataset",
|
| 258 |
+
filename=README_FILENAME,
|
| 259 |
+
revision=revision,
|
| 260 |
)
|
| 261 |
+
shutil.copy2(readme_path, local_dir / README_FILENAME)
|
| 262 |
+
downloaded_files.add(README_FILENAME)
|
| 263 |
manifest = {
|
| 264 |
"repo": _infer_repo_from_materialized_snapshot(local_dir),
|
| 265 |
"snapshot_id": hf_sha or local_dir.name,
|
| 266 |
"source_type": "hf_dataset_viewer",
|
| 267 |
"hf_repo_id": repo_id,
|
| 268 |
+
"hf_revision": requested_revision,
|
| 269 |
+
"hf_resolved_revision": revision,
|
| 270 |
"hf_sha": hf_sha,
|
| 271 |
"materialized_at": _iso_now(),
|
| 272 |
"downloaded_files": sorted(downloaded_files),
|
| 273 |
}
|
| 274 |
+
write_text(json.dumps(manifest, indent=2) + "\n", local_dir / ROOT_MANIFEST_FILENAME)
|
| 275 |
return local_dir
|
| 276 |
|
| 277 |
|
| 278 |
+
def _download_published_analysis_files(
|
| 279 |
+
*,
|
| 280 |
+
repo_id: str,
|
| 281 |
+
revision: str,
|
| 282 |
+
local_dir: Path,
|
| 283 |
+
remote_paths: set[str],
|
| 284 |
+
downloaded_files: set[str],
|
| 285 |
+
) -> None:
|
| 286 |
+
if CURRENT_ANALYSIS_MANIFEST_PATH in remote_paths:
|
| 287 |
+
manifest_path = _download_repo_file(
|
| 288 |
+
repo_id=repo_id,
|
| 289 |
+
revision=revision,
|
| 290 |
+
local_dir=local_dir,
|
| 291 |
+
repo_path=CURRENT_ANALYSIS_MANIFEST_PATH,
|
| 292 |
+
downloaded_files=downloaded_files,
|
| 293 |
+
)
|
| 294 |
+
current_manifest = load_current_analysis_manifest(manifest_path)
|
| 295 |
+
for repo_path in _manifest_artifact_paths(current_manifest, include_archived=True):
|
| 296 |
+
if repo_path not in remote_paths:
|
| 297 |
+
continue
|
| 298 |
+
_download_repo_file(
|
| 299 |
+
repo_id=repo_id,
|
| 300 |
+
revision=revision,
|
| 301 |
+
local_dir=local_dir,
|
| 302 |
+
repo_path=repo_path,
|
| 303 |
+
downloaded_files=downloaded_files,
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
for repo_path in sorted(
|
| 307 |
+
path for path in remote_paths if _is_archived_analysis_manifest_path(path)
|
| 308 |
+
):
|
| 309 |
+
manifest_path = _download_repo_file(
|
| 310 |
+
repo_id=repo_id,
|
| 311 |
+
revision=revision,
|
| 312 |
+
local_dir=local_dir,
|
| 313 |
+
repo_path=repo_path,
|
| 314 |
+
downloaded_files=downloaded_files,
|
| 315 |
+
)
|
| 316 |
+
archived_manifest = load_archived_analysis_run_manifest(manifest_path)
|
| 317 |
+
for artifact_path in _manifest_artifact_paths(archived_manifest, include_archived=False):
|
| 318 |
+
if artifact_path not in remote_paths:
|
| 319 |
+
continue
|
| 320 |
+
_download_repo_file(
|
| 321 |
+
repo_id=repo_id,
|
| 322 |
+
revision=revision,
|
| 323 |
+
local_dir=local_dir,
|
| 324 |
+
repo_path=artifact_path,
|
| 325 |
+
downloaded_files=downloaded_files,
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def _download_analysis_state_files(
|
| 330 |
+
*,
|
| 331 |
+
repo_id: str,
|
| 332 |
+
revision: str,
|
| 333 |
+
local_dir: Path,
|
| 334 |
+
remote_paths: set[str],
|
| 335 |
+
downloaded_files: set[str],
|
| 336 |
+
) -> None:
|
| 337 |
+
for repo_path in sorted(
|
| 338 |
+
path for path in remote_paths if PurePosixPath(path).parts[:1] == ("analysis-state",)
|
| 339 |
+
):
|
| 340 |
+
_download_repo_file(
|
| 341 |
+
repo_id=repo_id,
|
| 342 |
+
revision=revision,
|
| 343 |
+
local_dir=local_dir,
|
| 344 |
+
repo_path=repo_path,
|
| 345 |
+
downloaded_files=downloaded_files,
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def _manifest_artifact_paths(
|
| 350 |
+
payload: dict[str, Any],
|
| 351 |
+
*,
|
| 352 |
+
include_archived: bool,
|
| 353 |
+
) -> list[str]:
|
| 354 |
+
paths = [
|
| 355 |
+
str(value) for value in (payload.get("artifacts") or {}).values() if isinstance(value, str)
|
| 356 |
+
]
|
| 357 |
+
if include_archived:
|
| 358 |
+
paths.extend(
|
| 359 |
+
str(value)
|
| 360 |
+
for value in (payload.get("archived_artifacts") or {}).values()
|
| 361 |
+
if isinstance(value, str)
|
| 362 |
+
)
|
| 363 |
+
deduped: list[str] = []
|
| 364 |
+
seen: set[str] = set()
|
| 365 |
+
for repo_path in paths:
|
| 366 |
+
normalized = repo_path.lstrip("./")
|
| 367 |
+
if not normalized or normalized in seen:
|
| 368 |
+
continue
|
| 369 |
+
seen.add(normalized)
|
| 370 |
+
deduped.append(normalized)
|
| 371 |
+
return deduped
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
def _is_archived_analysis_manifest_path(repo_path: str) -> bool:
|
| 375 |
+
parts = PurePosixPath(repo_path).parts
|
| 376 |
+
return (
|
| 377 |
+
len(parts) == 5
|
| 378 |
+
and parts[0] == "snapshots"
|
| 379 |
+
and parts[2] == "analysis-runs"
|
| 380 |
+
and parts[4] == ROOT_MANIFEST_FILENAME
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
def _download_repo_file(
|
| 385 |
+
*,
|
| 386 |
+
repo_id: str,
|
| 387 |
+
revision: str,
|
| 388 |
+
local_dir: Path,
|
| 389 |
+
repo_path: str,
|
| 390 |
+
downloaded_files: set[str],
|
| 391 |
+
required: bool = True,
|
| 392 |
+
) -> Path:
|
| 393 |
+
try:
|
| 394 |
+
downloaded = Path(
|
| 395 |
+
hf_hub_download(
|
| 396 |
+
repo_id=repo_id,
|
| 397 |
+
repo_type="dataset",
|
| 398 |
+
filename=repo_path,
|
| 399 |
+
revision=revision,
|
| 400 |
+
)
|
| 401 |
+
)
|
| 402 |
+
except Exception:
|
| 403 |
+
if required:
|
| 404 |
+
raise
|
| 405 |
+
return local_dir / repo_path
|
| 406 |
+
destination = repo_relative_path_to_local(local_dir, repo_path)
|
| 407 |
+
_copy_downloaded_file(downloaded, destination)
|
| 408 |
+
downloaded_files.add(repo_path)
|
| 409 |
+
return destination
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
def _copy_downloaded_file(downloaded_path: Path, destination: Path) -> None:
|
| 413 |
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
| 414 |
+
shutil.copy2(downloaded_path, destination)
|
| 415 |
+
|
| 416 |
+
|
| 417 |
def _hf_dataset_info(repo_id: str, revision: str | None, *, files_metadata: bool) -> Any:
|
| 418 |
api = HfApi()
|
| 419 |
try:
|
|
|
|
| 441 |
def _download_first_available_hf_file(
|
| 442 |
*,
|
| 443 |
repo_id: str,
|
| 444 |
+
revision: str,
|
| 445 |
filenames: list[str],
|
| 446 |
) -> Path | None:
|
| 447 |
for filename in filenames:
|
|
|
|
| 461 |
return None
|
| 462 |
|
| 463 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
def _hf_latest_snapshot_candidates(latest_payload: dict[str, Any], filename: str) -> list[str]:
|
| 465 |
candidates: list[str] = []
|
| 466 |
manifest_path = str(latest_payload.get("manifest_path") or "").strip("/")
|
| 467 |
snapshot_dir = str(latest_payload.get("snapshot_dir") or "").strip("/")
|
| 468 |
latest_snapshot_id = str(latest_payload.get("latest_snapshot_id") or "").strip()
|
| 469 |
+
archived_manifest_path = str(latest_payload.get("archived_manifest_path") or "").strip("/")
|
| 470 |
|
| 471 |
+
if filename == ROOT_MANIFEST_FILENAME and manifest_path:
|
| 472 |
candidates.append(manifest_path)
|
| 473 |
if snapshot_dir and snapshot_dir not in {".", "/"}:
|
| 474 |
candidates.append(f"{snapshot_dir}/{filename}")
|
| 475 |
+
if filename == ROOT_MANIFEST_FILENAME and archived_manifest_path:
|
| 476 |
+
candidates.append(archived_manifest_path)
|
| 477 |
if manifest_path and "/" in manifest_path:
|
| 478 |
manifest_dir = manifest_path.rsplit("/", 1)[0]
|
| 479 |
candidates.append(f"{manifest_dir}/{filename}")
|
| 480 |
if latest_snapshot_id:
|
| 481 |
+
candidates.append(str(PurePosixPath("snapshots") / latest_snapshot_id / filename))
|
| 482 |
candidates.append(filename)
|
| 483 |
|
| 484 |
seen: set[str] = set()
|
|
|
|
| 492 |
return deduped
|
| 493 |
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
def _download_url_to_path(url: str, destination: Path) -> None:
|
| 496 |
destination.parent.mkdir(parents=True, exist_ok=True)
|
| 497 |
urllib.request.urlretrieve(url, destination)
|
|
|
|
| 525 |
def _infer_repo_from_materialized_snapshot(local_dir: Path) -> str:
|
| 526 |
import pyarrow.parquet as pq
|
| 527 |
|
| 528 |
+
for table_filename in RAW_TABLE_FILENAMES:
|
| 529 |
+
path = local_dir / table_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
if not path.exists():
|
| 531 |
continue
|
| 532 |
rows = pq.read_table(path).slice(0, 1).to_pylist()
|
src/slop_farmer/data/snapshot_paths.py
CHANGED
|
@@ -1,9 +1,63 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
from slop_farmer.data.parquet_io import read_json
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def default_hf_materialize_dir(output_dir: Path, repo_id: str, revision: str | None) -> Path:
|
| 9 |
suffix = repo_id.replace("/", "--")
|
|
@@ -12,14 +66,241 @@ def default_hf_materialize_dir(output_dir: Path, repo_id: str, revision: str | N
|
|
| 12 |
return output_dir.resolve() / "snapshots" / f"hf-{suffix}"
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def load_latest_snapshot_pointer(snapshots_root: Path) -> Path | None:
|
| 16 |
-
|
|
|
|
| 17 |
if not latest_path.exists():
|
| 18 |
return None
|
| 19 |
payload = read_json(latest_path)
|
| 20 |
snapshot_dir = payload.get("snapshot_dir")
|
| 21 |
if isinstance(snapshot_dir, str) and snapshot_dir:
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
| 23 |
return None
|
| 24 |
|
| 25 |
|
|
@@ -46,3 +327,104 @@ def resolve_snapshot_dir_from_snapshots_root(
|
|
| 46 |
if snapshot_dirs:
|
| 47 |
return snapshot_dirs[-1].resolve()
|
| 48 |
raise FileNotFoundError(f"Could not resolve a snapshot directory from {latest_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
import re
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from pathlib import Path, PurePosixPath
|
| 6 |
+
from typing import Any
|
| 7 |
|
| 8 |
from slop_farmer.data.parquet_io import read_json
|
| 9 |
|
| 10 |
+
RAW_TABLE_FILENAMES: tuple[str, ...] = (
|
| 11 |
+
"issues.parquet",
|
| 12 |
+
"pull_requests.parquet",
|
| 13 |
+
"comments.parquet",
|
| 14 |
+
"reviews.parquet",
|
| 15 |
+
"review_comments.parquet",
|
| 16 |
+
"pr_files.parquet",
|
| 17 |
+
"pr_diffs.parquet",
|
| 18 |
+
"links.parquet",
|
| 19 |
+
"events.parquet",
|
| 20 |
+
)
|
| 21 |
+
VIEWER_SPLIT_FILENAMES: tuple[str, ...] = (
|
| 22 |
+
"issue_comments.parquet",
|
| 23 |
+
"pr_comments.parquet",
|
| 24 |
+
)
|
| 25 |
+
ROOT_MANIFEST_FILENAME = "manifest.json"
|
| 26 |
+
README_FILENAME = "README.md"
|
| 27 |
+
STATE_WATERMARK_PATH = "state/watermark.json"
|
| 28 |
+
SNAPSHOTS_LATEST_PATH = "snapshots/latest.json"
|
| 29 |
+
PR_SCOPE_CLUSTERS_FILENAME = "pr-scope-clusters.json"
|
| 30 |
+
NEW_CONTRIBUTORS_PARQUET_FILENAME = "new_contributors.parquet"
|
| 31 |
+
NEW_CONTRIBUTORS_REPORT_JSON_FILENAME = "new-contributors-report.json"
|
| 32 |
+
NEW_CONTRIBUTORS_REPORT_MARKDOWN_FILENAME = "new-contributors-report.md"
|
| 33 |
+
CONTRIBUTOR_ARTIFACT_FILENAMES: tuple[str, ...] = (
|
| 34 |
+
NEW_CONTRIBUTORS_PARQUET_FILENAME,
|
| 35 |
+
NEW_CONTRIBUTORS_REPORT_JSON_FILENAME,
|
| 36 |
+
NEW_CONTRIBUTORS_REPORT_MARKDOWN_FILENAME,
|
| 37 |
+
)
|
| 38 |
+
ANALYSIS_REPORT_FILENAME_BY_VARIANT: dict[str, str] = {
|
| 39 |
+
"deterministic": "analysis-report.json",
|
| 40 |
+
"hybrid": "analysis-report-hybrid.json",
|
| 41 |
+
}
|
| 42 |
+
HYBRID_ANALYSIS_REVIEWS_FILENAME = "analysis-report-hybrid.llm-reviews.json"
|
| 43 |
+
LEGACY_ANALYSIS_FILENAMES: tuple[str, ...] = (
|
| 44 |
+
ANALYSIS_REPORT_FILENAME_BY_VARIANT["deterministic"],
|
| 45 |
+
ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"],
|
| 46 |
+
HYBRID_ANALYSIS_REVIEWS_FILENAME,
|
| 47 |
+
)
|
| 48 |
+
CURRENT_ANALYSIS_DIR = PurePosixPath("analysis/current")
|
| 49 |
+
CURRENT_ANALYSIS_MANIFEST_PATH = str(CURRENT_ANALYSIS_DIR / ROOT_MANIFEST_FILENAME)
|
| 50 |
+
ANALYSIS_MANIFEST_SCHEMA_VERSION = 1
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass(frozen=True, slots=True)
|
| 54 |
+
class ResolvedAnalysisReportPath:
|
| 55 |
+
path: Path
|
| 56 |
+
variant: str
|
| 57 |
+
source: str
|
| 58 |
+
snapshot_id: str | None = None
|
| 59 |
+
analysis_id: str | None = None
|
| 60 |
+
|
| 61 |
|
| 62 |
def default_hf_materialize_dir(output_dir: Path, repo_id: str, revision: str | None) -> Path:
|
| 63 |
suffix = repo_id.replace("/", "--")
|
|
|
|
| 66 |
return output_dir.resolve() / "snapshots" / f"hf-{suffix}"
|
| 67 |
|
| 68 |
|
| 69 |
+
def repo_relative_path_to_local(base_dir: Path, repo_relative_path: str) -> Path:
|
| 70 |
+
return base_dir.joinpath(*PurePosixPath(repo_relative_path).parts)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def snapshot_artifact_path(snapshot_id: str, filename: str) -> str:
|
| 74 |
+
return str(PurePosixPath("snapshots") / snapshot_id / filename)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def archived_snapshot_manifest_path(snapshot_id: str) -> str:
|
| 78 |
+
return snapshot_artifact_path(snapshot_id, ROOT_MANIFEST_FILENAME)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def analysis_run_artifact_path(snapshot_id: str, analysis_id: str, filename: str) -> str:
|
| 82 |
+
return str(PurePosixPath("snapshots") / snapshot_id / "analysis-runs" / analysis_id / filename)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def analysis_run_manifest_path(snapshot_id: str, analysis_id: str) -> str:
|
| 86 |
+
return analysis_run_artifact_path(snapshot_id, analysis_id, ROOT_MANIFEST_FILENAME)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def current_analysis_artifact_path(filename: str) -> str:
|
| 90 |
+
return str(CURRENT_ANALYSIS_DIR / filename)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def repo_key(repo_slug: str) -> str:
|
| 94 |
+
return _path_key(repo_slug)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def model_key(model: str) -> str:
|
| 98 |
+
return _path_key(model)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def build_current_analysis_manifest(
|
| 102 |
+
*,
|
| 103 |
+
repo: str,
|
| 104 |
+
snapshot_id: str,
|
| 105 |
+
analysis_id: str,
|
| 106 |
+
variant: str,
|
| 107 |
+
channel: str,
|
| 108 |
+
model: str | None,
|
| 109 |
+
published_at: str,
|
| 110 |
+
include_hybrid_reviews: bool,
|
| 111 |
+
) -> dict[str, Any]:
|
| 112 |
+
artifacts = {
|
| 113 |
+
"hybrid": current_analysis_artifact_path(ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"]),
|
| 114 |
+
}
|
| 115 |
+
archived_artifacts = {
|
| 116 |
+
"hybrid": analysis_run_artifact_path(
|
| 117 |
+
snapshot_id,
|
| 118 |
+
analysis_id,
|
| 119 |
+
ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"],
|
| 120 |
+
)
|
| 121 |
+
}
|
| 122 |
+
if include_hybrid_reviews:
|
| 123 |
+
artifacts["hybrid_reviews"] = current_analysis_artifact_path(
|
| 124 |
+
HYBRID_ANALYSIS_REVIEWS_FILENAME
|
| 125 |
+
)
|
| 126 |
+
archived_artifacts["hybrid_reviews"] = analysis_run_artifact_path(
|
| 127 |
+
snapshot_id,
|
| 128 |
+
analysis_id,
|
| 129 |
+
HYBRID_ANALYSIS_REVIEWS_FILENAME,
|
| 130 |
+
)
|
| 131 |
+
payload = {
|
| 132 |
+
"schema_version": ANALYSIS_MANIFEST_SCHEMA_VERSION,
|
| 133 |
+
"repo": repo,
|
| 134 |
+
"snapshot_id": snapshot_id,
|
| 135 |
+
"analysis_id": analysis_id,
|
| 136 |
+
"variant": variant,
|
| 137 |
+
"channel": channel,
|
| 138 |
+
"model": model,
|
| 139 |
+
"published_at": published_at,
|
| 140 |
+
"artifacts": artifacts,
|
| 141 |
+
"archived_artifacts": archived_artifacts,
|
| 142 |
+
}
|
| 143 |
+
return validate_current_analysis_manifest(payload)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def build_archived_analysis_run_manifest(
|
| 147 |
+
*,
|
| 148 |
+
repo: str,
|
| 149 |
+
snapshot_id: str,
|
| 150 |
+
analysis_id: str,
|
| 151 |
+
variant: str,
|
| 152 |
+
channel: str,
|
| 153 |
+
model: str | None,
|
| 154 |
+
published_at: str,
|
| 155 |
+
include_hybrid_reviews: bool,
|
| 156 |
+
) -> dict[str, Any]:
|
| 157 |
+
artifacts = {
|
| 158 |
+
"hybrid": analysis_run_artifact_path(
|
| 159 |
+
snapshot_id,
|
| 160 |
+
analysis_id,
|
| 161 |
+
ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"],
|
| 162 |
+
)
|
| 163 |
+
}
|
| 164 |
+
if include_hybrid_reviews:
|
| 165 |
+
artifacts["hybrid_reviews"] = analysis_run_artifact_path(
|
| 166 |
+
snapshot_id,
|
| 167 |
+
analysis_id,
|
| 168 |
+
HYBRID_ANALYSIS_REVIEWS_FILENAME,
|
| 169 |
+
)
|
| 170 |
+
payload = {
|
| 171 |
+
"schema_version": ANALYSIS_MANIFEST_SCHEMA_VERSION,
|
| 172 |
+
"repo": repo,
|
| 173 |
+
"snapshot_id": snapshot_id,
|
| 174 |
+
"analysis_id": analysis_id,
|
| 175 |
+
"variant": variant,
|
| 176 |
+
"channel": channel,
|
| 177 |
+
"model": model,
|
| 178 |
+
"published_at": published_at,
|
| 179 |
+
"artifacts": artifacts,
|
| 180 |
+
}
|
| 181 |
+
return validate_archived_analysis_run_manifest(payload)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def load_current_analysis_manifest(path: Path) -> dict[str, Any]:
|
| 185 |
+
payload = read_json(path)
|
| 186 |
+
if not isinstance(payload, dict):
|
| 187 |
+
raise ValueError(f"Current analysis manifest at {path} must contain a JSON object.")
|
| 188 |
+
return validate_current_analysis_manifest(payload)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def load_archived_analysis_run_manifest(path: Path) -> dict[str, Any]:
|
| 192 |
+
payload = read_json(path)
|
| 193 |
+
if not isinstance(payload, dict):
|
| 194 |
+
raise ValueError(f"Archived analysis manifest at {path} must contain a JSON object.")
|
| 195 |
+
return validate_archived_analysis_run_manifest(payload)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def resolve_default_dashboard_analysis_report(
|
| 199 |
+
snapshot_dir: Path,
|
| 200 |
+
) -> ResolvedAnalysisReportPath | None:
|
| 201 |
+
current = resolve_current_analysis_report(snapshot_dir)
|
| 202 |
+
if current is not None and _analysis_matches_snapshot(snapshot_dir, current):
|
| 203 |
+
return current
|
| 204 |
+
return resolve_snapshot_local_analysis_report(snapshot_dir, variant="auto")
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def resolve_current_analysis_report(
|
| 208 |
+
snapshot_dir: Path,
|
| 209 |
+
*,
|
| 210 |
+
variant: str = "auto",
|
| 211 |
+
) -> ResolvedAnalysisReportPath | None:
|
| 212 |
+
normalized = _normalize_analysis_variant(variant)
|
| 213 |
+
manifest_path = repo_relative_path_to_local(snapshot_dir, CURRENT_ANALYSIS_MANIFEST_PATH)
|
| 214 |
+
if not manifest_path.exists():
|
| 215 |
+
return None
|
| 216 |
+
manifest = load_current_analysis_manifest(manifest_path)
|
| 217 |
+
artifact_key = _analysis_artifact_key_for_variant(normalized, manifest_kind="current")
|
| 218 |
+
artifact_path = manifest.get("artifacts", {}).get(artifact_key)
|
| 219 |
+
if not isinstance(artifact_path, str) or not artifact_path:
|
| 220 |
+
message = (
|
| 221 |
+
f"Published current analysis manifest does not provide the {normalized} artifact."
|
| 222 |
+
if normalized != "auto"
|
| 223 |
+
else "Published current analysis manifest does not provide the canonical hybrid artifact."
|
| 224 |
+
)
|
| 225 |
+
raise ValueError(message)
|
| 226 |
+
report_path = repo_relative_path_to_local(snapshot_dir, artifact_path)
|
| 227 |
+
if not report_path.exists():
|
| 228 |
+
raise ValueError(
|
| 229 |
+
f"Published current analysis artifact {artifact_path!r} is missing from the materialized snapshot."
|
| 230 |
+
)
|
| 231 |
+
return ResolvedAnalysisReportPath(
|
| 232 |
+
path=report_path,
|
| 233 |
+
variant="hybrid" if artifact_key == "hybrid" else normalized,
|
| 234 |
+
source="current",
|
| 235 |
+
snapshot_id=str(manifest["snapshot_id"]),
|
| 236 |
+
analysis_id=str(manifest["analysis_id"]),
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def resolve_snapshot_local_analysis_report(
|
| 241 |
+
snapshot_dir: Path,
|
| 242 |
+
*,
|
| 243 |
+
variant: str = "auto",
|
| 244 |
+
) -> ResolvedAnalysisReportPath | None:
|
| 245 |
+
normalized = _normalize_analysis_variant(variant)
|
| 246 |
+
if normalized == "auto":
|
| 247 |
+
hybrid_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"]
|
| 248 |
+
if hybrid_path.exists():
|
| 249 |
+
return ResolvedAnalysisReportPath(
|
| 250 |
+
path=hybrid_path,
|
| 251 |
+
variant="hybrid",
|
| 252 |
+
source="snapshot",
|
| 253 |
+
)
|
| 254 |
+
deterministic_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT["deterministic"]
|
| 255 |
+
if deterministic_path.exists():
|
| 256 |
+
return ResolvedAnalysisReportPath(
|
| 257 |
+
path=deterministic_path,
|
| 258 |
+
variant="deterministic",
|
| 259 |
+
source="snapshot",
|
| 260 |
+
)
|
| 261 |
+
return None
|
| 262 |
+
report_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT[normalized]
|
| 263 |
+
if not report_path.exists():
|
| 264 |
+
return None
|
| 265 |
+
return ResolvedAnalysisReportPath(
|
| 266 |
+
path=report_path,
|
| 267 |
+
variant=normalized,
|
| 268 |
+
source="snapshot",
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def validate_current_analysis_manifest(payload: dict[str, Any]) -> dict[str, Any]:
|
| 273 |
+
validated = _validate_analysis_manifest(payload, require_archived_artifacts=True)
|
| 274 |
+
archived_artifacts = _validate_artifacts(
|
| 275 |
+
dict(validated["archived_artifacts"]),
|
| 276 |
+
expected_prefix=analysis_run_artifact_path(
|
| 277 |
+
str(validated["snapshot_id"]),
|
| 278 |
+
str(validated["analysis_id"]),
|
| 279 |
+
"",
|
| 280 |
+
),
|
| 281 |
+
)
|
| 282 |
+
if set(archived_artifacts) != set(validated["artifacts"]):
|
| 283 |
+
raise ValueError("Current analysis manifest artifacts and archived_artifacts must match.")
|
| 284 |
+
validated["archived_artifacts"] = archived_artifacts
|
| 285 |
+
return validated
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def validate_archived_analysis_run_manifest(payload: dict[str, Any]) -> dict[str, Any]:
|
| 289 |
+
return _validate_analysis_manifest(payload, require_archived_artifacts=False)
|
| 290 |
+
|
| 291 |
+
|
| 292 |
def load_latest_snapshot_pointer(snapshots_root: Path) -> Path | None:
|
| 293 |
+
resolved_snapshots_root = snapshots_root.resolve()
|
| 294 |
+
latest_path = resolved_snapshots_root / "latest.json"
|
| 295 |
if not latest_path.exists():
|
| 296 |
return None
|
| 297 |
payload = read_json(latest_path)
|
| 298 |
snapshot_dir = payload.get("snapshot_dir")
|
| 299 |
if isinstance(snapshot_dir, str) and snapshot_dir:
|
| 300 |
+
path = Path(snapshot_dir)
|
| 301 |
+
if path.is_absolute():
|
| 302 |
+
return path.resolve()
|
| 303 |
+
return (resolved_snapshots_root.parent / path).resolve()
|
| 304 |
return None
|
| 305 |
|
| 306 |
|
|
|
|
| 327 |
if snapshot_dirs:
|
| 328 |
return snapshot_dirs[-1].resolve()
|
| 329 |
raise FileNotFoundError(f"Could not resolve a snapshot directory from {latest_path}")
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
def _validate_analysis_manifest(
|
| 333 |
+
payload: dict[str, Any],
|
| 334 |
+
*,
|
| 335 |
+
require_archived_artifacts: bool,
|
| 336 |
+
) -> dict[str, Any]:
|
| 337 |
+
validated = {str(key): value for key, value in payload.items()}
|
| 338 |
+
if validated.get("schema_version") != ANALYSIS_MANIFEST_SCHEMA_VERSION:
|
| 339 |
+
raise ValueError(
|
| 340 |
+
f"Unsupported analysis manifest schema version: {validated.get('schema_version')!r}"
|
| 341 |
+
)
|
| 342 |
+
for field in ("repo", "snapshot_id", "analysis_id", "variant", "channel", "published_at"):
|
| 343 |
+
if not isinstance(validated.get(field), str) or not str(validated[field]).strip():
|
| 344 |
+
raise ValueError(f"Analysis manifest field {field!r} must be a non-empty string.")
|
| 345 |
+
validated[field] = str(validated[field]).strip()
|
| 346 |
+
model = validated.get("model")
|
| 347 |
+
if model is not None and not isinstance(model, str):
|
| 348 |
+
raise ValueError("Analysis manifest field 'model' must be a string when present.")
|
| 349 |
+
artifacts = validated.get("artifacts")
|
| 350 |
+
if not isinstance(artifacts, dict):
|
| 351 |
+
raise ValueError("Analysis manifest field 'artifacts' must be an object.")
|
| 352 |
+
expected_prefix = (
|
| 353 |
+
current_analysis_artifact_path("")
|
| 354 |
+
if require_archived_artifacts
|
| 355 |
+
else analysis_run_artifact_path(
|
| 356 |
+
str(validated["snapshot_id"]),
|
| 357 |
+
str(validated["analysis_id"]),
|
| 358 |
+
"",
|
| 359 |
+
)
|
| 360 |
+
)
|
| 361 |
+
validated["artifacts"] = _validate_artifacts(dict(artifacts), expected_prefix=expected_prefix)
|
| 362 |
+
if require_archived_artifacts:
|
| 363 |
+
archived_artifacts = validated.get("archived_artifacts")
|
| 364 |
+
if not isinstance(archived_artifacts, dict):
|
| 365 |
+
raise ValueError(
|
| 366 |
+
"Current analysis manifest field 'archived_artifacts' must be an object."
|
| 367 |
+
)
|
| 368 |
+
validated["archived_artifacts"] = {
|
| 369 |
+
str(key): value for key, value in archived_artifacts.items()
|
| 370 |
+
}
|
| 371 |
+
return validated
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
def _validate_artifacts(artifacts: dict[str, Any], *, expected_prefix: str) -> dict[str, str]:
|
| 375 |
+
normalized = {str(key): value for key, value in artifacts.items()}
|
| 376 |
+
hybrid_path = normalized.get("hybrid")
|
| 377 |
+
if not isinstance(hybrid_path, str) or not hybrid_path:
|
| 378 |
+
raise ValueError("Analysis manifest must include artifacts.hybrid.")
|
| 379 |
+
validated = {"hybrid": hybrid_path}
|
| 380 |
+
hybrid_reviews_path = normalized.get("hybrid_reviews")
|
| 381 |
+
if hybrid_reviews_path is not None:
|
| 382 |
+
if not isinstance(hybrid_reviews_path, str) or not hybrid_reviews_path:
|
| 383 |
+
raise ValueError(
|
| 384 |
+
"Analysis manifest artifacts.hybrid_reviews must be a non-empty string."
|
| 385 |
+
)
|
| 386 |
+
validated["hybrid_reviews"] = hybrid_reviews_path
|
| 387 |
+
for key, value in validated.items():
|
| 388 |
+
if not value.startswith(expected_prefix):
|
| 389 |
+
raise ValueError(
|
| 390 |
+
f"Analysis manifest artifact {key!r} must live under {expected_prefix!r}, got {value!r}."
|
| 391 |
+
)
|
| 392 |
+
return validated
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
def _path_key(value: str) -> str:
|
| 396 |
+
normalized = re.sub(r"[^a-z0-9]+", "-", value.strip().lower())
|
| 397 |
+
normalized = re.sub(r"-+", "-", normalized).strip("-")
|
| 398 |
+
if not normalized:
|
| 399 |
+
raise ValueError("Expected a non-empty path key value.")
|
| 400 |
+
return normalized
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
def _analysis_matches_snapshot(
|
| 404 |
+
snapshot_dir: Path,
|
| 405 |
+
analysis_path: ResolvedAnalysisReportPath,
|
| 406 |
+
) -> bool:
|
| 407 |
+
snapshot_manifest_path = snapshot_dir / ROOT_MANIFEST_FILENAME
|
| 408 |
+
if snapshot_manifest_path.exists():
|
| 409 |
+
snapshot_manifest = read_json(snapshot_manifest_path)
|
| 410 |
+
snapshot_id = snapshot_manifest.get("snapshot_id")
|
| 411 |
+
if snapshot_id is not None:
|
| 412 |
+
return str(snapshot_id) == str(analysis_path.snapshot_id)
|
| 413 |
+
return snapshot_dir.name == str(analysis_path.snapshot_id)
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def _normalize_analysis_variant(variant: str) -> str:
|
| 417 |
+
normalized = variant.strip().lower()
|
| 418 |
+
if normalized not in {"auto", "deterministic", "hybrid"}:
|
| 419 |
+
raise ValueError(
|
| 420 |
+
f"Unsupported analysis variant {variant!r}; expected auto, hybrid, or deterministic."
|
| 421 |
+
)
|
| 422 |
+
return normalized
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
def _analysis_artifact_key_for_variant(variant: str, *, manifest_kind: str) -> str:
|
| 426 |
+
if variant in {"auto", "hybrid"}:
|
| 427 |
+
return "hybrid"
|
| 428 |
+
raise ValueError(
|
| 429 |
+
f"Published {manifest_kind} analysis only serves canonical hybrid artifacts; requested {variant!r}."
|
| 430 |
+
)
|
src/slop_farmer/reports/analysis.py
CHANGED
|
@@ -19,15 +19,12 @@ from rank_bm25 import BM25Okapi
|
|
| 19 |
from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
|
| 20 |
from slop_farmer.data.links import build_text_link_rows
|
| 21 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
|
| 22 |
-
from slop_farmer.data.
|
| 23 |
-
from slop_farmer.data.snapshot_paths import (
|
| 24 |
-
default_hf_materialize_dir,
|
| 25 |
-
resolve_snapshot_dir_from_output,
|
| 26 |
-
)
|
| 27 |
from slop_farmer.reports.analysis_cache import (
|
| 28 |
HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
|
| 29 |
PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
|
| 30 |
HybridReviewCacheEntry,
|
|
|
|
| 31 |
HybridReviewCacheManifest,
|
| 32 |
HybridReviewCacheStore,
|
| 33 |
HybridReviewSettingsFingerprint,
|
|
@@ -89,12 +86,12 @@ LLM_PROVIDER_ENV_VARS = (
|
|
| 89 |
"DEEPSEEK_API_KEY",
|
| 90 |
)
|
| 91 |
LLM_PACKET_CHARS_PER_TOKEN = 4
|
| 92 |
-
LLM_MAX_INPUT_TOKENS =
|
| 93 |
-
LLM_MAX_NODES_PER_PACKET =
|
| 94 |
-
LLM_MAX_SOFT_PAIRS_PER_PACKET =
|
| 95 |
-
LLM_MAX_DIFF_CHARS_PER_ITEM =
|
| 96 |
-
LLM_MAX_FILENAMES_PER_ITEM =
|
| 97 |
-
LLM_SKIP_EVALUATOR_ABOVE_TOKENS =
|
| 98 |
LLM_OVERFLOW_POLICY = "truncate_then_skip"
|
| 99 |
LLM_SHARED_TARGET_MAX_NEIGHBORS_PER_PR = 3
|
| 100 |
LLM_SHARED_TARGET_MAX_EXTRA_PAIRS_PER_TARGET = 18
|
|
@@ -311,6 +308,42 @@ class AnalysisBuildResult:
|
|
| 311 |
llm_reviews: list[dict[str, Any]]
|
| 312 |
|
| 313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
def _hybrid_review_cache_manifest() -> HybridReviewCacheManifest:
|
| 315 |
return HybridReviewCacheManifest(
|
| 316 |
cache_schema_version=HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
|
|
@@ -766,18 +799,14 @@ def _artifact_suffix(row: dict[str, Any] | None, kind: str) -> str:
|
|
| 766 |
|
| 767 |
|
| 768 |
def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
local_dir=materialize_dir,
|
| 778 |
-
revision=options.hf_revision,
|
| 779 |
-
).resolve()
|
| 780 |
-
return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 781 |
|
| 782 |
|
| 783 |
def _load_snapshot(snapshot_dir: Path) -> SnapshotData:
|
|
@@ -927,39 +956,46 @@ async def _build_report(snapshot: SnapshotData, options: AnalysisOptions) -> Ana
|
|
| 927 |
review_comment_map=review_comment_map,
|
| 928 |
)
|
| 929 |
issue_soft_candidates = _issue_soft_candidates(issue_map, features, issue_hard_pairs)
|
| 930 |
-
|
| 931 |
options=options,
|
| 932 |
snapshot=snapshot,
|
|
|
|
|
|
|
| 933 |
features=features,
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 939 |
)
|
| 940 |
issue_pairs = dict(issue_hard_pairs)
|
| 941 |
for pair, detail in accepted_issue_pairs.items():
|
| 942 |
issue_pairs.setdefault(pair, set()).update(
|
| 943 |
detail.get("evidence_types") or {"soft_similarity"}
|
| 944 |
)
|
| 945 |
-
|
| 946 |
-
pr_soft_candidates, pr_pair_target_issues = _pr_duplicate_candidates(
|
| 947 |
-
options=options,
|
| 948 |
-
snapshot=snapshot,
|
| 949 |
-
issue_map=issue_map,
|
| 950 |
-
pr_map=pr_map,
|
| 951 |
-
features=features,
|
| 952 |
-
)
|
| 953 |
-
accepted_pr_pairs, pr_llm_enabled, pr_llm_reviews = await _accepted_soft_pairs(
|
| 954 |
-
options=options,
|
| 955 |
-
snapshot=snapshot,
|
| 956 |
-
features=features,
|
| 957 |
-
hard_pairs={},
|
| 958 |
-
soft_candidates=pr_soft_candidates,
|
| 959 |
-
label="pull_request",
|
| 960 |
-
hybrid_review_cache=hybrid_review_cache,
|
| 961 |
-
llm_available=llm_available,
|
| 962 |
-
)
|
| 963 |
pr_pairs: dict[tuple[str, str], set[str]] = {}
|
| 964 |
for pair, detail in accepted_pr_pairs.items():
|
| 965 |
pr_pairs.setdefault(pair, set()).update(detail.get("evidence_types") or {"soft_similarity"})
|
|
@@ -1873,28 +1909,21 @@ def _review_subpacket(packet: dict[str, Any], soft_pairs: list[dict[str, Any]])
|
|
| 1873 |
}
|
| 1874 |
|
| 1875 |
|
| 1876 |
-
def _split_packet_for_review(packet: dict[str, Any]) -> list[dict[str, Any]]:
|
| 1877 |
-
if
|
| 1878 |
-
|
| 1879 |
-
|
| 1880 |
-
):
|
| 1881 |
return [packet]
|
| 1882 |
batches: list[list[dict[str, Any]]] = []
|
| 1883 |
current_batch: list[dict[str, Any]] = []
|
| 1884 |
-
current_nodes: set[str] = set()
|
| 1885 |
for soft_pair in sorted(packet["soft_pairs"], key=_soft_pair_review_sort_key):
|
| 1886 |
-
|
| 1887 |
-
|
| 1888 |
-
if current_batch and (
|
| 1889 |
-
len(current_batch) >= LLM_MAX_SOFT_PAIRS_PER_PACKET
|
| 1890 |
-
or len(next_nodes) > LLM_MAX_NODES_PER_PACKET
|
| 1891 |
-
):
|
| 1892 |
batches.append(current_batch)
|
| 1893 |
current_batch = [soft_pair]
|
| 1894 |
-
current_nodes = set(pair_nodes)
|
| 1895 |
continue
|
| 1896 |
-
current_batch
|
| 1897 |
-
current_nodes = next_nodes
|
| 1898 |
if current_batch:
|
| 1899 |
batches.append(current_batch)
|
| 1900 |
return [_review_subpacket(packet, batch) for batch in batches]
|
|
@@ -1985,7 +2014,8 @@ def _should_run_evaluator(
|
|
| 1985 |
aggressively_trimmed: bool,
|
| 1986 |
analyst_result: ClusterAnalystResponse,
|
| 1987 |
) -> bool:
|
| 1988 |
-
|
|
|
|
| 1989 |
return False
|
| 1990 |
if budget.estimated_eval_tokens > LLM_SKIP_EVALUATOR_ABOVE_TOKENS:
|
| 1991 |
return False
|
|
@@ -2020,6 +2050,166 @@ def _packet_soft_pair_ids(packet: dict[str, Any]) -> list[str]:
|
|
| 2020 |
]
|
| 2021 |
|
| 2022 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2023 |
async def _accepted_soft_pairs(
|
| 2024 |
*,
|
| 2025 |
options: AnalysisOptions,
|
|
@@ -2030,6 +2220,7 @@ async def _accepted_soft_pairs(
|
|
| 2030 |
label: str,
|
| 2031 |
hybrid_review_cache: HybridReviewCacheStore,
|
| 2032 |
llm_available: bool,
|
|
|
|
| 2033 |
) -> tuple[dict[tuple[str, str], dict[str, Any]], bool, list[dict[str, Any]]]:
|
| 2034 |
del snapshot
|
| 2035 |
if not soft_candidates:
|
|
@@ -2048,6 +2239,8 @@ async def _accepted_soft_pairs(
|
|
| 2048 |
for pair in soft_candidates:
|
| 2049 |
candidate_graph.setdefault(pair, set()).add("soft_similarity")
|
| 2050 |
component_payloads = _component_packets(features, candidate_graph, soft_candidates)
|
|
|
|
|
|
|
| 2051 |
accepted: dict[tuple[str, str], dict[str, Any]] = dict(deterministic_accepts)
|
| 2052 |
llm_used = False
|
| 2053 |
review_records: list[dict[str, Any]] = []
|
|
@@ -2055,7 +2248,7 @@ async def _accepted_soft_pairs(
|
|
| 2055 |
for index, payload in enumerate(component_payloads, start=1):
|
| 2056 |
component_budget = _estimate_packet_size(payload, options.model)
|
| 2057 |
cluster_id = _cluster_id_from_nodes(payload["nodes"])
|
| 2058 |
-
review_units = _split_packet_for_review(payload)
|
| 2059 |
if len(review_units) > 1:
|
| 2060 |
_analysis_log(
|
| 2061 |
f"LLM {label} soft-edge review {index}/{total_components}: "
|
|
@@ -2064,10 +2257,6 @@ async def _accepted_soft_pairs(
|
|
| 2064 |
f"est_tokens={component_budget.estimated_input_tokens})"
|
| 2065 |
)
|
| 2066 |
for unit_index, review_unit in enumerate(review_units, start=1):
|
| 2067 |
-
prefix = (
|
| 2068 |
-
f"LLM {label} soft-edge review {index}/{total_components}"
|
| 2069 |
-
f" unit {unit_index}/{len(review_units)}"
|
| 2070 |
-
)
|
| 2071 |
prepared = _prepare_packet_for_llm(
|
| 2072 |
review_unit,
|
| 2073 |
options.model,
|
|
@@ -2075,41 +2264,29 @@ async def _accepted_soft_pairs(
|
|
| 2075 |
)
|
| 2076 |
if prepared is None:
|
| 2077 |
unit_budget = _estimate_packet_size(review_unit, options.model)
|
| 2078 |
-
|
| 2079 |
-
|
| 2080 |
-
|
| 2081 |
-
|
| 2082 |
-
|
| 2083 |
-
|
| 2084 |
-
|
| 2085 |
-
|
| 2086 |
-
|
| 2087 |
-
|
| 2088 |
-
|
| 2089 |
-
|
| 2090 |
-
|
| 2091 |
-
|
| 2092 |
-
|
| 2093 |
-
|
| 2094 |
-
|
| 2095 |
-
|
| 2096 |
-
"
|
| 2097 |
-
"
|
| 2098 |
-
|
| 2099 |
-
|
| 2100 |
-
|
| 2101 |
-
"overflow_policy": LLM_OVERFLOW_POLICY,
|
| 2102 |
-
"trimmed": True,
|
| 2103 |
-
"aggressively_trimmed": True,
|
| 2104 |
-
"split": len(review_units) > 1,
|
| 2105 |
-
"analyst_result": None,
|
| 2106 |
-
"evaluator_result": None,
|
| 2107 |
-
"evaluator_used": False,
|
| 2108 |
-
"retried": False,
|
| 2109 |
-
"accepted_nontrivial_soft_edge": False,
|
| 2110 |
-
"error_kind": None,
|
| 2111 |
-
"error_message": None,
|
| 2112 |
-
}
|
| 2113 |
)
|
| 2114 |
continue
|
| 2115 |
prepared_review_unit = _prepared_review_unit_payload(prepared)
|
|
@@ -2118,85 +2295,113 @@ async def _accepted_soft_pairs(
|
|
| 2118 |
model=options.model,
|
| 2119 |
prepared_review_unit=prepared_review_unit,
|
| 2120 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2121 |
cached_entry = hybrid_review_cache.get(cache_key)
|
| 2122 |
-
cache_hit = cached_entry is not None
|
| 2123 |
if cached_entry is not None:
|
| 2124 |
-
|
| 2125 |
-
|
| 2126 |
-
|
| 2127 |
-
|
| 2128 |
-
|
| 2129 |
-
|
| 2130 |
-
|
| 2131 |
-
|
| 2132 |
-
|
| 2133 |
-
|
| 2134 |
-
"
|
|
|
|
| 2135 |
)
|
| 2136 |
-
|
| 2137 |
-
|
| 2138 |
-
|
| 2139 |
-
|
| 2140 |
-
|
| 2141 |
-
|
| 2142 |
-
|
| 2143 |
-
|
| 2144 |
-
|
| 2145 |
-
|
| 2146 |
-
|
| 2147 |
-
"model": options.model,
|
| 2148 |
-
"cluster_id": cluster_id,
|
| 2149 |
-
"nodes": list(prepared.packet["nodes"]),
|
| 2150 |
-
"soft_pairs": _packet_soft_pair_ids(prepared.packet),
|
| 2151 |
-
"prepared_review_unit_hash": cache_key.prepared_review_unit_hash,
|
| 2152 |
-
"component_budget": _packet_budget_json(component_budget),
|
| 2153 |
-
"budget": _packet_budget_json(prepared.budget),
|
| 2154 |
-
"overflow_policy": LLM_OVERFLOW_POLICY,
|
| 2155 |
-
"trimmed": prepared.trimmed,
|
| 2156 |
-
"aggressively_trimmed": prepared.aggressively_trimmed,
|
| 2157 |
-
"split": prepared.split,
|
| 2158 |
-
"analyst_result": None,
|
| 2159 |
-
"evaluator_result": None,
|
| 2160 |
-
"evaluator_used": False,
|
| 2161 |
-
"retried": False,
|
| 2162 |
-
"accepted_nontrivial_soft_edge": False,
|
| 2163 |
-
"error_kind": None,
|
| 2164 |
-
"error_message": None,
|
| 2165 |
-
}
|
| 2166 |
)
|
| 2167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2168 |
_analysis_log(
|
| 2169 |
-
f"{prefix}:
|
| 2170 |
-
f"
|
| 2171 |
-
f"{
|
| 2172 |
-
f"{' split' if prepared.split else ''}"
|
| 2173 |
)
|
| 2174 |
-
|
| 2175 |
-
if
|
| 2176 |
-
hybrid_review_cache.put(
|
| 2177 |
-
HybridReviewCacheEntry(
|
| 2178 |
-
key=cache_key,
|
| 2179 |
-
result=_cluster_analysis_call_result_payload(result),
|
| 2180 |
-
cached_at=_iso_now(),
|
| 2181 |
-
nodes=tuple(prepared.packet["nodes"]),
|
| 2182 |
-
soft_pairs=tuple(_packet_soft_pair_ids(prepared.packet)),
|
| 2183 |
-
budget=_packet_budget_json(prepared.budget),
|
| 2184 |
-
split=prepared.split,
|
| 2185 |
-
trimmed=prepared.trimmed,
|
| 2186 |
-
aggressively_trimmed=prepared.aggressively_trimmed,
|
| 2187 |
-
)
|
| 2188 |
-
)
|
| 2189 |
-
accepted_nontrivial = False
|
| 2190 |
-
if result.analyst_result is None:
|
| 2191 |
-
if result.error_kind is not None:
|
| 2192 |
_analysis_log(
|
| 2193 |
-
f"{prefix}: {result.error_kind}"
|
| 2194 |
-
f" (nodes={
|
| 2195 |
-
f"est_tokens={
|
| 2196 |
f"overflow_policy={LLM_OVERFLOW_POLICY})"
|
| 2197 |
)
|
| 2198 |
else:
|
| 2199 |
-
_analysis_log(f"{prefix}: no result")
|
| 2200 |
else:
|
| 2201 |
llm_used = True
|
| 2202 |
verdicts = {
|
|
@@ -2205,18 +2410,28 @@ async def _accepted_soft_pairs(
|
|
| 2205 |
}
|
| 2206 |
accepted_count = sum(1 for verdict in verdicts.values() if verdict.accept)
|
| 2207 |
rejected_count = sum(1 for verdict in verdicts.values() if not verdict.accept)
|
| 2208 |
-
accepted_nontrivial =
|
| 2209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2210 |
)
|
| 2211 |
evaluator_status = "used" if result.evaluator_used else "skipped"
|
| 2212 |
_analysis_log(
|
| 2213 |
-
f"{prefix}: {accepted_count} accepted, {rejected_count} rejected, "
|
| 2214 |
-
f"evaluator={evaluator_status}, source={
|
| 2215 |
)
|
| 2216 |
if result.error_kind is not None:
|
| 2217 |
-
_analysis_log(
|
| 2218 |
-
|
| 2219 |
-
|
|
|
|
|
|
|
| 2220 |
verdict = verdicts.get(normalized_pair)
|
| 2221 |
if verdict is None:
|
| 2222 |
continue
|
|
@@ -2224,45 +2439,31 @@ async def _accepted_soft_pairs(
|
|
| 2224 |
accepted[normalized_pair] = soft_candidates[normalized_pair]
|
| 2225 |
else:
|
| 2226 |
accepted.pop(normalized_pair, None)
|
| 2227 |
-
|
| 2228 |
-
|
| 2229 |
-
"
|
| 2230 |
-
|
| 2231 |
-
|
| 2232 |
-
|
| 2233 |
-
|
| 2234 |
-
|
| 2235 |
-
|
| 2236 |
-
|
| 2237 |
-
|
| 2238 |
-
|
| 2239 |
-
|
| 2240 |
-
|
| 2241 |
-
|
| 2242 |
-
|
| 2243 |
-
|
| 2244 |
-
|
| 2245 |
-
|
| 2246 |
-
|
| 2247 |
-
|
| 2248 |
-
|
| 2249 |
-
|
| 2250 |
-
None
|
| 2251 |
-
if result.analyst_result is None
|
| 2252 |
-
else result.analyst_result.model_dump(mode="json")
|
| 2253 |
-
),
|
| 2254 |
-
"evaluator_result": (
|
| 2255 |
-
None
|
| 2256 |
-
if result.evaluator_result is None
|
| 2257 |
-
else result.evaluator_result.model_dump(mode="json")
|
| 2258 |
-
),
|
| 2259 |
-
"evaluator_used": result.evaluator_used,
|
| 2260 |
-
"retried": result.retried,
|
| 2261 |
-
"accepted_nontrivial_soft_edge": accepted_nontrivial,
|
| 2262 |
-
"error_kind": result.error_kind,
|
| 2263 |
-
"error_message": result.error_message,
|
| 2264 |
-
}
|
| 2265 |
)
|
|
|
|
| 2266 |
return accepted, llm_used, review_records
|
| 2267 |
|
| 2268 |
|
|
|
|
| 19 |
from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
|
| 20 |
from slop_farmer.data.links import build_text_link_rows
|
| 21 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
|
| 22 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
from slop_farmer.reports.analysis_cache import (
|
| 24 |
HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
|
| 25 |
PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
|
| 26 |
HybridReviewCacheEntry,
|
| 27 |
+
HybridReviewCacheKey,
|
| 28 |
HybridReviewCacheManifest,
|
| 29 |
HybridReviewCacheStore,
|
| 30 |
HybridReviewSettingsFingerprint,
|
|
|
|
| 86 |
"DEEPSEEK_API_KEY",
|
| 87 |
)
|
| 88 |
LLM_PACKET_CHARS_PER_TOKEN = 4
|
| 89 |
+
LLM_MAX_INPUT_TOKENS = 60_000
|
| 90 |
+
LLM_MAX_NODES_PER_PACKET = 48
|
| 91 |
+
LLM_MAX_SOFT_PAIRS_PER_PACKET = 72
|
| 92 |
+
LLM_MAX_DIFF_CHARS_PER_ITEM = 1_200
|
| 93 |
+
LLM_MAX_FILENAMES_PER_ITEM = 16
|
| 94 |
+
LLM_SKIP_EVALUATOR_ABOVE_TOKENS = 60_000
|
| 95 |
LLM_OVERFLOW_POLICY = "truncate_then_skip"
|
| 96 |
LLM_SHARED_TARGET_MAX_NEIGHBORS_PER_PR = 3
|
| 97 |
LLM_SHARED_TARGET_MAX_EXTRA_PAIRS_PER_TARGET = 18
|
|
|
|
| 308 |
llm_reviews: list[dict[str, Any]]
|
| 309 |
|
| 310 |
|
| 311 |
+
@dataclass(frozen=True, slots=True)
|
| 312 |
+
class SoftPairReviewUnitMeta:
|
| 313 |
+
label: str
|
| 314 |
+
component_index: int
|
| 315 |
+
component_count: int
|
| 316 |
+
review_unit_index: int
|
| 317 |
+
review_unit_count: int
|
| 318 |
+
cluster_id: str
|
| 319 |
+
prefix: str
|
| 320 |
+
nodes: tuple[str, ...]
|
| 321 |
+
soft_pairs: tuple[str, ...]
|
| 322 |
+
component_budget: PacketBudget
|
| 323 |
+
budget: PacketBudget
|
| 324 |
+
prepared_review_unit_hash: str | None
|
| 325 |
+
trimmed: bool
|
| 326 |
+
aggressively_trimmed: bool
|
| 327 |
+
split: bool
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
@dataclass(frozen=True, slots=True)
|
| 331 |
+
class PendingSoftPairReview:
|
| 332 |
+
meta: SoftPairReviewUnitMeta
|
| 333 |
+
prepared: PreparedLlmPacket
|
| 334 |
+
cache_key: HybridReviewCacheKey
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
@dataclass(frozen=True, slots=True)
|
| 338 |
+
class CompletedSoftPairReview:
|
| 339 |
+
meta: SoftPairReviewUnitMeta
|
| 340 |
+
result: ClusterAnalysisCallResult | None
|
| 341 |
+
status: str
|
| 342 |
+
reason: str | None
|
| 343 |
+
source: str | None
|
| 344 |
+
cache_hit: bool
|
| 345 |
+
|
| 346 |
+
|
| 347 |
def _hybrid_review_cache_manifest() -> HybridReviewCacheManifest:
|
| 348 |
return HybridReviewCacheManifest(
|
| 349 |
cache_schema_version=HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
|
|
|
|
| 799 |
|
| 800 |
|
| 801 |
def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
|
| 802 |
+
return resolve_snapshot_source_dir(
|
| 803 |
+
snapshot_dir=options.snapshot_dir,
|
| 804 |
+
local_snapshots_root=options.output_dir.resolve() / "snapshots",
|
| 805 |
+
hf_repo_id=options.hf_repo_id,
|
| 806 |
+
hf_revision=options.hf_revision,
|
| 807 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 808 |
+
hf_output_dir=options.output_dir,
|
| 809 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 810 |
|
| 811 |
|
| 812 |
def _load_snapshot(snapshot_dir: Path) -> SnapshotData:
|
|
|
|
| 956 |
review_comment_map=review_comment_map,
|
| 957 |
)
|
| 958 |
issue_soft_candidates = _issue_soft_candidates(issue_map, features, issue_hard_pairs)
|
| 959 |
+
pr_soft_candidates, pr_pair_target_issues = _pr_duplicate_candidates(
|
| 960 |
options=options,
|
| 961 |
snapshot=snapshot,
|
| 962 |
+
issue_map=issue_map,
|
| 963 |
+
pr_map=pr_map,
|
| 964 |
features=features,
|
| 965 |
+
)
|
| 966 |
+
review_semaphore = asyncio.Semaphore(options.hybrid_llm_concurrency)
|
| 967 |
+
(
|
| 968 |
+
(accepted_issue_pairs, issue_llm_enabled, issue_llm_reviews),
|
| 969 |
+
(accepted_pr_pairs, pr_llm_enabled, pr_llm_reviews),
|
| 970 |
+
) = await asyncio.gather(
|
| 971 |
+
_accepted_soft_pairs(
|
| 972 |
+
options=options,
|
| 973 |
+
snapshot=snapshot,
|
| 974 |
+
features=features,
|
| 975 |
+
hard_pairs=issue_hard_pairs,
|
| 976 |
+
soft_candidates=issue_soft_candidates,
|
| 977 |
+
label="issue",
|
| 978 |
+
hybrid_review_cache=hybrid_review_cache,
|
| 979 |
+
llm_available=llm_available,
|
| 980 |
+
review_semaphore=review_semaphore,
|
| 981 |
+
),
|
| 982 |
+
_accepted_soft_pairs(
|
| 983 |
+
options=options,
|
| 984 |
+
snapshot=snapshot,
|
| 985 |
+
features=features,
|
| 986 |
+
hard_pairs={},
|
| 987 |
+
soft_candidates=pr_soft_candidates,
|
| 988 |
+
label="pull_request",
|
| 989 |
+
hybrid_review_cache=hybrid_review_cache,
|
| 990 |
+
llm_available=llm_available,
|
| 991 |
+
review_semaphore=review_semaphore,
|
| 992 |
+
),
|
| 993 |
)
|
| 994 |
issue_pairs = dict(issue_hard_pairs)
|
| 995 |
for pair, detail in accepted_issue_pairs.items():
|
| 996 |
issue_pairs.setdefault(pair, set()).update(
|
| 997 |
detail.get("evidence_types") or {"soft_similarity"}
|
| 998 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 999 |
pr_pairs: dict[tuple[str, str], set[str]] = {}
|
| 1000 |
for pair, detail in accepted_pr_pairs.items():
|
| 1001 |
pr_pairs.setdefault(pair, set()).update(detail.get("evidence_types") or {"soft_similarity"})
|
|
|
|
| 1909 |
}
|
| 1910 |
|
| 1911 |
|
| 1912 |
+
def _split_packet_for_review(packet: dict[str, Any], model: str) -> list[dict[str, Any]]:
|
| 1913 |
+
if not packet["soft_pairs"]:
|
| 1914 |
+
return [packet]
|
| 1915 |
+
if not _packet_over_budget(_estimate_packet_size(packet, model)):
|
|
|
|
| 1916 |
return [packet]
|
| 1917 |
batches: list[list[dict[str, Any]]] = []
|
| 1918 |
current_batch: list[dict[str, Any]] = []
|
|
|
|
| 1919 |
for soft_pair in sorted(packet["soft_pairs"], key=_soft_pair_review_sort_key):
|
| 1920 |
+
candidate_batch = [*current_batch, soft_pair]
|
| 1921 |
+
candidate_packet = _review_subpacket(packet, candidate_batch)
|
| 1922 |
+
if current_batch and _packet_over_budget(_estimate_packet_size(candidate_packet, model)):
|
|
|
|
|
|
|
|
|
|
| 1923 |
batches.append(current_batch)
|
| 1924 |
current_batch = [soft_pair]
|
|
|
|
| 1925 |
continue
|
| 1926 |
+
current_batch = candidate_batch
|
|
|
|
| 1927 |
if current_batch:
|
| 1928 |
batches.append(current_batch)
|
| 1929 |
return [_review_subpacket(packet, batch) for batch in batches]
|
|
|
|
| 2014 |
aggressively_trimmed: bool,
|
| 2015 |
analyst_result: ClusterAnalystResponse,
|
| 2016 |
) -> bool:
|
| 2017 |
+
del split
|
| 2018 |
+
if aggressively_trimmed:
|
| 2019 |
return False
|
| 2020 |
if budget.estimated_eval_tokens > LLM_SKIP_EVALUATOR_ABOVE_TOKENS:
|
| 2021 |
return False
|
|
|
|
| 2050 |
]
|
| 2051 |
|
| 2052 |
|
| 2053 |
+
def _soft_pair_review_meta(
|
| 2054 |
+
*,
|
| 2055 |
+
label: str,
|
| 2056 |
+
component_index: int,
|
| 2057 |
+
component_count: int,
|
| 2058 |
+
review_unit_index: int,
|
| 2059 |
+
review_unit_count: int,
|
| 2060 |
+
cluster_id: str,
|
| 2061 |
+
component_budget: PacketBudget,
|
| 2062 |
+
budget: PacketBudget,
|
| 2063 |
+
prepared_review_unit_hash: str | None,
|
| 2064 |
+
trimmed: bool,
|
| 2065 |
+
aggressively_trimmed: bool,
|
| 2066 |
+
split: bool,
|
| 2067 |
+
packet: dict[str, Any],
|
| 2068 |
+
) -> SoftPairReviewUnitMeta:
|
| 2069 |
+
prefix = (
|
| 2070 |
+
f"LLM {label} soft-edge review {component_index}/{component_count}"
|
| 2071 |
+
f" unit {review_unit_index}/{review_unit_count}"
|
| 2072 |
+
)
|
| 2073 |
+
return SoftPairReviewUnitMeta(
|
| 2074 |
+
label=label,
|
| 2075 |
+
component_index=component_index,
|
| 2076 |
+
component_count=component_count,
|
| 2077 |
+
review_unit_index=review_unit_index,
|
| 2078 |
+
review_unit_count=review_unit_count,
|
| 2079 |
+
cluster_id=cluster_id,
|
| 2080 |
+
prefix=prefix,
|
| 2081 |
+
nodes=tuple(str(node) for node in packet["nodes"]),
|
| 2082 |
+
soft_pairs=tuple(_packet_soft_pair_ids(packet)),
|
| 2083 |
+
component_budget=component_budget,
|
| 2084 |
+
budget=budget,
|
| 2085 |
+
prepared_review_unit_hash=prepared_review_unit_hash,
|
| 2086 |
+
trimmed=trimmed,
|
| 2087 |
+
aggressively_trimmed=aggressively_trimmed,
|
| 2088 |
+
split=split,
|
| 2089 |
+
)
|
| 2090 |
+
|
| 2091 |
+
|
| 2092 |
+
def _completed_soft_pair_review_sort_key(review: CompletedSoftPairReview) -> tuple[int, int]:
|
| 2093 |
+
return (
|
| 2094 |
+
review.meta.component_index,
|
| 2095 |
+
review.meta.review_unit_index,
|
| 2096 |
+
)
|
| 2097 |
+
|
| 2098 |
+
|
| 2099 |
+
def _soft_pair_review_record(
|
| 2100 |
+
*,
|
| 2101 |
+
review: CompletedSoftPairReview,
|
| 2102 |
+
model: str,
|
| 2103 |
+
accepted_nontrivial_soft_edge: bool,
|
| 2104 |
+
) -> dict[str, Any]:
|
| 2105 |
+
result = review.result
|
| 2106 |
+
return {
|
| 2107 |
+
"label": review.meta.label,
|
| 2108 |
+
"component_index": review.meta.component_index,
|
| 2109 |
+
"component_count": review.meta.component_count,
|
| 2110 |
+
"review_unit_index": review.meta.review_unit_index,
|
| 2111 |
+
"review_unit_count": review.meta.review_unit_count,
|
| 2112 |
+
"status": review.status,
|
| 2113 |
+
"reason": review.reason,
|
| 2114 |
+
"source": review.source,
|
| 2115 |
+
"cache_hit": review.cache_hit,
|
| 2116 |
+
"model": model,
|
| 2117 |
+
"cluster_id": review.meta.cluster_id,
|
| 2118 |
+
"nodes": list(review.meta.nodes),
|
| 2119 |
+
"soft_pairs": list(review.meta.soft_pairs),
|
| 2120 |
+
"prepared_review_unit_hash": review.meta.prepared_review_unit_hash,
|
| 2121 |
+
"component_budget": _packet_budget_json(review.meta.component_budget),
|
| 2122 |
+
"budget": _packet_budget_json(review.meta.budget),
|
| 2123 |
+
"overflow_policy": LLM_OVERFLOW_POLICY,
|
| 2124 |
+
"trimmed": review.meta.trimmed,
|
| 2125 |
+
"aggressively_trimmed": review.meta.aggressively_trimmed,
|
| 2126 |
+
"split": review.meta.split,
|
| 2127 |
+
"analyst_result": (
|
| 2128 |
+
None
|
| 2129 |
+
if result is None or result.analyst_result is None
|
| 2130 |
+
else result.analyst_result.model_dump(mode="json")
|
| 2131 |
+
),
|
| 2132 |
+
"evaluator_result": (
|
| 2133 |
+
None
|
| 2134 |
+
if result is None or result.evaluator_result is None
|
| 2135 |
+
else result.evaluator_result.model_dump(mode="json")
|
| 2136 |
+
),
|
| 2137 |
+
"evaluator_used": False if result is None else result.evaluator_used,
|
| 2138 |
+
"retried": False if result is None else result.retried,
|
| 2139 |
+
"accepted_nontrivial_soft_edge": accepted_nontrivial_soft_edge,
|
| 2140 |
+
"error_kind": None if result is None else result.error_kind,
|
| 2141 |
+
"error_message": None if result is None else result.error_message,
|
| 2142 |
+
}
|
| 2143 |
+
|
| 2144 |
+
|
| 2145 |
+
def _completed_soft_pair_review_from_result(
|
| 2146 |
+
pending: PendingSoftPairReview,
|
| 2147 |
+
result: ClusterAnalysisCallResult,
|
| 2148 |
+
) -> CompletedSoftPairReview:
|
| 2149 |
+
return CompletedSoftPairReview(
|
| 2150 |
+
meta=pending.meta,
|
| 2151 |
+
result=result,
|
| 2152 |
+
status="reviewed" if result.analyst_result is not None else "error",
|
| 2153 |
+
reason=None,
|
| 2154 |
+
source="llm",
|
| 2155 |
+
cache_hit=False,
|
| 2156 |
+
)
|
| 2157 |
+
|
| 2158 |
+
|
| 2159 |
+
async def _run_pending_soft_pair_review(
|
| 2160 |
+
pending: PendingSoftPairReview,
|
| 2161 |
+
*,
|
| 2162 |
+
model: str,
|
| 2163 |
+
review_semaphore: asyncio.Semaphore,
|
| 2164 |
+
) -> CompletedSoftPairReview:
|
| 2165 |
+
async with review_semaphore:
|
| 2166 |
+
try:
|
| 2167 |
+
result = await _fast_agent_cluster_analysis(pending.prepared, model)
|
| 2168 |
+
except Exception as exc:
|
| 2169 |
+
result = ClusterAnalysisCallResult(
|
| 2170 |
+
analyst_result=None,
|
| 2171 |
+
evaluator_result=None,
|
| 2172 |
+
error_kind=_classify_llm_error(exc),
|
| 2173 |
+
error_message=_summarize_llm_error(exc),
|
| 2174 |
+
evaluator_used=False,
|
| 2175 |
+
retried=False,
|
| 2176 |
+
)
|
| 2177 |
+
return _completed_soft_pair_review_from_result(pending, result)
|
| 2178 |
+
|
| 2179 |
+
|
| 2180 |
+
async def _run_pending_soft_pair_reviews(
|
| 2181 |
+
pending_reviews: list[PendingSoftPairReview],
|
| 2182 |
+
*,
|
| 2183 |
+
concurrency: int,
|
| 2184 |
+
model: str,
|
| 2185 |
+
review_semaphore: asyncio.Semaphore,
|
| 2186 |
+
) -> list[CompletedSoftPairReview]:
|
| 2187 |
+
if not pending_reviews:
|
| 2188 |
+
return []
|
| 2189 |
+
if concurrency <= 1:
|
| 2190 |
+
completed: list[CompletedSoftPairReview] = []
|
| 2191 |
+
for pending in pending_reviews:
|
| 2192 |
+
completed.append(
|
| 2193 |
+
await _run_pending_soft_pair_review(
|
| 2194 |
+
pending,
|
| 2195 |
+
model=model,
|
| 2196 |
+
review_semaphore=review_semaphore,
|
| 2197 |
+
)
|
| 2198 |
+
)
|
| 2199 |
+
return completed
|
| 2200 |
+
tasks = [
|
| 2201 |
+
asyncio.create_task(
|
| 2202 |
+
_run_pending_soft_pair_review(
|
| 2203 |
+
pending,
|
| 2204 |
+
model=model,
|
| 2205 |
+
review_semaphore=review_semaphore,
|
| 2206 |
+
)
|
| 2207 |
+
)
|
| 2208 |
+
for pending in pending_reviews
|
| 2209 |
+
]
|
| 2210 |
+
return await asyncio.gather(*tasks)
|
| 2211 |
+
|
| 2212 |
+
|
| 2213 |
async def _accepted_soft_pairs(
|
| 2214 |
*,
|
| 2215 |
options: AnalysisOptions,
|
|
|
|
| 2220 |
label: str,
|
| 2221 |
hybrid_review_cache: HybridReviewCacheStore,
|
| 2222 |
llm_available: bool,
|
| 2223 |
+
review_semaphore: asyncio.Semaphore,
|
| 2224 |
) -> tuple[dict[tuple[str, str], dict[str, Any]], bool, list[dict[str, Any]]]:
|
| 2225 |
del snapshot
|
| 2226 |
if not soft_candidates:
|
|
|
|
| 2239 |
for pair in soft_candidates:
|
| 2240 |
candidate_graph.setdefault(pair, set()).add("soft_similarity")
|
| 2241 |
component_payloads = _component_packets(features, candidate_graph, soft_candidates)
|
| 2242 |
+
pending_reviews: list[PendingSoftPairReview] = []
|
| 2243 |
+
completed_reviews: list[CompletedSoftPairReview] = []
|
| 2244 |
accepted: dict[tuple[str, str], dict[str, Any]] = dict(deterministic_accepts)
|
| 2245 |
llm_used = False
|
| 2246 |
review_records: list[dict[str, Any]] = []
|
|
|
|
| 2248 |
for index, payload in enumerate(component_payloads, start=1):
|
| 2249 |
component_budget = _estimate_packet_size(payload, options.model)
|
| 2250 |
cluster_id = _cluster_id_from_nodes(payload["nodes"])
|
| 2251 |
+
review_units = _split_packet_for_review(payload, options.model)
|
| 2252 |
if len(review_units) > 1:
|
| 2253 |
_analysis_log(
|
| 2254 |
f"LLM {label} soft-edge review {index}/{total_components}: "
|
|
|
|
| 2257 |
f"est_tokens={component_budget.estimated_input_tokens})"
|
| 2258 |
)
|
| 2259 |
for unit_index, review_unit in enumerate(review_units, start=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2260 |
prepared = _prepare_packet_for_llm(
|
| 2261 |
review_unit,
|
| 2262 |
options.model,
|
|
|
|
| 2264 |
)
|
| 2265 |
if prepared is None:
|
| 2266 |
unit_budget = _estimate_packet_size(review_unit, options.model)
|
| 2267 |
+
completed_reviews.append(
|
| 2268 |
+
CompletedSoftPairReview(
|
| 2269 |
+
meta=_soft_pair_review_meta(
|
| 2270 |
+
label=label,
|
| 2271 |
+
component_index=index,
|
| 2272 |
+
component_count=total_components,
|
| 2273 |
+
review_unit_index=unit_index,
|
| 2274 |
+
review_unit_count=len(review_units),
|
| 2275 |
+
cluster_id=cluster_id,
|
| 2276 |
+
component_budget=component_budget,
|
| 2277 |
+
budget=unit_budget,
|
| 2278 |
+
prepared_review_unit_hash=None,
|
| 2279 |
+
trimmed=True,
|
| 2280 |
+
aggressively_trimmed=True,
|
| 2281 |
+
split=len(review_units) > 1,
|
| 2282 |
+
packet=review_unit,
|
| 2283 |
+
),
|
| 2284 |
+
result=None,
|
| 2285 |
+
status="skipped",
|
| 2286 |
+
reason="over_budget_after_truncate",
|
| 2287 |
+
source=None,
|
| 2288 |
+
cache_hit=False,
|
| 2289 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2290 |
)
|
| 2291 |
continue
|
| 2292 |
prepared_review_unit = _prepared_review_unit_payload(prepared)
|
|
|
|
| 2295 |
model=options.model,
|
| 2296 |
prepared_review_unit=prepared_review_unit,
|
| 2297 |
)
|
| 2298 |
+
meta = _soft_pair_review_meta(
|
| 2299 |
+
label=label,
|
| 2300 |
+
component_index=index,
|
| 2301 |
+
component_count=total_components,
|
| 2302 |
+
review_unit_index=unit_index,
|
| 2303 |
+
review_unit_count=len(review_units),
|
| 2304 |
+
cluster_id=cluster_id,
|
| 2305 |
+
component_budget=component_budget,
|
| 2306 |
+
budget=prepared.budget,
|
| 2307 |
+
prepared_review_unit_hash=cache_key.prepared_review_unit_hash,
|
| 2308 |
+
trimmed=prepared.trimmed,
|
| 2309 |
+
aggressively_trimmed=prepared.aggressively_trimmed,
|
| 2310 |
+
split=prepared.split,
|
| 2311 |
+
packet=prepared.packet,
|
| 2312 |
+
)
|
| 2313 |
cached_entry = hybrid_review_cache.get(cache_key)
|
|
|
|
| 2314 |
if cached_entry is not None:
|
| 2315 |
+
completed_reviews.append(
|
| 2316 |
+
CompletedSoftPairReview(
|
| 2317 |
+
meta=meta,
|
| 2318 |
+
result=_cluster_analysis_call_result_from_payload(cached_entry.result),
|
| 2319 |
+
status=(
|
| 2320 |
+
"reviewed"
|
| 2321 |
+
if cached_entry.result.get("analyst_result") is not None
|
| 2322 |
+
else "error"
|
| 2323 |
+
),
|
| 2324 |
+
reason=None,
|
| 2325 |
+
source="cache",
|
| 2326 |
+
cache_hit=True,
|
| 2327 |
)
|
| 2328 |
+
)
|
| 2329 |
+
continue
|
| 2330 |
+
if not llm_available:
|
| 2331 |
+
completed_reviews.append(
|
| 2332 |
+
CompletedSoftPairReview(
|
| 2333 |
+
meta=meta,
|
| 2334 |
+
result=None,
|
| 2335 |
+
status="skipped",
|
| 2336 |
+
reason="llm_unavailable_cache_miss",
|
| 2337 |
+
source=None,
|
| 2338 |
+
cache_hit=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2339 |
)
|
| 2340 |
+
)
|
| 2341 |
+
continue
|
| 2342 |
+
pending_reviews.append(
|
| 2343 |
+
PendingSoftPairReview(
|
| 2344 |
+
meta=meta,
|
| 2345 |
+
prepared=prepared,
|
| 2346 |
+
cache_key=cache_key,
|
| 2347 |
+
)
|
| 2348 |
+
)
|
| 2349 |
+
reviewed_from_cache = sum(1 for review in completed_reviews if review.cache_hit)
|
| 2350 |
+
skipped_reviews = sum(1 for review in completed_reviews if review.status == "skipped")
|
| 2351 |
+
_analysis_log(
|
| 2352 |
+
f"LLM {label} soft-edge review scheduling: "
|
| 2353 |
+
f"units={len(pending_reviews) + len(completed_reviews)}, "
|
| 2354 |
+
f"cache_hits={reviewed_from_cache}, "
|
| 2355 |
+
f"cache_misses={len(pending_reviews)}, "
|
| 2356 |
+
f"skipped={skipped_reviews}, "
|
| 2357 |
+
f"concurrency={options.hybrid_llm_concurrency}"
|
| 2358 |
+
)
|
| 2359 |
+
completed_reviews.extend(
|
| 2360 |
+
await _run_pending_soft_pair_reviews(
|
| 2361 |
+
pending_reviews,
|
| 2362 |
+
concurrency=options.hybrid_llm_concurrency,
|
| 2363 |
+
model=options.model,
|
| 2364 |
+
review_semaphore=review_semaphore,
|
| 2365 |
+
)
|
| 2366 |
+
)
|
| 2367 |
+
pending_by_position = {
|
| 2368 |
+
(pending.meta.component_index, pending.meta.review_unit_index): pending
|
| 2369 |
+
for pending in pending_reviews
|
| 2370 |
+
}
|
| 2371 |
+
for review in sorted(completed_reviews, key=_completed_soft_pair_review_sort_key):
|
| 2372 |
+
accepted_nontrivial = False
|
| 2373 |
+
pending = pending_by_position.get(
|
| 2374 |
+
(review.meta.component_index, review.meta.review_unit_index)
|
| 2375 |
+
)
|
| 2376 |
+
result = review.result
|
| 2377 |
+
if review.reason == "over_budget_after_truncate":
|
| 2378 |
+
_analysis_log(
|
| 2379 |
+
f"{review.meta.prefix}: skipped over-budget packet "
|
| 2380 |
+
f"(nodes={review.meta.budget.node_count}, soft_pairs={review.meta.budget.soft_pair_count}, "
|
| 2381 |
+
f"est_tokens={review.meta.budget.estimated_input_tokens}, overflow_policy={LLM_OVERFLOW_POLICY})"
|
| 2382 |
+
)
|
| 2383 |
+
elif review.reason == "llm_unavailable_cache_miss":
|
| 2384 |
+
_analysis_log(
|
| 2385 |
+
f"{review.meta.prefix}: cache miss with fast-agent unavailable; "
|
| 2386 |
+
"keeping deterministic-only soft edges"
|
| 2387 |
+
)
|
| 2388 |
+
else:
|
| 2389 |
+
if review.cache_hit:
|
| 2390 |
_analysis_log(
|
| 2391 |
+
f"{review.meta.prefix}: cache hit "
|
| 2392 |
+
f"(nodes={review.meta.budget.node_count}, soft_pairs={review.meta.budget.soft_pair_count}, "
|
| 2393 |
+
f"est_tokens={review.meta.budget.estimated_input_tokens}, model={options.model})"
|
|
|
|
| 2394 |
)
|
| 2395 |
+
if result is None or result.analyst_result is None:
|
| 2396 |
+
if result is not None and result.error_kind is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2397 |
_analysis_log(
|
| 2398 |
+
f"{review.meta.prefix}: {result.error_kind}"
|
| 2399 |
+
f" (nodes={review.meta.budget.node_count}, soft_pairs={review.meta.budget.soft_pair_count}, "
|
| 2400 |
+
f"est_tokens={review.meta.budget.estimated_input_tokens}, "
|
| 2401 |
f"overflow_policy={LLM_OVERFLOW_POLICY})"
|
| 2402 |
)
|
| 2403 |
else:
|
| 2404 |
+
_analysis_log(f"{review.meta.prefix}: no result")
|
| 2405 |
else:
|
| 2406 |
llm_used = True
|
| 2407 |
verdicts = {
|
|
|
|
| 2410 |
}
|
| 2411 |
accepted_count = sum(1 for verdict in verdicts.values() if verdict.accept)
|
| 2412 |
rejected_count = sum(1 for verdict in verdicts.values() if not verdict.accept)
|
| 2413 |
+
accepted_nontrivial = any(
|
| 2414 |
+
verdicts.get(_pair_key(*pair_id.split("|", 1))) is not None
|
| 2415 |
+
and verdicts[_pair_key(*pair_id.split("|", 1))].accept
|
| 2416 |
+
and not bool(
|
| 2417 |
+
soft_candidates[_pair_key(*pair_id.split("|", 1))].get(
|
| 2418 |
+
"deterministic_accept",
|
| 2419 |
+
True,
|
| 2420 |
+
)
|
| 2421 |
+
)
|
| 2422 |
+
for pair_id in review.meta.soft_pairs
|
| 2423 |
)
|
| 2424 |
evaluator_status = "used" if result.evaluator_used else "skipped"
|
| 2425 |
_analysis_log(
|
| 2426 |
+
f"{review.meta.prefix}: {accepted_count} accepted, {rejected_count} rejected, "
|
| 2427 |
+
f"evaluator={evaluator_status}, source={review.source}"
|
| 2428 |
)
|
| 2429 |
if result.error_kind is not None:
|
| 2430 |
+
_analysis_log(
|
| 2431 |
+
f"{review.meta.prefix}: {result.error_kind}; keeping analyst result"
|
| 2432 |
+
)
|
| 2433 |
+
for pair_id in review.meta.soft_pairs:
|
| 2434 |
+
normalized_pair = _pair_key(*pair_id.split("|", 1))
|
| 2435 |
verdict = verdicts.get(normalized_pair)
|
| 2436 |
if verdict is None:
|
| 2437 |
continue
|
|
|
|
| 2439 |
accepted[normalized_pair] = soft_candidates[normalized_pair]
|
| 2440 |
else:
|
| 2441 |
accepted.pop(normalized_pair, None)
|
| 2442 |
+
if (
|
| 2443 |
+
pending is not None
|
| 2444 |
+
and review.source == "llm"
|
| 2445 |
+
and _cacheable_cluster_analysis_result(result)
|
| 2446 |
+
):
|
| 2447 |
+
hybrid_review_cache.put(
|
| 2448 |
+
HybridReviewCacheEntry(
|
| 2449 |
+
key=pending.cache_key,
|
| 2450 |
+
result=_cluster_analysis_call_result_payload(result),
|
| 2451 |
+
cached_at=_iso_now(),
|
| 2452 |
+
nodes=tuple(pending.prepared.packet["nodes"]),
|
| 2453 |
+
soft_pairs=tuple(_packet_soft_pair_ids(pending.prepared.packet)),
|
| 2454 |
+
budget=_packet_budget_json(pending.prepared.budget),
|
| 2455 |
+
split=pending.prepared.split,
|
| 2456 |
+
trimmed=pending.prepared.trimmed,
|
| 2457 |
+
aggressively_trimmed=pending.prepared.aggressively_trimmed,
|
| 2458 |
+
)
|
| 2459 |
+
)
|
| 2460 |
+
review_records.append(
|
| 2461 |
+
_soft_pair_review_record(
|
| 2462 |
+
review=review,
|
| 2463 |
+
model=options.model,
|
| 2464 |
+
accepted_nontrivial_soft_edge=accepted_nontrivial,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2465 |
)
|
| 2466 |
+
)
|
| 2467 |
return accepted, llm_used, review_records
|
| 2468 |
|
| 2469 |
|
src/slop_farmer/reports/analysis_service.py
CHANGED
|
@@ -6,12 +6,16 @@ from typing import Any
|
|
| 6 |
|
| 7 |
from slop_farmer.data.parquet_io import read_json
|
| 8 |
from slop_farmer.data.search_duckdb import connect_pr_search_db, resolve_active_run
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
ANALYSIS_VARIANTS = {"auto", "deterministic", "hybrid"}
|
| 11 |
-
ANALYSIS_REPORT_FILENAMES = {
|
| 12 |
-
"deterministic": "analysis-report.json",
|
| 13 |
-
"hybrid": "analysis-report-hybrid.json",
|
| 14 |
-
}
|
| 15 |
|
| 16 |
|
| 17 |
@dataclass(frozen=True, slots=True)
|
|
@@ -28,6 +32,7 @@ class AnalysisContext:
|
|
| 28 |
report_source: str
|
| 29 |
variant_requested: str
|
| 30 |
variant_used: str
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def get_analysis_status(
|
|
@@ -35,14 +40,15 @@ def get_analysis_status(
|
|
| 35 |
*,
|
| 36 |
repo: str | None = None,
|
| 37 |
variant: str = "auto",
|
| 38 |
-
|
|
|
|
| 39 |
) -> dict[str, Any]:
|
| 40 |
active = _resolve_active_snapshot_context(db_path, repo=repo)
|
| 41 |
-
report_path, variant_used, report_source = _resolve_analysis_report_path(
|
| 42 |
active.snapshot_dir,
|
| 43 |
-
str(active.active_run["repo"]),
|
| 44 |
variant,
|
| 45 |
-
|
|
|
|
| 46 |
required=False,
|
| 47 |
)
|
| 48 |
payload = {
|
|
@@ -55,7 +61,7 @@ def get_analysis_status(
|
|
| 55 |
if report_path is None or variant_used is None or report_source is None:
|
| 56 |
return payload
|
| 57 |
report = _load_report(report_path)
|
| 58 |
-
|
| 59 |
**payload,
|
| 60 |
"snapshot_id": str(report.get("snapshot_id") or active.active_run["snapshot_id"]),
|
| 61 |
"variant_used": variant_used,
|
|
@@ -64,6 +70,9 @@ def get_analysis_status(
|
|
| 64 |
"generated_at": report.get("generated_at"),
|
| 65 |
"counts": _analysis_counts(report),
|
| 66 |
}
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
def get_pr_analysis(
|
|
@@ -72,13 +81,15 @@ def get_pr_analysis(
|
|
| 72 |
pr_number: int,
|
| 73 |
repo: str | None = None,
|
| 74 |
variant: str = "auto",
|
| 75 |
-
|
|
|
|
| 76 |
) -> dict[str, Any]:
|
| 77 |
context = _load_analysis_context(
|
| 78 |
db_path,
|
| 79 |
repo=repo,
|
| 80 |
variant=variant,
|
| 81 |
-
|
|
|
|
| 82 |
)
|
| 83 |
meta_bug, rank = _find_meta_bug_for_pr(context.report, pr_number)
|
| 84 |
duplicate_pr = _find_duplicate_pr_for_pr(context.report, pr_number)
|
|
@@ -97,13 +108,15 @@ def list_analysis_meta_bugs(
|
|
| 97 |
repo: str | None = None,
|
| 98 |
variant: str = "auto",
|
| 99 |
limit: int = 50,
|
| 100 |
-
|
|
|
|
| 101 |
) -> dict[str, Any]:
|
| 102 |
context = _load_analysis_context(
|
| 103 |
db_path,
|
| 104 |
repo=repo,
|
| 105 |
variant=variant,
|
| 106 |
-
|
|
|
|
| 107 |
)
|
| 108 |
meta_bugs = [
|
| 109 |
_meta_bug_payload(cluster, rank=index)
|
|
@@ -122,13 +135,15 @@ def get_analysis_meta_bug(
|
|
| 122 |
cluster_id: str,
|
| 123 |
repo: str | None = None,
|
| 124 |
variant: str = "auto",
|
| 125 |
-
|
|
|
|
| 126 |
) -> dict[str, Any]:
|
| 127 |
context = _load_analysis_context(
|
| 128 |
db_path,
|
| 129 |
repo=repo,
|
| 130 |
variant=variant,
|
| 131 |
-
|
|
|
|
| 132 |
)
|
| 133 |
for index, cluster in enumerate(context.report.get("meta_bugs", []), start=1):
|
| 134 |
if str(cluster.get("cluster_id")) != cluster_id:
|
|
@@ -147,13 +162,15 @@ def list_analysis_duplicate_prs(
|
|
| 147 |
repo: str | None = None,
|
| 148 |
variant: str = "auto",
|
| 149 |
limit: int = 50,
|
| 150 |
-
|
|
|
|
| 151 |
) -> dict[str, Any]:
|
| 152 |
context = _load_analysis_context(
|
| 153 |
db_path,
|
| 154 |
repo=repo,
|
| 155 |
variant=variant,
|
| 156 |
-
|
|
|
|
| 157 |
)
|
| 158 |
duplicate_prs = [
|
| 159 |
{"rank": index, **dict(entry)}
|
|
@@ -171,13 +188,15 @@ def get_analysis_best(
|
|
| 171 |
*,
|
| 172 |
repo: str | None = None,
|
| 173 |
variant: str = "auto",
|
| 174 |
-
|
|
|
|
| 175 |
) -> dict[str, Any]:
|
| 176 |
context = _load_analysis_context(
|
| 177 |
db_path,
|
| 178 |
repo=repo,
|
| 179 |
variant=variant,
|
| 180 |
-
|
|
|
|
| 181 |
)
|
| 182 |
return {
|
| 183 |
**_analysis_base_payload(context),
|
|
@@ -217,14 +236,15 @@ def _load_analysis_context(
|
|
| 217 |
*,
|
| 218 |
repo: str | None,
|
| 219 |
variant: str,
|
| 220 |
-
|
|
|
|
| 221 |
) -> AnalysisContext:
|
| 222 |
active = _resolve_active_snapshot_context(db_path, repo=repo)
|
| 223 |
-
report_path, variant_used, report_source = _resolve_analysis_report_path(
|
| 224 |
active.snapshot_dir,
|
| 225 |
-
str(active.active_run["repo"]),
|
| 226 |
variant,
|
| 227 |
-
|
|
|
|
| 228 |
required=True,
|
| 229 |
)
|
| 230 |
assert report_path is not None
|
|
@@ -237,59 +257,143 @@ def _load_analysis_context(
|
|
| 237 |
report_source=report_source,
|
| 238 |
variant_requested=_normalize_analysis_variant(variant),
|
| 239 |
variant_used=variant_used,
|
|
|
|
| 240 |
)
|
| 241 |
|
| 242 |
|
| 243 |
def _resolve_analysis_report_path(
|
| 244 |
snapshot_dir: Path,
|
| 245 |
-
repo: str,
|
| 246 |
variant: str,
|
| 247 |
*,
|
| 248 |
-
|
|
|
|
| 249 |
required: bool,
|
| 250 |
-
) -> tuple[Path | None, str | None, str | None]:
|
| 251 |
normalized = _normalize_analysis_variant(variant)
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
if deterministic_path.exists():
|
| 264 |
-
return deterministic_path, "deterministic", source
|
| 265 |
if not required:
|
| 266 |
-
return None, None, None
|
| 267 |
raise ValueError(
|
| 268 |
-
"
|
| 269 |
)
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
if not required:
|
| 275 |
-
return None, None, None
|
| 276 |
raise ValueError(
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
)
|
|
|
|
| 279 |
|
| 280 |
|
| 281 |
-
def
|
|
|
|
|
|
|
| 282 |
*,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
snapshot_dir: Path,
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
) ->
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
|
| 295 |
def _normalize_analysis_variant(variant: str) -> str:
|
|
@@ -304,7 +408,7 @@ def _normalize_analysis_variant(variant: str) -> str:
|
|
| 304 |
def _analysis_base_payload(context: AnalysisContext) -> dict[str, Any]:
|
| 305 |
active_snapshot_id = str(context.active_run["snapshot_id"])
|
| 306 |
snapshot_id = str(context.report.get("snapshot_id") or active_snapshot_id)
|
| 307 |
-
|
| 308 |
"repo": str(context.active_run["repo"]),
|
| 309 |
"snapshot_id": snapshot_id,
|
| 310 |
"active_snapshot_id": active_snapshot_id,
|
|
@@ -315,6 +419,9 @@ def _analysis_base_payload(context: AnalysisContext) -> dict[str, Any]:
|
|
| 315 |
"llm_enrichment": bool(context.report.get("llm_enrichment")),
|
| 316 |
"generated_at": context.report.get("generated_at"),
|
| 317 |
}
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
|
| 320 |
def _analysis_counts(report: dict[str, Any]) -> dict[str, int]:
|
|
|
|
| 6 |
|
| 7 |
from slop_farmer.data.parquet_io import read_json
|
| 8 |
from slop_farmer.data.search_duckdb import connect_pr_search_db, resolve_active_run
|
| 9 |
+
from slop_farmer.data.snapshot_paths import (
|
| 10 |
+
ANALYSIS_REPORT_FILENAME_BY_VARIANT,
|
| 11 |
+
CURRENT_ANALYSIS_MANIFEST_PATH,
|
| 12 |
+
analysis_run_manifest_path,
|
| 13 |
+
load_archived_analysis_run_manifest,
|
| 14 |
+
load_current_analysis_manifest,
|
| 15 |
+
repo_relative_path_to_local,
|
| 16 |
+
)
|
| 17 |
|
| 18 |
ANALYSIS_VARIANTS = {"auto", "deterministic", "hybrid"}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
@dataclass(frozen=True, slots=True)
|
|
|
|
| 32 |
report_source: str
|
| 33 |
variant_requested: str
|
| 34 |
variant_used: str
|
| 35 |
+
analysis_id: str | None
|
| 36 |
|
| 37 |
|
| 38 |
def get_analysis_status(
|
|
|
|
| 40 |
*,
|
| 41 |
repo: str | None = None,
|
| 42 |
variant: str = "auto",
|
| 43 |
+
snapshot_id: str | None = None,
|
| 44 |
+
analysis_id: str | None = None,
|
| 45 |
) -> dict[str, Any]:
|
| 46 |
active = _resolve_active_snapshot_context(db_path, repo=repo)
|
| 47 |
+
report_path, variant_used, report_source, resolved_analysis_id = _resolve_analysis_report_path(
|
| 48 |
active.snapshot_dir,
|
|
|
|
| 49 |
variant,
|
| 50 |
+
snapshot_id=snapshot_id,
|
| 51 |
+
analysis_id=analysis_id,
|
| 52 |
required=False,
|
| 53 |
)
|
| 54 |
payload = {
|
|
|
|
| 61 |
if report_path is None or variant_used is None or report_source is None:
|
| 62 |
return payload
|
| 63 |
report = _load_report(report_path)
|
| 64 |
+
status = {
|
| 65 |
**payload,
|
| 66 |
"snapshot_id": str(report.get("snapshot_id") or active.active_run["snapshot_id"]),
|
| 67 |
"variant_used": variant_used,
|
|
|
|
| 70 |
"generated_at": report.get("generated_at"),
|
| 71 |
"counts": _analysis_counts(report),
|
| 72 |
}
|
| 73 |
+
if resolved_analysis_id is not None:
|
| 74 |
+
status["analysis_id"] = resolved_analysis_id
|
| 75 |
+
return status
|
| 76 |
|
| 77 |
|
| 78 |
def get_pr_analysis(
|
|
|
|
| 81 |
pr_number: int,
|
| 82 |
repo: str | None = None,
|
| 83 |
variant: str = "auto",
|
| 84 |
+
snapshot_id: str | None = None,
|
| 85 |
+
analysis_id: str | None = None,
|
| 86 |
) -> dict[str, Any]:
|
| 87 |
context = _load_analysis_context(
|
| 88 |
db_path,
|
| 89 |
repo=repo,
|
| 90 |
variant=variant,
|
| 91 |
+
snapshot_id=snapshot_id,
|
| 92 |
+
analysis_id=analysis_id,
|
| 93 |
)
|
| 94 |
meta_bug, rank = _find_meta_bug_for_pr(context.report, pr_number)
|
| 95 |
duplicate_pr = _find_duplicate_pr_for_pr(context.report, pr_number)
|
|
|
|
| 108 |
repo: str | None = None,
|
| 109 |
variant: str = "auto",
|
| 110 |
limit: int = 50,
|
| 111 |
+
snapshot_id: str | None = None,
|
| 112 |
+
analysis_id: str | None = None,
|
| 113 |
) -> dict[str, Any]:
|
| 114 |
context = _load_analysis_context(
|
| 115 |
db_path,
|
| 116 |
repo=repo,
|
| 117 |
variant=variant,
|
| 118 |
+
snapshot_id=snapshot_id,
|
| 119 |
+
analysis_id=analysis_id,
|
| 120 |
)
|
| 121 |
meta_bugs = [
|
| 122 |
_meta_bug_payload(cluster, rank=index)
|
|
|
|
| 135 |
cluster_id: str,
|
| 136 |
repo: str | None = None,
|
| 137 |
variant: str = "auto",
|
| 138 |
+
snapshot_id: str | None = None,
|
| 139 |
+
analysis_id: str | None = None,
|
| 140 |
) -> dict[str, Any]:
|
| 141 |
context = _load_analysis_context(
|
| 142 |
db_path,
|
| 143 |
repo=repo,
|
| 144 |
variant=variant,
|
| 145 |
+
snapshot_id=snapshot_id,
|
| 146 |
+
analysis_id=analysis_id,
|
| 147 |
)
|
| 148 |
for index, cluster in enumerate(context.report.get("meta_bugs", []), start=1):
|
| 149 |
if str(cluster.get("cluster_id")) != cluster_id:
|
|
|
|
| 162 |
repo: str | None = None,
|
| 163 |
variant: str = "auto",
|
| 164 |
limit: int = 50,
|
| 165 |
+
snapshot_id: str | None = None,
|
| 166 |
+
analysis_id: str | None = None,
|
| 167 |
) -> dict[str, Any]:
|
| 168 |
context = _load_analysis_context(
|
| 169 |
db_path,
|
| 170 |
repo=repo,
|
| 171 |
variant=variant,
|
| 172 |
+
snapshot_id=snapshot_id,
|
| 173 |
+
analysis_id=analysis_id,
|
| 174 |
)
|
| 175 |
duplicate_prs = [
|
| 176 |
{"rank": index, **dict(entry)}
|
|
|
|
| 188 |
*,
|
| 189 |
repo: str | None = None,
|
| 190 |
variant: str = "auto",
|
| 191 |
+
snapshot_id: str | None = None,
|
| 192 |
+
analysis_id: str | None = None,
|
| 193 |
) -> dict[str, Any]:
|
| 194 |
context = _load_analysis_context(
|
| 195 |
db_path,
|
| 196 |
repo=repo,
|
| 197 |
variant=variant,
|
| 198 |
+
snapshot_id=snapshot_id,
|
| 199 |
+
analysis_id=analysis_id,
|
| 200 |
)
|
| 201 |
return {
|
| 202 |
**_analysis_base_payload(context),
|
|
|
|
| 236 |
*,
|
| 237 |
repo: str | None,
|
| 238 |
variant: str,
|
| 239 |
+
snapshot_id: str | None,
|
| 240 |
+
analysis_id: str | None,
|
| 241 |
) -> AnalysisContext:
|
| 242 |
active = _resolve_active_snapshot_context(db_path, repo=repo)
|
| 243 |
+
report_path, variant_used, report_source, resolved_analysis_id = _resolve_analysis_report_path(
|
| 244 |
active.snapshot_dir,
|
|
|
|
| 245 |
variant,
|
| 246 |
+
snapshot_id=snapshot_id,
|
| 247 |
+
analysis_id=analysis_id,
|
| 248 |
required=True,
|
| 249 |
)
|
| 250 |
assert report_path is not None
|
|
|
|
| 257 |
report_source=report_source,
|
| 258 |
variant_requested=_normalize_analysis_variant(variant),
|
| 259 |
variant_used=variant_used,
|
| 260 |
+
analysis_id=resolved_analysis_id,
|
| 261 |
)
|
| 262 |
|
| 263 |
|
| 264 |
def _resolve_analysis_report_path(
|
| 265 |
snapshot_dir: Path,
|
|
|
|
| 266 |
variant: str,
|
| 267 |
*,
|
| 268 |
+
snapshot_id: str | None,
|
| 269 |
+
analysis_id: str | None,
|
| 270 |
required: bool,
|
| 271 |
+
) -> tuple[Path | None, str | None, str | None, str | None]:
|
| 272 |
normalized = _normalize_analysis_variant(variant)
|
| 273 |
+
if (snapshot_id is None) != (analysis_id is None):
|
| 274 |
+
raise ValueError("snapshot_id and analysis_id must be provided together.")
|
| 275 |
+
if snapshot_id is not None and analysis_id is not None:
|
| 276 |
+
selection = _resolve_archived_analysis_report_path(
|
| 277 |
+
snapshot_dir,
|
| 278 |
+
snapshot_id=snapshot_id,
|
| 279 |
+
analysis_id=analysis_id,
|
| 280 |
+
variant=normalized,
|
| 281 |
+
)
|
| 282 |
+
if selection is not None:
|
| 283 |
+
return (*selection, analysis_id)
|
|
|
|
|
|
|
| 284 |
if not required:
|
| 285 |
+
return None, None, None, None
|
| 286 |
raise ValueError(
|
| 287 |
+
f"Published analysis run {analysis_id!r} for snapshot {snapshot_id!r} was not found."
|
| 288 |
)
|
| 289 |
+
|
| 290 |
+
current_manifest_path = repo_relative_path_to_local(
|
| 291 |
+
snapshot_dir, CURRENT_ANALYSIS_MANIFEST_PATH
|
| 292 |
+
)
|
| 293 |
+
if normalized == "deterministic":
|
| 294 |
+
selection = _resolve_snapshot_local_report_path(snapshot_dir, variant=normalized)
|
| 295 |
+
if selection is not None:
|
| 296 |
+
return (*selection, None)
|
| 297 |
+
|
| 298 |
+
if current_manifest_path.exists():
|
| 299 |
+
report_path, variant_used = _resolve_manifest_report_path(
|
| 300 |
+
snapshot_dir,
|
| 301 |
+
load_current_analysis_manifest(current_manifest_path),
|
| 302 |
+
variant=normalized,
|
| 303 |
+
manifest_kind="current",
|
| 304 |
+
)
|
| 305 |
+
return (
|
| 306 |
+
report_path,
|
| 307 |
+
variant_used,
|
| 308 |
+
"current",
|
| 309 |
+
str(load_current_analysis_manifest(current_manifest_path)["analysis_id"]),
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
selection = _resolve_snapshot_local_report_path(snapshot_dir, variant=normalized)
|
| 313 |
+
if selection is not None:
|
| 314 |
+
return (*selection, None)
|
| 315 |
if not required:
|
| 316 |
+
return None, None, None, None
|
| 317 |
raise ValueError(
|
| 318 |
+
"No analysis report was found for the current analysis view or active snapshot."
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def _resolve_archived_analysis_report_path(
|
| 323 |
+
snapshot_dir: Path,
|
| 324 |
+
*,
|
| 325 |
+
snapshot_id: str,
|
| 326 |
+
analysis_id: str,
|
| 327 |
+
variant: str,
|
| 328 |
+
) -> tuple[Path, str, str] | None:
|
| 329 |
+
manifest_path = repo_relative_path_to_local(
|
| 330 |
+
snapshot_dir,
|
| 331 |
+
analysis_run_manifest_path(snapshot_id, analysis_id),
|
| 332 |
+
)
|
| 333 |
+
if not manifest_path.exists():
|
| 334 |
+
return None
|
| 335 |
+
report_path, variant_used = _resolve_manifest_report_path(
|
| 336 |
+
snapshot_dir,
|
| 337 |
+
load_archived_analysis_run_manifest(manifest_path),
|
| 338 |
+
variant=variant,
|
| 339 |
+
manifest_kind="archived",
|
| 340 |
)
|
| 341 |
+
return report_path, variant_used, "archived"
|
| 342 |
|
| 343 |
|
| 344 |
+
def _resolve_manifest_report_path(
|
| 345 |
+
snapshot_dir: Path,
|
| 346 |
+
manifest: dict[str, Any],
|
| 347 |
*,
|
| 348 |
+
variant: str,
|
| 349 |
+
manifest_kind: str,
|
| 350 |
+
) -> tuple[Path, str]:
|
| 351 |
+
artifact_key = _artifact_key_for_variant(variant, manifest_kind=manifest_kind)
|
| 352 |
+
artifacts = manifest.get("artifacts") or {}
|
| 353 |
+
artifact_path = artifacts.get(artifact_key)
|
| 354 |
+
if not isinstance(artifact_path, str) or not artifact_path:
|
| 355 |
+
message = (
|
| 356 |
+
f"Published {manifest_kind} analysis manifest does not provide the {variant} artifact."
|
| 357 |
+
if variant != "auto"
|
| 358 |
+
else f"Published {manifest_kind} analysis manifest does not provide the canonical hybrid artifact."
|
| 359 |
+
)
|
| 360 |
+
raise ValueError(message)
|
| 361 |
+
report_path = repo_relative_path_to_local(snapshot_dir, artifact_path)
|
| 362 |
+
if not report_path.exists():
|
| 363 |
+
raise ValueError(
|
| 364 |
+
f"Published {manifest_kind} analysis artifact {artifact_path!r} is missing from the materialized snapshot."
|
| 365 |
+
)
|
| 366 |
+
variant_used = "hybrid" if artifact_key == "hybrid" else variant
|
| 367 |
+
return report_path, variant_used
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
def _artifact_key_for_variant(variant: str, *, manifest_kind: str) -> str:
|
| 371 |
+
if variant == "auto":
|
| 372 |
+
return "hybrid"
|
| 373 |
+
if variant == "hybrid":
|
| 374 |
+
return "hybrid"
|
| 375 |
+
raise ValueError(
|
| 376 |
+
f"Published {manifest_kind} analysis only serves canonical hybrid artifacts; requested {variant!r}."
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
def _resolve_snapshot_local_report_path(
|
| 381 |
snapshot_dir: Path,
|
| 382 |
+
*,
|
| 383 |
+
variant: str,
|
| 384 |
+
) -> tuple[Path, str, str] | None:
|
| 385 |
+
if variant == "auto":
|
| 386 |
+
hybrid_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"]
|
| 387 |
+
if hybrid_path.exists():
|
| 388 |
+
return hybrid_path, "hybrid", "snapshot"
|
| 389 |
+
deterministic_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT["deterministic"]
|
| 390 |
+
if deterministic_path.exists():
|
| 391 |
+
return deterministic_path, "deterministic", "snapshot"
|
| 392 |
+
return None
|
| 393 |
+
report_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT[variant]
|
| 394 |
+
if not report_path.exists():
|
| 395 |
+
return None
|
| 396 |
+
return report_path, variant, "snapshot"
|
| 397 |
|
| 398 |
|
| 399 |
def _normalize_analysis_variant(variant: str) -> str:
|
|
|
|
| 408 |
def _analysis_base_payload(context: AnalysisContext) -> dict[str, Any]:
|
| 409 |
active_snapshot_id = str(context.active_run["snapshot_id"])
|
| 410 |
snapshot_id = str(context.report.get("snapshot_id") or active_snapshot_id)
|
| 411 |
+
payload = {
|
| 412 |
"repo": str(context.active_run["repo"]),
|
| 413 |
"snapshot_id": snapshot_id,
|
| 414 |
"active_snapshot_id": active_snapshot_id,
|
|
|
|
| 419 |
"llm_enrichment": bool(context.report.get("llm_enrichment")),
|
| 420 |
"generated_at": context.report.get("generated_at"),
|
| 421 |
}
|
| 422 |
+
if context.analysis_id is not None:
|
| 423 |
+
payload["analysis_id"] = context.analysis_id
|
| 424 |
+
return payload
|
| 425 |
|
| 426 |
|
| 427 |
def _analysis_counts(report: dict[str, Any]) -> dict[str, int]:
|
src/slop_farmer/reports/dashboard.py
CHANGED
|
@@ -8,7 +8,11 @@ from typing import Any
|
|
| 8 |
|
| 9 |
from slop_farmer.config import DashboardDataOptions
|
| 10 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 11 |
-
from slop_farmer.data.snapshot_paths import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
def run_dashboard_data(options: DashboardDataOptions) -> Path:
|
|
@@ -16,7 +20,8 @@ def run_dashboard_data(options: DashboardDataOptions) -> Path:
|
|
| 16 |
manifest = _read_optional_json(snapshot_dir / "manifest.json")
|
| 17 |
issues = read_parquet_rows(snapshot_dir / "issues.parquet")
|
| 18 |
pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 19 |
-
|
|
|
|
| 20 |
contributor_report = _read_optional_json(
|
| 21 |
options.contributors_input or snapshot_dir / "new-contributors-report.json"
|
| 22 |
)
|
|
@@ -67,6 +72,21 @@ def run_dashboard_data(options: DashboardDataOptions) -> Path:
|
|
| 67 |
"clustered_pr_count": sum(1 for row in prs if row["cluster_id"]),
|
| 68 |
"contributor_count": len(contributors),
|
| 69 |
"analysis_available": bool(analysis),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
"contributors_available": bool(contributor_report),
|
| 71 |
"pr_scope_available": bool(pr_scope_report),
|
| 72 |
"pr_scope_cluster_count": len(pr_scope_clusters),
|
|
@@ -88,7 +108,29 @@ def _resolve_snapshot_dir(options: DashboardDataOptions) -> Path:
|
|
| 88 |
if options.snapshot_root is not None
|
| 89 |
else (Path("data") / "snapshots").resolve()
|
| 90 |
)
|
| 91 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
def _read_optional_json(path: Path) -> dict[str, Any]:
|
|
@@ -153,6 +195,14 @@ def _excerpt(value: Any, limit: int = 240) -> str | None:
|
|
| 153 |
return compact[: limit - 1].rstrip() + "…"
|
| 154 |
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
def _cluster_rows(
|
| 157 |
analysis: dict[str, Any],
|
| 158 |
issue_map: dict[int, dict[str, Any]],
|
|
|
|
| 8 |
|
| 9 |
from slop_farmer.config import DashboardDataOptions
|
| 10 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 11 |
+
from slop_farmer.data.snapshot_paths import (
|
| 12 |
+
ResolvedAnalysisReportPath,
|
| 13 |
+
resolve_default_dashboard_analysis_report,
|
| 14 |
+
)
|
| 15 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
| 16 |
|
| 17 |
|
| 18 |
def run_dashboard_data(options: DashboardDataOptions) -> Path:
|
|
|
|
| 20 |
manifest = _read_optional_json(snapshot_dir / "manifest.json")
|
| 21 |
issues = read_parquet_rows(snapshot_dir / "issues.parquet")
|
| 22 |
pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 23 |
+
analysis_path = _resolve_analysis_input(snapshot_dir, options.analysis_input)
|
| 24 |
+
analysis = _read_optional_json(analysis_path.path) if analysis_path is not None else {}
|
| 25 |
contributor_report = _read_optional_json(
|
| 26 |
options.contributors_input or snapshot_dir / "new-contributors-report.json"
|
| 27 |
)
|
|
|
|
| 72 |
"clustered_pr_count": sum(1 for row in prs if row["cluster_id"]),
|
| 73 |
"contributor_count": len(contributors),
|
| 74 |
"analysis_available": bool(analysis),
|
| 75 |
+
"analysis_source": None if analysis_path is None else analysis_path.source,
|
| 76 |
+
"analysis_variant": None if analysis_path is None else analysis_path.variant,
|
| 77 |
+
"analysis_snapshot_id": (
|
| 78 |
+
None
|
| 79 |
+
if analysis_path is None
|
| 80 |
+
else (
|
| 81 |
+
analysis_path.snapshot_id
|
| 82 |
+
or (
|
| 83 |
+
str(analysis.get("snapshot_id")).strip()
|
| 84 |
+
if analysis.get("snapshot_id") is not None
|
| 85 |
+
else None
|
| 86 |
+
)
|
| 87 |
+
)
|
| 88 |
+
),
|
| 89 |
+
"analysis_id": None if analysis_path is None else analysis_path.analysis_id,
|
| 90 |
"contributors_available": bool(contributor_report),
|
| 91 |
"pr_scope_available": bool(pr_scope_report),
|
| 92 |
"pr_scope_cluster_count": len(pr_scope_clusters),
|
|
|
|
| 108 |
if options.snapshot_root is not None
|
| 109 |
else (Path("data") / "snapshots").resolve()
|
| 110 |
)
|
| 111 |
+
return resolve_snapshot_source_dir(
|
| 112 |
+
snapshot_dir=options.snapshot_dir,
|
| 113 |
+
local_snapshots_root=snapshots_root,
|
| 114 |
+
hf_repo_id=options.hf_repo_id,
|
| 115 |
+
hf_revision=options.hf_revision,
|
| 116 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 117 |
+
hf_output_dir=snapshots_root.parent,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def _resolve_analysis_input(
|
| 122 |
+
snapshot_dir: Path, override_path: Path | None
|
| 123 |
+
) -> ResolvedAnalysisReportPath | None:
|
| 124 |
+
if override_path is not None:
|
| 125 |
+
resolved = override_path.resolve()
|
| 126 |
+
if not resolved.exists():
|
| 127 |
+
raise FileNotFoundError(f"Dashboard analysis input not found: {resolved}")
|
| 128 |
+
return ResolvedAnalysisReportPath(
|
| 129 |
+
path=resolved,
|
| 130 |
+
variant=_analysis_variant_for_path(resolved),
|
| 131 |
+
source="override",
|
| 132 |
+
)
|
| 133 |
+
return resolve_default_dashboard_analysis_report(snapshot_dir)
|
| 134 |
|
| 135 |
|
| 136 |
def _read_optional_json(path: Path) -> dict[str, Any]:
|
|
|
|
| 195 |
return compact[: limit - 1].rstrip() + "…"
|
| 196 |
|
| 197 |
|
| 198 |
+
def _analysis_variant_for_path(path: Path) -> str:
|
| 199 |
+
if path.name == "analysis-report-hybrid.json":
|
| 200 |
+
return "hybrid"
|
| 201 |
+
if path.name == "analysis-report.json":
|
| 202 |
+
return "deterministic"
|
| 203 |
+
return "override"
|
| 204 |
+
|
| 205 |
+
|
| 206 |
def _cluster_rows(
|
| 207 |
analysis: dict[str, Any],
|
| 208 |
issue_map: dict[int, dict[str, Any]],
|
src/slop_farmer/reports/new_contributor_report.py
CHANGED
|
@@ -12,7 +12,7 @@ from typing import Any
|
|
| 12 |
from slop_farmer.config import NewContributorReportOptions, resolve_github_token
|
| 13 |
from slop_farmer.data.http import urlopen_with_retry
|
| 14 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
|
| 15 |
-
from slop_farmer.data.
|
| 16 |
from slop_farmer.reports.user_activity import summarize_user
|
| 17 |
|
| 18 |
GRAPHQL_URL = "https://api.github.com/graphql"
|
|
@@ -131,7 +131,14 @@ def run_new_contributor_report(options: NewContributorReportOptions) -> Path:
|
|
| 131 |
|
| 132 |
|
| 133 |
def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
|
| 134 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
@@ -244,7 +251,6 @@ def _report_contributors(
|
|
| 244 |
previous_report_reusable
|
| 245 |
and previous_entry is not None
|
| 246 |
and not previous_entry.get("fetch_error")
|
| 247 |
-
and not known_via_prior_merged_pr
|
| 248 |
):
|
| 249 |
contributors.append(
|
| 250 |
_reused_previous_report_entry(
|
|
@@ -256,6 +262,8 @@ def _report_contributors(
|
|
| 256 |
)
|
| 257 |
)
|
| 258 |
reused_previous_report += 1
|
|
|
|
|
|
|
| 259 |
continue
|
| 260 |
try:
|
| 261 |
summary = summarize_user(row["author_login"], options.window_days, None)
|
|
|
|
| 12 |
from slop_farmer.config import NewContributorReportOptions, resolve_github_token
|
| 13 |
from slop_farmer.data.http import urlopen_with_retry
|
| 14 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
|
| 15 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
| 16 |
from slop_farmer.reports.user_activity import summarize_user
|
| 17 |
|
| 18 |
GRAPHQL_URL = "https://api.github.com/graphql"
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
|
| 134 |
+
return resolve_snapshot_source_dir(
|
| 135 |
+
snapshot_dir=options.snapshot_dir,
|
| 136 |
+
local_snapshots_root=options.output_dir.resolve() / "snapshots",
|
| 137 |
+
hf_repo_id=options.hf_repo_id,
|
| 138 |
+
hf_revision=options.hf_revision,
|
| 139 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 140 |
+
hf_output_dir=options.output_dir,
|
| 141 |
+
)
|
| 142 |
|
| 143 |
|
| 144 |
def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
|
|
| 251 |
previous_report_reusable
|
| 252 |
and previous_entry is not None
|
| 253 |
and not previous_entry.get("fetch_error")
|
|
|
|
| 254 |
):
|
| 255 |
contributors.append(
|
| 256 |
_reused_previous_report_entry(
|
|
|
|
| 262 |
)
|
| 263 |
)
|
| 264 |
reused_previous_report += 1
|
| 265 |
+
if known_via_prior_merged_pr:
|
| 266 |
+
reused_known_merged += 1
|
| 267 |
continue
|
| 268 |
try:
|
| 269 |
summary = summarize_user(row["author_login"], options.window_days, None)
|
src/slop_farmer/reports/pr_scope.py
CHANGED
|
@@ -42,11 +42,7 @@ from typing import Any
|
|
| 42 |
from pydantic import BaseModel, Field
|
| 43 |
|
| 44 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 45 |
-
from slop_farmer.data.
|
| 46 |
-
from slop_farmer.data.snapshot_paths import (
|
| 47 |
-
default_hf_materialize_dir,
|
| 48 |
-
resolve_snapshot_dir_from_output,
|
| 49 |
-
)
|
| 50 |
from slop_farmer.reports.pr_heuristics import (
|
| 51 |
compile_cluster_suppression_rules,
|
| 52 |
suppressed_pull_request_reasons,
|
|
@@ -260,17 +256,14 @@ def run_pr_scope_report(options: Any) -> Path:
|
|
| 260 |
|
| 261 |
|
| 262 |
def _resolve_snapshot_dir(options: Any) -> Path:
|
| 263 |
-
|
| 264 |
-
snapshot_dir
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
)
|
| 272 |
-
return snapshot_dir.resolve()
|
| 273 |
-
return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 274 |
|
| 275 |
|
| 276 |
def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:
|
|
|
|
| 42 |
from pydantic import BaseModel, Field
|
| 43 |
|
| 44 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 45 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
from slop_farmer.reports.pr_heuristics import (
|
| 47 |
compile_cluster_suppression_rules,
|
| 48 |
suppressed_pull_request_reasons,
|
|
|
|
| 256 |
|
| 257 |
|
| 258 |
def _resolve_snapshot_dir(options: Any) -> Path:
|
| 259 |
+
return resolve_snapshot_source_dir(
|
| 260 |
+
snapshot_dir=options.snapshot_dir,
|
| 261 |
+
local_snapshots_root=options.output_dir.resolve() / "snapshots",
|
| 262 |
+
hf_repo_id=options.hf_repo_id,
|
| 263 |
+
hf_revision=options.hf_revision,
|
| 264 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 265 |
+
hf_output_dir=options.output_dir,
|
| 266 |
+
)
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
|
| 269 |
def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:
|
src/slop_farmer/reports/pr_search_scope.py
CHANGED
|
@@ -10,11 +10,7 @@ from typing import Any
|
|
| 10 |
|
| 11 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 12 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 13 |
-
from slop_farmer.data.
|
| 14 |
-
from slop_farmer.data.snapshot_paths import (
|
| 15 |
-
default_hf_materialize_dir,
|
| 16 |
-
resolve_snapshot_dir_from_output,
|
| 17 |
-
)
|
| 18 |
from slop_farmer.reports.pr_heuristics import (
|
| 19 |
compile_cluster_suppression_rules,
|
| 20 |
suppressed_pull_request_reasons,
|
|
@@ -36,17 +32,14 @@ DEFAULT_CANDIDATE_LIMIT = 5
|
|
| 36 |
|
| 37 |
|
| 38 |
def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
|
| 39 |
-
|
| 40 |
-
snapshot_dir
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
)
|
| 48 |
-
return snapshot_dir.resolve()
|
| 49 |
-
return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 50 |
|
| 51 |
|
| 52 |
def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
@@ -54,6 +47,7 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
| 54 |
manifest = read_json(manifest_path) if manifest_path.exists() else {}
|
| 55 |
pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 56 |
pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
|
|
|
|
| 57 |
repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
|
| 58 |
snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
|
| 59 |
return {
|
|
@@ -62,6 +56,7 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
| 62 |
"manifest": manifest,
|
| 63 |
"pull_requests": pull_requests,
|
| 64 |
"pr_files": pr_files,
|
|
|
|
| 65 |
}
|
| 66 |
|
| 67 |
|
|
@@ -412,6 +407,7 @@ def _document_row(row: Mapping[str, Any]) -> dict[str, Any]:
|
|
| 412 |
return {
|
| 413 |
"pr_number": int(row["number"]),
|
| 414 |
"github_id": row.get("github_id"),
|
|
|
|
| 415 |
"state": row.get("state"),
|
| 416 |
"draft": bool(row.get("draft")),
|
| 417 |
"merged": bool(row.get("merged")),
|
|
|
|
| 10 |
|
| 11 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 12 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 13 |
+
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from slop_farmer.reports.pr_heuristics import (
|
| 15 |
compile_cluster_suppression_rules,
|
| 16 |
suppressed_pull_request_reasons,
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
|
| 35 |
+
return resolve_snapshot_source_dir(
|
| 36 |
+
snapshot_dir=options.snapshot_dir,
|
| 37 |
+
local_snapshots_root=options.output_dir.resolve() / "snapshots",
|
| 38 |
+
hf_repo_id=options.hf_repo_id,
|
| 39 |
+
hf_revision=options.hf_revision,
|
| 40 |
+
hf_materialize_dir=options.hf_materialize_dir,
|
| 41 |
+
hf_output_dir=options.output_dir,
|
| 42 |
+
)
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
|
|
| 47 |
manifest = read_json(manifest_path) if manifest_path.exists() else {}
|
| 48 |
pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 49 |
pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
|
| 50 |
+
contributors = read_parquet_rows(snapshot_dir / "new_contributors.parquet")
|
| 51 |
repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
|
| 52 |
snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
|
| 53 |
return {
|
|
|
|
| 56 |
"manifest": manifest,
|
| 57 |
"pull_requests": pull_requests,
|
| 58 |
"pr_files": pr_files,
|
| 59 |
+
"contributors": contributors,
|
| 60 |
}
|
| 61 |
|
| 62 |
|
|
|
|
| 407 |
return {
|
| 408 |
"pr_number": int(row["number"]),
|
| 409 |
"github_id": row.get("github_id"),
|
| 410 |
+
"author_login": row.get("author_login"),
|
| 411 |
"state": row.get("state"),
|
| 412 |
"draft": bool(row.get("draft")),
|
| 413 |
"merged": bool(row.get("merged")),
|
src/slop_farmer/reports/pr_search_service.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import json
|
| 4 |
-
from collections.abc import Iterable, Mapping
|
| 5 |
from contextlib import suppress
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Any, Protocol
|
|
@@ -17,6 +17,8 @@ from slop_farmer.data.search_duckdb import (
|
|
| 17 |
get_cluster,
|
| 18 |
get_cluster_ids_for_prs,
|
| 19 |
get_cluster_members,
|
|
|
|
|
|
|
| 20 |
get_document,
|
| 21 |
get_feature,
|
| 22 |
get_pair_neighbor_row,
|
|
@@ -99,6 +101,16 @@ def run_pr_search_refresh(options: PrSearchRefreshOptions) -> dict[str, Any]:
|
|
| 99 |
"pr_search_documents",
|
| 100 |
_scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
|
| 101 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
insert_rows(
|
| 103 |
connection,
|
| 104 |
"pr_scope_features",
|
|
@@ -290,6 +302,85 @@ def get_pr_search_candidate_clusters(
|
|
| 290 |
connection.close()
|
| 291 |
|
| 292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
def get_pr_search_similar_lookup(
|
| 294 |
db_path: Path,
|
| 295 |
*,
|
|
@@ -801,6 +892,15 @@ def _require_feature(connection: Any, *, run_id: str, pr_number: int) -> dict[st
|
|
| 801 |
return feature
|
| 802 |
|
| 803 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 804 |
def _json_list(raw: Any) -> list[str]:
|
| 805 |
if isinstance(raw, list):
|
| 806 |
return [str(item) for item in raw]
|
|
@@ -838,6 +938,71 @@ def _without_json_fields(row: Mapping[str, Any]) -> dict[str, Any]:
|
|
| 838 |
return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
|
| 839 |
|
| 840 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
def _normalize_lookup_mode(mode: str) -> str:
|
| 842 |
normalized = mode.strip().lower()
|
| 843 |
if normalized not in {"auto", "indexed", "live"}:
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import json
|
| 4 |
+
from collections.abc import Iterable, Mapping, Sequence
|
| 5 |
from contextlib import suppress
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Any, Protocol
|
|
|
|
| 17 |
get_cluster,
|
| 18 |
get_cluster_ids_for_prs,
|
| 19 |
get_cluster_members,
|
| 20 |
+
get_contributor,
|
| 21 |
+
get_contributor_pulls,
|
| 22 |
get_document,
|
| 23 |
get_feature,
|
| 24 |
get_pair_neighbor_row,
|
|
|
|
| 101 |
"pr_search_documents",
|
| 102 |
_scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
|
| 103 |
)
|
| 104 |
+
insert_rows(
|
| 105 |
+
connection,
|
| 106 |
+
"pr_search_contributors",
|
| 107 |
+
_contributor_rows(
|
| 108 |
+
snapshot["contributors"],
|
| 109 |
+
run_id=run_id,
|
| 110 |
+
repo=repo,
|
| 111 |
+
snapshot_id=str(snapshot["snapshot_id"]),
|
| 112 |
+
),
|
| 113 |
+
)
|
| 114 |
insert_rows(
|
| 115 |
connection,
|
| 116 |
"pr_scope_features",
|
|
|
|
| 302 |
connection.close()
|
| 303 |
|
| 304 |
|
| 305 |
+
def get_pr_search_contributor(
|
| 306 |
+
db_path: Path,
|
| 307 |
+
*,
|
| 308 |
+
author_login: str,
|
| 309 |
+
repo: str | None = None,
|
| 310 |
+
) -> dict[str, Any]:
|
| 311 |
+
connection = connect_pr_search_db(db_path, read_only=True)
|
| 312 |
+
try:
|
| 313 |
+
active_run = resolve_active_run(connection, repo=repo)
|
| 314 |
+
run_id = str(active_run["id"])
|
| 315 |
+
contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
|
| 316 |
+
pulls = _document_rows(
|
| 317 |
+
get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=20)
|
| 318 |
+
)
|
| 319 |
+
return {
|
| 320 |
+
"repo": active_run["repo"],
|
| 321 |
+
"snapshot_id": active_run["snapshot_id"],
|
| 322 |
+
"run_id": run_id,
|
| 323 |
+
"contributor": contributor,
|
| 324 |
+
"pulls": pulls,
|
| 325 |
+
"pull_count": len(pulls),
|
| 326 |
+
}
|
| 327 |
+
finally:
|
| 328 |
+
connection.close()
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def get_pr_search_contributor_pulls(
|
| 332 |
+
db_path: Path,
|
| 333 |
+
*,
|
| 334 |
+
author_login: str,
|
| 335 |
+
repo: str | None = None,
|
| 336 |
+
limit: int = 20,
|
| 337 |
+
) -> dict[str, Any]:
|
| 338 |
+
connection = connect_pr_search_db(db_path, read_only=True)
|
| 339 |
+
try:
|
| 340 |
+
active_run = resolve_active_run(connection, repo=repo)
|
| 341 |
+
run_id = str(active_run["id"])
|
| 342 |
+
contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
|
| 343 |
+
pulls = _document_rows(
|
| 344 |
+
get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=limit)
|
| 345 |
+
)
|
| 346 |
+
return {
|
| 347 |
+
"repo": active_run["repo"],
|
| 348 |
+
"snapshot_id": active_run["snapshot_id"],
|
| 349 |
+
"run_id": run_id,
|
| 350 |
+
"contributor": contributor,
|
| 351 |
+
"pulls": pulls,
|
| 352 |
+
"pull_count": len(pulls),
|
| 353 |
+
}
|
| 354 |
+
finally:
|
| 355 |
+
connection.close()
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def get_pr_search_pull_contributor(
|
| 359 |
+
db_path: Path,
|
| 360 |
+
*,
|
| 361 |
+
pr_number: int,
|
| 362 |
+
repo: str | None = None,
|
| 363 |
+
) -> dict[str, Any]:
|
| 364 |
+
connection = connect_pr_search_db(db_path, read_only=True)
|
| 365 |
+
try:
|
| 366 |
+
active_run = resolve_active_run(connection, repo=repo)
|
| 367 |
+
run_id = str(active_run["id"])
|
| 368 |
+
document = _require_document(connection, run_id=run_id, pr_number=pr_number)
|
| 369 |
+
author_login = str(document.get("author_login") or "").strip()
|
| 370 |
+
if not author_login:
|
| 371 |
+
raise ValueError(f"PR #{pr_number} does not have an indexed author_login.")
|
| 372 |
+
contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
|
| 373 |
+
return {
|
| 374 |
+
"repo": active_run["repo"],
|
| 375 |
+
"snapshot_id": active_run["snapshot_id"],
|
| 376 |
+
"run_id": run_id,
|
| 377 |
+
"pr": _without_json_fields(document),
|
| 378 |
+
"contributor": contributor,
|
| 379 |
+
}
|
| 380 |
+
finally:
|
| 381 |
+
connection.close()
|
| 382 |
+
|
| 383 |
+
|
| 384 |
def get_pr_search_similar_lookup(
|
| 385 |
db_path: Path,
|
| 386 |
*,
|
|
|
|
| 892 |
return feature
|
| 893 |
|
| 894 |
|
| 895 |
+
def _require_contributor(connection: Any, *, run_id: str, author_login: str) -> dict[str, Any]:
|
| 896 |
+
contributor = get_contributor(connection, run_id=run_id, author_login=author_login)
|
| 897 |
+
if contributor is None:
|
| 898 |
+
raise ValueError(
|
| 899 |
+
f"Contributor {author_login!r} was not found in the active indexed universe."
|
| 900 |
+
)
|
| 901 |
+
return _contributor_row(contributor)
|
| 902 |
+
|
| 903 |
+
|
| 904 |
def _json_list(raw: Any) -> list[str]:
|
| 905 |
if isinstance(raw, list):
|
| 906 |
return [str(item) for item in raw]
|
|
|
|
| 938 |
return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
|
| 939 |
|
| 940 |
|
| 941 |
+
def _document_rows(rows: Sequence[Mapping[str, Any]]) -> list[dict[str, Any]]:
|
| 942 |
+
return [_without_json_fields(row) for row in rows]
|
| 943 |
+
|
| 944 |
+
|
| 945 |
+
def _contributor_rows(
|
| 946 |
+
rows: list[Mapping[str, Any]],
|
| 947 |
+
*,
|
| 948 |
+
run_id: str,
|
| 949 |
+
repo: str,
|
| 950 |
+
snapshot_id: str,
|
| 951 |
+
) -> list[dict[str, Any]]:
|
| 952 |
+
return [
|
| 953 |
+
{
|
| 954 |
+
"run_id": run_id,
|
| 955 |
+
"repo": repo,
|
| 956 |
+
"snapshot_id": snapshot_id,
|
| 957 |
+
"report_generated_at": row.get("report_generated_at"),
|
| 958 |
+
"window_days": row.get("window_days"),
|
| 959 |
+
"author_login": row.get("author_login"),
|
| 960 |
+
"name": row.get("name"),
|
| 961 |
+
"profile_url": row.get("profile_url"),
|
| 962 |
+
"repo_pull_requests_url": row.get("repo_pull_requests_url"),
|
| 963 |
+
"repo_issues_url": row.get("repo_issues_url"),
|
| 964 |
+
"repo_first_seen_at": row.get("repo_first_seen_at"),
|
| 965 |
+
"repo_last_seen_at": row.get("repo_last_seen_at"),
|
| 966 |
+
"repo_primary_artifact_count": row.get("repo_primary_artifact_count"),
|
| 967 |
+
"repo_artifact_count": row.get("repo_artifact_count"),
|
| 968 |
+
"snapshot_issue_count": row.get("snapshot_issue_count"),
|
| 969 |
+
"snapshot_pr_count": row.get("snapshot_pr_count"),
|
| 970 |
+
"snapshot_comment_count": row.get("snapshot_comment_count"),
|
| 971 |
+
"snapshot_review_count": row.get("snapshot_review_count"),
|
| 972 |
+
"snapshot_review_comment_count": row.get("snapshot_review_comment_count"),
|
| 973 |
+
"repo_association": row.get("repo_association"),
|
| 974 |
+
"new_to_repo": row.get("new_to_repo"),
|
| 975 |
+
"first_seen_in_snapshot": row.get("first_seen_in_snapshot"),
|
| 976 |
+
"report_reason": row.get("report_reason"),
|
| 977 |
+
"account_age_days": row.get("account_age_days"),
|
| 978 |
+
"young_account": row.get("young_account"),
|
| 979 |
+
"follow_through_score": row.get("follow_through_score"),
|
| 980 |
+
"breadth_score": row.get("breadth_score"),
|
| 981 |
+
"automation_risk_signal": row.get("automation_risk_signal"),
|
| 982 |
+
"heuristic_note": row.get("heuristic_note"),
|
| 983 |
+
"public_orgs_json": row.get("public_orgs"),
|
| 984 |
+
"visible_authored_pr_count": row.get("visible_authored_pr_count"),
|
| 985 |
+
"merged_pr_count": row.get("merged_pr_count"),
|
| 986 |
+
"closed_unmerged_pr_count": row.get("closed_unmerged_pr_count"),
|
| 987 |
+
"open_pr_count": row.get("open_pr_count"),
|
| 988 |
+
"merged_pr_rate": row.get("merged_pr_rate"),
|
| 989 |
+
"closed_unmerged_pr_rate": row.get("closed_unmerged_pr_rate"),
|
| 990 |
+
"still_open_pr_rate": row.get("still_open_pr_rate"),
|
| 991 |
+
"distinct_repos_with_authored_prs": row.get("distinct_repos_with_authored_prs"),
|
| 992 |
+
"distinct_repos_with_open_prs": row.get("distinct_repos_with_open_prs"),
|
| 993 |
+
"fetch_error": row.get("fetch_error"),
|
| 994 |
+
}
|
| 995 |
+
for row in rows
|
| 996 |
+
]
|
| 997 |
+
|
| 998 |
+
|
| 999 |
+
def _contributor_row(row: Mapping[str, Any]) -> dict[str, Any]:
|
| 1000 |
+
return {
|
| 1001 |
+
**_without_json_fields(row),
|
| 1002 |
+
"public_orgs": _json_list(row.get("public_orgs_json")),
|
| 1003 |
+
}
|
| 1004 |
+
|
| 1005 |
+
|
| 1006 |
def _normalize_lookup_mode(mode: str) -> str:
|
| 1007 |
normalized = mode.strip().lower()
|
| 1008 |
if normalized not in {"auto", "indexed", "live"}:
|
src/slop_farmer/reports/read_views.py
CHANGED
|
@@ -5,6 +5,11 @@ from pathlib import Path
|
|
| 5 |
from typing import Any, Literal
|
| 6 |
|
| 7 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
AnalysisVariant = Literal["auto", "hybrid", "deterministic"]
|
| 10 |
|
|
@@ -252,7 +257,8 @@ def get_issue_best(snapshot_dir: Path, *, variant: AnalysisVariant) -> dict[str,
|
|
| 252 |
def get_contributor_status(snapshot_dir: Path) -> dict[str, Any]:
|
| 253 |
metadata = _snapshot_metadata(snapshot_dir)
|
| 254 |
report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
|
| 255 |
-
|
|
|
|
| 256 |
return {
|
| 257 |
"repo": str(report.get("repo") or metadata.repo),
|
| 258 |
"snapshot_id": str(report.get("snapshot_id") or metadata.snapshot_id),
|
|
@@ -321,7 +327,12 @@ def _analysis_context(
|
|
| 321 |
snapshot_dir: Path,
|
| 322 |
*,
|
| 323 |
variant: AnalysisVariant,
|
| 324 |
-
) -> tuple[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
metadata = _snapshot_metadata(snapshot_dir)
|
| 326 |
selection = _select_analysis_report(_analysis_candidates(snapshot_dir), variant=variant)
|
| 327 |
issue_map, pr_map = _artifact_maps(snapshot_dir)
|
|
@@ -395,16 +406,38 @@ def _select_analysis_report(
|
|
| 395 |
|
| 396 |
|
| 397 |
def _analysis_report_paths(snapshot_dir: Path) -> list[Path]:
|
| 398 |
-
ordered = [
|
| 399 |
-
|
| 400 |
-
snapshot_dir
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
ordered.extend(
|
| 405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
)
|
| 407 |
-
return [path for path in
|
| 408 |
|
| 409 |
|
| 410 |
def _analysis_auto_priority(candidate: dict[str, Any]) -> tuple[int, str]:
|
|
@@ -448,15 +481,25 @@ def _analysis_counts(payload: dict[str, Any]) -> dict[str, int]:
|
|
| 448 |
}
|
| 449 |
|
| 450 |
|
| 451 |
-
def _artifact_maps(
|
| 452 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
pr_rows = (
|
| 454 |
read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 455 |
if (snapshot_dir / "pull_requests.parquet").exists()
|
| 456 |
else []
|
| 457 |
)
|
| 458 |
-
issue_map = {
|
| 459 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
return issue_map, pr_map
|
| 461 |
|
| 462 |
|
|
@@ -474,7 +517,9 @@ def _issue_cluster_summary(
|
|
| 474 |
return {
|
| 475 |
"rank": rank,
|
| 476 |
"cluster_id": str(cluster.get("cluster_id") or f"cluster-{rank or 0}"),
|
| 477 |
-
"title": _cluster_title(
|
|
|
|
|
|
|
| 478 |
"summary": cluster.get("summary"),
|
| 479 |
"status": cluster.get("status"),
|
| 480 |
"confidence": _coerce_float(cluster.get("confidence")),
|
|
@@ -518,7 +563,9 @@ def _cluster_url(
|
|
| 518 |
issue_map: dict[int, dict[str, Any]],
|
| 519 |
pr_map: dict[int, dict[str, Any]],
|
| 520 |
) -> str | None:
|
| 521 |
-
return _url_for_issue(canonical_issue_number, issue_map) or _url_for_pr(
|
|
|
|
|
|
|
| 522 |
|
| 523 |
|
| 524 |
def _duplicate_pr_summary(
|
|
@@ -605,7 +652,8 @@ def _pr_member_row(number: int, row: dict[str, Any] | None, *, role: str) -> dic
|
|
| 605 |
|
| 606 |
|
| 607 |
def _contributor_summary(contributor: dict[str, Any], *, rank: int | None = None) -> dict[str, Any]:
|
| 608 |
-
|
|
|
|
| 609 |
return {
|
| 610 |
"rank": rank,
|
| 611 |
"author_login": contributor.get("author_login"),
|
|
@@ -629,7 +677,8 @@ def _contributor_summary(contributor: dict[str, Any], *, rank: int | None = None
|
|
| 629 |
|
| 630 |
|
| 631 |
def _contributor_risk(contributor: dict[str, Any]) -> dict[str, Any]:
|
| 632 |
-
|
|
|
|
| 633 |
return {
|
| 634 |
"automation_risk_signal": contributor.get("automation_risk_signal"),
|
| 635 |
"heuristic_note": contributor.get("heuristic_note"),
|
|
|
|
| 5 |
from typing import Any, Literal
|
| 6 |
|
| 7 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 8 |
+
from slop_farmer.data.snapshot_paths import (
|
| 9 |
+
CURRENT_ANALYSIS_MANIFEST_PATH,
|
| 10 |
+
load_current_analysis_manifest,
|
| 11 |
+
repo_relative_path_to_local,
|
| 12 |
+
)
|
| 13 |
|
| 14 |
AnalysisVariant = Literal["auto", "hybrid", "deterministic"]
|
| 15 |
|
|
|
|
| 257 |
def get_contributor_status(snapshot_dir: Path) -> dict[str, Any]:
|
| 258 |
metadata = _snapshot_metadata(snapshot_dir)
|
| 259 |
report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
|
| 260 |
+
raw_contributors = report.get("contributors")
|
| 261 |
+
contributors: list[Any] = raw_contributors if isinstance(raw_contributors, list) else []
|
| 262 |
return {
|
| 263 |
"repo": str(report.get("repo") or metadata.repo),
|
| 264 |
"snapshot_id": str(report.get("snapshot_id") or metadata.snapshot_id),
|
|
|
|
| 327 |
snapshot_dir: Path,
|
| 328 |
*,
|
| 329 |
variant: AnalysisVariant,
|
| 330 |
+
) -> tuple[
|
| 331 |
+
_SnapshotMetadata,
|
| 332 |
+
_AnalysisSelection | None,
|
| 333 |
+
dict[int, dict[str, Any]],
|
| 334 |
+
dict[int, dict[str, Any]],
|
| 335 |
+
]:
|
| 336 |
metadata = _snapshot_metadata(snapshot_dir)
|
| 337 |
selection = _select_analysis_report(_analysis_candidates(snapshot_dir), variant=variant)
|
| 338 |
issue_map, pr_map = _artifact_maps(snapshot_dir)
|
|
|
|
| 406 |
|
| 407 |
|
| 408 |
def _analysis_report_paths(snapshot_dir: Path) -> list[Path]:
|
| 409 |
+
ordered: list[Path] = []
|
| 410 |
+
current_manifest_path = repo_relative_path_to_local(
|
| 411 |
+
snapshot_dir, CURRENT_ANALYSIS_MANIFEST_PATH
|
| 412 |
+
)
|
| 413 |
+
if current_manifest_path.exists():
|
| 414 |
+
try:
|
| 415 |
+
current_manifest = load_current_analysis_manifest(current_manifest_path)
|
| 416 |
+
except ValueError:
|
| 417 |
+
current_manifest = None
|
| 418 |
+
if current_manifest is not None:
|
| 419 |
+
for artifact_path in (current_manifest.get("artifacts") or {}).values():
|
| 420 |
+
if not isinstance(artifact_path, str):
|
| 421 |
+
continue
|
| 422 |
+
ordered.append(repo_relative_path_to_local(snapshot_dir, artifact_path))
|
| 423 |
ordered.extend(
|
| 424 |
+
[
|
| 425 |
+
snapshot_dir / "analysis-report-hybrid.json",
|
| 426 |
+
snapshot_dir / "analysis-report-deterministic.json",
|
| 427 |
+
snapshot_dir / "analysis-report.json",
|
| 428 |
+
]
|
| 429 |
+
)
|
| 430 |
+
seen: set[Path] = set()
|
| 431 |
+
deduped: list[Path] = []
|
| 432 |
+
for path in ordered:
|
| 433 |
+
if path in seen:
|
| 434 |
+
continue
|
| 435 |
+
seen.add(path)
|
| 436 |
+
deduped.append(path)
|
| 437 |
+
deduped.extend(
|
| 438 |
+
path for path in sorted(snapshot_dir.glob("analysis-report*.json")) if path not in seen
|
| 439 |
)
|
| 440 |
+
return [path for path in deduped if path.exists()]
|
| 441 |
|
| 442 |
|
| 443 |
def _analysis_auto_priority(candidate: dict[str, Any]) -> tuple[int, str]:
|
|
|
|
| 481 |
}
|
| 482 |
|
| 483 |
|
| 484 |
+
def _artifact_maps(
|
| 485 |
+
snapshot_dir: Path,
|
| 486 |
+
) -> tuple[dict[int, dict[str, Any]], dict[int, dict[str, Any]]]:
|
| 487 |
+
issue_rows = (
|
| 488 |
+
read_parquet_rows(snapshot_dir / "issues.parquet")
|
| 489 |
+
if (snapshot_dir / "issues.parquet").exists()
|
| 490 |
+
else []
|
| 491 |
+
)
|
| 492 |
pr_rows = (
|
| 493 |
read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 494 |
if (snapshot_dir / "pull_requests.parquet").exists()
|
| 495 |
else []
|
| 496 |
)
|
| 497 |
+
issue_map = {
|
| 498 |
+
int(row["number"]): row for row in issue_rows if _coerce_int(row.get("number")) is not None
|
| 499 |
+
}
|
| 500 |
+
pr_map = {
|
| 501 |
+
int(row["number"]): row for row in pr_rows if _coerce_int(row.get("number")) is not None
|
| 502 |
+
}
|
| 503 |
return issue_map, pr_map
|
| 504 |
|
| 505 |
|
|
|
|
| 517 |
return {
|
| 518 |
"rank": rank,
|
| 519 |
"cluster_id": str(cluster.get("cluster_id") or f"cluster-{rank or 0}"),
|
| 520 |
+
"title": _cluster_title(
|
| 521 |
+
cluster, issue_map, pr_map, canonical_issue_number, canonical_pr_number
|
| 522 |
+
),
|
| 523 |
"summary": cluster.get("summary"),
|
| 524 |
"status": cluster.get("status"),
|
| 525 |
"confidence": _coerce_float(cluster.get("confidence")),
|
|
|
|
| 563 |
issue_map: dict[int, dict[str, Any]],
|
| 564 |
pr_map: dict[int, dict[str, Any]],
|
| 565 |
) -> str | None:
|
| 566 |
+
return _url_for_issue(canonical_issue_number, issue_map) or _url_for_pr(
|
| 567 |
+
canonical_pr_number, pr_map
|
| 568 |
+
)
|
| 569 |
|
| 570 |
|
| 571 |
def _duplicate_pr_summary(
|
|
|
|
| 652 |
|
| 653 |
|
| 654 |
def _contributor_summary(contributor: dict[str, Any], *, rank: int | None = None) -> dict[str, Any]:
|
| 655 |
+
raw_activity = contributor.get("activity")
|
| 656 |
+
activity: dict[str, Any] = raw_activity if isinstance(raw_activity, dict) else {}
|
| 657 |
return {
|
| 658 |
"rank": rank,
|
| 659 |
"author_login": contributor.get("author_login"),
|
|
|
|
| 677 |
|
| 678 |
|
| 679 |
def _contributor_risk(contributor: dict[str, Any]) -> dict[str, Any]:
|
| 680 |
+
raw_activity = contributor.get("activity")
|
| 681 |
+
activity: dict[str, Any] = raw_activity if isinstance(raw_activity, dict) else {}
|
| 682 |
return {
|
| 683 |
"automation_risk_signal": contributor.get("automation_risk_signal"),
|
| 684 |
"heuristic_note": contributor.get("heuristic_note"),
|
uv.lock
CHANGED
|
@@ -561,7 +561,7 @@ wheels = [
|
|
| 561 |
|
| 562 |
[[package]]
|
| 563 |
name = "fast-agent-mcp"
|
| 564 |
-
version = "0.6.
|
| 565 |
source = { registry = "https://pypi.org/simple" }
|
| 566 |
dependencies = [
|
| 567 |
{ name = "a2a-sdk" },
|
|
@@ -598,9 +598,9 @@ dependencies = [
|
|
| 598 |
{ name = "uvloop", marker = "sys_platform != 'win32'" },
|
| 599 |
{ name = "watchfiles" },
|
| 600 |
]
|
| 601 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 602 |
wheels = [
|
| 603 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 604 |
]
|
| 605 |
|
| 606 |
[[package]]
|
|
@@ -820,34 +820,34 @@ wheels = [
|
|
| 820 |
|
| 821 |
[[package]]
|
| 822 |
name = "hf-xet"
|
| 823 |
-
version = "1.4.
|
| 824 |
-
source = { registry = "https://pypi.org/simple" }
|
| 825 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 826 |
-
wheels = [
|
| 827 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 828 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 829 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 830 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 831 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 832 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 833 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 834 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 835 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 836 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 837 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 838 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 839 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 840 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 841 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 842 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 843 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 844 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 845 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 846 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 847 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 848 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 849 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 850 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 851 |
]
|
| 852 |
|
| 853 |
[[package]]
|
|
@@ -902,7 +902,7 @@ wheels = [
|
|
| 902 |
|
| 903 |
[[package]]
|
| 904 |
name = "huggingface-hub"
|
| 905 |
-
version = "1.
|
| 906 |
source = { registry = "https://pypi.org/simple" }
|
| 907 |
dependencies = [
|
| 908 |
{ name = "filelock" },
|
|
@@ -915,9 +915,9 @@ dependencies = [
|
|
| 915 |
{ name = "typer" },
|
| 916 |
{ name = "typing-extensions" },
|
| 917 |
]
|
| 918 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 919 |
wheels = [
|
| 920 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 921 |
]
|
| 922 |
|
| 923 |
[[package]]
|
|
@@ -2366,7 +2366,7 @@ wheels = [
|
|
| 2366 |
|
| 2367 |
[[package]]
|
| 2368 |
name = "slop-farmer"
|
| 2369 |
-
version = "0.1.
|
| 2370 |
source = { editable = "." }
|
| 2371 |
dependencies = [
|
| 2372 |
{ name = "duckdb" },
|
|
@@ -2398,7 +2398,7 @@ requires-dist = [
|
|
| 2398 |
{ name = "fast-agent-mcp", marker = "python_full_version >= '3.13.5' and extra == 'llm'", specifier = ">=0.6.16" },
|
| 2399 |
{ name = "fastapi", specifier = ">=0.115.0" },
|
| 2400 |
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
|
| 2401 |
-
{ name = "huggingface-hub", specifier = ">=
|
| 2402 |
{ name = "pyarrow", specifier = ">=18.0.0" },
|
| 2403 |
{ name = "pydantic", specifier = ">=2.11" },
|
| 2404 |
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.0" },
|
|
|
|
| 561 |
|
| 562 |
[[package]]
|
| 563 |
name = "fast-agent-mcp"
|
| 564 |
+
version = "0.6.17"
|
| 565 |
source = { registry = "https://pypi.org/simple" }
|
| 566 |
dependencies = [
|
| 567 |
{ name = "a2a-sdk" },
|
|
|
|
| 598 |
{ name = "uvloop", marker = "sys_platform != 'win32'" },
|
| 599 |
{ name = "watchfiles" },
|
| 600 |
]
|
| 601 |
+
sdist = { url = "https://files.pythonhosted.org/packages/8c/a1/b6b1045345d38b342da3def7723a2dc6a44faff9c01fee6d81afbd272d62/fast_agent_mcp-0.6.17.tar.gz", hash = "sha256:a920113d47ef2ab82be1bd63b77d3bf78f8f862a5a6e91f1fd0aa931850fb25f", size = 2091401, upload-time = "2026-04-16T21:48:43.334Z" }
|
| 602 |
wheels = [
|
| 603 |
+
{ url = "https://files.pythonhosted.org/packages/b4/ef/47e05d6fa95e04ed8ad60afac3ae29d8205894fb220ffde193bd33578f3a/fast_agent_mcp-0.6.17-py3-none-any.whl", hash = "sha256:a23c5a5ed8924e38809dabd31f994e5cc81b8c084e84632bb1eb246b257c4752", size = 1573794, upload-time = "2026-04-16T21:48:38.999Z" },
|
| 604 |
]
|
| 605 |
|
| 606 |
[[package]]
|
|
|
|
| 820 |
|
| 821 |
[[package]]
|
| 822 |
name = "hf-xet"
|
| 823 |
+
version = "1.4.3"
|
| 824 |
+
source = { registry = "https://pypi.org/simple" }
|
| 825 |
+
sdist = { url = "https://files.pythonhosted.org/packages/53/92/ec9ad04d0b5728dca387a45af7bc98fbb0d73b2118759f5f6038b61a57e8/hf_xet-1.4.3.tar.gz", hash = "sha256:8ddedb73c8c08928c793df2f3401ec26f95be7f7e516a7bee2fbb546f6676113", size = 670477, upload-time = "2026-03-31T22:40:07.874Z" }
|
| 826 |
+
wheels = [
|
| 827 |
+
{ url = "https://files.pythonhosted.org/packages/72/43/724d307b34e353da0abd476e02f72f735cdd2bc86082dee1b32ea0bfee1d/hf_xet-1.4.3-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:7551659ba4f1e1074e9623996f28c3873682530aee0a846b7f2f066239228144", size = 3800935, upload-time = "2026-03-31T22:39:49.618Z" },
|
| 828 |
+
{ url = "https://files.pythonhosted.org/packages/2b/d2/8bee5996b699262edb87dbb54118d287c0e1b2fc78af7cdc41857ba5e3c4/hf_xet-1.4.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:bee693ada985e7045997f05f081d0e12c4c08bd7626dc397f8a7c487e6c04f7f", size = 3558942, upload-time = "2026-03-31T22:39:47.938Z" },
|
| 829 |
+
{ url = "https://files.pythonhosted.org/packages/c3/a1/e993d09cbe251196fb60812b09a58901c468127b7259d2bf0f68bf6088eb/hf_xet-1.4.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:21644b404bb0100fe3857892f752c4d09642586fd988e61501c95bbf44b393a3", size = 4207657, upload-time = "2026-03-31T22:39:39.69Z" },
|
| 830 |
+
{ url = "https://files.pythonhosted.org/packages/64/44/9eb6d21e5c34c63e5e399803a6932fa983cabdf47c0ecbcfe7ea97684b8c/hf_xet-1.4.3-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:987f09cfe418237812896a6736b81b1af02a3a6dcb4b4944425c4c4fca7a7cf8", size = 3986765, upload-time = "2026-03-31T22:39:37.936Z" },
|
| 831 |
+
{ url = "https://files.pythonhosted.org/packages/ea/7b/8ad6f16fdb82f5f7284a34b5ec48645bd575bdcd2f6f0d1644775909c486/hf_xet-1.4.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:60cf7fc43a99da0a853345cf86d23738c03983ee5249613a6305d3e57a5dca74", size = 4188162, upload-time = "2026-03-31T22:39:58.382Z" },
|
| 832 |
+
{ url = "https://files.pythonhosted.org/packages/1b/c4/39d6e136cbeea9ca5a23aad4b33024319222adbdc059ebcda5fc7d9d5ff4/hf_xet-1.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2815a49a7a59f3e2edf0cf113ae88e8cb2ca2a221bf353fb60c609584f4884d4", size = 4424525, upload-time = "2026-03-31T22:40:00.225Z" },
|
| 833 |
+
{ url = "https://files.pythonhosted.org/packages/46/f2/adc32dae6bdbc367853118b9878139ac869419a4ae7ba07185dc31251b76/hf_xet-1.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:42ee323265f1e6a81b0e11094564fb7f7e0ec75b5105ffd91ae63f403a11931b", size = 3671610, upload-time = "2026-03-31T22:40:10.42Z" },
|
| 834 |
+
{ url = "https://files.pythonhosted.org/packages/e2/19/25d897dcc3f81953e0c2cde9ec186c7a0fee413eb0c9a7a9130d87d94d3a/hf_xet-1.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:27c976ba60079fb8217f485b9c5c7fcd21c90b0367753805f87cb9f3cdc4418a", size = 3528529, upload-time = "2026-03-31T22:40:09.106Z" },
|
| 835 |
+
{ url = "https://files.pythonhosted.org/packages/ec/36/3e8f85ca9fe09b8de2b2e10c63b3b3353d7dda88a0b3d426dffbe7b8313b/hf_xet-1.4.3-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:5251d5ece3a81815bae9abab41cf7ddb7bcb8f56411bce0827f4a3071c92fdc6", size = 3801019, upload-time = "2026-03-31T22:39:56.651Z" },
|
| 836 |
+
{ url = "https://files.pythonhosted.org/packages/b5/9c/defb6cb1de28bccb7bd8d95f6e60f72a3d3fa4cb3d0329c26fb9a488bfe7/hf_xet-1.4.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1feb0f3abeacee143367c326a128a2e2b60868ec12a36c225afb1d6c5a05e6d2", size = 3558746, upload-time = "2026-03-31T22:39:54.766Z" },
|
| 837 |
+
{ url = "https://files.pythonhosted.org/packages/c1/bd/8d001191893178ff8e826e46ad5299446e62b93cd164e17b0ffea08832ec/hf_xet-1.4.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8b301fc150290ca90b4fccd079829b84bb4786747584ae08b94b4577d82fb791", size = 4207692, upload-time = "2026-03-31T22:39:46.246Z" },
|
| 838 |
+
{ url = "https://files.pythonhosted.org/packages/ce/48/6790b402803250e9936435613d3a78b9aaeee7973439f0918848dde58309/hf_xet-1.4.3-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:d972fbe95ddc0d3c0fc49b31a8a69f47db35c1e3699bf316421705741aab6653", size = 3986281, upload-time = "2026-03-31T22:39:44.648Z" },
|
| 839 |
+
{ url = "https://files.pythonhosted.org/packages/51/56/ea62552fe53db652a9099eda600b032d75554d0e86c12a73824bfedef88b/hf_xet-1.4.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c5b48db1ee344a805a1b9bd2cda9b6b65fe77ed3787bd6e87ad5521141d317cd", size = 4187414, upload-time = "2026-03-31T22:40:04.951Z" },
|
| 840 |
+
{ url = "https://files.pythonhosted.org/packages/7d/f5/bc1456d4638061bea997e6d2db60a1a613d7b200e0755965ec312dc1ef79/hf_xet-1.4.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:22bdc1f5fb8b15bf2831440b91d1c9bbceeb7e10c81a12e8d75889996a5c9da8", size = 4424368, upload-time = "2026-03-31T22:40:06.347Z" },
|
| 841 |
+
{ url = "https://files.pythonhosted.org/packages/e4/76/ab597bae87e1f06d18d3ecb8ed7f0d3c9a37037fc32ce76233d369273c64/hf_xet-1.4.3-cp314-cp314t-win_amd64.whl", hash = "sha256:0392c79b7cf48418cd61478c1a925246cf10639f4cd9d94368d8ca1e8df9ea07", size = 3672280, upload-time = "2026-03-31T22:40:16.401Z" },
|
| 842 |
+
{ url = "https://files.pythonhosted.org/packages/62/05/2e462d34e23a09a74d73785dbed71cc5dbad82a72eee2ad60a72a554155d/hf_xet-1.4.3-cp314-cp314t-win_arm64.whl", hash = "sha256:681c92a07796325778a79d76c67011764ecc9042a8c3579332b61b63ae512075", size = 3528945, upload-time = "2026-03-31T22:40:14.995Z" },
|
| 843 |
+
{ url = "https://files.pythonhosted.org/packages/ac/9f/9c23e4a447b8f83120798f9279d0297a4d1360bdbf59ef49ebec78fe2545/hf_xet-1.4.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:d0da85329eaf196e03e90b84c2d0aca53bd4573d097a75f99609e80775f98025", size = 3805048, upload-time = "2026-03-31T22:39:53.105Z" },
|
| 844 |
+
{ url = "https://files.pythonhosted.org/packages/0b/f8/7aacb8e5f4a7899d39c787b5984e912e6c18b11be136ef13947d7a66d265/hf_xet-1.4.3-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e23717ce4186b265f69afa66e6f0069fe7efbf331546f5c313d00e123dc84583", size = 3562178, upload-time = "2026-03-31T22:39:51.295Z" },
|
| 845 |
+
{ url = "https://files.pythonhosted.org/packages/df/9a/a24b26dc8a65f0ecc0fe5be981a19e61e7ca963b85e062c083f3a9100529/hf_xet-1.4.3-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc360b70c815bf340ed56c7b8c63aacf11762a4b099b2fe2c9bd6d6068668c08", size = 4212320, upload-time = "2026-03-31T22:39:42.922Z" },
|
| 846 |
+
{ url = "https://files.pythonhosted.org/packages/53/60/46d493db155d2ee2801b71fb1b0fd67696359047fdd8caee2c914cc50c79/hf_xet-1.4.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:39f2d2e9654cd9b4319885733993807aab6de9dfbd34c42f0b78338d6617421f", size = 3991546, upload-time = "2026-03-31T22:39:41.335Z" },
|
| 847 |
+
{ url = "https://files.pythonhosted.org/packages/bc/f5/067363e1c96c6b17256910830d1b54099d06287e10f4ec6ec4e7e08371fc/hf_xet-1.4.3-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:49ad8a8cead2b56051aa84d7fce3e1335efe68df3cf6c058f22a65513885baac", size = 4193200, upload-time = "2026-03-31T22:40:01.936Z" },
|
| 848 |
+
{ url = "https://files.pythonhosted.org/packages/42/4b/53951592882d9c23080c7644542fda34a3813104e9e11fa1a7d82d419cb8/hf_xet-1.4.3-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7716d62015477a70ea272d2d68cd7cad140f61c52ee452e133e139abfe2c17ba", size = 4429392, upload-time = "2026-03-31T22:40:03.492Z" },
|
| 849 |
+
{ url = "https://files.pythonhosted.org/packages/8a/21/75a6c175b4e79662ad8e62f46a40ce341d8d6b206b06b4320d07d55b188c/hf_xet-1.4.3-cp37-abi3-win_amd64.whl", hash = "sha256:6b591fcad34e272a5b02607485e4f2a1334aebf1bc6d16ce8eb1eb8978ac2021", size = 3677359, upload-time = "2026-03-31T22:40:13.619Z" },
|
| 850 |
+
{ url = "https://files.pythonhosted.org/packages/8a/7c/44314ecd0e89f8b2b51c9d9e5e7a60a9c1c82024ac471d415860557d3cd8/hf_xet-1.4.3-cp37-abi3-win_arm64.whl", hash = "sha256:7c2c7e20bcfcc946dc67187c203463f5e932e395845d098cc2a93f5b67ca0b47", size = 3533664, upload-time = "2026-03-31T22:40:12.152Z" },
|
| 851 |
]
|
| 852 |
|
| 853 |
[[package]]
|
|
|
|
| 902 |
|
| 903 |
[[package]]
|
| 904 |
name = "huggingface-hub"
|
| 905 |
+
version = "1.11.0"
|
| 906 |
source = { registry = "https://pypi.org/simple" }
|
| 907 |
dependencies = [
|
| 908 |
{ name = "filelock" },
|
|
|
|
| 915 |
{ name = "typer" },
|
| 916 |
{ name = "typing-extensions" },
|
| 917 |
]
|
| 918 |
+
sdist = { url = "https://files.pythonhosted.org/packages/dc/89/e7aa12d8a6b9259bed10671abb25ae6fa437c0f88a86ecbf59617bae7759/huggingface_hub-1.11.0.tar.gz", hash = "sha256:15fb3713c7f9cdff7b808a94fd91664f661ab142796bb48c9cd9493e8d166278", size = 761749, upload-time = "2026-04-16T13:07:39.73Z" }
|
| 919 |
wheels = [
|
| 920 |
+
{ url = "https://files.pythonhosted.org/packages/37/02/4f3f8997d1ea7fe0146b343e5e14bd065fa87af790d07e5576d31b31cc18/huggingface_hub-1.11.0-py3-none-any.whl", hash = "sha256:42a6de0afbfeb5e022222d36398f029679db4eb4778801aafda32257ae9131ab", size = 645499, upload-time = "2026-04-16T13:07:37.716Z" },
|
| 921 |
]
|
| 922 |
|
| 923 |
[[package]]
|
|
|
|
| 2366 |
|
| 2367 |
[[package]]
|
| 2368 |
name = "slop-farmer"
|
| 2369 |
+
version = "0.1.1"
|
| 2370 |
source = { editable = "." }
|
| 2371 |
dependencies = [
|
| 2372 |
{ name = "duckdb" },
|
|
|
|
| 2398 |
{ name = "fast-agent-mcp", marker = "python_full_version >= '3.13.5' and extra == 'llm'", specifier = ">=0.6.16" },
|
| 2399 |
{ name = "fastapi", specifier = ">=0.115.0" },
|
| 2400 |
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
|
| 2401 |
+
{ name = "huggingface-hub", specifier = ">=1.11.0" },
|
| 2402 |
{ name = "pyarrow", specifier = ">=18.0.0" },
|
| 2403 |
{ name = "pydantic", specifier = ">=2.11" },
|
| 2404 |
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.0" },
|