Spaces:
Sleeping
Sleeping
Deploy PR search API with issues/contributors routes
Browse files- README.md +5 -4
- pyproject.toml +8 -1
- src/slop_farmer.egg-info/PKG-INFO +42 -1
- src/slop_farmer.egg-info/SOURCES.txt +1 -2
- src/slop_farmer.egg-info/entry_points.txt +0 -1
- src/slop_farmer.egg-info/requires.txt +1 -1
- src/slop_farmer/__init__.py +1 -1
- src/slop_farmer/app/cli.py +0 -307
- src/slop_farmer/app/deploy.py +2 -11
- src/slop_farmer/app/hf_checkpoint_import.py +70 -10
- src/slop_farmer/app/pipeline.py +90 -12
- src/slop_farmer/app/pr_search.py +0 -74
- src/slop_farmer/app/pr_search_api.py +133 -83
- src/slop_farmer/app/workflow.py +0 -3
- src/slop_farmer/app_config.py +0 -22
- src/slop_farmer/config.py +0 -38
- src/slop_farmer/data/search_duckdb.py +0 -146
- src/slop_farmer/data/snapshot_materialize.py +6 -0
- src/slop_farmer/reports/analysis.py +17 -9
- src/slop_farmer/reports/dashboard.py +2 -9
- src/slop_farmer/reports/new_contributor_report.py +3 -11
- src/slop_farmer/reports/pr_scope.py +16 -9
- src/slop_farmer/reports/pr_search_scope.py +16 -12
- src/slop_farmer/reports/pr_search_service.py +1 -166
- src/slop_farmer/reports/read_views.py +742 -0
- uv.lock +4 -4
README.md
CHANGED
|
@@ -29,8 +29,9 @@ Defaults for this deployment:
|
|
| 29 |
CLI examples:
|
| 30 |
|
| 31 |
```bash
|
| 32 |
-
pr-search
|
| 33 |
-
pr-search
|
| 34 |
-
pr-search
|
| 35 |
-
pr-search --
|
|
|
|
| 36 |
```
|
|
|
|
| 29 |
CLI examples:
|
| 30 |
|
| 31 |
```bash
|
| 32 |
+
pr-search status
|
| 33 |
+
pr-search code similar 67096
|
| 34 |
+
pr-search code clusters for-pr 67096
|
| 35 |
+
pr-search issues list --limit 5
|
| 36 |
+
pr-search contributors list --limit 10
|
| 37 |
```
|
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "slop-farmer"
|
| 7 |
-
version = "0.1.
|
| 8 |
description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.13.5"
|
|
@@ -60,6 +60,13 @@ select = [
|
|
| 60 |
]
|
| 61 |
ignore = ["E501"]
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
[tool.slop-farmer.dashboard-data]
|
| 64 |
output-dir = "web/public/data"
|
| 65 |
window-days = 14
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "slop-farmer"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.13.5"
|
|
|
|
| 60 |
]
|
| 61 |
ignore = ["E501"]
|
| 62 |
|
| 63 |
+
[tool.slop-farmer.analyze]
|
| 64 |
+
output-dir = "eval_data"
|
| 65 |
+
hf-repo-id = "evalstate/transformers-pr"
|
| 66 |
+
ranking-backend = "hybrid"
|
| 67 |
+
model = "gpt-5.4-mini"
|
| 68 |
+
max-clusters = 10
|
| 69 |
+
|
| 70 |
[tool.slop-farmer.dashboard-data]
|
| 71 |
output-dir = "web/public/data"
|
| 72 |
window-days = 14
|
src/slop_farmer.egg-info/PKG-INFO
CHANGED
|
@@ -11,7 +11,7 @@ Requires-Dist: huggingface_hub>=0.30.0
|
|
| 11 |
Requires-Dist: pydantic>=2.11
|
| 12 |
Requires-Dist: PyYAML>=6.0.2
|
| 13 |
Requires-Dist: rank-bm25>=0.2.2
|
| 14 |
-
Requires-Dist: fast-agent-mcp>=0.6.
|
| 15 |
Requires-Dist: uvicorn>=0.34.0
|
| 16 |
Provides-Extra: dev
|
| 17 |
Requires-Dist: httpx>=0.28.0; extra == "dev"
|
|
@@ -409,3 +409,44 @@ Or use the CLI wrapper with a YAML config:
|
|
| 409 |
```bash
|
| 410 |
uv run slop-farmer --config configs/diffusers.yaml deploy-dashboard --refresh-contributors
|
| 411 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
Requires-Dist: pydantic>=2.11
|
| 12 |
Requires-Dist: PyYAML>=6.0.2
|
| 13 |
Requires-Dist: rank-bm25>=0.2.2
|
| 14 |
+
Requires-Dist: fast-agent-mcp>=0.6.17
|
| 15 |
Requires-Dist: uvicorn>=0.34.0
|
| 16 |
Provides-Extra: dev
|
| 17 |
Requires-Dist: httpx>=0.28.0; extra == "dev"
|
|
|
|
| 409 |
```bash
|
| 410 |
uv run slop-farmer --config configs/diffusers.yaml deploy-dashboard --refresh-contributors
|
| 411 |
```
|
| 412 |
+
|
| 413 |
+
## Deploy the PR similarity API to a Hugging Face Docker Space
|
| 414 |
+
|
| 415 |
+
The repo includes the FastAPI service for the read-oriented PR similarity surface.
|
| 416 |
+
The standalone `pr-search` client now lives in the downstream `pr-search-cli`
|
| 417 |
+
package.
|
| 418 |
+
|
| 419 |
+
Deploy the OpenClaw API Space with:
|
| 420 |
+
|
| 421 |
+
```bash
|
| 422 |
+
scripts/update_openclaw_pr_search_api.sh
|
| 423 |
+
```
|
| 424 |
+
|
| 425 |
+
Or use the generic deploy script directly:
|
| 426 |
+
|
| 427 |
+
```bash
|
| 428 |
+
SPACE_ID=evalstate/openclaw-pr-api \
|
| 429 |
+
SPACE_TITLE="OpenClaw PR API" \
|
| 430 |
+
DEFAULT_REPO=openclaw/openclaw \
|
| 431 |
+
GHR_BASE_URL=https://ghreplica.dutiful.dev \
|
| 432 |
+
HF_REPO_ID=evalstate/openclaw-pr \
|
| 433 |
+
BUCKET_ID=evalstate/openclaw-pr-api-data \
|
| 434 |
+
scripts/deploy_pr_search_space.sh
|
| 435 |
+
```
|
| 436 |
+
|
| 437 |
+
This deploy flow:
|
| 438 |
+
|
| 439 |
+
- creates or updates a Docker Space
|
| 440 |
+
- uploads a minimal app bundle with a generated Space `README.md`
|
| 441 |
+
- sets runtime variables for the API
|
| 442 |
+
- mounts the configured HF bucket at `/data`
|
| 443 |
+
|
| 444 |
+
After the Space is live, you can query it either through the in-repo admin CLI:
|
| 445 |
+
|
| 446 |
+
```bash
|
| 447 |
+
uv run slop-farmer pr-search status --repo openclaw/openclaw
|
| 448 |
+
uv run slop-farmer pr-search similar 67096 --repo openclaw/openclaw
|
| 449 |
+
```
|
| 450 |
+
|
| 451 |
+
Or through the downstream `pr-search-cli` package, which owns the standalone
|
| 452 |
+
`pr-search` executable.
|
src/slop_farmer.egg-info/SOURCES.txt
CHANGED
|
@@ -17,7 +17,6 @@ src/slop_farmer/app/hf_checkpoint_import.py
|
|
| 17 |
src/slop_farmer/app/pipeline.py
|
| 18 |
src/slop_farmer/app/pr_search.py
|
| 19 |
src/slop_farmer/app/pr_search_api.py
|
| 20 |
-
src/slop_farmer/app/pr_search_client.py
|
| 21 |
src/slop_farmer/app/publish.py
|
| 22 |
src/slop_farmer/app/snapshot_state.py
|
| 23 |
src/slop_farmer/app/workflow.py
|
|
@@ -42,6 +41,7 @@ src/slop_farmer/reports/pr_heuristics.py
|
|
| 42 |
src/slop_farmer/reports/pr_scope.py
|
| 43 |
src/slop_farmer/reports/pr_search_scope.py
|
| 44 |
src/slop_farmer/reports/pr_search_service.py
|
|
|
|
| 45 |
src/slop_farmer/reports/user_activity.py
|
| 46 |
tests/test_analysis.py
|
| 47 |
tests/test_analysis_cache.py
|
|
@@ -61,7 +61,6 @@ tests/test_pipeline_checkpoint_resume.py
|
|
| 61 |
tests/test_pr_scope.py
|
| 62 |
tests/test_pr_search.py
|
| 63 |
tests/test_pr_search_api.py
|
| 64 |
-
tests/test_pr_search_client.py
|
| 65 |
tests/test_publish.py
|
| 66 |
tests/test_snapshot_state.py
|
| 67 |
tests/test_update_transformers_dataset.py
|
|
|
|
| 17 |
src/slop_farmer/app/pipeline.py
|
| 18 |
src/slop_farmer/app/pr_search.py
|
| 19 |
src/slop_farmer/app/pr_search_api.py
|
|
|
|
| 20 |
src/slop_farmer/app/publish.py
|
| 21 |
src/slop_farmer/app/snapshot_state.py
|
| 22 |
src/slop_farmer/app/workflow.py
|
|
|
|
| 41 |
src/slop_farmer/reports/pr_scope.py
|
| 42 |
src/slop_farmer/reports/pr_search_scope.py
|
| 43 |
src/slop_farmer/reports/pr_search_service.py
|
| 44 |
+
src/slop_farmer/reports/read_views.py
|
| 45 |
src/slop_farmer/reports/user_activity.py
|
| 46 |
tests/test_analysis.py
|
| 47 |
tests/test_analysis_cache.py
|
|
|
|
| 61 |
tests/test_pr_scope.py
|
| 62 |
tests/test_pr_search.py
|
| 63 |
tests/test_pr_search_api.py
|
|
|
|
| 64 |
tests/test_publish.py
|
| 65 |
tests/test_snapshot_state.py
|
| 66 |
tests/test_update_transformers_dataset.py
|
src/slop_farmer.egg-info/entry_points.txt
CHANGED
|
@@ -1,3 +1,2 @@
|
|
| 1 |
[console_scripts]
|
| 2 |
-
pr-search = slop_farmer.app.pr_search_client:main
|
| 3 |
slop-farmer = slop_farmer.app.cli:main
|
|
|
|
| 1 |
[console_scripts]
|
|
|
|
| 2 |
slop-farmer = slop_farmer.app.cli:main
|
src/slop_farmer.egg-info/requires.txt
CHANGED
|
@@ -5,7 +5,7 @@ huggingface_hub>=0.30.0
|
|
| 5 |
pydantic>=2.11
|
| 6 |
PyYAML>=6.0.2
|
| 7 |
rank-bm25>=0.2.2
|
| 8 |
-
fast-agent-mcp>=0.6.
|
| 9 |
uvicorn>=0.34.0
|
| 10 |
|
| 11 |
[dev]
|
|
|
|
| 5 |
pydantic>=2.11
|
| 6 |
PyYAML>=6.0.2
|
| 7 |
rank-bm25>=0.2.2
|
| 8 |
+
fast-agent-mcp>=0.6.17
|
| 9 |
uvicorn>=0.34.0
|
| 10 |
|
| 11 |
[dev]
|
src/slop_farmer/__init__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
__all__ = ["__version__"]
|
| 2 |
|
| 3 |
-
__version__ = "0.1.
|
|
|
|
| 1 |
__all__ = ["__version__"]
|
| 2 |
|
| 3 |
+
__version__ = "0.1.0"
|
src/slop_farmer/app/cli.py
CHANGED
|
@@ -13,8 +13,6 @@ from slop_farmer.config import (
|
|
| 13 |
AnalysisOptions,
|
| 14 |
CheckpointImportOptions,
|
| 15 |
DashboardDataOptions,
|
| 16 |
-
DatasetRefreshOptions,
|
| 17 |
-
DatasetStatusOptions,
|
| 18 |
DeployDashboardOptions,
|
| 19 |
FullPipelineOptions,
|
| 20 |
MarkdownReportOptions,
|
|
@@ -43,7 +41,6 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
|
| 43 |
subparsers = parser.add_subparsers(dest="command", required=True)
|
| 44 |
|
| 45 |
_add_scrape_parser(subparsers, defaults["scrape"])
|
| 46 |
-
_add_refresh_dataset_parser(subparsers, defaults["refresh-dataset"])
|
| 47 |
_add_analyze_parser(subparsers, defaults["analyze"])
|
| 48 |
_add_pr_scope_parser(subparsers, defaults["pr-scope"])
|
| 49 |
_add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
|
|
@@ -55,7 +52,6 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
|
| 55 |
_add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
|
| 56 |
_add_publish_snapshot_parser(subparsers, defaults["publish-snapshot"])
|
| 57 |
_add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
|
| 58 |
-
_add_dataset_status_parser(subparsers, defaults["dataset-status"])
|
| 59 |
_add_full_pipeline_parser(subparsers, defaults["full-pipeline"])
|
| 60 |
return parser
|
| 61 |
|
|
@@ -63,7 +59,6 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
|
|
| 63 |
def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
|
| 64 |
commands = (
|
| 65 |
"scrape",
|
| 66 |
-
"refresh-dataset",
|
| 67 |
"analyze",
|
| 68 |
"import-hf-checkpoint",
|
| 69 |
"pr-scope",
|
|
@@ -73,7 +68,6 @@ def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]
|
|
| 73 |
"dashboard-data",
|
| 74 |
"publish-snapshot",
|
| 75 |
"deploy-dashboard",
|
| 76 |
-
"dataset-status",
|
| 77 |
"full-pipeline",
|
| 78 |
)
|
| 79 |
return {command: command_defaults(command, config_path=config_path) for command in commands}
|
|
@@ -190,80 +184,6 @@ def _add_scrape_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
|
| 190 |
)
|
| 191 |
|
| 192 |
|
| 193 |
-
def _add_refresh_dataset_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 194 |
-
refresh = subparsers.add_parser(
|
| 195 |
-
"refresh-dataset",
|
| 196 |
-
help="Refresh the canonical Hugging Face dataset repo from remote watermark state.",
|
| 197 |
-
)
|
| 198 |
-
refresh.add_argument(
|
| 199 |
-
"--repo",
|
| 200 |
-
default=defaults.get("repo", "huggingface/transformers"),
|
| 201 |
-
help="GitHub repository in owner/name form.",
|
| 202 |
-
)
|
| 203 |
-
refresh.add_argument(
|
| 204 |
-
"--hf-repo-id",
|
| 205 |
-
default=defaults.get("hf-repo-id"),
|
| 206 |
-
required=defaults.get("hf-repo-id") is None,
|
| 207 |
-
help="Canonical Hugging Face dataset repo id to refresh.",
|
| 208 |
-
)
|
| 209 |
-
refresh.add_argument("--max-issues", type=int, default=defaults.get("max-issues"))
|
| 210 |
-
refresh.add_argument("--max-prs", type=int, default=defaults.get("max-prs"))
|
| 211 |
-
refresh.add_argument(
|
| 212 |
-
"--max-issue-comments", type=int, default=defaults.get("max-issue-comments")
|
| 213 |
-
)
|
| 214 |
-
refresh.add_argument(
|
| 215 |
-
"--max-reviews-per-pr", type=int, default=defaults.get("max-reviews-per-pr")
|
| 216 |
-
)
|
| 217 |
-
refresh.add_argument(
|
| 218 |
-
"--max-review-comments-per-pr",
|
| 219 |
-
type=int,
|
| 220 |
-
default=defaults.get("max-review-comments-per-pr"),
|
| 221 |
-
)
|
| 222 |
-
refresh.add_argument(
|
| 223 |
-
"--fetch-timeline",
|
| 224 |
-
action="store_true",
|
| 225 |
-
default=bool(defaults.get("fetch-timeline", False)),
|
| 226 |
-
)
|
| 227 |
-
refresh.add_argument(
|
| 228 |
-
"--new-contributor-report",
|
| 229 |
-
dest="new_contributor_report",
|
| 230 |
-
action="store_true",
|
| 231 |
-
default=bool(defaults.get("new-contributor-report", True)),
|
| 232 |
-
)
|
| 233 |
-
refresh.add_argument(
|
| 234 |
-
"--no-new-contributor-report",
|
| 235 |
-
dest="new_contributor_report",
|
| 236 |
-
action="store_false",
|
| 237 |
-
)
|
| 238 |
-
refresh.add_argument(
|
| 239 |
-
"--new-contributor-window-days",
|
| 240 |
-
type=int,
|
| 241 |
-
default=int(defaults.get("new-contributor-window-days", 42)),
|
| 242 |
-
)
|
| 243 |
-
refresh.add_argument(
|
| 244 |
-
"--new-contributor-max-authors",
|
| 245 |
-
type=int,
|
| 246 |
-
default=int(defaults.get("new-contributor-max-authors", 25)),
|
| 247 |
-
)
|
| 248 |
-
refresh.add_argument("--http-timeout", type=int, default=300)
|
| 249 |
-
refresh.add_argument("--http-max-retries", type=int, default=8)
|
| 250 |
-
refresh.add_argument("--checkpoint-every-comments", type=int, default=1000)
|
| 251 |
-
refresh.add_argument("--checkpoint-every-prs", type=int, default=25)
|
| 252 |
-
refresh.add_argument(
|
| 253 |
-
"--private-hf-repo",
|
| 254 |
-
dest="private_hf_repo",
|
| 255 |
-
action="store_true",
|
| 256 |
-
default=bool(defaults.get("private-hf-repo", False)),
|
| 257 |
-
help="Create the target dataset repo as private if needed.",
|
| 258 |
-
)
|
| 259 |
-
refresh.add_argument(
|
| 260 |
-
"--private",
|
| 261 |
-
dest="private_hf_repo",
|
| 262 |
-
action="store_true",
|
| 263 |
-
help=argparse.SUPPRESS,
|
| 264 |
-
)
|
| 265 |
-
|
| 266 |
-
|
| 267 |
def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 268 |
analyze = subparsers.add_parser(
|
| 269 |
"analyze", help="Analyze a local snapshot and write a shortlist JSON report."
|
|
@@ -717,61 +637,6 @@ def _add_pr_search_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
|
| 717 |
status.add_argument("--repo", help="Optional repo override.")
|
| 718 |
status.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 719 |
|
| 720 |
-
contributor = pr_search_subparsers.add_parser(
|
| 721 |
-
"contributor", help="Show indexed contributor summary for one author login."
|
| 722 |
-
)
|
| 723 |
-
contributor.add_argument("login", help="GitHub author login to query.")
|
| 724 |
-
contributor.add_argument(
|
| 725 |
-
"--db",
|
| 726 |
-
type=Path,
|
| 727 |
-
default=Path(defaults["db"]) if defaults.get("db") else None,
|
| 728 |
-
help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
|
| 729 |
-
)
|
| 730 |
-
contributor.add_argument(
|
| 731 |
-
"--output-dir",
|
| 732 |
-
type=Path,
|
| 733 |
-
default=Path(defaults.get("output-dir", "data")),
|
| 734 |
-
)
|
| 735 |
-
contributor.add_argument("--repo", help="Optional repo override.")
|
| 736 |
-
contributor.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 737 |
-
|
| 738 |
-
contributor_prs = pr_search_subparsers.add_parser(
|
| 739 |
-
"contributor-prs", help="List indexed PRs for one contributor login."
|
| 740 |
-
)
|
| 741 |
-
contributor_prs.add_argument("login", help="GitHub author login to query.")
|
| 742 |
-
contributor_prs.add_argument(
|
| 743 |
-
"--db",
|
| 744 |
-
type=Path,
|
| 745 |
-
default=Path(defaults["db"]) if defaults.get("db") else None,
|
| 746 |
-
help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
|
| 747 |
-
)
|
| 748 |
-
contributor_prs.add_argument(
|
| 749 |
-
"--output-dir",
|
| 750 |
-
type=Path,
|
| 751 |
-
default=Path(defaults.get("output-dir", "data")),
|
| 752 |
-
)
|
| 753 |
-
contributor_prs.add_argument("--repo", help="Optional repo override.")
|
| 754 |
-
contributor_prs.add_argument("--limit", type=int, default=20, help="Maximum rows to show.")
|
| 755 |
-
contributor_prs.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 756 |
-
|
| 757 |
-
pr_contributor = pr_search_subparsers.add_parser(
|
| 758 |
-
"pr-contributor", help="Show contributor summary for the author of one indexed PR."
|
| 759 |
-
)
|
| 760 |
-
pr_contributor.add_argument("pr_number", type=int, help="Pull request number to query.")
|
| 761 |
-
pr_contributor.add_argument(
|
| 762 |
-
"--db",
|
| 763 |
-
type=Path,
|
| 764 |
-
default=Path(defaults["db"]) if defaults.get("db") else None,
|
| 765 |
-
help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
|
| 766 |
-
)
|
| 767 |
-
pr_contributor.add_argument(
|
| 768 |
-
"--output-dir",
|
| 769 |
-
type=Path,
|
| 770 |
-
default=Path(defaults.get("output-dir", "data")),
|
| 771 |
-
)
|
| 772 |
-
pr_contributor.add_argument("--repo", help="Optional repo override.")
|
| 773 |
-
pr_contributor.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 774 |
-
|
| 775 |
|
| 776 |
def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 777 |
new_contributor = subparsers.add_parser(
|
|
@@ -794,24 +659,6 @@ def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]
|
|
| 794 |
new_contributor.add_argument(
|
| 795 |
"--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
|
| 796 |
)
|
| 797 |
-
new_contributor.add_argument(
|
| 798 |
-
"--hf-repo-id",
|
| 799 |
-
default=defaults.get("hf-repo-id"),
|
| 800 |
-
help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
|
| 801 |
-
)
|
| 802 |
-
new_contributor.add_argument(
|
| 803 |
-
"--hf-revision",
|
| 804 |
-
default=defaults.get("hf-revision"),
|
| 805 |
-
help="Optional Hub revision for metadata and README download.",
|
| 806 |
-
)
|
| 807 |
-
new_contributor.add_argument(
|
| 808 |
-
"--hf-materialize-dir",
|
| 809 |
-
type=Path,
|
| 810 |
-
default=Path(defaults["hf-materialize-dir"])
|
| 811 |
-
if defaults.get("hf-materialize-dir")
|
| 812 |
-
else None,
|
| 813 |
-
help="Optional local directory used when materializing an HF dataset snapshot.",
|
| 814 |
-
)
|
| 815 |
new_contributor.add_argument(
|
| 816 |
"--window-days",
|
| 817 |
type=int,
|
|
@@ -855,24 +702,6 @@ def _add_dashboard_data_parser(subparsers: Any, defaults: dict[str, Any]) -> Non
|
|
| 855 |
type=Path,
|
| 856 |
help="Optional PR scope cluster JSON. Defaults to pr-scope-clusters.json in the snapshot.",
|
| 857 |
)
|
| 858 |
-
dashboard.add_argument(
|
| 859 |
-
"--hf-repo-id",
|
| 860 |
-
default=defaults.get("hf-repo-id"),
|
| 861 |
-
help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
|
| 862 |
-
)
|
| 863 |
-
dashboard.add_argument(
|
| 864 |
-
"--hf-revision",
|
| 865 |
-
default=defaults.get("hf-revision"),
|
| 866 |
-
help="Optional Hub revision for metadata and README download.",
|
| 867 |
-
)
|
| 868 |
-
dashboard.add_argument(
|
| 869 |
-
"--hf-materialize-dir",
|
| 870 |
-
type=Path,
|
| 871 |
-
default=Path(defaults["hf-materialize-dir"])
|
| 872 |
-
if defaults.get("hf-materialize-dir")
|
| 873 |
-
else None,
|
| 874 |
-
help="Optional local directory used when materializing an HF dataset snapshot.",
|
| 875 |
-
)
|
| 876 |
dashboard.add_argument(
|
| 877 |
"--window-days",
|
| 878 |
type=int,
|
|
@@ -932,24 +761,6 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
|
|
| 932 |
deploy_dashboard.add_argument(
|
| 933 |
"--contributors-input", type=Path, help="Optional contributor report JSON override."
|
| 934 |
)
|
| 935 |
-
deploy_dashboard.add_argument(
|
| 936 |
-
"--hf-repo-id",
|
| 937 |
-
default=defaults.get("hf-repo-id"),
|
| 938 |
-
help="Materialize a Hugging Face dataset repo instead of using the latest local snapshot.",
|
| 939 |
-
)
|
| 940 |
-
deploy_dashboard.add_argument(
|
| 941 |
-
"--hf-revision",
|
| 942 |
-
default=defaults.get("hf-revision"),
|
| 943 |
-
help="Optional Hub revision for metadata and README download.",
|
| 944 |
-
)
|
| 945 |
-
deploy_dashboard.add_argument(
|
| 946 |
-
"--hf-materialize-dir",
|
| 947 |
-
type=Path,
|
| 948 |
-
default=Path(defaults["hf-materialize-dir"])
|
| 949 |
-
if defaults.get("hf-materialize-dir")
|
| 950 |
-
else None,
|
| 951 |
-
help="Optional local directory used when materializing an HF dataset snapshot.",
|
| 952 |
-
)
|
| 953 |
deploy_dashboard.add_argument(
|
| 954 |
"--refresh-contributors",
|
| 955 |
action="store_true",
|
|
@@ -1006,31 +817,6 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
|
|
| 1006 |
)
|
| 1007 |
|
| 1008 |
|
| 1009 |
-
def _add_dataset_status_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 1010 |
-
dataset_status = subparsers.add_parser(
|
| 1011 |
-
"dataset-status",
|
| 1012 |
-
help="Inspect canonical dataset freshness and the local latest pointer.",
|
| 1013 |
-
)
|
| 1014 |
-
dataset_status.add_argument("--repo", default=defaults.get("repo"))
|
| 1015 |
-
dataset_status.add_argument(
|
| 1016 |
-
"--output-dir",
|
| 1017 |
-
type=Path,
|
| 1018 |
-
default=Path(defaults.get("output-dir", "data")),
|
| 1019 |
-
help="Local workspace root containing snapshots/latest.json.",
|
| 1020 |
-
)
|
| 1021 |
-
dataset_status.add_argument(
|
| 1022 |
-
"--hf-repo-id",
|
| 1023 |
-
default=defaults.get("hf-repo-id"),
|
| 1024 |
-
help="Canonical Hugging Face dataset repo id to inspect.",
|
| 1025 |
-
)
|
| 1026 |
-
dataset_status.add_argument(
|
| 1027 |
-
"--hf-revision",
|
| 1028 |
-
default=defaults.get("hf-revision"),
|
| 1029 |
-
help="Optional Hub revision for metadata and README download.",
|
| 1030 |
-
)
|
| 1031 |
-
dataset_status.add_argument("--json", action="store_true", help="Emit machine-readable JSON.")
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
def _add_full_pipeline_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 1035 |
full_pipeline = subparsers.add_parser(
|
| 1036 |
"full-pipeline",
|
|
@@ -1147,33 +933,6 @@ def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 1147 |
print(run_pipeline(options))
|
| 1148 |
|
| 1149 |
|
| 1150 |
-
def _run_refresh_dataset(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1151 |
-
del config_path
|
| 1152 |
-
from slop_farmer.app.dataset_refresh import run_dataset_refresh
|
| 1153 |
-
|
| 1154 |
-
result = run_dataset_refresh(
|
| 1155 |
-
DatasetRefreshOptions(
|
| 1156 |
-
repo=RepoRef.parse(args.repo),
|
| 1157 |
-
hf_repo_id=args.hf_repo_id,
|
| 1158 |
-
private_hf_repo=args.private_hf_repo,
|
| 1159 |
-
max_issues=args.max_issues,
|
| 1160 |
-
max_prs=args.max_prs,
|
| 1161 |
-
max_issue_comments=args.max_issue_comments,
|
| 1162 |
-
max_reviews_per_pr=args.max_reviews_per_pr,
|
| 1163 |
-
max_review_comments_per_pr=args.max_review_comments_per_pr,
|
| 1164 |
-
fetch_timeline=args.fetch_timeline,
|
| 1165 |
-
new_contributor_report=args.new_contributor_report,
|
| 1166 |
-
new_contributor_window_days=args.new_contributor_window_days,
|
| 1167 |
-
new_contributor_max_authors=args.new_contributor_max_authors,
|
| 1168 |
-
http_timeout=args.http_timeout,
|
| 1169 |
-
http_max_retries=args.http_max_retries,
|
| 1170 |
-
checkpoint_every_comments=args.checkpoint_every_comments,
|
| 1171 |
-
checkpoint_every_prs=args.checkpoint_every_prs,
|
| 1172 |
-
)
|
| 1173 |
-
)
|
| 1174 |
-
print(json.dumps(result, indent=2))
|
| 1175 |
-
|
| 1176 |
-
|
| 1177 |
def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1178 |
from slop_farmer.reports.analysis import run_analysis
|
| 1179 |
|
|
@@ -1282,18 +1041,12 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 1282 |
explain_pr_search_pair,
|
| 1283 |
format_pr_search_candidate_clusters,
|
| 1284 |
format_pr_search_cluster,
|
| 1285 |
-
format_pr_search_contributor,
|
| 1286 |
-
format_pr_search_contributor_pulls,
|
| 1287 |
format_pr_search_pair,
|
| 1288 |
format_pr_search_probe,
|
| 1289 |
-
format_pr_search_pull_contributor,
|
| 1290 |
format_pr_search_similar,
|
| 1291 |
format_pr_search_status,
|
| 1292 |
get_pr_search_candidate_clusters,
|
| 1293 |
get_pr_search_cluster,
|
| 1294 |
-
get_pr_search_contributor,
|
| 1295 |
-
get_pr_search_contributor_pulls,
|
| 1296 |
-
get_pr_search_pull_contributor,
|
| 1297 |
get_pr_search_similar,
|
| 1298 |
get_pr_search_status,
|
| 1299 |
probe_pr_search_github,
|
|
@@ -1387,36 +1140,6 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
|
|
| 1387 |
print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
|
| 1388 |
return
|
| 1389 |
|
| 1390 |
-
if args.pr_search_command == "contributor":
|
| 1391 |
-
result = get_pr_search_contributor(db_path, author_login=args.login, repo=args.repo)
|
| 1392 |
-
print(json.dumps(result, indent=2) if args.json else format_pr_search_contributor(result))
|
| 1393 |
-
return
|
| 1394 |
-
|
| 1395 |
-
if args.pr_search_command == "contributor-prs":
|
| 1396 |
-
result = get_pr_search_contributor_pulls(
|
| 1397 |
-
db_path,
|
| 1398 |
-
author_login=args.login,
|
| 1399 |
-
repo=args.repo,
|
| 1400 |
-
limit=args.limit,
|
| 1401 |
-
)
|
| 1402 |
-
print(
|
| 1403 |
-
json.dumps(result, indent=2)
|
| 1404 |
-
if args.json
|
| 1405 |
-
else format_pr_search_contributor_pulls(result)
|
| 1406 |
-
)
|
| 1407 |
-
return
|
| 1408 |
-
|
| 1409 |
-
if args.pr_search_command == "pr-contributor":
|
| 1410 |
-
result = get_pr_search_pull_contributor(
|
| 1411 |
-
db_path,
|
| 1412 |
-
pr_number=args.pr_number,
|
| 1413 |
-
repo=args.repo,
|
| 1414 |
-
)
|
| 1415 |
-
print(
|
| 1416 |
-
json.dumps(result, indent=2) if args.json else format_pr_search_pull_contributor(result)
|
| 1417 |
-
)
|
| 1418 |
-
return
|
| 1419 |
-
|
| 1420 |
raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
|
| 1421 |
|
| 1422 |
|
|
@@ -1458,7 +1181,6 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
|
|
| 1458 |
del config_path
|
| 1459 |
from slop_farmer.reports.new_contributor_report import run_new_contributor_report
|
| 1460 |
|
| 1461 |
-
hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
|
| 1462 |
print(
|
| 1463 |
run_new_contributor_report(
|
| 1464 |
NewContributorReportOptions(
|
|
@@ -1466,9 +1188,6 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
|
|
| 1466 |
output_dir=args.output_dir,
|
| 1467 |
output=args.output,
|
| 1468 |
json_output=args.json_output,
|
| 1469 |
-
hf_repo_id=hf_repo_id,
|
| 1470 |
-
hf_revision=hf_revision,
|
| 1471 |
-
hf_materialize_dir=hf_materialize_dir,
|
| 1472 |
window_days=args.window_days,
|
| 1473 |
max_authors=args.max_authors,
|
| 1474 |
)
|
|
@@ -1480,7 +1199,6 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
|
|
| 1480 |
from slop_farmer.reports.dashboard import run_dashboard_data
|
| 1481 |
|
| 1482 |
dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
|
| 1483 |
-
hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
|
| 1484 |
print(
|
| 1485 |
run_dashboard_data(
|
| 1486 |
DashboardDataOptions(
|
|
@@ -1489,9 +1207,6 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
|
|
| 1489 |
analysis_input=args.analysis_input,
|
| 1490 |
contributors_input=args.contributors_input,
|
| 1491 |
pr_scope_input=args.pr_scope_input,
|
| 1492 |
-
hf_repo_id=hf_repo_id,
|
| 1493 |
-
hf_revision=hf_revision,
|
| 1494 |
-
hf_materialize_dir=hf_materialize_dir,
|
| 1495 |
window_days=args.window_days,
|
| 1496 |
snapshot_root=(
|
| 1497 |
Path(dashboard_defaults["snapshot-root"])
|
|
@@ -1507,7 +1222,6 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
|
|
| 1507 |
del config_path
|
| 1508 |
from slop_farmer.app.deploy import run_deploy_dashboard
|
| 1509 |
|
| 1510 |
-
hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
|
| 1511 |
run_deploy_dashboard(
|
| 1512 |
DeployDashboardOptions(
|
| 1513 |
pipeline_data_dir=args.pipeline_data_dir,
|
|
@@ -1515,9 +1229,6 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
|
|
| 1515 |
snapshot_dir=args.snapshot_dir,
|
| 1516 |
analysis_input=args.analysis_input,
|
| 1517 |
contributors_input=args.contributors_input,
|
| 1518 |
-
hf_repo_id=hf_repo_id,
|
| 1519 |
-
hf_revision=hf_revision,
|
| 1520 |
-
hf_materialize_dir=hf_materialize_dir,
|
| 1521 |
refresh_contributors=args.refresh_contributors,
|
| 1522 |
dashboard_window_days=args.dashboard_window_days,
|
| 1523 |
contributor_window_days=args.contributor_window_days,
|
|
@@ -1536,22 +1247,6 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
|
|
| 1536 |
)
|
| 1537 |
|
| 1538 |
|
| 1539 |
-
def _run_dataset_status(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1540 |
-
del config_path
|
| 1541 |
-
from slop_farmer.app.dataset_status import format_dataset_status, get_dataset_status
|
| 1542 |
-
|
| 1543 |
-
result = get_dataset_status(
|
| 1544 |
-
DatasetStatusOptions(
|
| 1545 |
-
repo=args.repo,
|
| 1546 |
-
output_dir=args.output_dir,
|
| 1547 |
-
hf_repo_id=args.hf_repo_id,
|
| 1548 |
-
hf_revision=args.hf_revision,
|
| 1549 |
-
json_output=args.json,
|
| 1550 |
-
)
|
| 1551 |
-
)
|
| 1552 |
-
print(json.dumps(result, indent=2) if args.json else format_dataset_status(result))
|
| 1553 |
-
|
| 1554 |
-
|
| 1555 |
def _run_publish_snapshot(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1556 |
del config_path
|
| 1557 |
from slop_farmer.app.publish import run_publish_snapshot
|
|
@@ -1601,7 +1296,6 @@ def main() -> None:
|
|
| 1601 |
|
| 1602 |
handlers: dict[str, CommandHandler] = {
|
| 1603 |
"scrape": _run_scrape,
|
| 1604 |
-
"refresh-dataset": _run_refresh_dataset,
|
| 1605 |
"analyze": _run_analyze,
|
| 1606 |
"markdown-report": _run_markdown_report,
|
| 1607 |
"duplicate-prs": _run_duplicate_prs,
|
|
@@ -1612,7 +1306,6 @@ def main() -> None:
|
|
| 1612 |
"new-contributor-report": _run_new_contributor_report,
|
| 1613 |
"dashboard-data": _run_dashboard_data,
|
| 1614 |
"deploy-dashboard": _run_deploy_dashboard,
|
| 1615 |
-
"dataset-status": _run_dataset_status,
|
| 1616 |
"publish-snapshot": _run_publish_snapshot,
|
| 1617 |
"full-pipeline": _run_full_pipeline,
|
| 1618 |
}
|
|
|
|
| 13 |
AnalysisOptions,
|
| 14 |
CheckpointImportOptions,
|
| 15 |
DashboardDataOptions,
|
|
|
|
|
|
|
| 16 |
DeployDashboardOptions,
|
| 17 |
FullPipelineOptions,
|
| 18 |
MarkdownReportOptions,
|
|
|
|
| 41 |
subparsers = parser.add_subparsers(dest="command", required=True)
|
| 42 |
|
| 43 |
_add_scrape_parser(subparsers, defaults["scrape"])
|
|
|
|
| 44 |
_add_analyze_parser(subparsers, defaults["analyze"])
|
| 45 |
_add_pr_scope_parser(subparsers, defaults["pr-scope"])
|
| 46 |
_add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
|
|
|
|
| 52 |
_add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
|
| 53 |
_add_publish_snapshot_parser(subparsers, defaults["publish-snapshot"])
|
| 54 |
_add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
|
|
|
|
| 55 |
_add_full_pipeline_parser(subparsers, defaults["full-pipeline"])
|
| 56 |
return parser
|
| 57 |
|
|
|
|
| 59 |
def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
|
| 60 |
commands = (
|
| 61 |
"scrape",
|
|
|
|
| 62 |
"analyze",
|
| 63 |
"import-hf-checkpoint",
|
| 64 |
"pr-scope",
|
|
|
|
| 68 |
"dashboard-data",
|
| 69 |
"publish-snapshot",
|
| 70 |
"deploy-dashboard",
|
|
|
|
| 71 |
"full-pipeline",
|
| 72 |
)
|
| 73 |
return {command: command_defaults(command, config_path=config_path) for command in commands}
|
|
|
|
| 184 |
)
|
| 185 |
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 188 |
analyze = subparsers.add_parser(
|
| 189 |
"analyze", help="Analyze a local snapshot and write a shortlist JSON report."
|
|
|
|
| 637 |
status.add_argument("--repo", help="Optional repo override.")
|
| 638 |
status.add_argument("--json", action="store_true", help="Emit JSON.")
|
| 639 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
|
| 641 |
def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 642 |
new_contributor = subparsers.add_parser(
|
|
|
|
| 659 |
new_contributor.add_argument(
|
| 660 |
"--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
|
| 661 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
new_contributor.add_argument(
|
| 663 |
"--window-days",
|
| 664 |
type=int,
|
|
|
|
| 702 |
type=Path,
|
| 703 |
help="Optional PR scope cluster JSON. Defaults to pr-scope-clusters.json in the snapshot.",
|
| 704 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
dashboard.add_argument(
|
| 706 |
"--window-days",
|
| 707 |
type=int,
|
|
|
|
| 761 |
deploy_dashboard.add_argument(
|
| 762 |
"--contributors-input", type=Path, help="Optional contributor report JSON override."
|
| 763 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
deploy_dashboard.add_argument(
|
| 765 |
"--refresh-contributors",
|
| 766 |
action="store_true",
|
|
|
|
| 817 |
)
|
| 818 |
|
| 819 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 820 |
def _add_full_pipeline_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
|
| 821 |
full_pipeline = subparsers.add_parser(
|
| 822 |
"full-pipeline",
|
|
|
|
| 933 |
print(run_pipeline(options))
|
| 934 |
|
| 935 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 937 |
from slop_farmer.reports.analysis import run_analysis
|
| 938 |
|
|
|
|
| 1041 |
explain_pr_search_pair,
|
| 1042 |
format_pr_search_candidate_clusters,
|
| 1043 |
format_pr_search_cluster,
|
|
|
|
|
|
|
| 1044 |
format_pr_search_pair,
|
| 1045 |
format_pr_search_probe,
|
|
|
|
| 1046 |
format_pr_search_similar,
|
| 1047 |
format_pr_search_status,
|
| 1048 |
get_pr_search_candidate_clusters,
|
| 1049 |
get_pr_search_cluster,
|
|
|
|
|
|
|
|
|
|
| 1050 |
get_pr_search_similar,
|
| 1051 |
get_pr_search_status,
|
| 1052 |
probe_pr_search_github,
|
|
|
|
| 1140 |
print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
|
| 1141 |
return
|
| 1142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1143 |
raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
|
| 1144 |
|
| 1145 |
|
|
|
|
| 1181 |
del config_path
|
| 1182 |
from slop_farmer.reports.new_contributor_report import run_new_contributor_report
|
| 1183 |
|
|
|
|
| 1184 |
print(
|
| 1185 |
run_new_contributor_report(
|
| 1186 |
NewContributorReportOptions(
|
|
|
|
| 1188 |
output_dir=args.output_dir,
|
| 1189 |
output=args.output,
|
| 1190 |
json_output=args.json_output,
|
|
|
|
|
|
|
|
|
|
| 1191 |
window_days=args.window_days,
|
| 1192 |
max_authors=args.max_authors,
|
| 1193 |
)
|
|
|
|
| 1199 |
from slop_farmer.reports.dashboard import run_dashboard_data
|
| 1200 |
|
| 1201 |
dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
|
|
|
|
| 1202 |
print(
|
| 1203 |
run_dashboard_data(
|
| 1204 |
DashboardDataOptions(
|
|
|
|
| 1207 |
analysis_input=args.analysis_input,
|
| 1208 |
contributors_input=args.contributors_input,
|
| 1209 |
pr_scope_input=args.pr_scope_input,
|
|
|
|
|
|
|
|
|
|
| 1210 |
window_days=args.window_days,
|
| 1211 |
snapshot_root=(
|
| 1212 |
Path(dashboard_defaults["snapshot-root"])
|
|
|
|
| 1222 |
del config_path
|
| 1223 |
from slop_farmer.app.deploy import run_deploy_dashboard
|
| 1224 |
|
|
|
|
| 1225 |
run_deploy_dashboard(
|
| 1226 |
DeployDashboardOptions(
|
| 1227 |
pipeline_data_dir=args.pipeline_data_dir,
|
|
|
|
| 1229 |
snapshot_dir=args.snapshot_dir,
|
| 1230 |
analysis_input=args.analysis_input,
|
| 1231 |
contributors_input=args.contributors_input,
|
|
|
|
|
|
|
|
|
|
| 1232 |
refresh_contributors=args.refresh_contributors,
|
| 1233 |
dashboard_window_days=args.dashboard_window_days,
|
| 1234 |
contributor_window_days=args.contributor_window_days,
|
|
|
|
| 1247 |
)
|
| 1248 |
|
| 1249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1250 |
def _run_publish_snapshot(args: argparse.Namespace, config_path: Path | None) -> None:
|
| 1251 |
del config_path
|
| 1252 |
from slop_farmer.app.publish import run_publish_snapshot
|
|
|
|
| 1296 |
|
| 1297 |
handlers: dict[str, CommandHandler] = {
|
| 1298 |
"scrape": _run_scrape,
|
|
|
|
| 1299 |
"analyze": _run_analyze,
|
| 1300 |
"markdown-report": _run_markdown_report,
|
| 1301 |
"duplicate-prs": _run_duplicate_prs,
|
|
|
|
| 1306 |
"new-contributor-report": _run_new_contributor_report,
|
| 1307 |
"dashboard-data": _run_dashboard_data,
|
| 1308 |
"deploy-dashboard": _run_deploy_dashboard,
|
|
|
|
| 1309 |
"publish-snapshot": _run_publish_snapshot,
|
| 1310 |
"full-pipeline": _run_full_pipeline,
|
| 1311 |
}
|
src/slop_farmer/app/deploy.py
CHANGED
|
@@ -5,7 +5,6 @@ import subprocess
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
from slop_farmer.config import DeployDashboardOptions
|
| 8 |
-
from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
|
| 9 |
|
| 10 |
|
| 11 |
def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
@@ -18,16 +17,6 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
| 18 |
{
|
| 19 |
"PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
|
| 20 |
"WEB_DIR": str(options.web_dir),
|
| 21 |
-
"SNAPSHOT_DIR": str(
|
| 22 |
-
resolve_snapshot_source_dir(
|
| 23 |
-
snapshot_dir=options.snapshot_dir,
|
| 24 |
-
local_snapshots_root=options.pipeline_data_dir.resolve() / "snapshots",
|
| 25 |
-
hf_repo_id=options.hf_repo_id,
|
| 26 |
-
hf_revision=options.hf_revision,
|
| 27 |
-
hf_materialize_dir=options.hf_materialize_dir,
|
| 28 |
-
hf_output_dir=options.pipeline_data_dir,
|
| 29 |
-
)
|
| 30 |
-
),
|
| 31 |
"DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
|
| 32 |
"CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
|
| 33 |
"CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
|
|
@@ -39,6 +28,8 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
| 39 |
"SPACE_SHORT_DESCRIPTION": options.space_short_description,
|
| 40 |
}
|
| 41 |
)
|
|
|
|
|
|
|
| 42 |
if options.analysis_input is not None:
|
| 43 |
env["ANALYSIS_INPUT"] = str(options.analysis_input)
|
| 44 |
if options.contributors_input is not None:
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
from slop_farmer.config import DeployDashboardOptions
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
|
|
|
|
| 17 |
{
|
| 18 |
"PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
|
| 19 |
"WEB_DIR": str(options.web_dir),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
|
| 21 |
"CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
|
| 22 |
"CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
|
|
|
|
| 28 |
"SPACE_SHORT_DESCRIPTION": options.space_short_description,
|
| 29 |
}
|
| 30 |
)
|
| 31 |
+
if options.snapshot_dir is not None:
|
| 32 |
+
env["SNAPSHOT_DIR"] = str(options.snapshot_dir)
|
| 33 |
if options.analysis_input is not None:
|
| 34 |
env["ANALYSIS_INPUT"] = str(options.analysis_input)
|
| 35 |
if options.contributors_input is not None:
|
src/slop_farmer/app/hf_checkpoint_import.py
CHANGED
|
@@ -28,7 +28,6 @@ from huggingface_hub import HfApi, hf_hub_download
|
|
| 28 |
|
| 29 |
from slop_farmer.app.publish import publish_snapshot
|
| 30 |
from slop_farmer.config import CheckpointImportOptions
|
| 31 |
-
from slop_farmer.data.dataset_card import build_hf_dataset_card
|
| 32 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 33 |
from slop_farmer.data.parquet_io import (
|
| 34 |
SCHEMAS,
|
|
@@ -456,15 +455,76 @@ def _viewer_comment_rows(
|
|
| 456 |
def _dataset_card(
|
| 457 |
repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
|
| 458 |
) -> str:
|
| 459 |
-
return
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
|
| 470 |
def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:
|
|
|
|
| 28 |
|
| 29 |
from slop_farmer.app.publish import publish_snapshot
|
| 30 |
from slop_farmer.config import CheckpointImportOptions
|
|
|
|
| 31 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 32 |
from slop_farmer.data.parquet_io import (
|
| 33 |
SCHEMAS,
|
|
|
|
| 455 |
def _dataset_card(
|
| 456 |
repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
|
| 457 |
) -> str:
|
| 458 |
+
return f"""---
|
| 459 |
+
pretty_name: Transformers PR Slop Dataset
|
| 460 |
+
configs:
|
| 461 |
+
- config_name: issues
|
| 462 |
+
data_files:
|
| 463 |
+
- split: train
|
| 464 |
+
path: issues.parquet
|
| 465 |
+
default: true
|
| 466 |
+
- config_name: prs
|
| 467 |
+
data_files:
|
| 468 |
+
- split: train
|
| 469 |
+
path: pull_requests.parquet
|
| 470 |
+
- config_name: issue_comments
|
| 471 |
+
data_files:
|
| 472 |
+
- split: train
|
| 473 |
+
path: issue_comments.parquet
|
| 474 |
+
- config_name: pr_comments
|
| 475 |
+
data_files:
|
| 476 |
+
- split: train
|
| 477 |
+
path: pr_comments.parquet
|
| 478 |
+
- config_name: pr_reviews
|
| 479 |
+
data_files:
|
| 480 |
+
- split: train
|
| 481 |
+
path: reviews.parquet
|
| 482 |
+
- config_name: pr_files
|
| 483 |
+
data_files:
|
| 484 |
+
- split: train
|
| 485 |
+
path: pr_files.parquet
|
| 486 |
+
- config_name: pr_diffs
|
| 487 |
+
data_files:
|
| 488 |
+
- split: train
|
| 489 |
+
path: pr_diffs.parquet
|
| 490 |
+
- config_name: review_comments
|
| 491 |
+
data_files:
|
| 492 |
+
- split: train
|
| 493 |
+
path: review_comments.parquet
|
| 494 |
+
- config_name: links
|
| 495 |
+
data_files:
|
| 496 |
+
- split: train
|
| 497 |
+
path: links.parquet
|
| 498 |
+
- config_name: events
|
| 499 |
+
data_files:
|
| 500 |
+
- split: train
|
| 501 |
+
path: events.parquet
|
| 502 |
+
---
|
| 503 |
+
---
|
| 504 |
+
|
| 505 |
+
# Transformers PR Slop Dataset
|
| 506 |
+
|
| 507 |
+
Imported checkpoint snapshot for `{repo_slug}`.
|
| 508 |
+
|
| 509 |
+
Files:
|
| 510 |
+
- `issues.parquet`
|
| 511 |
+
- `pull_requests.parquet`
|
| 512 |
+
- `comments.parquet`
|
| 513 |
+
- `issue_comments.parquet`
|
| 514 |
+
- `pr_comments.parquet`
|
| 515 |
+
- `reviews.parquet`
|
| 516 |
+
- `pr_files.parquet`
|
| 517 |
+
- `pr_diffs.parquet`
|
| 518 |
+
- `review_comments.parquet`
|
| 519 |
+
- `links.parquet`
|
| 520 |
+
- `events.parquet`
|
| 521 |
+
|
| 522 |
+
Notes:
|
| 523 |
+
- source HF dataset: `{source_repo_id}`
|
| 524 |
+
- source checkpoint root: `{checkpoint_root}`
|
| 525 |
+
- latest imported checkpoint: `{snapshot_id}`
|
| 526 |
+
- links were regenerated locally from text references and timeline events
|
| 527 |
+
"""
|
| 528 |
|
| 529 |
|
| 530 |
def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:
|
src/slop_farmer/app/pipeline.py
CHANGED
|
@@ -9,7 +9,6 @@ from typing import Any, Protocol
|
|
| 9 |
|
| 10 |
from slop_farmer.app.publish import publish_snapshot
|
| 11 |
from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
|
| 12 |
-
from slop_farmer.data.dataset_card import build_hf_dataset_card
|
| 13 |
from slop_farmer.data.github_api import GitHubClient
|
| 14 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 15 |
from slop_farmer.data.normalize import (
|
|
@@ -113,14 +112,96 @@ def _reference_time_for_age_caps(crawl_started_at: str) -> datetime:
|
|
| 113 |
def _dataset_card(
|
| 114 |
repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
|
| 115 |
) -> str:
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
def _viewer_comment_rows(
|
|
@@ -964,9 +1045,6 @@ def run_pipeline(options: PipelineOptions, client: GitHubClientLike | None = Non
|
|
| 964 |
output_dir=options.output_dir,
|
| 965 |
output=None,
|
| 966 |
json_output=None,
|
| 967 |
-
hf_repo_id=None,
|
| 968 |
-
hf_revision=None,
|
| 969 |
-
hf_materialize_dir=None,
|
| 970 |
window_days=options.new_contributor_window_days,
|
| 971 |
max_authors=options.new_contributor_max_authors,
|
| 972 |
)
|
|
|
|
| 9 |
|
| 10 |
from slop_farmer.app.publish import publish_snapshot
|
| 11 |
from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
|
|
|
|
| 12 |
from slop_farmer.data.github_api import GitHubClient
|
| 13 |
from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
|
| 14 |
from slop_farmer.data.normalize import (
|
|
|
|
| 112 |
def _dataset_card(
|
| 113 |
repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
|
| 114 |
) -> str:
|
| 115 |
+
new_contributor_config = ""
|
| 116 |
+
new_contributor_file = ""
|
| 117 |
+
if include_new_contributors:
|
| 118 |
+
new_contributor_config = """- config_name: new_contributors
|
| 119 |
+
data_files:
|
| 120 |
+
- split: train
|
| 121 |
+
path: new_contributors.parquet
|
| 122 |
+
"""
|
| 123 |
+
new_contributor_file = """- `new_contributors.parquet`
|
| 124 |
+
- `new-contributors-report.json`
|
| 125 |
+
- `new-contributors-report.md`
|
| 126 |
+
"""
|
| 127 |
+
return f"""---
|
| 128 |
+
pretty_name: Transformers PR Slop Dataset
|
| 129 |
+
configs:
|
| 130 |
+
- config_name: issues
|
| 131 |
+
data_files:
|
| 132 |
+
- split: train
|
| 133 |
+
path: issues.parquet
|
| 134 |
+
default: true
|
| 135 |
+
- config_name: prs
|
| 136 |
+
data_files:
|
| 137 |
+
- split: train
|
| 138 |
+
path: pull_requests.parquet
|
| 139 |
+
- config_name: issue_comments
|
| 140 |
+
data_files:
|
| 141 |
+
- split: train
|
| 142 |
+
path: issue_comments.parquet
|
| 143 |
+
- config_name: pr_comments
|
| 144 |
+
data_files:
|
| 145 |
+
- split: train
|
| 146 |
+
path: pr_comments.parquet
|
| 147 |
+
- config_name: pr_reviews
|
| 148 |
+
data_files:
|
| 149 |
+
- split: train
|
| 150 |
+
path: reviews.parquet
|
| 151 |
+
- config_name: pr_files
|
| 152 |
+
data_files:
|
| 153 |
+
- split: train
|
| 154 |
+
path: pr_files.parquet
|
| 155 |
+
- config_name: pr_diffs
|
| 156 |
+
data_files:
|
| 157 |
+
- split: train
|
| 158 |
+
path: pr_diffs.parquet
|
| 159 |
+
- config_name: review_comments
|
| 160 |
+
data_files:
|
| 161 |
+
- split: train
|
| 162 |
+
path: review_comments.parquet
|
| 163 |
+
- config_name: links
|
| 164 |
+
data_files:
|
| 165 |
+
- split: train
|
| 166 |
+
path: links.parquet
|
| 167 |
+
- config_name: events
|
| 168 |
+
data_files:
|
| 169 |
+
- split: train
|
| 170 |
+
path: events.parquet
|
| 171 |
+
{new_contributor_config}---
|
| 172 |
+
---
|
| 173 |
+
|
| 174 |
+
# Transformers PR Slop Dataset
|
| 175 |
+
|
| 176 |
+
Normalized snapshots of issues, pull requests, comments, reviews, and linkage data from `{repo}`.
|
| 177 |
+
|
| 178 |
+
Files:
|
| 179 |
+
- `issues.parquet`
|
| 180 |
+
- `pull_requests.parquet`
|
| 181 |
+
- `comments.parquet`
|
| 182 |
+
- `issue_comments.parquet` (derived view of issue discussion comments)
|
| 183 |
+
- `pr_comments.parquet` (derived view of pull request discussion comments)
|
| 184 |
+
- `reviews.parquet`
|
| 185 |
+
- `pr_files.parquet`
|
| 186 |
+
- `pr_diffs.parquet`
|
| 187 |
+
- `review_comments.parquet`
|
| 188 |
+
- `links.parquet`
|
| 189 |
+
- `events.parquet`
|
| 190 |
+
{new_contributor_file}
|
| 191 |
+
|
| 192 |
+
Use:
|
| 193 |
+
- duplicate PR and issue analysis
|
| 194 |
+
- triage and ranking experiments
|
| 195 |
+
- eval set creation
|
| 196 |
+
|
| 197 |
+
Notes:
|
| 198 |
+
- updated daily
|
| 199 |
+
- latest snapshot: `{snapshot_id}`
|
| 200 |
+
- raw data only; no labels or moderation decisions
|
| 201 |
+
- PR metadata, file-level patch hunks, and full unified diffs are included
|
| 202 |
+
- new contributor reviewer artifacts are included when generated for the snapshot
|
| 203 |
+
- full file contents for changed files are not included
|
| 204 |
+
"""
|
| 205 |
|
| 206 |
|
| 207 |
def _viewer_comment_rows(
|
|
|
|
| 1045 |
output_dir=options.output_dir,
|
| 1046 |
output=None,
|
| 1047 |
json_output=None,
|
|
|
|
|
|
|
|
|
|
| 1048 |
window_days=options.new_contributor_window_days,
|
| 1049 |
max_authors=options.new_contributor_max_authors,
|
| 1050 |
)
|
src/slop_farmer/app/pr_search.py
CHANGED
|
@@ -10,12 +10,9 @@ get_pr_search_status = pr_search_service.get_pr_search_status
|
|
| 10 |
get_pr_search_similar = pr_search_service.get_pr_search_similar
|
| 11 |
get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
|
| 12 |
get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
|
| 13 |
-
get_pr_search_contributor = pr_search_service.get_pr_search_contributor
|
| 14 |
-
get_pr_search_contributor_pulls = pr_search_service.get_pr_search_contributor_pulls
|
| 15 |
get_pr_search_clusters = pr_search_service.get_pr_search_clusters
|
| 16 |
list_pr_search_clusters = pr_search_service.list_pr_search_clusters
|
| 17 |
get_pr_search_cluster = pr_search_service.get_pr_search_cluster
|
| 18 |
-
get_pr_search_pull_contributor = pr_search_service.get_pr_search_pull_contributor
|
| 19 |
explain_pr_search_pair = pr_search_service.explain_pr_search_pair
|
| 20 |
probe_pr_search_live = pr_search_service.probe_pr_search_live
|
| 21 |
probe_pr_search_github = pr_search_service.probe_pr_search_github
|
|
@@ -34,7 +31,6 @@ def format_pr_search_status(result: Mapping[str, Any]) -> str:
|
|
| 34 |
(
|
| 35 |
"Rows: "
|
| 36 |
f"documents={counts['documents']} "
|
| 37 |
-
f"contributors={counts.get('contributors', 0)} "
|
| 38 |
f"features={counts['features']} "
|
| 39 |
f"neighbors={counts['neighbors']} "
|
| 40 |
f"clusters={counts['clusters']} "
|
|
@@ -249,73 +245,3 @@ def format_pr_search_probe(result: Mapping[str, Any]) -> str:
|
|
| 249 |
if row.get("reason"):
|
| 250 |
lines.append(f" reason: {row['reason']}")
|
| 251 |
return "\n".join(lines)
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
def format_pr_search_contributor(result: Mapping[str, Any]) -> str:
|
| 255 |
-
contributor = result["contributor"]
|
| 256 |
-
lines = [
|
| 257 |
-
f"Contributor {contributor['author_login']}",
|
| 258 |
-
f"Repo: {result['repo']}",
|
| 259 |
-
f"Snapshot: {result['snapshot_id']}",
|
| 260 |
-
f"Name: {contributor.get('name') or '-'}",
|
| 261 |
-
f"Profile: {contributor.get('profile_url') or '-'}",
|
| 262 |
-
f"Association: {contributor.get('repo_association') or '-'}",
|
| 263 |
-
f"First seen in snapshot: {'yes' if contributor.get('first_seen_in_snapshot') else 'no'}",
|
| 264 |
-
(
|
| 265 |
-
"Scores: "
|
| 266 |
-
f"follow-through={contributor.get('follow_through_score') or '-'} "
|
| 267 |
-
f"breadth={contributor.get('breadth_score') or '-'} "
|
| 268 |
-
f"risk={contributor.get('automation_risk_signal') or '-'}"
|
| 269 |
-
),
|
| 270 |
-
f"Heuristic: {contributor.get('heuristic_note') or '-'}",
|
| 271 |
-
f"Public orgs: {', '.join(contributor.get('public_orgs') or []) or '-'}",
|
| 272 |
-
"",
|
| 273 |
-
"Recent indexed PRs:",
|
| 274 |
-
]
|
| 275 |
-
pulls = result.get("pulls") or []
|
| 276 |
-
if not pulls:
|
| 277 |
-
lines.append("- none")
|
| 278 |
-
return "\n".join(lines)
|
| 279 |
-
for row in pulls:
|
| 280 |
-
lines.append(
|
| 281 |
-
f"- PR #{row['pr_number']}: {row.get('title') or ''} "
|
| 282 |
-
f"[state={row.get('state') or '-'} merged={'yes' if row.get('merged') else 'no'}]"
|
| 283 |
-
)
|
| 284 |
-
return "\n".join(lines)
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
def format_pr_search_contributor_pulls(result: Mapping[str, Any]) -> str:
|
| 288 |
-
contributor = result["contributor"]
|
| 289 |
-
lines = [
|
| 290 |
-
f"Contributor PRs: {contributor['author_login']}",
|
| 291 |
-
f"Repo: {result['repo']}",
|
| 292 |
-
f"Snapshot: {result['snapshot_id']}",
|
| 293 |
-
f"Pull requests: {result.get('pull_count', len(result.get('pulls') or []))}",
|
| 294 |
-
"",
|
| 295 |
-
]
|
| 296 |
-
pulls = result.get("pulls") or []
|
| 297 |
-
if not pulls:
|
| 298 |
-
lines.append("No indexed PRs found for that contributor.")
|
| 299 |
-
return "\n".join(lines)
|
| 300 |
-
for row in pulls:
|
| 301 |
-
lines.append(
|
| 302 |
-
f"- PR #{row['pr_number']}: {row.get('title') or ''} "
|
| 303 |
-
f"(updated={row.get('updated_at') or '-'}, state={row.get('state') or '-'})"
|
| 304 |
-
)
|
| 305 |
-
return "\n".join(lines)
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
def format_pr_search_pull_contributor(result: Mapping[str, Any]) -> str:
|
| 309 |
-
pr = result["pr"]
|
| 310 |
-
contributor = result["contributor"]
|
| 311 |
-
return "\n".join(
|
| 312 |
-
[
|
| 313 |
-
f"PR #{pr['pr_number']}: {pr.get('title') or ''}",
|
| 314 |
-
f"Author: {contributor['author_login']}",
|
| 315 |
-
f"Risk: {contributor.get('automation_risk_signal') or '-'}",
|
| 316 |
-
f"Follow-through: {contributor.get('follow_through_score') or '-'}",
|
| 317 |
-
f"Breadth: {contributor.get('breadth_score') or '-'}",
|
| 318 |
-
f"Heuristic: {contributor.get('heuristic_note') or '-'}",
|
| 319 |
-
f"Profile: {contributor.get('profile_url') or '-'}",
|
| 320 |
-
]
|
| 321 |
-
)
|
|
|
|
| 10 |
get_pr_search_similar = pr_search_service.get_pr_search_similar
|
| 11 |
get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
|
| 12 |
get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
|
|
|
|
|
|
|
| 13 |
get_pr_search_clusters = pr_search_service.get_pr_search_clusters
|
| 14 |
list_pr_search_clusters = pr_search_service.list_pr_search_clusters
|
| 15 |
get_pr_search_cluster = pr_search_service.get_pr_search_cluster
|
|
|
|
| 16 |
explain_pr_search_pair = pr_search_service.explain_pr_search_pair
|
| 17 |
probe_pr_search_live = pr_search_service.probe_pr_search_live
|
| 18 |
probe_pr_search_github = pr_search_service.probe_pr_search_github
|
|
|
|
| 31 |
(
|
| 32 |
"Rows: "
|
| 33 |
f"documents={counts['documents']} "
|
|
|
|
| 34 |
f"features={counts['features']} "
|
| 35 |
f"neighbors={counts['neighbors']} "
|
| 36 |
f"clusters={counts['clusters']} "
|
|
|
|
| 245 |
if row.get("reason"):
|
| 246 |
lines.append(f" reason: {row['reason']}")
|
| 247 |
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/slop_farmer/app/pr_search_api.py
CHANGED
|
@@ -11,25 +11,30 @@ from fastapi.responses import JSONResponse
|
|
| 11 |
|
| 12 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 13 |
from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
|
| 14 |
-
from slop_farmer.
|
| 15 |
-
|
| 16 |
-
get_analysis_meta_bug,
|
| 17 |
-
get_analysis_status,
|
| 18 |
-
get_pr_analysis,
|
| 19 |
-
list_analysis_duplicate_prs,
|
| 20 |
-
list_analysis_meta_bugs,
|
| 21 |
-
)
|
| 22 |
from slop_farmer.reports.pr_search_service import (
|
| 23 |
get_pr_search_cluster,
|
| 24 |
get_pr_search_clusters,
|
| 25 |
-
get_pr_search_contributor,
|
| 26 |
-
get_pr_search_contributor_pulls,
|
| 27 |
-
get_pr_search_pull_contributor,
|
| 28 |
get_pr_search_similar_lookup,
|
| 29 |
get_pr_search_status,
|
| 30 |
list_pr_search_clusters,
|
| 31 |
run_pr_search_refresh,
|
| 32 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
@dataclass(slots=True)
|
|
@@ -37,7 +42,6 @@ class PrSearchApiSettings:
|
|
| 37 |
default_repo: str | None
|
| 38 |
index_path: Path
|
| 39 |
output_dir: Path
|
| 40 |
-
analysis_dir: Path | None = None
|
| 41 |
snapshot_dir: Path | None = None
|
| 42 |
hf_repo_id: str | None = None
|
| 43 |
hf_revision: str | None = None
|
|
@@ -55,6 +59,10 @@ class PrSearchApiSettings:
|
|
| 55 |
candidate_limit_max: int = 20
|
| 56 |
cluster_list_limit_default: int = 50
|
| 57 |
cluster_list_limit_max: int = 200
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
probe_limit_default: int = 10
|
| 59 |
probe_limit_max: int = 25
|
| 60 |
|
|
@@ -70,7 +78,6 @@ class PrSearchApiSettings:
|
|
| 70 |
default_repo=os.environ.get("DEFAULT_REPO"),
|
| 71 |
index_path=index_path,
|
| 72 |
output_dir=output_dir,
|
| 73 |
-
analysis_dir=_env_path("ANALYSIS_DIR") or (output_dir / "analysis"),
|
| 74 |
snapshot_dir=snapshot_dir,
|
| 75 |
hf_repo_id=os.environ.get("HF_REPO_ID"),
|
| 76 |
hf_revision=os.environ.get("HF_REVISION"),
|
|
@@ -88,6 +95,10 @@ class PrSearchApiSettings:
|
|
| 88 |
candidate_limit_max=_env_int("CANDIDATE_LIMIT_MAX", 20),
|
| 89 |
cluster_list_limit_default=_env_int("CLUSTER_LIST_LIMIT_DEFAULT", 50),
|
| 90 |
cluster_list_limit_max=_env_int("CLUSTER_LIST_LIMIT_MAX", 200),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
probe_limit_default=_env_int("PROBE_LIMIT_DEFAULT", 10),
|
| 92 |
probe_limit_max=_env_int("PROBE_LIMIT_MAX", 25),
|
| 93 |
)
|
|
@@ -102,13 +113,14 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 102 |
app.state.ready = False
|
| 103 |
app.state.startup_error = None
|
| 104 |
try:
|
|
|
|
| 105 |
_bootstrap_index(api_settings)
|
| 106 |
app.state.ready = _is_ready(api_settings)
|
| 107 |
except Exception as exc:
|
| 108 |
app.state.startup_error = str(exc)
|
| 109 |
yield
|
| 110 |
|
| 111 |
-
app = FastAPI(title="slop PR search API", version="0.1.
|
| 112 |
|
| 113 |
@app.exception_handler(ValueError)
|
| 114 |
async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
|
|
@@ -139,7 +151,9 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 139 |
async def repo_status(owner: str, repo: str, request: Request) -> dict[str, Any]:
|
| 140 |
settings = request.app.state.settings
|
| 141 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 142 |
-
|
|
|
|
|
|
|
| 143 |
|
| 144 |
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/similar")
|
| 145 |
async def pr_similar(
|
|
@@ -217,80 +231,89 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 217 |
),
|
| 218 |
)
|
| 219 |
|
| 220 |
-
@app.get("/v1/repos/{owner}/{repo}/
|
| 221 |
-
async def
|
| 222 |
-
owner: str,
|
|
|
|
|
|
|
|
|
|
| 223 |
) -> dict[str, Any]:
|
| 224 |
settings = request.app.state.settings
|
| 225 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 226 |
-
return
|
| 227 |
|
| 228 |
-
@app.get("/v1/repos/{owner}/{repo}/
|
| 229 |
-
async def
|
| 230 |
owner: str,
|
| 231 |
repo: str,
|
| 232 |
-
login: str,
|
| 233 |
request: Request,
|
| 234 |
limit: int | None = None,
|
|
|
|
| 235 |
) -> dict[str, Any]:
|
| 236 |
settings = request.app.state.settings
|
| 237 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 238 |
-
return
|
| 239 |
-
settings
|
| 240 |
-
repo=repo_slug,
|
| 241 |
-
author_login=login,
|
| 242 |
limit=_limit(
|
| 243 |
-
limit,
|
|
|
|
|
|
|
| 244 |
),
|
|
|
|
| 245 |
)
|
| 246 |
|
| 247 |
-
@app.get("/v1/repos/{owner}/{repo}/
|
| 248 |
-
async def
|
| 249 |
owner: str,
|
| 250 |
repo: str,
|
| 251 |
-
|
| 252 |
request: Request,
|
|
|
|
| 253 |
) -> dict[str, Any]:
|
| 254 |
settings = request.app.state.settings
|
| 255 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 256 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
-
@app.get("/v1/repos/{owner}/{repo}/
|
| 259 |
-
async def
|
| 260 |
owner: str,
|
| 261 |
repo: str,
|
|
|
|
| 262 |
request: Request,
|
| 263 |
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 264 |
) -> dict[str, Any]:
|
| 265 |
settings = request.app.state.settings
|
| 266 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 267 |
-
return
|
| 268 |
-
settings
|
| 269 |
-
|
| 270 |
variant=variant,
|
| 271 |
-
analysis_root=settings.analysis_dir,
|
| 272 |
)
|
| 273 |
|
| 274 |
-
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/
|
| 275 |
-
async def
|
| 276 |
owner: str,
|
| 277 |
repo: str,
|
| 278 |
number: int,
|
| 279 |
request: Request,
|
|
|
|
| 280 |
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 281 |
) -> dict[str, Any]:
|
| 282 |
settings = request.app.state.settings
|
| 283 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 284 |
-
return
|
| 285 |
-
settings
|
| 286 |
-
repo=repo_slug,
|
| 287 |
pr_number=number,
|
|
|
|
| 288 |
variant=variant,
|
| 289 |
-
analysis_root=settings.analysis_dir,
|
| 290 |
)
|
| 291 |
|
| 292 |
-
@app.get("/v1/repos/{owner}/{repo}/
|
| 293 |
-
async def
|
| 294 |
owner: str,
|
| 295 |
repo: str,
|
| 296 |
request: Request,
|
|
@@ -299,73 +322,76 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 299 |
) -> dict[str, Any]:
|
| 300 |
settings = request.app.state.settings
|
| 301 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 302 |
-
return
|
| 303 |
-
settings
|
| 304 |
-
repo=repo_slug,
|
| 305 |
-
variant=variant,
|
| 306 |
-
analysis_root=settings.analysis_dir,
|
| 307 |
limit=_limit(
|
| 308 |
limit,
|
| 309 |
-
default=settings.
|
| 310 |
-
maximum=settings.
|
| 311 |
),
|
|
|
|
| 312 |
)
|
| 313 |
|
| 314 |
-
@app.get("/v1/repos/{owner}/{repo}/
|
| 315 |
-
async def
|
| 316 |
owner: str,
|
| 317 |
repo: str,
|
| 318 |
-
cluster_id: str,
|
| 319 |
request: Request,
|
| 320 |
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 321 |
) -> dict[str, Any]:
|
| 322 |
settings = request.app.state.settings
|
| 323 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 324 |
-
return
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
-
@app.get("/v1/repos/{owner}/{repo}/
|
| 333 |
-
async def
|
| 334 |
owner: str,
|
| 335 |
repo: str,
|
| 336 |
request: Request,
|
| 337 |
limit: int | None = None,
|
| 338 |
-
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 339 |
) -> dict[str, Any]:
|
| 340 |
settings = request.app.state.settings
|
| 341 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 342 |
-
return
|
| 343 |
-
settings
|
| 344 |
-
repo=repo_slug,
|
| 345 |
-
variant=variant,
|
| 346 |
-
analysis_root=settings.analysis_dir,
|
| 347 |
limit=_limit(
|
| 348 |
limit,
|
| 349 |
-
default=settings.
|
| 350 |
-
maximum=settings.
|
| 351 |
),
|
| 352 |
)
|
| 353 |
|
| 354 |
-
@app.get("/v1/repos/{owner}/{repo}/
|
| 355 |
-
async def
|
| 356 |
owner: str,
|
| 357 |
repo: str,
|
|
|
|
| 358 |
request: Request,
|
| 359 |
-
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 360 |
) -> dict[str, Any]:
|
| 361 |
settings = request.app.state.settings
|
| 362 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 363 |
-
return
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
return app
|
| 371 |
|
|
@@ -391,6 +417,21 @@ def _bootstrap_index(settings: PrSearchApiSettings) -> None:
|
|
| 391 |
)
|
| 392 |
|
| 393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
def _needs_refresh(settings: PrSearchApiSettings) -> bool:
|
| 395 |
if settings.rebuild_on_start:
|
| 396 |
return True
|
|
@@ -429,6 +470,17 @@ def _repo_slug(settings: PrSearchApiSettings, owner: str, repo: str) -> str:
|
|
| 429 |
return repo_slug
|
| 430 |
|
| 431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
def _limit(value: int | None, *, default: int, maximum: int) -> int:
|
| 433 |
limit = default if value is None else value
|
| 434 |
if limit < 1:
|
|
@@ -452,8 +504,6 @@ def _looks_not_found(exc: ValueError) -> bool:
|
|
| 452 |
message = str(exc).lower()
|
| 453 |
return (
|
| 454 |
"not found" in message
|
| 455 |
-
or "analysis report was not found" in message
|
| 456 |
-
or "no analysis report was found" in message
|
| 457 |
or "no active pr search run" in message
|
| 458 |
or "was not found in the active indexed universe" in message
|
| 459 |
)
|
|
|
|
| 11 |
|
| 12 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 13 |
from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
|
| 14 |
+
from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
|
| 15 |
+
from slop_farmer.data.snapshot_paths import default_hf_materialize_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
from slop_farmer.reports.pr_search_service import (
|
| 17 |
get_pr_search_cluster,
|
| 18 |
get_pr_search_clusters,
|
|
|
|
|
|
|
|
|
|
| 19 |
get_pr_search_similar_lookup,
|
| 20 |
get_pr_search_status,
|
| 21 |
list_pr_search_clusters,
|
| 22 |
run_pr_search_refresh,
|
| 23 |
)
|
| 24 |
+
from slop_farmer.reports.read_views import (
|
| 25 |
+
check_issue_cluster_membership,
|
| 26 |
+
get_contributor,
|
| 27 |
+
get_contributor_risk,
|
| 28 |
+
get_contributor_status,
|
| 29 |
+
get_issue_best,
|
| 30 |
+
get_issue_cluster,
|
| 31 |
+
get_issue_cluster_status,
|
| 32 |
+
get_issue_clusters_for_pr,
|
| 33 |
+
get_snapshot_surfaces,
|
| 34 |
+
list_contributors,
|
| 35 |
+
list_issue_clusters,
|
| 36 |
+
list_issue_duplicate_prs,
|
| 37 |
+
)
|
| 38 |
|
| 39 |
|
| 40 |
@dataclass(slots=True)
|
|
|
|
| 42 |
default_repo: str | None
|
| 43 |
index_path: Path
|
| 44 |
output_dir: Path
|
|
|
|
| 45 |
snapshot_dir: Path | None = None
|
| 46 |
hf_repo_id: str | None = None
|
| 47 |
hf_revision: str | None = None
|
|
|
|
| 59 |
candidate_limit_max: int = 20
|
| 60 |
cluster_list_limit_default: int = 50
|
| 61 |
cluster_list_limit_max: int = 200
|
| 62 |
+
issue_list_limit_default: int = 50
|
| 63 |
+
issue_list_limit_max: int = 200
|
| 64 |
+
contributor_list_limit_default: int = 50
|
| 65 |
+
contributor_list_limit_max: int = 200
|
| 66 |
probe_limit_default: int = 10
|
| 67 |
probe_limit_max: int = 25
|
| 68 |
|
|
|
|
| 78 |
default_repo=os.environ.get("DEFAULT_REPO"),
|
| 79 |
index_path=index_path,
|
| 80 |
output_dir=output_dir,
|
|
|
|
| 81 |
snapshot_dir=snapshot_dir,
|
| 82 |
hf_repo_id=os.environ.get("HF_REPO_ID"),
|
| 83 |
hf_revision=os.environ.get("HF_REVISION"),
|
|
|
|
| 95 |
candidate_limit_max=_env_int("CANDIDATE_LIMIT_MAX", 20),
|
| 96 |
cluster_list_limit_default=_env_int("CLUSTER_LIST_LIMIT_DEFAULT", 50),
|
| 97 |
cluster_list_limit_max=_env_int("CLUSTER_LIST_LIMIT_MAX", 200),
|
| 98 |
+
issue_list_limit_default=_env_int("ISSUE_LIST_LIMIT_DEFAULT", 50),
|
| 99 |
+
issue_list_limit_max=_env_int("ISSUE_LIST_LIMIT_MAX", 200),
|
| 100 |
+
contributor_list_limit_default=_env_int("CONTRIBUTOR_LIST_LIMIT_DEFAULT", 50),
|
| 101 |
+
contributor_list_limit_max=_env_int("CONTRIBUTOR_LIST_LIMIT_MAX", 200),
|
| 102 |
probe_limit_default=_env_int("PROBE_LIMIT_DEFAULT", 10),
|
| 103 |
probe_limit_max=_env_int("PROBE_LIMIT_MAX", 25),
|
| 104 |
)
|
|
|
|
| 113 |
app.state.ready = False
|
| 114 |
app.state.startup_error = None
|
| 115 |
try:
|
| 116 |
+
_bootstrap_snapshot_assets(api_settings)
|
| 117 |
_bootstrap_index(api_settings)
|
| 118 |
app.state.ready = _is_ready(api_settings)
|
| 119 |
except Exception as exc:
|
| 120 |
app.state.startup_error = str(exc)
|
| 121 |
yield
|
| 122 |
|
| 123 |
+
app = FastAPI(title="slop PR search API", version="0.1.0", lifespan=lifespan)
|
| 124 |
|
| 125 |
@app.exception_handler(ValueError)
|
| 126 |
async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
|
|
|
|
| 151 |
async def repo_status(owner: str, repo: str, request: Request) -> dict[str, Any]:
|
| 152 |
settings = request.app.state.settings
|
| 153 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 154 |
+
status = get_pr_search_status(settings.index_path, repo=repo_slug)
|
| 155 |
+
snapshot_dir = _status_snapshot_dir(status)
|
| 156 |
+
return {**status, "surfaces": get_snapshot_surfaces(snapshot_dir)}
|
| 157 |
|
| 158 |
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/similar")
|
| 159 |
async def pr_similar(
|
|
|
|
| 231 |
),
|
| 232 |
)
|
| 233 |
|
| 234 |
+
@app.get("/v1/repos/{owner}/{repo}/issues/status")
|
| 235 |
+
async def issue_status(
|
| 236 |
+
owner: str,
|
| 237 |
+
repo: str,
|
| 238 |
+
request: Request,
|
| 239 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 240 |
) -> dict[str, Any]:
|
| 241 |
settings = request.app.state.settings
|
| 242 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 243 |
+
return get_issue_cluster_status(_active_snapshot_dir(settings, repo_slug), variant=variant)
|
| 244 |
|
| 245 |
+
@app.get("/v1/repos/{owner}/{repo}/issues/clusters")
|
| 246 |
+
async def issue_clusters(
|
| 247 |
owner: str,
|
| 248 |
repo: str,
|
|
|
|
| 249 |
request: Request,
|
| 250 |
limit: int | None = None,
|
| 251 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 252 |
) -> dict[str, Any]:
|
| 253 |
settings = request.app.state.settings
|
| 254 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 255 |
+
return list_issue_clusters(
|
| 256 |
+
_active_snapshot_dir(settings, repo_slug),
|
|
|
|
|
|
|
| 257 |
limit=_limit(
|
| 258 |
+
limit,
|
| 259 |
+
default=settings.issue_list_limit_default,
|
| 260 |
+
maximum=settings.issue_list_limit_max,
|
| 261 |
),
|
| 262 |
+
variant=variant,
|
| 263 |
)
|
| 264 |
|
| 265 |
+
@app.get("/v1/repos/{owner}/{repo}/issues/clusters/{cluster_id}")
|
| 266 |
+
async def issue_cluster(
|
| 267 |
owner: str,
|
| 268 |
repo: str,
|
| 269 |
+
cluster_id: str,
|
| 270 |
request: Request,
|
| 271 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 272 |
) -> dict[str, Any]:
|
| 273 |
settings = request.app.state.settings
|
| 274 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 275 |
+
return get_issue_cluster(
|
| 276 |
+
_active_snapshot_dir(settings, repo_slug),
|
| 277 |
+
cluster_id=cluster_id,
|
| 278 |
+
variant=variant,
|
| 279 |
+
)
|
| 280 |
|
| 281 |
+
@app.get("/v1/repos/{owner}/{repo}/issues/pulls/{number}")
|
| 282 |
+
async def issue_clusters_for_pr(
|
| 283 |
owner: str,
|
| 284 |
repo: str,
|
| 285 |
+
number: int,
|
| 286 |
request: Request,
|
| 287 |
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 288 |
) -> dict[str, Any]:
|
| 289 |
settings = request.app.state.settings
|
| 290 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 291 |
+
return get_issue_clusters_for_pr(
|
| 292 |
+
_active_snapshot_dir(settings, repo_slug),
|
| 293 |
+
pr_number=number,
|
| 294 |
variant=variant,
|
|
|
|
| 295 |
)
|
| 296 |
|
| 297 |
+
@app.get("/v1/repos/{owner}/{repo}/issues/pulls/{number}/membership")
|
| 298 |
+
async def issue_membership_for_pr(
|
| 299 |
owner: str,
|
| 300 |
repo: str,
|
| 301 |
number: int,
|
| 302 |
request: Request,
|
| 303 |
+
cluster_id: str | None = None,
|
| 304 |
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 305 |
) -> dict[str, Any]:
|
| 306 |
settings = request.app.state.settings
|
| 307 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 308 |
+
return check_issue_cluster_membership(
|
| 309 |
+
_active_snapshot_dir(settings, repo_slug),
|
|
|
|
| 310 |
pr_number=number,
|
| 311 |
+
cluster_id=cluster_id,
|
| 312 |
variant=variant,
|
|
|
|
| 313 |
)
|
| 314 |
|
| 315 |
+
@app.get("/v1/repos/{owner}/{repo}/issues/duplicate-prs")
|
| 316 |
+
async def issue_duplicate_prs(
|
| 317 |
owner: str,
|
| 318 |
repo: str,
|
| 319 |
request: Request,
|
|
|
|
| 322 |
) -> dict[str, Any]:
|
| 323 |
settings = request.app.state.settings
|
| 324 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 325 |
+
return list_issue_duplicate_prs(
|
| 326 |
+
_active_snapshot_dir(settings, repo_slug),
|
|
|
|
|
|
|
|
|
|
| 327 |
limit=_limit(
|
| 328 |
limit,
|
| 329 |
+
default=settings.issue_list_limit_default,
|
| 330 |
+
maximum=settings.issue_list_limit_max,
|
| 331 |
),
|
| 332 |
+
variant=variant,
|
| 333 |
)
|
| 334 |
|
| 335 |
+
@app.get("/v1/repos/{owner}/{repo}/issues/best")
|
| 336 |
+
async def issue_best(
|
| 337 |
owner: str,
|
| 338 |
repo: str,
|
|
|
|
| 339 |
request: Request,
|
| 340 |
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 341 |
) -> dict[str, Any]:
|
| 342 |
settings = request.app.state.settings
|
| 343 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 344 |
+
return get_issue_best(_active_snapshot_dir(settings, repo_slug), variant=variant)
|
| 345 |
+
|
| 346 |
+
@app.get("/v1/repos/{owner}/{repo}/contributors/status")
|
| 347 |
+
async def contributor_status(
|
| 348 |
+
owner: str,
|
| 349 |
+
repo: str,
|
| 350 |
+
request: Request,
|
| 351 |
+
) -> dict[str, Any]:
|
| 352 |
+
settings = request.app.state.settings
|
| 353 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 354 |
+
return get_contributor_status(_active_snapshot_dir(settings, repo_slug))
|
| 355 |
|
| 356 |
+
@app.get("/v1/repos/{owner}/{repo}/contributors")
|
| 357 |
+
async def contributors(
|
| 358 |
owner: str,
|
| 359 |
repo: str,
|
| 360 |
request: Request,
|
| 361 |
limit: int | None = None,
|
|
|
|
| 362 |
) -> dict[str, Any]:
|
| 363 |
settings = request.app.state.settings
|
| 364 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 365 |
+
return list_contributors(
|
| 366 |
+
_active_snapshot_dir(settings, repo_slug),
|
|
|
|
|
|
|
|
|
|
| 367 |
limit=_limit(
|
| 368 |
limit,
|
| 369 |
+
default=settings.contributor_list_limit_default,
|
| 370 |
+
maximum=settings.contributor_list_limit_max,
|
| 371 |
),
|
| 372 |
)
|
| 373 |
|
| 374 |
+
@app.get("/v1/repos/{owner}/{repo}/contributors/{login}")
|
| 375 |
+
async def contributor(
|
| 376 |
owner: str,
|
| 377 |
repo: str,
|
| 378 |
+
login: str,
|
| 379 |
request: Request,
|
|
|
|
| 380 |
) -> dict[str, Any]:
|
| 381 |
settings = request.app.state.settings
|
| 382 |
repo_slug = _repo_slug(settings, owner, repo)
|
| 383 |
+
return get_contributor(_active_snapshot_dir(settings, repo_slug), author_login=login)
|
| 384 |
+
|
| 385 |
+
@app.get("/v1/repos/{owner}/{repo}/contributors/{login}/risk")
|
| 386 |
+
async def contributor_risk(
|
| 387 |
+
owner: str,
|
| 388 |
+
repo: str,
|
| 389 |
+
login: str,
|
| 390 |
+
request: Request,
|
| 391 |
+
) -> dict[str, Any]:
|
| 392 |
+
settings = request.app.state.settings
|
| 393 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 394 |
+
return get_contributor_risk(_active_snapshot_dir(settings, repo_slug), author_login=login)
|
| 395 |
|
| 396 |
return app
|
| 397 |
|
|
|
|
| 417 |
)
|
| 418 |
|
| 419 |
|
| 420 |
+
def _bootstrap_snapshot_assets(settings: PrSearchApiSettings) -> None:
|
| 421 |
+
if settings.snapshot_dir is not None or settings.hf_repo_id is None:
|
| 422 |
+
return
|
| 423 |
+
materialize_dir = settings.hf_materialize_dir or default_hf_materialize_dir(
|
| 424 |
+
settings.output_dir,
|
| 425 |
+
settings.hf_repo_id,
|
| 426 |
+
settings.hf_revision,
|
| 427 |
+
)
|
| 428 |
+
materialize_hf_dataset_snapshot(
|
| 429 |
+
repo_id=settings.hf_repo_id,
|
| 430 |
+
local_dir=materialize_dir,
|
| 431 |
+
revision=settings.hf_revision,
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
|
| 435 |
def _needs_refresh(settings: PrSearchApiSettings) -> bool:
|
| 436 |
if settings.rebuild_on_start:
|
| 437 |
return True
|
|
|
|
| 470 |
return repo_slug
|
| 471 |
|
| 472 |
|
| 473 |
+
def _active_snapshot_dir(settings: PrSearchApiSettings, repo_slug: str) -> Path:
|
| 474 |
+
return _status_snapshot_dir(get_pr_search_status(settings.index_path, repo=repo_slug))
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
def _status_snapshot_dir(status: dict[str, Any]) -> Path:
|
| 478 |
+
snapshot_dir = status.get("snapshot_dir")
|
| 479 |
+
if not snapshot_dir:
|
| 480 |
+
raise HTTPException(status_code=503, detail="active snapshot directory is unavailable")
|
| 481 |
+
return Path(str(snapshot_dir))
|
| 482 |
+
|
| 483 |
+
|
| 484 |
def _limit(value: int | None, *, default: int, maximum: int) -> int:
|
| 485 |
limit = default if value is None else value
|
| 486 |
if limit < 1:
|
|
|
|
| 504 |
message = str(exc).lower()
|
| 505 |
return (
|
| 506 |
"not found" in message
|
|
|
|
|
|
|
| 507 |
or "no active pr search run" in message
|
| 508 |
or "was not found in the active indexed universe" in message
|
| 509 |
)
|
src/slop_farmer/app/workflow.py
CHANGED
|
@@ -74,9 +74,6 @@ def run_full_pipeline(options: FullPipelineOptions) -> str:
|
|
| 74 |
analysis_input=analysis_path,
|
| 75 |
contributors_input=snapshot_dir / "new-contributors-report.json",
|
| 76 |
pr_scope_input=snapshot_dir / "pr-scope-clusters.json",
|
| 77 |
-
hf_repo_id=None,
|
| 78 |
-
hf_revision=None,
|
| 79 |
-
hf_materialize_dir=None,
|
| 80 |
window_days=options.dashboard_window_days,
|
| 81 |
)
|
| 82 |
)
|
|
|
|
| 74 |
analysis_input=analysis_path,
|
| 75 |
contributors_input=snapshot_dir / "new-contributors-report.json",
|
| 76 |
pr_scope_input=snapshot_dir / "pr-scope-clusters.json",
|
|
|
|
|
|
|
|
|
|
| 77 |
window_days=options.dashboard_window_days,
|
| 78 |
)
|
| 79 |
)
|
src/slop_farmer/app_config.py
CHANGED
|
@@ -184,18 +184,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 184 |
"new-contributor-window-days": contributor_window_days,
|
| 185 |
"new-contributor-max-authors": contributor_max_authors,
|
| 186 |
},
|
| 187 |
-
"refresh-dataset": {
|
| 188 |
-
"repo": repo,
|
| 189 |
-
"hf-repo-id": dataset_id,
|
| 190 |
-
"fetch-timeline": scrape.get("fetch-timeline"),
|
| 191 |
-
"max-issues": scrape.get("max-issues"),
|
| 192 |
-
"max-prs": scrape.get("max-prs"),
|
| 193 |
-
"max-issue-comments": scrape.get("max-issue-comments"),
|
| 194 |
-
"max-reviews-per-pr": scrape.get("max-reviews-per-pr"),
|
| 195 |
-
"max-review-comments-per-pr": scrape.get("max-review-comments-per-pr"),
|
| 196 |
-
"new-contributor-window-days": contributor_window_days,
|
| 197 |
-
"new-contributor-max-authors": contributor_max_authors,
|
| 198 |
-
},
|
| 199 |
"analyze": {
|
| 200 |
"output-dir": str(data_dir) if data_dir else None,
|
| 201 |
"hf-repo-id": analysis.get("hf-repo-id", dataset_id),
|
|
@@ -213,7 +201,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 213 |
},
|
| 214 |
"pr-scope": {
|
| 215 |
"output-dir": str(data_dir) if data_dir else None,
|
| 216 |
-
"hf-repo-id": dataset_id,
|
| 217 |
"cluster-suppression-rules": cluster_suppression_rules,
|
| 218 |
},
|
| 219 |
"pr-search": {
|
|
@@ -223,14 +210,12 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 223 |
},
|
| 224 |
"new-contributor-report": {
|
| 225 |
"output-dir": str(data_dir) if data_dir else None,
|
| 226 |
-
"hf-repo-id": dataset_id,
|
| 227 |
"window-days": contributor_window_days,
|
| 228 |
"max-authors": contributor_max_authors,
|
| 229 |
},
|
| 230 |
"dashboard-data": {
|
| 231 |
"output-dir": str(dashboard_dir) if dashboard_dir else None,
|
| 232 |
"snapshot-root": str(data_dir / "snapshots") if data_dir else None,
|
| 233 |
-
"hf-repo-id": dataset_id,
|
| 234 |
"window-days": dashboard_window_days,
|
| 235 |
},
|
| 236 |
"publish-snapshot": {
|
|
@@ -251,7 +236,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 251 |
"deploy-dashboard": {
|
| 252 |
"pipeline-data-dir": str(data_dir) if data_dir else None,
|
| 253 |
"web-dir": str(web_dir) if web_dir else None,
|
| 254 |
-
"hf-repo-id": dataset_id,
|
| 255 |
"dashboard-window-days": dashboard_window_days,
|
| 256 |
"contributor-window-days": contributor_window_days,
|
| 257 |
"contributor-max-authors": contributor_max_authors,
|
|
@@ -264,11 +248,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 264 |
"dataset-id": dataset_id,
|
| 265 |
"space-tags": tags_value,
|
| 266 |
},
|
| 267 |
-
"dataset-status": {
|
| 268 |
-
"repo": repo,
|
| 269 |
-
"output-dir": str(data_dir) if data_dir else None,
|
| 270 |
-
"hf-repo-id": dataset_id,
|
| 271 |
-
},
|
| 272 |
}
|
| 273 |
for command, values in defaults.items():
|
| 274 |
defaults[command] = {key: value for key, value in values.items() if value is not None}
|
|
@@ -280,7 +259,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
|
|
| 280 |
defaults[command].update(_resolve_command_paths(config_path, values))
|
| 281 |
|
| 282 |
defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
|
| 283 |
-
defaults["refresh-dataset"].update(_resolve_command_paths(config_path, scrape))
|
| 284 |
defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
|
| 285 |
defaults["full-pipeline"].update(_resolve_command_paths(config_path, full_pipeline))
|
| 286 |
return defaults
|
|
|
|
| 184 |
"new-contributor-window-days": contributor_window_days,
|
| 185 |
"new-contributor-max-authors": contributor_max_authors,
|
| 186 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
"analyze": {
|
| 188 |
"output-dir": str(data_dir) if data_dir else None,
|
| 189 |
"hf-repo-id": analysis.get("hf-repo-id", dataset_id),
|
|
|
|
| 201 |
},
|
| 202 |
"pr-scope": {
|
| 203 |
"output-dir": str(data_dir) if data_dir else None,
|
|
|
|
| 204 |
"cluster-suppression-rules": cluster_suppression_rules,
|
| 205 |
},
|
| 206 |
"pr-search": {
|
|
|
|
| 210 |
},
|
| 211 |
"new-contributor-report": {
|
| 212 |
"output-dir": str(data_dir) if data_dir else None,
|
|
|
|
| 213 |
"window-days": contributor_window_days,
|
| 214 |
"max-authors": contributor_max_authors,
|
| 215 |
},
|
| 216 |
"dashboard-data": {
|
| 217 |
"output-dir": str(dashboard_dir) if dashboard_dir else None,
|
| 218 |
"snapshot-root": str(data_dir / "snapshots") if data_dir else None,
|
|
|
|
| 219 |
"window-days": dashboard_window_days,
|
| 220 |
},
|
| 221 |
"publish-snapshot": {
|
|
|
|
| 236 |
"deploy-dashboard": {
|
| 237 |
"pipeline-data-dir": str(data_dir) if data_dir else None,
|
| 238 |
"web-dir": str(web_dir) if web_dir else None,
|
|
|
|
| 239 |
"dashboard-window-days": dashboard_window_days,
|
| 240 |
"contributor-window-days": contributor_window_days,
|
| 241 |
"contributor-max-authors": contributor_max_authors,
|
|
|
|
| 248 |
"dataset-id": dataset_id,
|
| 249 |
"space-tags": tags_value,
|
| 250 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
}
|
| 252 |
for command, values in defaults.items():
|
| 253 |
defaults[command] = {key: value for key, value in values.items() if value is not None}
|
|
|
|
| 259 |
defaults[command].update(_resolve_command_paths(config_path, values))
|
| 260 |
|
| 261 |
defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
|
|
|
|
| 262 |
defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
|
| 263 |
defaults["full-pipeline"].update(_resolve_command_paths(config_path, full_pipeline))
|
| 264 |
return defaults
|
src/slop_farmer/config.py
CHANGED
|
@@ -127,9 +127,6 @@ class NewContributorReportOptions:
|
|
| 127 |
json_output: Path | None
|
| 128 |
window_days: int
|
| 129 |
max_authors: int
|
| 130 |
-
hf_repo_id: str | None = None
|
| 131 |
-
hf_revision: str | None = None
|
| 132 |
-
hf_materialize_dir: Path | None = None
|
| 133 |
|
| 134 |
|
| 135 |
@dataclass(slots=True)
|
|
@@ -140,9 +137,6 @@ class DashboardDataOptions:
|
|
| 140 |
contributors_input: Path | None
|
| 141 |
pr_scope_input: Path | None
|
| 142 |
window_days: int
|
| 143 |
-
hf_repo_id: str | None = None
|
| 144 |
-
hf_revision: str | None = None
|
| 145 |
-
hf_materialize_dir: Path | None = None
|
| 146 |
snapshot_root: Path | None = None
|
| 147 |
|
| 148 |
|
|
@@ -161,9 +155,6 @@ class DeployDashboardOptions:
|
|
| 161 |
snapshot_dir: Path | None
|
| 162 |
analysis_input: Path | None
|
| 163 |
contributors_input: Path | None
|
| 164 |
-
hf_repo_id: str | None
|
| 165 |
-
hf_revision: str | None
|
| 166 |
-
hf_materialize_dir: Path | None
|
| 167 |
refresh_contributors: bool
|
| 168 |
dashboard_window_days: int
|
| 169 |
contributor_window_days: int
|
|
@@ -242,32 +233,3 @@ class FullPipelineOptions:
|
|
| 242 |
max_issues: int | None
|
| 243 |
max_prs: int | None
|
| 244 |
open_prs_only: bool = False
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
@dataclass(slots=True)
|
| 248 |
-
class DatasetRefreshOptions:
|
| 249 |
-
repo: RepoRef
|
| 250 |
-
hf_repo_id: str
|
| 251 |
-
private_hf_repo: bool
|
| 252 |
-
max_issues: int | None
|
| 253 |
-
max_prs: int | None
|
| 254 |
-
max_issue_comments: int | None
|
| 255 |
-
max_reviews_per_pr: int | None
|
| 256 |
-
max_review_comments_per_pr: int | None
|
| 257 |
-
fetch_timeline: bool
|
| 258 |
-
new_contributor_report: bool
|
| 259 |
-
new_contributor_window_days: int
|
| 260 |
-
new_contributor_max_authors: int
|
| 261 |
-
http_timeout: int
|
| 262 |
-
http_max_retries: int
|
| 263 |
-
checkpoint_every_comments: int
|
| 264 |
-
checkpoint_every_prs: int
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
@dataclass(slots=True)
|
| 268 |
-
class DatasetStatusOptions:
|
| 269 |
-
output_dir: Path
|
| 270 |
-
hf_repo_id: str | None
|
| 271 |
-
hf_revision: str | None
|
| 272 |
-
repo: str | None = None
|
| 273 |
-
json_output: bool = False
|
|
|
|
| 127 |
json_output: Path | None
|
| 128 |
window_days: int
|
| 129 |
max_authors: int
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
@dataclass(slots=True)
|
|
|
|
| 137 |
contributors_input: Path | None
|
| 138 |
pr_scope_input: Path | None
|
| 139 |
window_days: int
|
|
|
|
|
|
|
|
|
|
| 140 |
snapshot_root: Path | None = None
|
| 141 |
|
| 142 |
|
|
|
|
| 155 |
snapshot_dir: Path | None
|
| 156 |
analysis_input: Path | None
|
| 157 |
contributors_input: Path | None
|
|
|
|
|
|
|
|
|
|
| 158 |
refresh_contributors: bool
|
| 159 |
dashboard_window_days: int
|
| 160 |
contributor_window_days: int
|
|
|
|
| 233 |
max_issues: int | None
|
| 234 |
max_prs: int | None
|
| 235 |
open_prs_only: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/slop_farmer/data/search_duckdb.py
CHANGED
|
@@ -31,7 +31,6 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
|
|
| 31 |
"repo",
|
| 32 |
"pr_number",
|
| 33 |
"github_id",
|
| 34 |
-
"author_login",
|
| 35 |
"state",
|
| 36 |
"draft",
|
| 37 |
"merged",
|
|
@@ -47,48 +46,6 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
|
|
| 47 |
"review_comments_count",
|
| 48 |
"html_url",
|
| 49 |
),
|
| 50 |
-
"pr_search_contributors": (
|
| 51 |
-
"run_id",
|
| 52 |
-
"repo",
|
| 53 |
-
"snapshot_id",
|
| 54 |
-
"report_generated_at",
|
| 55 |
-
"window_days",
|
| 56 |
-
"author_login",
|
| 57 |
-
"name",
|
| 58 |
-
"profile_url",
|
| 59 |
-
"repo_pull_requests_url",
|
| 60 |
-
"repo_issues_url",
|
| 61 |
-
"repo_first_seen_at",
|
| 62 |
-
"repo_last_seen_at",
|
| 63 |
-
"repo_primary_artifact_count",
|
| 64 |
-
"repo_artifact_count",
|
| 65 |
-
"snapshot_issue_count",
|
| 66 |
-
"snapshot_pr_count",
|
| 67 |
-
"snapshot_comment_count",
|
| 68 |
-
"snapshot_review_count",
|
| 69 |
-
"snapshot_review_comment_count",
|
| 70 |
-
"repo_association",
|
| 71 |
-
"new_to_repo",
|
| 72 |
-
"first_seen_in_snapshot",
|
| 73 |
-
"report_reason",
|
| 74 |
-
"account_age_days",
|
| 75 |
-
"young_account",
|
| 76 |
-
"follow_through_score",
|
| 77 |
-
"breadth_score",
|
| 78 |
-
"automation_risk_signal",
|
| 79 |
-
"heuristic_note",
|
| 80 |
-
"public_orgs_json",
|
| 81 |
-
"visible_authored_pr_count",
|
| 82 |
-
"merged_pr_count",
|
| 83 |
-
"closed_unmerged_pr_count",
|
| 84 |
-
"open_pr_count",
|
| 85 |
-
"merged_pr_rate",
|
| 86 |
-
"closed_unmerged_pr_rate",
|
| 87 |
-
"still_open_pr_rate",
|
| 88 |
-
"distinct_repos_with_authored_prs",
|
| 89 |
-
"distinct_repos_with_open_prs",
|
| 90 |
-
"fetch_error",
|
| 91 |
-
),
|
| 92 |
"pr_scope_features": (
|
| 93 |
"run_id",
|
| 94 |
"repo",
|
|
@@ -187,7 +144,6 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
|
|
| 187 |
repo VARCHAR,
|
| 188 |
pr_number BIGINT,
|
| 189 |
github_id BIGINT,
|
| 190 |
-
author_login VARCHAR,
|
| 191 |
state VARCHAR,
|
| 192 |
draft BOOLEAN,
|
| 193 |
merged BOOLEAN,
|
|
@@ -203,48 +159,6 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
|
|
| 203 |
review_comments_count BIGINT,
|
| 204 |
html_url VARCHAR
|
| 205 |
);
|
| 206 |
-
CREATE TABLE IF NOT EXISTS pr_search_contributors (
|
| 207 |
-
run_id VARCHAR,
|
| 208 |
-
repo VARCHAR,
|
| 209 |
-
snapshot_id VARCHAR,
|
| 210 |
-
report_generated_at VARCHAR,
|
| 211 |
-
window_days BIGINT,
|
| 212 |
-
author_login VARCHAR,
|
| 213 |
-
name VARCHAR,
|
| 214 |
-
profile_url VARCHAR,
|
| 215 |
-
repo_pull_requests_url VARCHAR,
|
| 216 |
-
repo_issues_url VARCHAR,
|
| 217 |
-
repo_first_seen_at VARCHAR,
|
| 218 |
-
repo_last_seen_at VARCHAR,
|
| 219 |
-
repo_primary_artifact_count BIGINT,
|
| 220 |
-
repo_artifact_count BIGINT,
|
| 221 |
-
snapshot_issue_count BIGINT,
|
| 222 |
-
snapshot_pr_count BIGINT,
|
| 223 |
-
snapshot_comment_count BIGINT,
|
| 224 |
-
snapshot_review_count BIGINT,
|
| 225 |
-
snapshot_review_comment_count BIGINT,
|
| 226 |
-
repo_association VARCHAR,
|
| 227 |
-
new_to_repo BOOLEAN,
|
| 228 |
-
first_seen_in_snapshot BOOLEAN,
|
| 229 |
-
report_reason VARCHAR,
|
| 230 |
-
account_age_days BIGINT,
|
| 231 |
-
young_account BOOLEAN,
|
| 232 |
-
follow_through_score VARCHAR,
|
| 233 |
-
breadth_score VARCHAR,
|
| 234 |
-
automation_risk_signal VARCHAR,
|
| 235 |
-
heuristic_note VARCHAR,
|
| 236 |
-
public_orgs_json VARCHAR,
|
| 237 |
-
visible_authored_pr_count BIGINT,
|
| 238 |
-
merged_pr_count BIGINT,
|
| 239 |
-
closed_unmerged_pr_count BIGINT,
|
| 240 |
-
open_pr_count BIGINT,
|
| 241 |
-
merged_pr_rate DOUBLE,
|
| 242 |
-
closed_unmerged_pr_rate DOUBLE,
|
| 243 |
-
still_open_pr_rate DOUBLE,
|
| 244 |
-
distinct_repos_with_authored_prs BIGINT,
|
| 245 |
-
distinct_repos_with_open_prs BIGINT,
|
| 246 |
-
fetch_error VARCHAR
|
| 247 |
-
);
|
| 248 |
CREATE TABLE IF NOT EXISTS pr_scope_features (
|
| 249 |
run_id VARCHAR,
|
| 250 |
repo VARCHAR,
|
|
@@ -318,8 +232,6 @@ CREATE TABLE IF NOT EXISTS pr_scope_cluster_candidates (
|
|
| 318 |
CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
|
| 319 |
CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
|
| 320 |
CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
|
| 321 |
-
CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_author ON pr_search_documents (run_id, author_login);
|
| 322 |
-
CREATE INDEX IF NOT EXISTS idx_pr_search_contributors_run_author ON pr_search_contributors (run_id, author_login);
|
| 323 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
|
| 324 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
|
| 325 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
|
|
@@ -344,9 +256,6 @@ def connect_pr_search_db(path: Path, *, read_only: bool = False) -> duckdb.DuckD
|
|
| 344 |
|
| 345 |
def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
|
| 346 |
connection.execute(SCHEMA_SQL)
|
| 347 |
-
connection.execute(
|
| 348 |
-
"ALTER TABLE pr_search_documents ADD COLUMN IF NOT EXISTS author_login VARCHAR"
|
| 349 |
-
)
|
| 350 |
|
| 351 |
|
| 352 |
def insert_rows(
|
|
@@ -444,7 +353,6 @@ def resolve_active_run(
|
|
| 444 |
def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
|
| 445 |
return {
|
| 446 |
"documents": _count(connection, "pr_search_documents", run_id),
|
| 447 |
-
"contributors": _count(connection, "pr_search_contributors", run_id),
|
| 448 |
"features": _count(connection, "pr_scope_features", run_id),
|
| 449 |
"run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
|
| 450 |
"neighbors": _count(connection, "pr_scope_neighbors", run_id),
|
|
@@ -467,60 +375,6 @@ def get_document(
|
|
| 467 |
)
|
| 468 |
|
| 469 |
|
| 470 |
-
def get_contributor(
|
| 471 |
-
connection: duckdb.DuckDBPyConnection,
|
| 472 |
-
*,
|
| 473 |
-
run_id: str,
|
| 474 |
-
author_login: str,
|
| 475 |
-
) -> dict[str, Any] | None:
|
| 476 |
-
return fetch_one(
|
| 477 |
-
connection,
|
| 478 |
-
"""
|
| 479 |
-
SELECT *
|
| 480 |
-
FROM pr_search_contributors
|
| 481 |
-
WHERE run_id = ? AND lower(author_login) = lower(?)
|
| 482 |
-
""",
|
| 483 |
-
[run_id, author_login],
|
| 484 |
-
)
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
def get_contributor_pulls(
|
| 488 |
-
connection: duckdb.DuckDBPyConnection,
|
| 489 |
-
*,
|
| 490 |
-
run_id: str,
|
| 491 |
-
author_login: str,
|
| 492 |
-
limit: int,
|
| 493 |
-
) -> list[dict[str, Any]]:
|
| 494 |
-
return fetch_rows(
|
| 495 |
-
connection,
|
| 496 |
-
"""
|
| 497 |
-
SELECT
|
| 498 |
-
pr_number,
|
| 499 |
-
github_id,
|
| 500 |
-
author_login,
|
| 501 |
-
state,
|
| 502 |
-
draft,
|
| 503 |
-
merged,
|
| 504 |
-
title,
|
| 505 |
-
base_ref,
|
| 506 |
-
created_at,
|
| 507 |
-
updated_at,
|
| 508 |
-
merged_at,
|
| 509 |
-
additions,
|
| 510 |
-
deletions,
|
| 511 |
-
changed_files,
|
| 512 |
-
comments_count,
|
| 513 |
-
review_comments_count,
|
| 514 |
-
html_url
|
| 515 |
-
FROM pr_search_documents
|
| 516 |
-
WHERE run_id = ? AND lower(author_login) = lower(?)
|
| 517 |
-
ORDER BY updated_at DESC NULLS LAST, pr_number DESC
|
| 518 |
-
LIMIT ?
|
| 519 |
-
""",
|
| 520 |
-
[run_id, author_login, limit],
|
| 521 |
-
)
|
| 522 |
-
|
| 523 |
-
|
| 524 |
def get_feature(
|
| 525 |
connection: duckdb.DuckDBPyConnection,
|
| 526 |
*,
|
|
|
|
| 31 |
"repo",
|
| 32 |
"pr_number",
|
| 33 |
"github_id",
|
|
|
|
| 34 |
"state",
|
| 35 |
"draft",
|
| 36 |
"merged",
|
|
|
|
| 46 |
"review_comments_count",
|
| 47 |
"html_url",
|
| 48 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"pr_scope_features": (
|
| 50 |
"run_id",
|
| 51 |
"repo",
|
|
|
|
| 144 |
repo VARCHAR,
|
| 145 |
pr_number BIGINT,
|
| 146 |
github_id BIGINT,
|
|
|
|
| 147 |
state VARCHAR,
|
| 148 |
draft BOOLEAN,
|
| 149 |
merged BOOLEAN,
|
|
|
|
| 159 |
review_comments_count BIGINT,
|
| 160 |
html_url VARCHAR
|
| 161 |
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
CREATE TABLE IF NOT EXISTS pr_scope_features (
|
| 163 |
run_id VARCHAR,
|
| 164 |
repo VARCHAR,
|
|
|
|
| 232 |
CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
|
| 233 |
CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
|
| 234 |
CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
|
|
|
|
|
|
|
| 235 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
|
| 236 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
|
| 237 |
CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
|
|
|
|
| 256 |
|
| 257 |
def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
|
| 258 |
connection.execute(SCHEMA_SQL)
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
def insert_rows(
|
|
|
|
| 353 |
def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
|
| 354 |
return {
|
| 355 |
"documents": _count(connection, "pr_search_documents", run_id),
|
|
|
|
| 356 |
"features": _count(connection, "pr_scope_features", run_id),
|
| 357 |
"run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
|
| 358 |
"neighbors": _count(connection, "pr_scope_neighbors", run_id),
|
|
|
|
| 375 |
)
|
| 376 |
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
def get_feature(
|
| 379 |
connection: duckdb.DuckDBPyConnection,
|
| 380 |
*,
|
src/slop_farmer/data/snapshot_materialize.py
CHANGED
|
@@ -74,6 +74,9 @@ def _materialize_hf_snapshot_repo_snapshot(
|
|
| 74 |
"links.parquet",
|
| 75 |
"events.parquet",
|
| 76 |
"manifest.json",
|
|
|
|
|
|
|
|
|
|
| 77 |
"new_contributors.parquet",
|
| 78 |
"new-contributors-report.json",
|
| 79 |
"new-contributors-report.md",
|
|
@@ -149,6 +152,9 @@ def _materialize_hf_root_snapshot(
|
|
| 149 |
"links.parquet",
|
| 150 |
"events.parquet",
|
| 151 |
"manifest.json",
|
|
|
|
|
|
|
|
|
|
| 152 |
"new_contributors.parquet",
|
| 153 |
"new-contributors-report.json",
|
| 154 |
"new-contributors-report.md",
|
|
|
|
| 74 |
"links.parquet",
|
| 75 |
"events.parquet",
|
| 76 |
"manifest.json",
|
| 77 |
+
"analysis-report.json",
|
| 78 |
+
"analysis-report-hybrid.json",
|
| 79 |
+
"analysis-report-deterministic.json",
|
| 80 |
"new_contributors.parquet",
|
| 81 |
"new-contributors-report.json",
|
| 82 |
"new-contributors-report.md",
|
|
|
|
| 152 |
"links.parquet",
|
| 153 |
"events.parquet",
|
| 154 |
"manifest.json",
|
| 155 |
+
"analysis-report.json",
|
| 156 |
+
"analysis-report-hybrid.json",
|
| 157 |
+
"analysis-report-deterministic.json",
|
| 158 |
"new_contributors.parquet",
|
| 159 |
"new-contributors-report.json",
|
| 160 |
"new-contributors-report.md",
|
src/slop_farmer/reports/analysis.py
CHANGED
|
@@ -19,7 +19,11 @@ from rank_bm25 import BM25Okapi
|
|
| 19 |
from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
|
| 20 |
from slop_farmer.data.links import build_text_link_rows
|
| 21 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
|
| 22 |
-
from slop_farmer.data.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
from slop_farmer.reports.analysis_cache import (
|
| 24 |
HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
|
| 25 |
PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
|
|
@@ -762,14 +766,18 @@ def _artifact_suffix(row: dict[str, Any] | None, kind: str) -> str:
|
|
| 762 |
|
| 763 |
|
| 764 |
def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
|
| 774 |
|
| 775 |
def _load_snapshot(snapshot_dir: Path) -> SnapshotData:
|
|
|
|
| 19 |
from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
|
| 20 |
from slop_farmer.data.links import build_text_link_rows
|
| 21 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
|
| 22 |
+
from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
|
| 23 |
+
from slop_farmer.data.snapshot_paths import (
|
| 24 |
+
default_hf_materialize_dir,
|
| 25 |
+
resolve_snapshot_dir_from_output,
|
| 26 |
+
)
|
| 27 |
from slop_farmer.reports.analysis_cache import (
|
| 28 |
HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
|
| 29 |
PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
|
|
|
|
| 766 |
|
| 767 |
|
| 768 |
def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
|
| 769 |
+
if options.hf_repo_id:
|
| 770 |
+
materialize_dir = options.hf_materialize_dir or default_hf_materialize_dir(
|
| 771 |
+
options.output_dir,
|
| 772 |
+
options.hf_repo_id,
|
| 773 |
+
options.hf_revision,
|
| 774 |
+
)
|
| 775 |
+
return materialize_hf_dataset_snapshot(
|
| 776 |
+
repo_id=options.hf_repo_id,
|
| 777 |
+
local_dir=materialize_dir,
|
| 778 |
+
revision=options.hf_revision,
|
| 779 |
+
).resolve()
|
| 780 |
+
return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 781 |
|
| 782 |
|
| 783 |
def _load_snapshot(snapshot_dir: Path) -> SnapshotData:
|
src/slop_farmer/reports/dashboard.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Any
|
|
| 8 |
|
| 9 |
from slop_farmer.config import DashboardDataOptions
|
| 10 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 11 |
-
from slop_farmer.data.
|
| 12 |
|
| 13 |
|
| 14 |
def run_dashboard_data(options: DashboardDataOptions) -> Path:
|
|
@@ -88,14 +88,7 @@ def _resolve_snapshot_dir(options: DashboardDataOptions) -> Path:
|
|
| 88 |
if options.snapshot_root is not None
|
| 89 |
else (Path("data") / "snapshots").resolve()
|
| 90 |
)
|
| 91 |
-
return
|
| 92 |
-
snapshot_dir=options.snapshot_dir,
|
| 93 |
-
local_snapshots_root=snapshots_root,
|
| 94 |
-
hf_repo_id=options.hf_repo_id,
|
| 95 |
-
hf_revision=options.hf_revision,
|
| 96 |
-
hf_materialize_dir=options.hf_materialize_dir,
|
| 97 |
-
hf_output_dir=snapshots_root.parent,
|
| 98 |
-
)
|
| 99 |
|
| 100 |
|
| 101 |
def _read_optional_json(path: Path) -> dict[str, Any]:
|
|
|
|
| 8 |
|
| 9 |
from slop_farmer.config import DashboardDataOptions
|
| 10 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 11 |
+
from slop_farmer.data.snapshot_paths import resolve_snapshot_dir_from_snapshots_root
|
| 12 |
|
| 13 |
|
| 14 |
def run_dashboard_data(options: DashboardDataOptions) -> Path:
|
|
|
|
| 88 |
if options.snapshot_root is not None
|
| 89 |
else (Path("data") / "snapshots").resolve()
|
| 90 |
)
|
| 91 |
+
return resolve_snapshot_dir_from_snapshots_root(snapshots_root, options.snapshot_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
def _read_optional_json(path: Path) -> dict[str, Any]:
|
src/slop_farmer/reports/new_contributor_report.py
CHANGED
|
@@ -12,7 +12,7 @@ from typing import Any
|
|
| 12 |
from slop_farmer.config import NewContributorReportOptions, resolve_github_token
|
| 13 |
from slop_farmer.data.http import urlopen_with_retry
|
| 14 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
|
| 15 |
-
from slop_farmer.data.
|
| 16 |
from slop_farmer.reports.user_activity import summarize_user
|
| 17 |
|
| 18 |
GRAPHQL_URL = "https://api.github.com/graphql"
|
|
@@ -131,14 +131,7 @@ def run_new_contributor_report(options: NewContributorReportOptions) -> Path:
|
|
| 131 |
|
| 132 |
|
| 133 |
def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
|
| 134 |
-
return
|
| 135 |
-
snapshot_dir=options.snapshot_dir,
|
| 136 |
-
local_snapshots_root=options.output_dir.resolve() / "snapshots",
|
| 137 |
-
hf_repo_id=options.hf_repo_id,
|
| 138 |
-
hf_revision=options.hf_revision,
|
| 139 |
-
hf_materialize_dir=options.hf_materialize_dir,
|
| 140 |
-
hf_output_dir=options.output_dir,
|
| 141 |
-
)
|
| 142 |
|
| 143 |
|
| 144 |
def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
@@ -251,6 +244,7 @@ def _report_contributors(
|
|
| 251 |
previous_report_reusable
|
| 252 |
and previous_entry is not None
|
| 253 |
and not previous_entry.get("fetch_error")
|
|
|
|
| 254 |
):
|
| 255 |
contributors.append(
|
| 256 |
_reused_previous_report_entry(
|
|
@@ -262,8 +256,6 @@ def _report_contributors(
|
|
| 262 |
)
|
| 263 |
)
|
| 264 |
reused_previous_report += 1
|
| 265 |
-
if known_via_prior_merged_pr:
|
| 266 |
-
reused_known_merged += 1
|
| 267 |
continue
|
| 268 |
try:
|
| 269 |
summary = summarize_user(row["author_login"], options.window_days, None)
|
|
|
|
| 12 |
from slop_farmer.config import NewContributorReportOptions, resolve_github_token
|
| 13 |
from slop_farmer.data.http import urlopen_with_retry
|
| 14 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
|
| 15 |
+
from slop_farmer.data.snapshot_paths import resolve_snapshot_dir_from_output
|
| 16 |
from slop_farmer.reports.user_activity import summarize_user
|
| 17 |
|
| 18 |
GRAPHQL_URL = "https://api.github.com/graphql"
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
|
| 134 |
+
return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
|
|
| 244 |
previous_report_reusable
|
| 245 |
and previous_entry is not None
|
| 246 |
and not previous_entry.get("fetch_error")
|
| 247 |
+
and not known_via_prior_merged_pr
|
| 248 |
):
|
| 249 |
contributors.append(
|
| 250 |
_reused_previous_report_entry(
|
|
|
|
| 256 |
)
|
| 257 |
)
|
| 258 |
reused_previous_report += 1
|
|
|
|
|
|
|
| 259 |
continue
|
| 260 |
try:
|
| 261 |
summary = summarize_user(row["author_login"], options.window_days, None)
|
src/slop_farmer/reports/pr_scope.py
CHANGED
|
@@ -42,7 +42,11 @@ from typing import Any
|
|
| 42 |
from pydantic import BaseModel, Field
|
| 43 |
|
| 44 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 45 |
-
from slop_farmer.data.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
from slop_farmer.reports.pr_heuristics import (
|
| 47 |
compile_cluster_suppression_rules,
|
| 48 |
suppressed_pull_request_reasons,
|
|
@@ -256,14 +260,17 @@ def run_pr_scope_report(options: Any) -> Path:
|
|
| 256 |
|
| 257 |
|
| 258 |
def _resolve_snapshot_dir(options: Any) -> Path:
|
| 259 |
-
|
| 260 |
-
snapshot_dir=
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
|
| 269 |
def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:
|
|
|
|
| 42 |
from pydantic import BaseModel, Field
|
| 43 |
|
| 44 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 45 |
+
from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
|
| 46 |
+
from slop_farmer.data.snapshot_paths import (
|
| 47 |
+
default_hf_materialize_dir,
|
| 48 |
+
resolve_snapshot_dir_from_output,
|
| 49 |
+
)
|
| 50 |
from slop_farmer.reports.pr_heuristics import (
|
| 51 |
compile_cluster_suppression_rules,
|
| 52 |
suppressed_pull_request_reasons,
|
|
|
|
| 260 |
|
| 261 |
|
| 262 |
def _resolve_snapshot_dir(options: Any) -> Path:
|
| 263 |
+
if options.hf_repo_id:
|
| 264 |
+
snapshot_dir = materialize_hf_dataset_snapshot(
|
| 265 |
+
repo_id=options.hf_repo_id,
|
| 266 |
+
local_dir=options.hf_materialize_dir
|
| 267 |
+
or default_hf_materialize_dir(
|
| 268 |
+
options.output_dir, options.hf_repo_id, options.hf_revision
|
| 269 |
+
),
|
| 270 |
+
revision=options.hf_revision,
|
| 271 |
+
)
|
| 272 |
+
return snapshot_dir.resolve()
|
| 273 |
+
return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 274 |
|
| 275 |
|
| 276 |
def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:
|
src/slop_farmer/reports/pr_search_scope.py
CHANGED
|
@@ -10,7 +10,11 @@ from typing import Any
|
|
| 10 |
|
| 11 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 12 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 13 |
-
from slop_farmer.data.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from slop_farmer.reports.pr_heuristics import (
|
| 15 |
compile_cluster_suppression_rules,
|
| 16 |
suppressed_pull_request_reasons,
|
|
@@ -32,14 +36,17 @@ DEFAULT_CANDIDATE_LIMIT = 5
|
|
| 32 |
|
| 33 |
|
| 34 |
def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
|
| 35 |
-
|
| 36 |
-
snapshot_dir=
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
@@ -47,7 +54,6 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
| 47 |
manifest = read_json(manifest_path) if manifest_path.exists() else {}
|
| 48 |
pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 49 |
pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
|
| 50 |
-
contributors = read_parquet_rows(snapshot_dir / "new_contributors.parquet")
|
| 51 |
repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
|
| 52 |
snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
|
| 53 |
return {
|
|
@@ -56,7 +62,6 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
| 56 |
"manifest": manifest,
|
| 57 |
"pull_requests": pull_requests,
|
| 58 |
"pr_files": pr_files,
|
| 59 |
-
"contributors": contributors,
|
| 60 |
}
|
| 61 |
|
| 62 |
|
|
@@ -407,7 +412,6 @@ def _document_row(row: Mapping[str, Any]) -> dict[str, Any]:
|
|
| 407 |
return {
|
| 408 |
"pr_number": int(row["number"]),
|
| 409 |
"github_id": row.get("github_id"),
|
| 410 |
-
"author_login": row.get("author_login"),
|
| 411 |
"state": row.get("state"),
|
| 412 |
"draft": bool(row.get("draft")),
|
| 413 |
"merged": bool(row.get("merged")),
|
|
|
|
| 10 |
|
| 11 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 12 |
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 13 |
+
from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
|
| 14 |
+
from slop_farmer.data.snapshot_paths import (
|
| 15 |
+
default_hf_materialize_dir,
|
| 16 |
+
resolve_snapshot_dir_from_output,
|
| 17 |
+
)
|
| 18 |
from slop_farmer.reports.pr_heuristics import (
|
| 19 |
compile_cluster_suppression_rules,
|
| 20 |
suppressed_pull_request_reasons,
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
|
| 39 |
+
if options.hf_repo_id:
|
| 40 |
+
snapshot_dir = materialize_hf_dataset_snapshot(
|
| 41 |
+
repo_id=options.hf_repo_id,
|
| 42 |
+
local_dir=options.hf_materialize_dir
|
| 43 |
+
or default_hf_materialize_dir(
|
| 44 |
+
options.output_dir, options.hf_repo_id, options.hf_revision
|
| 45 |
+
),
|
| 46 |
+
revision=options.hf_revision,
|
| 47 |
+
)
|
| 48 |
+
return snapshot_dir.resolve()
|
| 49 |
+
return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
|
| 50 |
|
| 51 |
|
| 52 |
def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
|
|
|
|
| 54 |
manifest = read_json(manifest_path) if manifest_path.exists() else {}
|
| 55 |
pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 56 |
pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
|
|
|
|
| 57 |
repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
|
| 58 |
snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
|
| 59 |
return {
|
|
|
|
| 62 |
"manifest": manifest,
|
| 63 |
"pull_requests": pull_requests,
|
| 64 |
"pr_files": pr_files,
|
|
|
|
| 65 |
}
|
| 66 |
|
| 67 |
|
|
|
|
| 412 |
return {
|
| 413 |
"pr_number": int(row["number"]),
|
| 414 |
"github_id": row.get("github_id"),
|
|
|
|
| 415 |
"state": row.get("state"),
|
| 416 |
"draft": bool(row.get("draft")),
|
| 417 |
"merged": bool(row.get("merged")),
|
src/slop_farmer/reports/pr_search_service.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import json
|
| 4 |
-
from collections.abc import Iterable, Mapping
|
| 5 |
from contextlib import suppress
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Any, Protocol
|
|
@@ -17,8 +17,6 @@ from slop_farmer.data.search_duckdb import (
|
|
| 17 |
get_cluster,
|
| 18 |
get_cluster_ids_for_prs,
|
| 19 |
get_cluster_members,
|
| 20 |
-
get_contributor,
|
| 21 |
-
get_contributor_pulls,
|
| 22 |
get_document,
|
| 23 |
get_feature,
|
| 24 |
get_pair_neighbor_row,
|
|
@@ -101,16 +99,6 @@ def run_pr_search_refresh(options: PrSearchRefreshOptions) -> dict[str, Any]:
|
|
| 101 |
"pr_search_documents",
|
| 102 |
_scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
|
| 103 |
)
|
| 104 |
-
insert_rows(
|
| 105 |
-
connection,
|
| 106 |
-
"pr_search_contributors",
|
| 107 |
-
_contributor_rows(
|
| 108 |
-
snapshot["contributors"],
|
| 109 |
-
run_id=run_id,
|
| 110 |
-
repo=repo,
|
| 111 |
-
snapshot_id=str(snapshot["snapshot_id"]),
|
| 112 |
-
),
|
| 113 |
-
)
|
| 114 |
insert_rows(
|
| 115 |
connection,
|
| 116 |
"pr_scope_features",
|
|
@@ -302,85 +290,6 @@ def get_pr_search_candidate_clusters(
|
|
| 302 |
connection.close()
|
| 303 |
|
| 304 |
|
| 305 |
-
def get_pr_search_contributor(
|
| 306 |
-
db_path: Path,
|
| 307 |
-
*,
|
| 308 |
-
author_login: str,
|
| 309 |
-
repo: str | None = None,
|
| 310 |
-
) -> dict[str, Any]:
|
| 311 |
-
connection = connect_pr_search_db(db_path, read_only=True)
|
| 312 |
-
try:
|
| 313 |
-
active_run = resolve_active_run(connection, repo=repo)
|
| 314 |
-
run_id = str(active_run["id"])
|
| 315 |
-
contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
|
| 316 |
-
pulls = _document_rows(
|
| 317 |
-
get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=20)
|
| 318 |
-
)
|
| 319 |
-
return {
|
| 320 |
-
"repo": active_run["repo"],
|
| 321 |
-
"snapshot_id": active_run["snapshot_id"],
|
| 322 |
-
"run_id": run_id,
|
| 323 |
-
"contributor": contributor,
|
| 324 |
-
"pulls": pulls,
|
| 325 |
-
"pull_count": len(pulls),
|
| 326 |
-
}
|
| 327 |
-
finally:
|
| 328 |
-
connection.close()
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
def get_pr_search_contributor_pulls(
|
| 332 |
-
db_path: Path,
|
| 333 |
-
*,
|
| 334 |
-
author_login: str,
|
| 335 |
-
repo: str | None = None,
|
| 336 |
-
limit: int = 20,
|
| 337 |
-
) -> dict[str, Any]:
|
| 338 |
-
connection = connect_pr_search_db(db_path, read_only=True)
|
| 339 |
-
try:
|
| 340 |
-
active_run = resolve_active_run(connection, repo=repo)
|
| 341 |
-
run_id = str(active_run["id"])
|
| 342 |
-
contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
|
| 343 |
-
pulls = _document_rows(
|
| 344 |
-
get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=limit)
|
| 345 |
-
)
|
| 346 |
-
return {
|
| 347 |
-
"repo": active_run["repo"],
|
| 348 |
-
"snapshot_id": active_run["snapshot_id"],
|
| 349 |
-
"run_id": run_id,
|
| 350 |
-
"contributor": contributor,
|
| 351 |
-
"pulls": pulls,
|
| 352 |
-
"pull_count": len(pulls),
|
| 353 |
-
}
|
| 354 |
-
finally:
|
| 355 |
-
connection.close()
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
def get_pr_search_pull_contributor(
|
| 359 |
-
db_path: Path,
|
| 360 |
-
*,
|
| 361 |
-
pr_number: int,
|
| 362 |
-
repo: str | None = None,
|
| 363 |
-
) -> dict[str, Any]:
|
| 364 |
-
connection = connect_pr_search_db(db_path, read_only=True)
|
| 365 |
-
try:
|
| 366 |
-
active_run = resolve_active_run(connection, repo=repo)
|
| 367 |
-
run_id = str(active_run["id"])
|
| 368 |
-
document = _require_document(connection, run_id=run_id, pr_number=pr_number)
|
| 369 |
-
author_login = str(document.get("author_login") or "").strip()
|
| 370 |
-
if not author_login:
|
| 371 |
-
raise ValueError(f"PR #{pr_number} does not have an indexed author_login.")
|
| 372 |
-
contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
|
| 373 |
-
return {
|
| 374 |
-
"repo": active_run["repo"],
|
| 375 |
-
"snapshot_id": active_run["snapshot_id"],
|
| 376 |
-
"run_id": run_id,
|
| 377 |
-
"pr": _without_json_fields(document),
|
| 378 |
-
"contributor": contributor,
|
| 379 |
-
}
|
| 380 |
-
finally:
|
| 381 |
-
connection.close()
|
| 382 |
-
|
| 383 |
-
|
| 384 |
def get_pr_search_similar_lookup(
|
| 385 |
db_path: Path,
|
| 386 |
*,
|
|
@@ -892,15 +801,6 @@ def _require_feature(connection: Any, *, run_id: str, pr_number: int) -> dict[st
|
|
| 892 |
return feature
|
| 893 |
|
| 894 |
|
| 895 |
-
def _require_contributor(connection: Any, *, run_id: str, author_login: str) -> dict[str, Any]:
|
| 896 |
-
contributor = get_contributor(connection, run_id=run_id, author_login=author_login)
|
| 897 |
-
if contributor is None:
|
| 898 |
-
raise ValueError(
|
| 899 |
-
f"Contributor {author_login!r} was not found in the active indexed universe."
|
| 900 |
-
)
|
| 901 |
-
return _contributor_row(contributor)
|
| 902 |
-
|
| 903 |
-
|
| 904 |
def _json_list(raw: Any) -> list[str]:
|
| 905 |
if isinstance(raw, list):
|
| 906 |
return [str(item) for item in raw]
|
|
@@ -938,71 +838,6 @@ def _without_json_fields(row: Mapping[str, Any]) -> dict[str, Any]:
|
|
| 938 |
return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
|
| 939 |
|
| 940 |
|
| 941 |
-
def _document_rows(rows: Sequence[Mapping[str, Any]]) -> list[dict[str, Any]]:
|
| 942 |
-
return [_without_json_fields(row) for row in rows]
|
| 943 |
-
|
| 944 |
-
|
| 945 |
-
def _contributor_rows(
|
| 946 |
-
rows: list[Mapping[str, Any]],
|
| 947 |
-
*,
|
| 948 |
-
run_id: str,
|
| 949 |
-
repo: str,
|
| 950 |
-
snapshot_id: str,
|
| 951 |
-
) -> list[dict[str, Any]]:
|
| 952 |
-
return [
|
| 953 |
-
{
|
| 954 |
-
"run_id": run_id,
|
| 955 |
-
"repo": repo,
|
| 956 |
-
"snapshot_id": snapshot_id,
|
| 957 |
-
"report_generated_at": row.get("report_generated_at"),
|
| 958 |
-
"window_days": row.get("window_days"),
|
| 959 |
-
"author_login": row.get("author_login"),
|
| 960 |
-
"name": row.get("name"),
|
| 961 |
-
"profile_url": row.get("profile_url"),
|
| 962 |
-
"repo_pull_requests_url": row.get("repo_pull_requests_url"),
|
| 963 |
-
"repo_issues_url": row.get("repo_issues_url"),
|
| 964 |
-
"repo_first_seen_at": row.get("repo_first_seen_at"),
|
| 965 |
-
"repo_last_seen_at": row.get("repo_last_seen_at"),
|
| 966 |
-
"repo_primary_artifact_count": row.get("repo_primary_artifact_count"),
|
| 967 |
-
"repo_artifact_count": row.get("repo_artifact_count"),
|
| 968 |
-
"snapshot_issue_count": row.get("snapshot_issue_count"),
|
| 969 |
-
"snapshot_pr_count": row.get("snapshot_pr_count"),
|
| 970 |
-
"snapshot_comment_count": row.get("snapshot_comment_count"),
|
| 971 |
-
"snapshot_review_count": row.get("snapshot_review_count"),
|
| 972 |
-
"snapshot_review_comment_count": row.get("snapshot_review_comment_count"),
|
| 973 |
-
"repo_association": row.get("repo_association"),
|
| 974 |
-
"new_to_repo": row.get("new_to_repo"),
|
| 975 |
-
"first_seen_in_snapshot": row.get("first_seen_in_snapshot"),
|
| 976 |
-
"report_reason": row.get("report_reason"),
|
| 977 |
-
"account_age_days": row.get("account_age_days"),
|
| 978 |
-
"young_account": row.get("young_account"),
|
| 979 |
-
"follow_through_score": row.get("follow_through_score"),
|
| 980 |
-
"breadth_score": row.get("breadth_score"),
|
| 981 |
-
"automation_risk_signal": row.get("automation_risk_signal"),
|
| 982 |
-
"heuristic_note": row.get("heuristic_note"),
|
| 983 |
-
"public_orgs_json": row.get("public_orgs"),
|
| 984 |
-
"visible_authored_pr_count": row.get("visible_authored_pr_count"),
|
| 985 |
-
"merged_pr_count": row.get("merged_pr_count"),
|
| 986 |
-
"closed_unmerged_pr_count": row.get("closed_unmerged_pr_count"),
|
| 987 |
-
"open_pr_count": row.get("open_pr_count"),
|
| 988 |
-
"merged_pr_rate": row.get("merged_pr_rate"),
|
| 989 |
-
"closed_unmerged_pr_rate": row.get("closed_unmerged_pr_rate"),
|
| 990 |
-
"still_open_pr_rate": row.get("still_open_pr_rate"),
|
| 991 |
-
"distinct_repos_with_authored_prs": row.get("distinct_repos_with_authored_prs"),
|
| 992 |
-
"distinct_repos_with_open_prs": row.get("distinct_repos_with_open_prs"),
|
| 993 |
-
"fetch_error": row.get("fetch_error"),
|
| 994 |
-
}
|
| 995 |
-
for row in rows
|
| 996 |
-
]
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
def _contributor_row(row: Mapping[str, Any]) -> dict[str, Any]:
|
| 1000 |
-
return {
|
| 1001 |
-
**_without_json_fields(row),
|
| 1002 |
-
"public_orgs": _json_list(row.get("public_orgs_json")),
|
| 1003 |
-
}
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
def _normalize_lookup_mode(mode: str) -> str:
|
| 1007 |
normalized = mode.strip().lower()
|
| 1008 |
if normalized not in {"auto", "indexed", "live"}:
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import json
|
| 4 |
+
from collections.abc import Iterable, Mapping
|
| 5 |
from contextlib import suppress
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Any, Protocol
|
|
|
|
| 17 |
get_cluster,
|
| 18 |
get_cluster_ids_for_prs,
|
| 19 |
get_cluster_members,
|
|
|
|
|
|
|
| 20 |
get_document,
|
| 21 |
get_feature,
|
| 22 |
get_pair_neighbor_row,
|
|
|
|
| 99 |
"pr_search_documents",
|
| 100 |
_scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
|
| 101 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
insert_rows(
|
| 103 |
connection,
|
| 104 |
"pr_scope_features",
|
|
|
|
| 290 |
connection.close()
|
| 291 |
|
| 292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
def get_pr_search_similar_lookup(
|
| 294 |
db_path: Path,
|
| 295 |
*,
|
|
|
|
| 801 |
return feature
|
| 802 |
|
| 803 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 804 |
def _json_list(raw: Any) -> list[str]:
|
| 805 |
if isinstance(raw, list):
|
| 806 |
return [str(item) for item in raw]
|
|
|
|
| 838 |
return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
|
| 839 |
|
| 840 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
def _normalize_lookup_mode(mode: str) -> str:
|
| 842 |
normalized = mode.strip().lower()
|
| 843 |
if normalized not in {"auto", "indexed", "live"}:
|
src/slop_farmer/reports/read_views.py
ADDED
|
@@ -0,0 +1,742 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any, Literal
|
| 6 |
+
|
| 7 |
+
from slop_farmer.data.parquet_io import read_json, read_parquet_rows
|
| 8 |
+
|
| 9 |
+
AnalysisVariant = Literal["auto", "hybrid", "deterministic"]
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass(slots=True, frozen=True)
|
| 13 |
+
class _SnapshotMetadata:
|
| 14 |
+
repo: str
|
| 15 |
+
snapshot_id: str
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@dataclass(slots=True, frozen=True)
|
| 19 |
+
class _AnalysisSelection:
|
| 20 |
+
path: Path
|
| 21 |
+
payload: dict[str, Any]
|
| 22 |
+
variant_used: str
|
| 23 |
+
llm_enrichment: bool
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def get_snapshot_surfaces(snapshot_dir: Path) -> dict[str, Any]:
|
| 27 |
+
issue_status = get_issue_cluster_status(snapshot_dir, variant="auto")
|
| 28 |
+
contributor_status = get_contributor_status(snapshot_dir)
|
| 29 |
+
return {
|
| 30 |
+
"issues": {
|
| 31 |
+
"available": issue_status["available"],
|
| 32 |
+
"variant_used": issue_status.get("variant_used"),
|
| 33 |
+
"llm_enrichment": issue_status.get("llm_enrichment"),
|
| 34 |
+
"generated_at": issue_status.get("generated_at"),
|
| 35 |
+
"cluster_count": (issue_status.get("counts") or {}).get("meta_bugs", 0),
|
| 36 |
+
"duplicate_pr_count": (issue_status.get("counts") or {}).get("duplicate_prs", 0),
|
| 37 |
+
"available_variants": issue_status.get("available_variants") or [],
|
| 38 |
+
},
|
| 39 |
+
"contributors": {
|
| 40 |
+
"available": contributor_status["available"],
|
| 41 |
+
"generated_at": contributor_status.get("generated_at"),
|
| 42 |
+
"contributor_count": contributor_status.get("contributor_count", 0),
|
| 43 |
+
},
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def get_issue_cluster_status(snapshot_dir: Path, *, variant: AnalysisVariant) -> dict[str, Any]:
|
| 48 |
+
metadata = _snapshot_metadata(snapshot_dir)
|
| 49 |
+
candidates = _analysis_candidates(snapshot_dir)
|
| 50 |
+
selection = _select_analysis_report(candidates, variant=variant)
|
| 51 |
+
status = {
|
| 52 |
+
"repo": metadata.repo,
|
| 53 |
+
"snapshot_id": metadata.snapshot_id,
|
| 54 |
+
"variant_requested": variant,
|
| 55 |
+
"available": selection is not None,
|
| 56 |
+
"available_variants": sorted({candidate["variant"] for candidate in candidates}),
|
| 57 |
+
}
|
| 58 |
+
if selection is None:
|
| 59 |
+
return {
|
| 60 |
+
**status,
|
| 61 |
+
"variant_used": None,
|
| 62 |
+
"llm_enrichment": False,
|
| 63 |
+
"generated_at": None,
|
| 64 |
+
"report_path": None,
|
| 65 |
+
"counts": {"meta_bugs": 0, "duplicate_issues": 0, "duplicate_prs": 0},
|
| 66 |
+
}
|
| 67 |
+
payload = selection.payload
|
| 68 |
+
return {
|
| 69 |
+
**status,
|
| 70 |
+
"variant_used": selection.variant_used,
|
| 71 |
+
"llm_enrichment": selection.llm_enrichment,
|
| 72 |
+
"generated_at": payload.get("generated_at"),
|
| 73 |
+
"report_path": selection.path.name,
|
| 74 |
+
"counts": _analysis_counts(payload),
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def list_issue_clusters(
|
| 79 |
+
snapshot_dir: Path,
|
| 80 |
+
*,
|
| 81 |
+
limit: int | None,
|
| 82 |
+
variant: AnalysisVariant,
|
| 83 |
+
) -> dict[str, Any]:
|
| 84 |
+
metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
|
| 85 |
+
base = _analysis_base_payload(metadata, selection, variant=variant)
|
| 86 |
+
if selection is None:
|
| 87 |
+
return {**base, "clusters": [], "cluster_count": 0}
|
| 88 |
+
clusters = [
|
| 89 |
+
_issue_cluster_summary(cluster, issue_map, pr_map, rank=index)
|
| 90 |
+
for index, cluster in enumerate(selection.payload.get("meta_bugs") or [], start=1)
|
| 91 |
+
]
|
| 92 |
+
total = len(clusters)
|
| 93 |
+
return {
|
| 94 |
+
**base,
|
| 95 |
+
"clusters": clusters[:limit] if limit is not None else clusters,
|
| 96 |
+
"cluster_count": total,
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def get_issue_cluster(
|
| 101 |
+
snapshot_dir: Path,
|
| 102 |
+
*,
|
| 103 |
+
cluster_id: str,
|
| 104 |
+
variant: AnalysisVariant,
|
| 105 |
+
) -> dict[str, Any]:
|
| 106 |
+
metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
|
| 107 |
+
base = _analysis_base_payload(metadata, selection, variant=variant)
|
| 108 |
+
if selection is None:
|
| 109 |
+
return {
|
| 110 |
+
**base,
|
| 111 |
+
"cluster_id": cluster_id,
|
| 112 |
+
"found": False,
|
| 113 |
+
"cluster": None,
|
| 114 |
+
"issues": [],
|
| 115 |
+
"pull_requests": [],
|
| 116 |
+
}
|
| 117 |
+
cluster = next(
|
| 118 |
+
(
|
| 119 |
+
row
|
| 120 |
+
for row in selection.payload.get("meta_bugs") or []
|
| 121 |
+
if str(row.get("cluster_id") or "") == cluster_id
|
| 122 |
+
),
|
| 123 |
+
None,
|
| 124 |
+
)
|
| 125 |
+
if cluster is None:
|
| 126 |
+
return {
|
| 127 |
+
**base,
|
| 128 |
+
"cluster_id": cluster_id,
|
| 129 |
+
"found": False,
|
| 130 |
+
"cluster": None,
|
| 131 |
+
"issues": [],
|
| 132 |
+
"pull_requests": [],
|
| 133 |
+
}
|
| 134 |
+
issue_numbers = _ordered_ints(cluster.get("issue_numbers"))
|
| 135 |
+
pr_numbers = _ordered_ints(cluster.get("pr_numbers"))
|
| 136 |
+
canonical_pr_number = _coerce_int(cluster.get("canonical_pr_number"))
|
| 137 |
+
return {
|
| 138 |
+
**base,
|
| 139 |
+
"cluster_id": cluster_id,
|
| 140 |
+
"found": True,
|
| 141 |
+
"cluster": {
|
| 142 |
+
**_issue_cluster_summary(cluster, issue_map, pr_map),
|
| 143 |
+
"canonical_issue_reason": cluster.get("canonical_issue_reason"),
|
| 144 |
+
"canonical_pr_reason": cluster.get("canonical_pr_reason"),
|
| 145 |
+
"best_issue_reason": cluster.get("best_issue_reason"),
|
| 146 |
+
"best_pr_reason": cluster.get("best_pr_reason"),
|
| 147 |
+
},
|
| 148 |
+
"issues": [_issue_member_row(number, issue_map.get(number)) for number in issue_numbers],
|
| 149 |
+
"pull_requests": [
|
| 150 |
+
_pr_member_row(
|
| 151 |
+
number,
|
| 152 |
+
pr_map.get(number),
|
| 153 |
+
role="canonical" if canonical_pr_number == number else "member",
|
| 154 |
+
)
|
| 155 |
+
for number in pr_numbers
|
| 156 |
+
],
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def get_issue_clusters_for_pr(
|
| 161 |
+
snapshot_dir: Path,
|
| 162 |
+
*,
|
| 163 |
+
pr_number: int,
|
| 164 |
+
variant: AnalysisVariant,
|
| 165 |
+
) -> dict[str, Any]:
|
| 166 |
+
metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
|
| 167 |
+
base = _analysis_base_payload(metadata, selection, variant=variant)
|
| 168 |
+
if selection is None:
|
| 169 |
+
return {**base, "pr_number": pr_number, "found": False, "clusters": [], "cluster_count": 0}
|
| 170 |
+
matches = []
|
| 171 |
+
for index, cluster in enumerate(selection.payload.get("meta_bugs") or [], start=1):
|
| 172 |
+
pr_numbers = _ordered_ints(cluster.get("pr_numbers"))
|
| 173 |
+
if pr_number not in pr_numbers:
|
| 174 |
+
continue
|
| 175 |
+
canonical_pr_number = _coerce_int(cluster.get("canonical_pr_number"))
|
| 176 |
+
matches.append(
|
| 177 |
+
{
|
| 178 |
+
**_issue_cluster_summary(cluster, issue_map, pr_map, rank=index),
|
| 179 |
+
"membership_role": "canonical" if canonical_pr_number == pr_number else "member",
|
| 180 |
+
}
|
| 181 |
+
)
|
| 182 |
+
return {
|
| 183 |
+
**base,
|
| 184 |
+
"pr_number": pr_number,
|
| 185 |
+
"found": bool(matches),
|
| 186 |
+
"clusters": matches,
|
| 187 |
+
"cluster_count": len(matches),
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def check_issue_cluster_membership(
|
| 192 |
+
snapshot_dir: Path,
|
| 193 |
+
*,
|
| 194 |
+
pr_number: int,
|
| 195 |
+
cluster_id: str | None,
|
| 196 |
+
variant: AnalysisVariant,
|
| 197 |
+
) -> dict[str, Any]:
|
| 198 |
+
lookup = get_issue_clusters_for_pr(snapshot_dir, pr_number=pr_number, variant=variant)
|
| 199 |
+
matches = list(lookup.get("clusters") or [])
|
| 200 |
+
matching_cluster_ids = [str(row.get("cluster_id")) for row in matches if row.get("cluster_id")]
|
| 201 |
+
if cluster_id is None:
|
| 202 |
+
return {
|
| 203 |
+
**lookup,
|
| 204 |
+
"cluster_id": None,
|
| 205 |
+
"matched": bool(matching_cluster_ids),
|
| 206 |
+
"matching_cluster_ids": matching_cluster_ids,
|
| 207 |
+
}
|
| 208 |
+
match = next((row for row in matches if row.get("cluster_id") == cluster_id), None)
|
| 209 |
+
return {
|
| 210 |
+
**lookup,
|
| 211 |
+
"cluster_id": cluster_id,
|
| 212 |
+
"matched": match is not None,
|
| 213 |
+
"matching_cluster_ids": matching_cluster_ids,
|
| 214 |
+
"membership": match,
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def list_issue_duplicate_prs(
|
| 219 |
+
snapshot_dir: Path,
|
| 220 |
+
*,
|
| 221 |
+
limit: int | None,
|
| 222 |
+
variant: AnalysisVariant,
|
| 223 |
+
) -> dict[str, Any]:
|
| 224 |
+
metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
|
| 225 |
+
base = _analysis_base_payload(metadata, selection, variant=variant)
|
| 226 |
+
if selection is None:
|
| 227 |
+
return {**base, "duplicate_prs": [], "duplicate_pr_count": 0}
|
| 228 |
+
rows = [
|
| 229 |
+
_duplicate_pr_summary(entry, issue_map, pr_map, rank=index)
|
| 230 |
+
for index, entry in enumerate(selection.payload.get("duplicate_prs") or [], start=1)
|
| 231 |
+
]
|
| 232 |
+
total = len(rows)
|
| 233 |
+
return {
|
| 234 |
+
**base,
|
| 235 |
+
"duplicate_prs": rows[:limit] if limit is not None else rows,
|
| 236 |
+
"duplicate_pr_count": total,
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def get_issue_best(snapshot_dir: Path, *, variant: AnalysisVariant) -> dict[str, Any]:
|
| 241 |
+
metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
|
| 242 |
+
base = _analysis_base_payload(metadata, selection, variant=variant)
|
| 243 |
+
if selection is None:
|
| 244 |
+
return {**base, "best_issue": None, "best_pr": None}
|
| 245 |
+
return {
|
| 246 |
+
**base,
|
| 247 |
+
"best_issue": _best_issue_summary(selection.payload.get("best_issue"), issue_map),
|
| 248 |
+
"best_pr": _best_pr_summary(selection.payload.get("best_pr"), pr_map),
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def get_contributor_status(snapshot_dir: Path) -> dict[str, Any]:
|
| 253 |
+
metadata = _snapshot_metadata(snapshot_dir)
|
| 254 |
+
report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
|
| 255 |
+
contributors = report.get("contributors") if isinstance(report.get("contributors"), list) else []
|
| 256 |
+
return {
|
| 257 |
+
"repo": str(report.get("repo") or metadata.repo),
|
| 258 |
+
"snapshot_id": str(report.get("snapshot_id") or metadata.snapshot_id),
|
| 259 |
+
"available": bool(report),
|
| 260 |
+
"generated_at": report.get("generated_at"),
|
| 261 |
+
"window_days": _coerce_int(report.get("window_days")),
|
| 262 |
+
"contributor_count": len(contributors),
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def list_contributors(snapshot_dir: Path, *, limit: int | None) -> dict[str, Any]:
|
| 267 |
+
status = get_contributor_status(snapshot_dir)
|
| 268 |
+
report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
|
| 269 |
+
rows = [
|
| 270 |
+
_contributor_summary(entry, rank=index)
|
| 271 |
+
for index, entry in enumerate(report.get("contributors") or [], start=1)
|
| 272 |
+
if isinstance(entry, dict)
|
| 273 |
+
]
|
| 274 |
+
total = len(rows)
|
| 275 |
+
return {
|
| 276 |
+
**status,
|
| 277 |
+
"contributors": rows[:limit] if limit is not None else rows,
|
| 278 |
+
"contributor_count": total,
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def get_contributor(snapshot_dir: Path, *, author_login: str) -> dict[str, Any]:
|
| 283 |
+
status = get_contributor_status(snapshot_dir)
|
| 284 |
+
report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
|
| 285 |
+
contributor = _find_contributor(report.get("contributors") or [], author_login)
|
| 286 |
+
if contributor is None:
|
| 287 |
+
return {
|
| 288 |
+
**status,
|
| 289 |
+
"author_login": author_login,
|
| 290 |
+
"found": False,
|
| 291 |
+
"summary": None,
|
| 292 |
+
"risk": None,
|
| 293 |
+
"contributor": None,
|
| 294 |
+
}
|
| 295 |
+
return {
|
| 296 |
+
**status,
|
| 297 |
+
"author_login": str(contributor.get("author_login") or author_login),
|
| 298 |
+
"found": True,
|
| 299 |
+
"summary": _contributor_summary(contributor),
|
| 300 |
+
"risk": _contributor_risk(contributor),
|
| 301 |
+
"contributor": contributor,
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def get_contributor_risk(snapshot_dir: Path, *, author_login: str) -> dict[str, Any]:
|
| 306 |
+
contributor = get_contributor(snapshot_dir, author_login=author_login)
|
| 307 |
+
risk = contributor.get("risk")
|
| 308 |
+
return {
|
| 309 |
+
"repo": contributor.get("repo"),
|
| 310 |
+
"snapshot_id": contributor.get("snapshot_id"),
|
| 311 |
+
"available": contributor.get("available"),
|
| 312 |
+
"generated_at": contributor.get("generated_at"),
|
| 313 |
+
"author_login": contributor.get("author_login"),
|
| 314 |
+
"found": contributor.get("found"),
|
| 315 |
+
"risk_available": risk is not None,
|
| 316 |
+
"risk": risk,
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def _analysis_context(
|
| 321 |
+
snapshot_dir: Path,
|
| 322 |
+
*,
|
| 323 |
+
variant: AnalysisVariant,
|
| 324 |
+
) -> tuple[_SnapshotMetadata, _AnalysisSelection | None, dict[int, dict[str, Any]], dict[int, dict[str, Any]]]:
|
| 325 |
+
metadata = _snapshot_metadata(snapshot_dir)
|
| 326 |
+
selection = _select_analysis_report(_analysis_candidates(snapshot_dir), variant=variant)
|
| 327 |
+
issue_map, pr_map = _artifact_maps(snapshot_dir)
|
| 328 |
+
return metadata, selection, issue_map, pr_map
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def _analysis_base_payload(
|
| 332 |
+
metadata: _SnapshotMetadata,
|
| 333 |
+
selection: _AnalysisSelection | None,
|
| 334 |
+
*,
|
| 335 |
+
variant: AnalysisVariant,
|
| 336 |
+
) -> dict[str, Any]:
|
| 337 |
+
base = {
|
| 338 |
+
"repo": metadata.repo,
|
| 339 |
+
"snapshot_id": metadata.snapshot_id,
|
| 340 |
+
"variant_requested": variant,
|
| 341 |
+
"available": selection is not None,
|
| 342 |
+
"variant_used": None,
|
| 343 |
+
"llm_enrichment": False,
|
| 344 |
+
"generated_at": None,
|
| 345 |
+
}
|
| 346 |
+
if selection is None:
|
| 347 |
+
return base
|
| 348 |
+
return {
|
| 349 |
+
**base,
|
| 350 |
+
"variant_used": selection.variant_used,
|
| 351 |
+
"llm_enrichment": selection.llm_enrichment,
|
| 352 |
+
"generated_at": selection.payload.get("generated_at"),
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
def _analysis_candidates(snapshot_dir: Path) -> list[dict[str, Any]]:
|
| 357 |
+
candidates: list[dict[str, Any]] = []
|
| 358 |
+
for path in _analysis_report_paths(snapshot_dir):
|
| 359 |
+
payload = _read_optional_json(path)
|
| 360 |
+
if not payload:
|
| 361 |
+
continue
|
| 362 |
+
llm_enrichment = bool(payload.get("llm_enrichment"))
|
| 363 |
+
candidates.append(
|
| 364 |
+
{
|
| 365 |
+
"path": path,
|
| 366 |
+
"payload": payload,
|
| 367 |
+
"variant": _analysis_variant(path.name, payload, llm_enrichment=llm_enrichment),
|
| 368 |
+
"llm_enrichment": llm_enrichment,
|
| 369 |
+
}
|
| 370 |
+
)
|
| 371 |
+
return candidates
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
def _select_analysis_report(
|
| 375 |
+
candidates: list[dict[str, Any]],
|
| 376 |
+
*,
|
| 377 |
+
variant: AnalysisVariant,
|
| 378 |
+
) -> _AnalysisSelection | None:
|
| 379 |
+
if not candidates:
|
| 380 |
+
return None
|
| 381 |
+
if variant == "auto":
|
| 382 |
+
ordered = sorted(candidates, key=_analysis_auto_priority)
|
| 383 |
+
else:
|
| 384 |
+
ordered = [candidate for candidate in candidates if candidate["variant"] == variant]
|
| 385 |
+
ordered.sort(key=_analysis_specific_priority)
|
| 386 |
+
if not ordered:
|
| 387 |
+
return None
|
| 388 |
+
winner = ordered[0]
|
| 389 |
+
return _AnalysisSelection(
|
| 390 |
+
path=Path(winner["path"]),
|
| 391 |
+
payload=dict(winner["payload"]),
|
| 392 |
+
variant_used=str(winner["variant"]),
|
| 393 |
+
llm_enrichment=bool(winner["llm_enrichment"]),
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
def _analysis_report_paths(snapshot_dir: Path) -> list[Path]:
|
| 398 |
+
ordered = [
|
| 399 |
+
snapshot_dir / "analysis-report-hybrid.json",
|
| 400 |
+
snapshot_dir / "analysis-report-deterministic.json",
|
| 401 |
+
snapshot_dir / "analysis-report.json",
|
| 402 |
+
]
|
| 403 |
+
seen = {path.name for path in ordered}
|
| 404 |
+
ordered.extend(
|
| 405 |
+
path for path in sorted(snapshot_dir.glob("analysis-report*.json")) if path.name not in seen
|
| 406 |
+
)
|
| 407 |
+
return [path for path in ordered if path.exists()]
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
def _analysis_auto_priority(candidate: dict[str, Any]) -> tuple[int, str]:
|
| 411 |
+
path = Path(candidate["path"])
|
| 412 |
+
if path.name == "analysis-report-hybrid.json":
|
| 413 |
+
return (0, path.name)
|
| 414 |
+
if bool(candidate.get("llm_enrichment")):
|
| 415 |
+
return (1, path.name)
|
| 416 |
+
if path.name == "analysis-report.json":
|
| 417 |
+
return (2, path.name)
|
| 418 |
+
return (3, path.name)
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
def _analysis_specific_priority(candidate: dict[str, Any]) -> tuple[int, str]:
|
| 422 |
+
path = Path(candidate["path"])
|
| 423 |
+
if path.name.endswith(f"-{candidate['variant']}.json"):
|
| 424 |
+
return (0, path.name)
|
| 425 |
+
if path.name == "analysis-report.json":
|
| 426 |
+
return (1, path.name)
|
| 427 |
+
return (2, path.name)
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
def _analysis_variant(path_name: str, payload: dict[str, Any], *, llm_enrichment: bool) -> str:
|
| 431 |
+
lowered = path_name.lower()
|
| 432 |
+
if "hybrid" in lowered:
|
| 433 |
+
return "hybrid"
|
| 434 |
+
if "deterministic" in lowered:
|
| 435 |
+
return "deterministic"
|
| 436 |
+
if isinstance(payload.get("variant_used"), str):
|
| 437 |
+
variant_used = str(payload["variant_used"]).strip().lower()
|
| 438 |
+
if variant_used in {"hybrid", "deterministic"}:
|
| 439 |
+
return variant_used
|
| 440 |
+
return "hybrid" if llm_enrichment else "deterministic"
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
def _analysis_counts(payload: dict[str, Any]) -> dict[str, int]:
|
| 444 |
+
return {
|
| 445 |
+
"meta_bugs": len(payload.get("meta_bugs") or []),
|
| 446 |
+
"duplicate_issues": len(payload.get("duplicate_issues") or []),
|
| 447 |
+
"duplicate_prs": len(payload.get("duplicate_prs") or []),
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
def _artifact_maps(snapshot_dir: Path) -> tuple[dict[int, dict[str, Any]], dict[int, dict[str, Any]]]:
|
| 452 |
+
issue_rows = read_parquet_rows(snapshot_dir / "issues.parquet") if (snapshot_dir / "issues.parquet").exists() else []
|
| 453 |
+
pr_rows = (
|
| 454 |
+
read_parquet_rows(snapshot_dir / "pull_requests.parquet")
|
| 455 |
+
if (snapshot_dir / "pull_requests.parquet").exists()
|
| 456 |
+
else []
|
| 457 |
+
)
|
| 458 |
+
issue_map = {int(row["number"]): row for row in issue_rows if _coerce_int(row.get("number")) is not None}
|
| 459 |
+
pr_map = {int(row["number"]): row for row in pr_rows if _coerce_int(row.get("number")) is not None}
|
| 460 |
+
return issue_map, pr_map
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
def _issue_cluster_summary(
|
| 464 |
+
cluster: dict[str, Any],
|
| 465 |
+
issue_map: dict[int, dict[str, Any]],
|
| 466 |
+
pr_map: dict[int, dict[str, Any]],
|
| 467 |
+
*,
|
| 468 |
+
rank: int | None = None,
|
| 469 |
+
) -> dict[str, Any]:
|
| 470 |
+
canonical_issue_number = _coerce_int(cluster.get("canonical_issue_number"))
|
| 471 |
+
canonical_pr_number = _coerce_int(cluster.get("canonical_pr_number"))
|
| 472 |
+
issue_numbers = _ordered_ints(cluster.get("issue_numbers"))
|
| 473 |
+
pr_numbers = _ordered_ints(cluster.get("pr_numbers"))
|
| 474 |
+
return {
|
| 475 |
+
"rank": rank,
|
| 476 |
+
"cluster_id": str(cluster.get("cluster_id") or f"cluster-{rank or 0}"),
|
| 477 |
+
"title": _cluster_title(cluster, issue_map, pr_map, canonical_issue_number, canonical_pr_number),
|
| 478 |
+
"summary": cluster.get("summary"),
|
| 479 |
+
"status": cluster.get("status"),
|
| 480 |
+
"confidence": _coerce_float(cluster.get("confidence")),
|
| 481 |
+
"canonical_issue_number": canonical_issue_number,
|
| 482 |
+
"canonical_issue_title": _title_for_issue(canonical_issue_number, issue_map),
|
| 483 |
+
"canonical_issue_url": _url_for_issue(canonical_issue_number, issue_map),
|
| 484 |
+
"canonical_pr_number": canonical_pr_number,
|
| 485 |
+
"canonical_pr_title": _title_for_pr(canonical_pr_number, pr_map),
|
| 486 |
+
"canonical_pr_url": _url_for_pr(canonical_pr_number, pr_map),
|
| 487 |
+
"issue_numbers": issue_numbers,
|
| 488 |
+
"issue_count": len(issue_numbers),
|
| 489 |
+
"pr_numbers": pr_numbers,
|
| 490 |
+
"pr_count": len(pr_numbers),
|
| 491 |
+
"evidence_types": [str(value) for value in (cluster.get("evidence_types") or []) if value],
|
| 492 |
+
"github_url": _cluster_url(canonical_issue_number, canonical_pr_number, issue_map, pr_map),
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
def _cluster_title(
|
| 497 |
+
cluster: dict[str, Any],
|
| 498 |
+
issue_map: dict[int, dict[str, Any]],
|
| 499 |
+
pr_map: dict[int, dict[str, Any]],
|
| 500 |
+
canonical_issue_number: int | None,
|
| 501 |
+
canonical_pr_number: int | None,
|
| 502 |
+
) -> str:
|
| 503 |
+
issue_title = _title_for_issue(canonical_issue_number, issue_map)
|
| 504 |
+
if issue_title:
|
| 505 |
+
return issue_title
|
| 506 |
+
pr_title = _title_for_pr(canonical_pr_number, pr_map)
|
| 507 |
+
if pr_title:
|
| 508 |
+
return pr_title
|
| 509 |
+
summary = str(cluster.get("summary") or "").strip()
|
| 510 |
+
if summary:
|
| 511 |
+
return summary
|
| 512 |
+
return str(cluster.get("cluster_id") or "cluster")
|
| 513 |
+
|
| 514 |
+
|
| 515 |
+
def _cluster_url(
|
| 516 |
+
canonical_issue_number: int | None,
|
| 517 |
+
canonical_pr_number: int | None,
|
| 518 |
+
issue_map: dict[int, dict[str, Any]],
|
| 519 |
+
pr_map: dict[int, dict[str, Any]],
|
| 520 |
+
) -> str | None:
|
| 521 |
+
return _url_for_issue(canonical_issue_number, issue_map) or _url_for_pr(canonical_pr_number, pr_map)
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
def _duplicate_pr_summary(
|
| 525 |
+
entry: dict[str, Any],
|
| 526 |
+
issue_map: dict[int, dict[str, Any]],
|
| 527 |
+
pr_map: dict[int, dict[str, Any]],
|
| 528 |
+
*,
|
| 529 |
+
rank: int,
|
| 530 |
+
) -> dict[str, Any]:
|
| 531 |
+
canonical_pr_number = _coerce_int(entry.get("canonical_pr_number"))
|
| 532 |
+
target_issue_number = _coerce_int(entry.get("target_issue_number"))
|
| 533 |
+
duplicates = _ordered_ints(entry.get("duplicate_pr_numbers"))
|
| 534 |
+
return {
|
| 535 |
+
"rank": rank,
|
| 536 |
+
"cluster_id": str(entry.get("cluster_id") or f"duplicate-pr-{rank}"),
|
| 537 |
+
"canonical_pr_number": canonical_pr_number,
|
| 538 |
+
"canonical_pr_title": _title_for_pr(canonical_pr_number, pr_map),
|
| 539 |
+
"canonical_pr_url": _url_for_pr(canonical_pr_number, pr_map),
|
| 540 |
+
"target_issue_number": target_issue_number,
|
| 541 |
+
"target_issue_title": _title_for_issue(target_issue_number, issue_map),
|
| 542 |
+
"target_issue_url": _url_for_issue(target_issue_number, issue_map),
|
| 543 |
+
"duplicate_pr_numbers": duplicates,
|
| 544 |
+
"duplicate_pr_count": len(duplicates),
|
| 545 |
+
"reason": entry.get("reason"),
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
|
| 549 |
+
def _best_issue_summary(entry: Any, issue_map: dict[int, dict[str, Any]]) -> dict[str, Any] | None:
|
| 550 |
+
if not isinstance(entry, dict):
|
| 551 |
+
return None
|
| 552 |
+
issue_number = _coerce_int(entry.get("issue_number"))
|
| 553 |
+
return {
|
| 554 |
+
"cluster_id": entry.get("cluster_id"),
|
| 555 |
+
"issue_number": issue_number,
|
| 556 |
+
"title": _title_for_issue(issue_number, issue_map),
|
| 557 |
+
"url": _url_for_issue(issue_number, issue_map),
|
| 558 |
+
"reason": entry.get("reason"),
|
| 559 |
+
"score": _coerce_float(entry.get("score")),
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
def _best_pr_summary(entry: Any, pr_map: dict[int, dict[str, Any]]) -> dict[str, Any] | None:
|
| 564 |
+
if not isinstance(entry, dict):
|
| 565 |
+
return None
|
| 566 |
+
pr_number = _coerce_int(entry.get("pr_number"))
|
| 567 |
+
return {
|
| 568 |
+
"cluster_id": entry.get("cluster_id"),
|
| 569 |
+
"pr_number": pr_number,
|
| 570 |
+
"title": _title_for_pr(pr_number, pr_map),
|
| 571 |
+
"url": _url_for_pr(pr_number, pr_map),
|
| 572 |
+
"reason": entry.get("reason"),
|
| 573 |
+
"score": _coerce_float(entry.get("score")),
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
def _issue_member_row(number: int, row: dict[str, Any] | None) -> dict[str, Any]:
|
| 578 |
+
row = row or {}
|
| 579 |
+
return {
|
| 580 |
+
"number": number,
|
| 581 |
+
"title": row.get("title"),
|
| 582 |
+
"state": row.get("state"),
|
| 583 |
+
"author_login": row.get("author_login"),
|
| 584 |
+
"created_at": row.get("created_at"),
|
| 585 |
+
"updated_at": row.get("updated_at"),
|
| 586 |
+
"html_url": row.get("html_url"),
|
| 587 |
+
}
|
| 588 |
+
|
| 589 |
+
|
| 590 |
+
def _pr_member_row(number: int, row: dict[str, Any] | None, *, role: str) -> dict[str, Any]:
|
| 591 |
+
row = row or {}
|
| 592 |
+
return {
|
| 593 |
+
"number": number,
|
| 594 |
+
"role": role,
|
| 595 |
+
"title": row.get("title"),
|
| 596 |
+
"author_login": row.get("author_login"),
|
| 597 |
+
"state": row.get("state"),
|
| 598 |
+
"draft": bool(row.get("draft")),
|
| 599 |
+
"merged": bool(row.get("merged")),
|
| 600 |
+
"author_association": row.get("author_association"),
|
| 601 |
+
"created_at": row.get("created_at"),
|
| 602 |
+
"updated_at": row.get("updated_at"),
|
| 603 |
+
"html_url": row.get("html_url"),
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
|
| 607 |
+
def _contributor_summary(contributor: dict[str, Any], *, rank: int | None = None) -> dict[str, Any]:
|
| 608 |
+
activity = contributor.get("activity") if isinstance(contributor.get("activity"), dict) else {}
|
| 609 |
+
return {
|
| 610 |
+
"rank": rank,
|
| 611 |
+
"author_login": contributor.get("author_login"),
|
| 612 |
+
"name": contributor.get("name"),
|
| 613 |
+
"profile_url": contributor.get("profile_url"),
|
| 614 |
+
"repo_association": contributor.get("repo_association"),
|
| 615 |
+
"first_seen_in_snapshot": contributor.get("first_seen_in_snapshot"),
|
| 616 |
+
"new_to_repo": contributor.get("new_to_repo"),
|
| 617 |
+
"snapshot_pr_count": _coerce_int(contributor.get("snapshot_pr_count")) or 0,
|
| 618 |
+
"snapshot_issue_count": _coerce_int(contributor.get("snapshot_issue_count")) or 0,
|
| 619 |
+
"follow_through_score": contributor.get("follow_through_score"),
|
| 620 |
+
"breadth_score": contributor.get("breadth_score"),
|
| 621 |
+
"automation_risk_signal": contributor.get("automation_risk_signal"),
|
| 622 |
+
"heuristic_note": contributor.get("heuristic_note"),
|
| 623 |
+
"account_age_days": _coerce_int(contributor.get("account_age_days")),
|
| 624 |
+
"public_pr_count_42d": _coerce_int(activity.get("visible_authored_pr_count")),
|
| 625 |
+
"public_repo_count_42d": _coerce_int(activity.get("distinct_repos_with_authored_prs")),
|
| 626 |
+
"repo_pull_requests_url": contributor.get("repo_pull_requests_url"),
|
| 627 |
+
"repo_issues_url": contributor.get("repo_issues_url"),
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
|
| 631 |
+
def _contributor_risk(contributor: dict[str, Any]) -> dict[str, Any]:
|
| 632 |
+
activity = contributor.get("activity") if isinstance(contributor.get("activity"), dict) else {}
|
| 633 |
+
return {
|
| 634 |
+
"automation_risk_signal": contributor.get("automation_risk_signal"),
|
| 635 |
+
"heuristic_note": contributor.get("heuristic_note"),
|
| 636 |
+
"follow_through_score": contributor.get("follow_through_score"),
|
| 637 |
+
"breadth_score": contributor.get("breadth_score"),
|
| 638 |
+
"account_age_days": _coerce_int(contributor.get("account_age_days")),
|
| 639 |
+
"public_pr_count_42d": _coerce_int(activity.get("visible_authored_pr_count")),
|
| 640 |
+
"public_repo_count_42d": _coerce_int(activity.get("distinct_repos_with_authored_prs")),
|
| 641 |
+
"report_reason": contributor.get("report_reason"),
|
| 642 |
+
}
|
| 643 |
+
|
| 644 |
+
|
| 645 |
+
def _find_contributor(entries: list[Any], author_login: str) -> dict[str, Any] | None:
|
| 646 |
+
lowered = author_login.casefold()
|
| 647 |
+
for entry in entries:
|
| 648 |
+
if not isinstance(entry, dict):
|
| 649 |
+
continue
|
| 650 |
+
login = str(entry.get("author_login") or "")
|
| 651 |
+
if login.casefold() == lowered:
|
| 652 |
+
return entry
|
| 653 |
+
return None
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
def _snapshot_metadata(snapshot_dir: Path) -> _SnapshotMetadata:
|
| 657 |
+
manifest = _read_optional_json(snapshot_dir / "manifest.json")
|
| 658 |
+
repo = str(manifest.get("repo") or _infer_repo(snapshot_dir) or "")
|
| 659 |
+
snapshot_id = str(manifest.get("snapshot_id") or snapshot_dir.name)
|
| 660 |
+
return _SnapshotMetadata(repo=repo, snapshot_id=snapshot_id)
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
def _infer_repo(snapshot_dir: Path) -> str | None:
|
| 664 |
+
for filename in ("pull_requests.parquet", "issues.parquet"):
|
| 665 |
+
path = snapshot_dir / filename
|
| 666 |
+
if not path.exists():
|
| 667 |
+
continue
|
| 668 |
+
rows = read_parquet_rows(path)
|
| 669 |
+
if rows and rows[0].get("repo"):
|
| 670 |
+
return str(rows[0]["repo"])
|
| 671 |
+
for filename in _analysis_report_paths(snapshot_dir):
|
| 672 |
+
payload = _read_optional_json(filename)
|
| 673 |
+
if payload.get("repo"):
|
| 674 |
+
return str(payload["repo"])
|
| 675 |
+
report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
|
| 676 |
+
if report.get("repo"):
|
| 677 |
+
return str(report["repo"])
|
| 678 |
+
return None
|
| 679 |
+
|
| 680 |
+
|
| 681 |
+
def _title_for_issue(number: int | None, issue_map: dict[int, dict[str, Any]]) -> str | None:
|
| 682 |
+
if number is None or number not in issue_map:
|
| 683 |
+
return None
|
| 684 |
+
title = issue_map[number].get("title")
|
| 685 |
+
return str(title) if title else None
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
def _url_for_issue(number: int | None, issue_map: dict[int, dict[str, Any]]) -> str | None:
|
| 689 |
+
if number is None or number not in issue_map:
|
| 690 |
+
return None
|
| 691 |
+
value = issue_map[number].get("html_url")
|
| 692 |
+
return str(value) if value else None
|
| 693 |
+
|
| 694 |
+
|
| 695 |
+
def _title_for_pr(number: int | None, pr_map: dict[int, dict[str, Any]]) -> str | None:
|
| 696 |
+
if number is None or number not in pr_map:
|
| 697 |
+
return None
|
| 698 |
+
title = pr_map[number].get("title")
|
| 699 |
+
return str(title) if title else None
|
| 700 |
+
|
| 701 |
+
|
| 702 |
+
def _url_for_pr(number: int | None, pr_map: dict[int, dict[str, Any]]) -> str | None:
|
| 703 |
+
if number is None or number not in pr_map:
|
| 704 |
+
return None
|
| 705 |
+
value = pr_map[number].get("html_url")
|
| 706 |
+
return str(value) if value else None
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
def _ordered_ints(values: Any) -> list[int]:
|
| 710 |
+
if not isinstance(values, list):
|
| 711 |
+
return []
|
| 712 |
+
ordered: list[int] = []
|
| 713 |
+
for value in values:
|
| 714 |
+
number = _coerce_int(value)
|
| 715 |
+
if number is not None:
|
| 716 |
+
ordered.append(number)
|
| 717 |
+
return ordered
|
| 718 |
+
|
| 719 |
+
|
| 720 |
+
def _coerce_int(value: Any) -> int | None:
|
| 721 |
+
if value is None:
|
| 722 |
+
return None
|
| 723 |
+
try:
|
| 724 |
+
return int(value)
|
| 725 |
+
except (TypeError, ValueError):
|
| 726 |
+
return None
|
| 727 |
+
|
| 728 |
+
|
| 729 |
+
def _coerce_float(value: Any) -> float | None:
|
| 730 |
+
if value is None:
|
| 731 |
+
return None
|
| 732 |
+
try:
|
| 733 |
+
return float(value)
|
| 734 |
+
except (TypeError, ValueError):
|
| 735 |
+
return None
|
| 736 |
+
|
| 737 |
+
|
| 738 |
+
def _read_optional_json(path: Path) -> dict[str, Any]:
|
| 739 |
+
if not path.exists():
|
| 740 |
+
return {}
|
| 741 |
+
payload = read_json(path)
|
| 742 |
+
return payload if isinstance(payload, dict) else {}
|
uv.lock
CHANGED
|
@@ -561,7 +561,7 @@ wheels = [
|
|
| 561 |
|
| 562 |
[[package]]
|
| 563 |
name = "fast-agent-mcp"
|
| 564 |
-
version = "0.6.
|
| 565 |
source = { registry = "https://pypi.org/simple" }
|
| 566 |
dependencies = [
|
| 567 |
{ name = "a2a-sdk" },
|
|
@@ -598,9 +598,9 @@ dependencies = [
|
|
| 598 |
{ name = "uvloop", marker = "sys_platform != 'win32'" },
|
| 599 |
{ name = "watchfiles" },
|
| 600 |
]
|
| 601 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 602 |
wheels = [
|
| 603 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 604 |
]
|
| 605 |
|
| 606 |
[[package]]
|
|
@@ -2366,7 +2366,7 @@ wheels = [
|
|
| 2366 |
|
| 2367 |
[[package]]
|
| 2368 |
name = "slop-farmer"
|
| 2369 |
-
version = "0.1.
|
| 2370 |
source = { editable = "." }
|
| 2371 |
dependencies = [
|
| 2372 |
{ name = "duckdb" },
|
|
|
|
| 561 |
|
| 562 |
[[package]]
|
| 563 |
name = "fast-agent-mcp"
|
| 564 |
+
version = "0.6.18"
|
| 565 |
source = { registry = "https://pypi.org/simple" }
|
| 566 |
dependencies = [
|
| 567 |
{ name = "a2a-sdk" },
|
|
|
|
| 598 |
{ name = "uvloop", marker = "sys_platform != 'win32'" },
|
| 599 |
{ name = "watchfiles" },
|
| 600 |
]
|
| 601 |
+
sdist = { url = "https://files.pythonhosted.org/packages/68/9f/a66344581177eb70cd817a58a3305c4b2c2b5f98661129c2cecc4aa36e77/fast_agent_mcp-0.6.18.tar.gz", hash = "sha256:5ee5624890a9670b6f1a912998807e0fd451aa1c7205d189a964764a988c7bc0", size = 2091443, upload-time = "2026-04-17T20:52:25.84Z" }
|
| 602 |
wheels = [
|
| 603 |
+
{ url = "https://files.pythonhosted.org/packages/49/63/d8942bde2e706c869f93835ea85a2015be0edf5772c4e9ec8939a1001172/fast_agent_mcp-0.6.18-py3-none-any.whl", hash = "sha256:67c0c011763a28b8d5779b5d4d5cdc61e6f3dbc8cd1a7227388229957429835f", size = 1573842, upload-time = "2026-04-17T20:52:28.807Z" },
|
| 604 |
]
|
| 605 |
|
| 606 |
[[package]]
|
|
|
|
| 2366 |
|
| 2367 |
[[package]]
|
| 2368 |
name = "slop-farmer"
|
| 2369 |
+
version = "0.1.0"
|
| 2370 |
source = { editable = "." }
|
| 2371 |
dependencies = [
|
| 2372 |
{ name = "duckdb" },
|