Spaces:
Sleeping
Sleeping
Deploy OpenClaw PR API
Browse files
pyproject.toml
CHANGED
|
@@ -16,7 +16,7 @@ dependencies = [
|
|
| 16 |
"pydantic>=2.11",
|
| 17 |
"PyYAML>=6.0.2",
|
| 18 |
"rank-bm25>=0.2.2",
|
| 19 |
-
"fast-agent-mcp>=0.6.
|
| 20 |
"uvicorn>=0.34.0",
|
| 21 |
]
|
| 22 |
|
|
@@ -33,7 +33,6 @@ llm = [
|
|
| 33 |
|
| 34 |
[project.scripts]
|
| 35 |
slop-farmer = "slop_farmer.app.cli:main"
|
| 36 |
-
pr-search = "slop_farmer.app.pr_search_client:main"
|
| 37 |
|
| 38 |
[tool.setuptools]
|
| 39 |
package-dir = {"" = "src"}
|
|
|
|
| 16 |
"pydantic>=2.11",
|
| 17 |
"PyYAML>=6.0.2",
|
| 18 |
"rank-bm25>=0.2.2",
|
| 19 |
+
"fast-agent-mcp>=0.6.17",
|
| 20 |
"uvicorn>=0.34.0",
|
| 21 |
]
|
| 22 |
|
|
|
|
| 33 |
|
| 34 |
[project.scripts]
|
| 35 |
slop-farmer = "slop_farmer.app.cli:main"
|
|
|
|
| 36 |
|
| 37 |
[tool.setuptools]
|
| 38 |
package-dir = {"" = "src"}
|
src/slop_farmer/app/pr_search_api.py
CHANGED
|
@@ -11,6 +11,14 @@ from fastapi.responses import JSONResponse
|
|
| 11 |
|
| 12 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 13 |
from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from slop_farmer.reports.pr_search_service import (
|
| 15 |
get_pr_search_cluster,
|
| 16 |
get_pr_search_clusters,
|
|
@@ -204,6 +212,104 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
|
|
| 204 |
),
|
| 205 |
)
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
return app
|
| 208 |
|
| 209 |
|
|
@@ -289,6 +395,7 @@ def _looks_not_found(exc: ValueError) -> bool:
|
|
| 289 |
message = str(exc).lower()
|
| 290 |
return (
|
| 291 |
"not found" in message
|
|
|
|
| 292 |
or "no active pr search run" in message
|
| 293 |
or "was not found in the active indexed universe" in message
|
| 294 |
)
|
|
|
|
| 11 |
|
| 12 |
from slop_farmer.config import PrSearchRefreshOptions
|
| 13 |
from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
|
| 14 |
+
from slop_farmer.reports.analysis_service import (
|
| 15 |
+
get_analysis_best,
|
| 16 |
+
get_analysis_meta_bug,
|
| 17 |
+
get_analysis_status,
|
| 18 |
+
get_pr_analysis,
|
| 19 |
+
list_analysis_duplicate_prs,
|
| 20 |
+
list_analysis_meta_bugs,
|
| 21 |
+
)
|
| 22 |
from slop_farmer.reports.pr_search_service import (
|
| 23 |
get_pr_search_cluster,
|
| 24 |
get_pr_search_clusters,
|
|
|
|
| 212 |
),
|
| 213 |
)
|
| 214 |
|
| 215 |
+
@app.get("/v1/repos/{owner}/{repo}/analysis/status")
|
| 216 |
+
async def analysis_status(
|
| 217 |
+
owner: str,
|
| 218 |
+
repo: str,
|
| 219 |
+
request: Request,
|
| 220 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 221 |
+
) -> dict[str, Any]:
|
| 222 |
+
settings = request.app.state.settings
|
| 223 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 224 |
+
return get_analysis_status(settings.index_path, repo=repo_slug, variant=variant)
|
| 225 |
+
|
| 226 |
+
@app.get("/v1/repos/{owner}/{repo}/pulls/{number}/analysis")
|
| 227 |
+
async def pr_analysis(
|
| 228 |
+
owner: str,
|
| 229 |
+
repo: str,
|
| 230 |
+
number: int,
|
| 231 |
+
request: Request,
|
| 232 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 233 |
+
) -> dict[str, Any]:
|
| 234 |
+
settings = request.app.state.settings
|
| 235 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 236 |
+
return get_pr_analysis(
|
| 237 |
+
settings.index_path,
|
| 238 |
+
repo=repo_slug,
|
| 239 |
+
pr_number=number,
|
| 240 |
+
variant=variant,
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
@app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs")
|
| 244 |
+
async def analysis_meta_bugs(
|
| 245 |
+
owner: str,
|
| 246 |
+
repo: str,
|
| 247 |
+
request: Request,
|
| 248 |
+
limit: int | None = None,
|
| 249 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 250 |
+
) -> dict[str, Any]:
|
| 251 |
+
settings = request.app.state.settings
|
| 252 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 253 |
+
return list_analysis_meta_bugs(
|
| 254 |
+
settings.index_path,
|
| 255 |
+
repo=repo_slug,
|
| 256 |
+
variant=variant,
|
| 257 |
+
limit=_limit(
|
| 258 |
+
limit,
|
| 259 |
+
default=settings.cluster_list_limit_default,
|
| 260 |
+
maximum=settings.cluster_list_limit_max,
|
| 261 |
+
),
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
@app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs/{cluster_id}")
|
| 265 |
+
async def analysis_meta_bug(
|
| 266 |
+
owner: str,
|
| 267 |
+
repo: str,
|
| 268 |
+
cluster_id: str,
|
| 269 |
+
request: Request,
|
| 270 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 271 |
+
) -> dict[str, Any]:
|
| 272 |
+
settings = request.app.state.settings
|
| 273 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 274 |
+
return get_analysis_meta_bug(
|
| 275 |
+
settings.index_path,
|
| 276 |
+
repo=repo_slug,
|
| 277 |
+
cluster_id=cluster_id,
|
| 278 |
+
variant=variant,
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
@app.get("/v1/repos/{owner}/{repo}/analysis/duplicate-prs")
|
| 282 |
+
async def analysis_duplicate_prs(
|
| 283 |
+
owner: str,
|
| 284 |
+
repo: str,
|
| 285 |
+
request: Request,
|
| 286 |
+
limit: int | None = None,
|
| 287 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 288 |
+
) -> dict[str, Any]:
|
| 289 |
+
settings = request.app.state.settings
|
| 290 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 291 |
+
return list_analysis_duplicate_prs(
|
| 292 |
+
settings.index_path,
|
| 293 |
+
repo=repo_slug,
|
| 294 |
+
variant=variant,
|
| 295 |
+
limit=_limit(
|
| 296 |
+
limit,
|
| 297 |
+
default=settings.cluster_list_limit_default,
|
| 298 |
+
maximum=settings.cluster_list_limit_max,
|
| 299 |
+
),
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
@app.get("/v1/repos/{owner}/{repo}/analysis/best")
|
| 303 |
+
async def analysis_best(
|
| 304 |
+
owner: str,
|
| 305 |
+
repo: str,
|
| 306 |
+
request: Request,
|
| 307 |
+
variant: Literal["auto", "hybrid", "deterministic"] = "auto",
|
| 308 |
+
) -> dict[str, Any]:
|
| 309 |
+
settings = request.app.state.settings
|
| 310 |
+
repo_slug = _repo_slug(settings, owner, repo)
|
| 311 |
+
return get_analysis_best(settings.index_path, repo=repo_slug, variant=variant)
|
| 312 |
+
|
| 313 |
return app
|
| 314 |
|
| 315 |
|
|
|
|
| 395 |
message = str(exc).lower()
|
| 396 |
return (
|
| 397 |
"not found" in message
|
| 398 |
+
or "no analysis report was found" in message
|
| 399 |
or "no active pr search run" in message
|
| 400 |
or "was not found in the active indexed universe" in message
|
| 401 |
)
|
src/slop_farmer/reports/analysis_service.py
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from slop_farmer.data.parquet_io import read_json
|
| 8 |
+
from slop_farmer.data.search_duckdb import connect_pr_search_db, resolve_active_run
|
| 9 |
+
|
| 10 |
+
ANALYSIS_VARIANTS = {"auto", "deterministic", "hybrid"}
|
| 11 |
+
ANALYSIS_REPORT_FILENAMES = {
|
| 12 |
+
"deterministic": "analysis-report.json",
|
| 13 |
+
"hybrid": "analysis-report-hybrid.json",
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass(frozen=True, slots=True)
|
| 18 |
+
class ActiveSnapshotContext:
|
| 19 |
+
active_run: dict[str, Any]
|
| 20 |
+
snapshot_dir: Path
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass(frozen=True, slots=True)
|
| 24 |
+
class AnalysisContext:
|
| 25 |
+
active_run: dict[str, Any]
|
| 26 |
+
report: dict[str, Any]
|
| 27 |
+
variant_requested: str
|
| 28 |
+
variant_used: str
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def get_analysis_status(
|
| 32 |
+
db_path: Path,
|
| 33 |
+
*,
|
| 34 |
+
repo: str | None = None,
|
| 35 |
+
variant: str = "auto",
|
| 36 |
+
) -> dict[str, Any]:
|
| 37 |
+
active = _resolve_active_snapshot_context(db_path, repo=repo)
|
| 38 |
+
report_path, variant_used = _resolve_analysis_report_path(
|
| 39 |
+
active.snapshot_dir,
|
| 40 |
+
variant,
|
| 41 |
+
required=False,
|
| 42 |
+
)
|
| 43 |
+
payload = {
|
| 44 |
+
"repo": str(active.active_run["repo"]),
|
| 45 |
+
"snapshot_id": str(active.active_run["snapshot_id"]),
|
| 46 |
+
"run_id": str(active.active_run["id"]),
|
| 47 |
+
"variant_requested": _normalize_analysis_variant(variant),
|
| 48 |
+
"available": report_path is not None,
|
| 49 |
+
}
|
| 50 |
+
if report_path is None or variant_used is None:
|
| 51 |
+
return payload
|
| 52 |
+
report = _load_report(report_path)
|
| 53 |
+
return {
|
| 54 |
+
**payload,
|
| 55 |
+
"variant_used": variant_used,
|
| 56 |
+
"llm_enrichment": bool(report.get("llm_enrichment")),
|
| 57 |
+
"generated_at": report.get("generated_at"),
|
| 58 |
+
"counts": _analysis_counts(report),
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def get_pr_analysis(
|
| 63 |
+
db_path: Path,
|
| 64 |
+
*,
|
| 65 |
+
pr_number: int,
|
| 66 |
+
repo: str | None = None,
|
| 67 |
+
variant: str = "auto",
|
| 68 |
+
) -> dict[str, Any]:
|
| 69 |
+
context = _load_analysis_context(db_path, repo=repo, variant=variant)
|
| 70 |
+
meta_bug, rank = _find_meta_bug_for_pr(context.report, pr_number)
|
| 71 |
+
duplicate_pr = _find_duplicate_pr_for_pr(context.report, pr_number)
|
| 72 |
+
return {
|
| 73 |
+
**_analysis_base_payload(context),
|
| 74 |
+
"pr_number": pr_number,
|
| 75 |
+
"found": meta_bug is not None or duplicate_pr is not None,
|
| 76 |
+
"meta_bug": None if meta_bug is None else _meta_bug_payload(meta_bug, rank=rank),
|
| 77 |
+
"duplicate_pr": duplicate_pr,
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def list_analysis_meta_bugs(
|
| 82 |
+
db_path: Path,
|
| 83 |
+
*,
|
| 84 |
+
repo: str | None = None,
|
| 85 |
+
variant: str = "auto",
|
| 86 |
+
limit: int = 50,
|
| 87 |
+
) -> dict[str, Any]:
|
| 88 |
+
context = _load_analysis_context(db_path, repo=repo, variant=variant)
|
| 89 |
+
meta_bugs = [
|
| 90 |
+
_meta_bug_payload(cluster, rank=index)
|
| 91 |
+
for index, cluster in enumerate(context.report.get("meta_bugs", [])[:limit], start=1)
|
| 92 |
+
]
|
| 93 |
+
return {
|
| 94 |
+
**_analysis_base_payload(context),
|
| 95 |
+
"meta_bugs": meta_bugs,
|
| 96 |
+
"meta_bug_count": len(meta_bugs),
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def get_analysis_meta_bug(
|
| 101 |
+
db_path: Path,
|
| 102 |
+
*,
|
| 103 |
+
cluster_id: str,
|
| 104 |
+
repo: str | None = None,
|
| 105 |
+
variant: str = "auto",
|
| 106 |
+
) -> dict[str, Any]:
|
| 107 |
+
context = _load_analysis_context(db_path, repo=repo, variant=variant)
|
| 108 |
+
for index, cluster in enumerate(context.report.get("meta_bugs", []), start=1):
|
| 109 |
+
if str(cluster.get("cluster_id")) != cluster_id:
|
| 110 |
+
continue
|
| 111 |
+
return {
|
| 112 |
+
**_analysis_base_payload(context),
|
| 113 |
+
"meta_bug": _meta_bug_payload(cluster, rank=index),
|
| 114 |
+
"duplicate_pr": _find_duplicate_pr_by_cluster_id(context.report, cluster_id),
|
| 115 |
+
}
|
| 116 |
+
raise ValueError(f"Analysis cluster {cluster_id!r} was not found in the active snapshot.")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def list_analysis_duplicate_prs(
|
| 120 |
+
db_path: Path,
|
| 121 |
+
*,
|
| 122 |
+
repo: str | None = None,
|
| 123 |
+
variant: str = "auto",
|
| 124 |
+
limit: int = 50,
|
| 125 |
+
) -> dict[str, Any]:
|
| 126 |
+
context = _load_analysis_context(db_path, repo=repo, variant=variant)
|
| 127 |
+
duplicate_prs = [
|
| 128 |
+
{"rank": index, **dict(entry)}
|
| 129 |
+
for index, entry in enumerate(context.report.get("duplicate_prs", [])[:limit], start=1)
|
| 130 |
+
]
|
| 131 |
+
return {
|
| 132 |
+
**_analysis_base_payload(context),
|
| 133 |
+
"duplicate_prs": duplicate_prs,
|
| 134 |
+
"duplicate_pr_count": len(duplicate_prs),
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def get_analysis_best(
|
| 139 |
+
db_path: Path,
|
| 140 |
+
*,
|
| 141 |
+
repo: str | None = None,
|
| 142 |
+
variant: str = "auto",
|
| 143 |
+
) -> dict[str, Any]:
|
| 144 |
+
context = _load_analysis_context(db_path, repo=repo, variant=variant)
|
| 145 |
+
return {
|
| 146 |
+
**_analysis_base_payload(context),
|
| 147 |
+
"best_issue": _best_entry_with_cluster_id(
|
| 148 |
+
context.report,
|
| 149 |
+
context.report.get("best_issue"),
|
| 150 |
+
number_key="issue_number",
|
| 151 |
+
numbers_key="issue_numbers",
|
| 152 |
+
),
|
| 153 |
+
"best_pr": _best_entry_with_cluster_id(
|
| 154 |
+
context.report,
|
| 155 |
+
context.report.get("best_pr"),
|
| 156 |
+
number_key="pr_number",
|
| 157 |
+
numbers_key="pr_numbers",
|
| 158 |
+
),
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def _resolve_active_snapshot_context(
|
| 163 |
+
db_path: Path,
|
| 164 |
+
*,
|
| 165 |
+
repo: str | None,
|
| 166 |
+
) -> ActiveSnapshotContext:
|
| 167 |
+
connection = connect_pr_search_db(db_path, read_only=True)
|
| 168 |
+
try:
|
| 169 |
+
active_run = resolve_active_run(connection, repo=repo)
|
| 170 |
+
finally:
|
| 171 |
+
connection.close()
|
| 172 |
+
return ActiveSnapshotContext(
|
| 173 |
+
active_run={str(key): value for key, value in active_run.items()},
|
| 174 |
+
snapshot_dir=Path(str(active_run["snapshot_dir"])).resolve(),
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _load_analysis_context(
|
| 179 |
+
db_path: Path,
|
| 180 |
+
*,
|
| 181 |
+
repo: str | None,
|
| 182 |
+
variant: str,
|
| 183 |
+
) -> AnalysisContext:
|
| 184 |
+
active = _resolve_active_snapshot_context(db_path, repo=repo)
|
| 185 |
+
report_path, variant_used = _resolve_analysis_report_path(
|
| 186 |
+
active.snapshot_dir,
|
| 187 |
+
variant,
|
| 188 |
+
required=True,
|
| 189 |
+
)
|
| 190 |
+
assert report_path is not None
|
| 191 |
+
assert variant_used is not None
|
| 192 |
+
return AnalysisContext(
|
| 193 |
+
active_run=active.active_run,
|
| 194 |
+
report=_load_report(report_path),
|
| 195 |
+
variant_requested=_normalize_analysis_variant(variant),
|
| 196 |
+
variant_used=variant_used,
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def _resolve_analysis_report_path(
|
| 201 |
+
snapshot_dir: Path,
|
| 202 |
+
variant: str,
|
| 203 |
+
*,
|
| 204 |
+
required: bool,
|
| 205 |
+
) -> tuple[Path | None, str | None]:
|
| 206 |
+
normalized = _normalize_analysis_variant(variant)
|
| 207 |
+
if normalized == "auto":
|
| 208 |
+
hybrid_path = snapshot_dir / ANALYSIS_REPORT_FILENAMES["hybrid"]
|
| 209 |
+
if hybrid_path.exists():
|
| 210 |
+
return hybrid_path, "hybrid"
|
| 211 |
+
deterministic_path = snapshot_dir / ANALYSIS_REPORT_FILENAMES["deterministic"]
|
| 212 |
+
if deterministic_path.exists():
|
| 213 |
+
return deterministic_path, "deterministic"
|
| 214 |
+
if not required:
|
| 215 |
+
return None, None
|
| 216 |
+
raise ValueError("No analysis report was found for the active snapshot.")
|
| 217 |
+
report_path = snapshot_dir / ANALYSIS_REPORT_FILENAMES[normalized]
|
| 218 |
+
if report_path.exists():
|
| 219 |
+
return report_path, normalized
|
| 220 |
+
if not required:
|
| 221 |
+
return None, None
|
| 222 |
+
raise ValueError(
|
| 223 |
+
f"{normalized.capitalize()} analysis report was not found for the active snapshot."
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def _normalize_analysis_variant(variant: str) -> str:
|
| 228 |
+
normalized = variant.strip().lower()
|
| 229 |
+
if normalized not in ANALYSIS_VARIANTS:
|
| 230 |
+
raise ValueError(
|
| 231 |
+
f"Unsupported analysis variant {variant!r}; expected auto, hybrid, or deterministic."
|
| 232 |
+
)
|
| 233 |
+
return normalized
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def _analysis_base_payload(context: AnalysisContext) -> dict[str, Any]:
|
| 237 |
+
return {
|
| 238 |
+
"repo": str(context.active_run["repo"]),
|
| 239 |
+
"snapshot_id": str(context.active_run["snapshot_id"]),
|
| 240 |
+
"run_id": str(context.active_run["id"]),
|
| 241 |
+
"variant_requested": context.variant_requested,
|
| 242 |
+
"variant_used": context.variant_used,
|
| 243 |
+
"llm_enrichment": bool(context.report.get("llm_enrichment")),
|
| 244 |
+
"generated_at": context.report.get("generated_at"),
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def _analysis_counts(report: dict[str, Any]) -> dict[str, int]:
|
| 249 |
+
return {
|
| 250 |
+
"meta_bugs": len(report.get("meta_bugs") or []),
|
| 251 |
+
"duplicate_issues": len(report.get("duplicate_issues") or []),
|
| 252 |
+
"duplicate_prs": len(report.get("duplicate_prs") or []),
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def _meta_bug_payload(cluster: dict[str, Any], *, rank: int | None = None) -> dict[str, Any]:
|
| 257 |
+
payload = dict(cluster)
|
| 258 |
+
if rank is not None:
|
| 259 |
+
payload["rank"] = rank
|
| 260 |
+
return payload
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def _find_meta_bug_for_pr(
|
| 264 |
+
report: dict[str, Any],
|
| 265 |
+
pr_number: int,
|
| 266 |
+
) -> tuple[dict[str, Any] | None, int | None]:
|
| 267 |
+
for index, cluster in enumerate(report.get("meta_bugs", []), start=1):
|
| 268 |
+
pr_numbers = {int(number) for number in cluster.get("pr_numbers", [])}
|
| 269 |
+
if pr_number in pr_numbers:
|
| 270 |
+
return dict(cluster), index
|
| 271 |
+
return None, None
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def _find_duplicate_pr_for_pr(report: dict[str, Any], pr_number: int) -> dict[str, Any] | None:
|
| 275 |
+
for entry in report.get("duplicate_prs", []):
|
| 276 |
+
numbers = {
|
| 277 |
+
int(entry["canonical_pr_number"]),
|
| 278 |
+
*(int(number) for number in entry.get("duplicate_pr_numbers", [])),
|
| 279 |
+
}
|
| 280 |
+
if pr_number in numbers:
|
| 281 |
+
return dict(entry)
|
| 282 |
+
return None
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def _find_duplicate_pr_by_cluster_id(
|
| 286 |
+
report: dict[str, Any],
|
| 287 |
+
cluster_id: str,
|
| 288 |
+
) -> dict[str, Any] | None:
|
| 289 |
+
for entry in report.get("duplicate_prs", []):
|
| 290 |
+
if str(entry.get("cluster_id")) == cluster_id:
|
| 291 |
+
return dict(entry)
|
| 292 |
+
return None
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def _best_entry_with_cluster_id(
|
| 296 |
+
report: dict[str, Any],
|
| 297 |
+
entry: Any,
|
| 298 |
+
*,
|
| 299 |
+
number_key: str,
|
| 300 |
+
numbers_key: str,
|
| 301 |
+
) -> dict[str, Any] | None:
|
| 302 |
+
if not isinstance(entry, dict):
|
| 303 |
+
return None
|
| 304 |
+
number = entry.get(number_key)
|
| 305 |
+
if number is None:
|
| 306 |
+
return dict(entry)
|
| 307 |
+
for cluster in report.get("meta_bugs", []):
|
| 308 |
+
numbers = {int(value) for value in cluster.get(numbers_key, [])}
|
| 309 |
+
if int(number) in numbers:
|
| 310 |
+
return {"cluster_id": cluster.get("cluster_id"), **dict(entry)}
|
| 311 |
+
return dict(entry)
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def _load_report(path: Path) -> dict[str, Any]:
|
| 315 |
+
payload = read_json(path)
|
| 316 |
+
if not isinstance(payload, dict):
|
| 317 |
+
raise ValueError(f"Analysis report at {path} must contain a JSON object.")
|
| 318 |
+
return {str(key): value for key, value in payload.items()}
|
src/slop_farmer/reports/new_contributor_report.py
CHANGED
|
@@ -102,6 +102,7 @@ query SearchIssues($query: String!, $cursor: String) {
|
|
| 102 |
}
|
| 103 |
}
|
| 104 |
""".strip()
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
def run_new_contributor_report(options: NewContributorReportOptions) -> Path:
|
|
@@ -167,8 +168,16 @@ def _report_contributors(
|
|
| 167 |
previous_snapshot_dir = _previous_snapshot_dir(snapshot)
|
| 168 |
previous_primary_authors = _snapshot_primary_authors(previous_snapshot_dir)
|
| 169 |
previous_merged_pr_authors = _snapshot_merged_pr_authors(previous_snapshot_dir)
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
selected = []
|
| 173 |
for row in author_rows:
|
| 174 |
if row["pr_count"] == 0:
|
|
@@ -224,12 +233,30 @@ def _report_contributors(
|
|
| 224 |
for index, row in enumerate(selected, start=1):
|
| 225 |
first_seen_in_snapshot = row["author_login"] not in previous_primary_authors
|
| 226 |
known_via_prior_merged_pr = row["author_login"] in previous_merged_pr_authors
|
|
|
|
| 227 |
if index == 1 or index == total_selected or index % 10 == 0:
|
| 228 |
_report_log(
|
| 229 |
f"Enriching contributors: {index}/{total_selected} "
|
| 230 |
f"(current={row['author_login']}, first_seen={str(first_seen_in_snapshot).lower()}, "
|
| 231 |
f"known_via_prior_merged_pr={str(known_via_prior_merged_pr).lower()})"
|
| 232 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
try:
|
| 234 |
summary = summarize_user(row["author_login"], options.window_days, None)
|
| 235 |
fetch_error = None
|
|
@@ -442,15 +469,21 @@ def _snapshot_merged_pr_authors(snapshot_dir: Path | None) -> set[str]:
|
|
| 442 |
return authors
|
| 443 |
|
| 444 |
|
| 445 |
-
def
|
| 446 |
if snapshot_dir is None:
|
| 447 |
-
return
|
| 448 |
path = snapshot_dir / "new-contributors-report.json"
|
| 449 |
if not path.exists():
|
| 450 |
-
return
|
| 451 |
try:
|
| 452 |
payload = read_json(path)
|
| 453 |
except Exception:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
return set()
|
| 455 |
contributors = payload.get("contributors")
|
| 456 |
if not isinstance(contributors, list):
|
|
@@ -462,15 +495,10 @@ def _previous_report_contributors(snapshot_dir: Path | None) -> set[str]:
|
|
| 462 |
}
|
| 463 |
|
| 464 |
|
| 465 |
-
def _previous_report_contributor_entries(
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
if not path.exists():
|
| 470 |
-
return {}
|
| 471 |
-
try:
|
| 472 |
-
payload = read_json(path)
|
| 473 |
-
except Exception:
|
| 474 |
return {}
|
| 475 |
contributors = payload.get("contributors")
|
| 476 |
if not isinstance(contributors, list):
|
|
@@ -482,6 +510,104 @@ def _previous_report_contributor_entries(snapshot_dir: Path | None) -> dict[str,
|
|
| 482 |
}
|
| 483 |
|
| 484 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
def _contributor_entry(
|
| 486 |
repo: str,
|
| 487 |
row: dict[str, Any],
|
|
|
|
| 102 |
}
|
| 103 |
}
|
| 104 |
""".strip()
|
| 105 |
+
PREVIOUS_REPORT_REUSE_MAX_AGE = timedelta(days=2)
|
| 106 |
|
| 107 |
|
| 108 |
def run_new_contributor_report(options: NewContributorReportOptions) -> Path:
|
|
|
|
| 168 |
previous_snapshot_dir = _previous_snapshot_dir(snapshot)
|
| 169 |
previous_primary_authors = _snapshot_primary_authors(previous_snapshot_dir)
|
| 170 |
previous_merged_pr_authors = _snapshot_merged_pr_authors(previous_snapshot_dir)
|
| 171 |
+
previous_report_payload = _previous_report_payload(previous_snapshot_dir)
|
| 172 |
+
previous_report_contributors = _previous_report_contributors(previous_report_payload)
|
| 173 |
+
previous_report_entries = _previous_report_contributor_entries(previous_report_payload)
|
| 174 |
+
snapshot_reference_time = _snapshot_reference_time(snapshot)
|
| 175 |
+
previous_report_reusable = _previous_report_reuse_allowed(
|
| 176 |
+
previous_report_payload,
|
| 177 |
+
window_days=options.window_days,
|
| 178 |
+
reference_time=snapshot_reference_time,
|
| 179 |
+
)
|
| 180 |
+
cutoff = snapshot_reference_time - timedelta(days=options.window_days)
|
| 181 |
selected = []
|
| 182 |
for row in author_rows:
|
| 183 |
if row["pr_count"] == 0:
|
|
|
|
| 233 |
for index, row in enumerate(selected, start=1):
|
| 234 |
first_seen_in_snapshot = row["author_login"] not in previous_primary_authors
|
| 235 |
known_via_prior_merged_pr = row["author_login"] in previous_merged_pr_authors
|
| 236 |
+
previous_entry = previous_report_entries.get(row["author_login"])
|
| 237 |
if index == 1 or index == total_selected or index % 10 == 0:
|
| 238 |
_report_log(
|
| 239 |
f"Enriching contributors: {index}/{total_selected} "
|
| 240 |
f"(current={row['author_login']}, first_seen={str(first_seen_in_snapshot).lower()}, "
|
| 241 |
f"known_via_prior_merged_pr={str(known_via_prior_merged_pr).lower()})"
|
| 242 |
)
|
| 243 |
+
if (
|
| 244 |
+
previous_report_reusable
|
| 245 |
+
and previous_entry is not None
|
| 246 |
+
and not previous_entry.get("fetch_error")
|
| 247 |
+
and not known_via_prior_merged_pr
|
| 248 |
+
):
|
| 249 |
+
contributors.append(
|
| 250 |
+
_reused_previous_report_entry(
|
| 251 |
+
snapshot["repo"],
|
| 252 |
+
row,
|
| 253 |
+
previous_entry,
|
| 254 |
+
first_seen_in_snapshot=first_seen_in_snapshot,
|
| 255 |
+
known_via_prior_merged_pr=known_via_prior_merged_pr,
|
| 256 |
+
)
|
| 257 |
+
)
|
| 258 |
+
reused_previous_report += 1
|
| 259 |
+
continue
|
| 260 |
try:
|
| 261 |
summary = summarize_user(row["author_login"], options.window_days, None)
|
| 262 |
fetch_error = None
|
|
|
|
| 469 |
return authors
|
| 470 |
|
| 471 |
|
| 472 |
+
def _previous_report_payload(snapshot_dir: Path | None) -> dict[str, Any] | None:
|
| 473 |
if snapshot_dir is None:
|
| 474 |
+
return None
|
| 475 |
path = snapshot_dir / "new-contributors-report.json"
|
| 476 |
if not path.exists():
|
| 477 |
+
return None
|
| 478 |
try:
|
| 479 |
payload = read_json(path)
|
| 480 |
except Exception:
|
| 481 |
+
return None
|
| 482 |
+
return payload if isinstance(payload, dict) else None
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
def _previous_report_contributors(payload: dict[str, Any] | None) -> set[str]:
|
| 486 |
+
if payload is None:
|
| 487 |
return set()
|
| 488 |
contributors = payload.get("contributors")
|
| 489 |
if not isinstance(contributors, list):
|
|
|
|
| 495 |
}
|
| 496 |
|
| 497 |
|
| 498 |
+
def _previous_report_contributor_entries(
|
| 499 |
+
payload: dict[str, Any] | None,
|
| 500 |
+
) -> dict[str, dict[str, Any]]:
|
| 501 |
+
if payload is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
return {}
|
| 503 |
contributors = payload.get("contributors")
|
| 504 |
if not isinstance(contributors, list):
|
|
|
|
| 510 |
}
|
| 511 |
|
| 512 |
|
| 513 |
+
def _previous_report_reuse_allowed(
|
| 514 |
+
payload: dict[str, Any] | None,
|
| 515 |
+
*,
|
| 516 |
+
window_days: int,
|
| 517 |
+
reference_time: datetime,
|
| 518 |
+
) -> bool:
|
| 519 |
+
if payload is None:
|
| 520 |
+
return False
|
| 521 |
+
if _coerce_int(payload.get("window_days")) != window_days:
|
| 522 |
+
return False
|
| 523 |
+
generated_at = _coerce_datetime(payload.get("generated_at"))
|
| 524 |
+
if generated_at is None:
|
| 525 |
+
return False
|
| 526 |
+
return abs(reference_time - generated_at) <= PREVIOUS_REPORT_REUSE_MAX_AGE
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
def _reused_previous_report_entry(
|
| 530 |
+
repo: str,
|
| 531 |
+
row: dict[str, Any],
|
| 532 |
+
previous_entry: dict[str, Any],
|
| 533 |
+
*,
|
| 534 |
+
first_seen_in_snapshot: bool,
|
| 535 |
+
known_via_prior_merged_pr: bool,
|
| 536 |
+
) -> dict[str, Any]:
|
| 537 |
+
login = row["author_login"]
|
| 538 |
+
age_days = _coerce_int(previous_entry.get("account_age_days"))
|
| 539 |
+
return {
|
| 540 |
+
"author_login": login,
|
| 541 |
+
"name": previous_entry.get("name"),
|
| 542 |
+
"profile_url": _profile_url(login),
|
| 543 |
+
"repo_pull_requests_url": _repo_search_url(repo, login, is_pr=True),
|
| 544 |
+
"repo_issues_url": _repo_search_url(repo, login, is_pr=False),
|
| 545 |
+
"repo_first_seen_at": row["first_seen_at"],
|
| 546 |
+
"repo_last_seen_at": row["last_seen_at"],
|
| 547 |
+
"repo_primary_artifact_count": row["primary_artifact_count"],
|
| 548 |
+
"repo_artifact_count": row["artifact_count"],
|
| 549 |
+
"snapshot_issue_count": row["issue_count"],
|
| 550 |
+
"snapshot_pr_count": row["pr_count"],
|
| 551 |
+
"snapshot_comment_count": row["comment_count"],
|
| 552 |
+
"snapshot_review_count": row["review_count"],
|
| 553 |
+
"snapshot_review_comment_count": row["review_comment_count"],
|
| 554 |
+
"repo_association": row.get("repo_association"),
|
| 555 |
+
"new_to_repo": first_seen_in_snapshot,
|
| 556 |
+
"first_seen_in_snapshot": first_seen_in_snapshot,
|
| 557 |
+
"known_via_prior_merged_pr": known_via_prior_merged_pr,
|
| 558 |
+
"report_reason": "first_seen_in_snapshot" if first_seen_in_snapshot else None,
|
| 559 |
+
"enrichment_source": "previous_report",
|
| 560 |
+
"live_refetch_skipped": True,
|
| 561 |
+
"account_age_days": age_days,
|
| 562 |
+
"young_account": age_days is not None and age_days <= 365,
|
| 563 |
+
"follow_through_score": previous_entry.get("follow_through_score"),
|
| 564 |
+
"breadth_score": previous_entry.get("breadth_score"),
|
| 565 |
+
"automation_risk_signal": previous_entry.get("automation_risk_signal"),
|
| 566 |
+
"heuristic_note": previous_entry.get("heuristic_note"),
|
| 567 |
+
"public_orgs": _previous_report_public_orgs(previous_entry),
|
| 568 |
+
"activity": _previous_report_activity(previous_entry),
|
| 569 |
+
"examples": {
|
| 570 |
+
"pull_requests": [
|
| 571 |
+
_artifact_example(item, "pull_request") for item in row["pull_requests"]
|
| 572 |
+
],
|
| 573 |
+
"issues": [_artifact_example(item, "issue") for item in row["issues"]],
|
| 574 |
+
},
|
| 575 |
+
"fetch_error": None,
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
def _previous_report_public_orgs(previous_entry: dict[str, Any]) -> list[str]:
|
| 580 |
+
values = previous_entry.get("public_orgs")
|
| 581 |
+
if not isinstance(values, list):
|
| 582 |
+
return []
|
| 583 |
+
public_orgs: list[str] = []
|
| 584 |
+
for value in values:
|
| 585 |
+
if isinstance(value, str) and value.strip():
|
| 586 |
+
public_orgs.append(value.strip())
|
| 587 |
+
elif isinstance(value, dict):
|
| 588 |
+
login = str(value.get("login") or "").strip()
|
| 589 |
+
if login:
|
| 590 |
+
public_orgs.append(login)
|
| 591 |
+
return public_orgs
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
def _previous_report_activity(previous_entry: dict[str, Any]) -> dict[str, Any]:
|
| 595 |
+
activity = previous_entry.get("activity")
|
| 596 |
+
if not isinstance(activity, dict):
|
| 597 |
+
activity = previous_entry
|
| 598 |
+
return {
|
| 599 |
+
"visible_authored_pr_count": activity.get("visible_authored_pr_count"),
|
| 600 |
+
"merged_pr_count": activity.get("merged_pr_count"),
|
| 601 |
+
"closed_unmerged_pr_count": activity.get("closed_unmerged_pr_count"),
|
| 602 |
+
"open_pr_count": activity.get("open_pr_count"),
|
| 603 |
+
"merged_pr_rate": activity.get("merged_pr_rate"),
|
| 604 |
+
"closed_unmerged_pr_rate": activity.get("closed_unmerged_pr_rate"),
|
| 605 |
+
"still_open_pr_rate": activity.get("still_open_pr_rate"),
|
| 606 |
+
"distinct_repos_with_authored_prs": activity.get("distinct_repos_with_authored_prs"),
|
| 607 |
+
"distinct_repos_with_open_prs": activity.get("distinct_repos_with_open_prs"),
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
|
| 611 |
def _contributor_entry(
|
| 612 |
repo: str,
|
| 613 |
row: dict[str, Any],
|