evalstate HF Staff commited on
Commit
41a8c52
·
verified ·
1 Parent(s): 1fb282b

Deploy OpenClaw PR API

Browse files
pyproject.toml CHANGED
@@ -16,7 +16,7 @@ dependencies = [
16
  "pydantic>=2.11",
17
  "PyYAML>=6.0.2",
18
  "rank-bm25>=0.2.2",
19
- "fast-agent-mcp>=0.6.16",
20
  "uvicorn>=0.34.0",
21
  ]
22
 
@@ -33,7 +33,6 @@ llm = [
33
 
34
  [project.scripts]
35
  slop-farmer = "slop_farmer.app.cli:main"
36
- pr-search = "slop_farmer.app.pr_search_client:main"
37
 
38
  [tool.setuptools]
39
  package-dir = {"" = "src"}
 
16
  "pydantic>=2.11",
17
  "PyYAML>=6.0.2",
18
  "rank-bm25>=0.2.2",
19
+ "fast-agent-mcp>=0.6.17",
20
  "uvicorn>=0.34.0",
21
  ]
22
 
 
33
 
34
  [project.scripts]
35
  slop-farmer = "slop_farmer.app.cli:main"
 
36
 
37
  [tool.setuptools]
38
  package-dir = {"" = "src"}
src/slop_farmer/app/pr_search_api.py CHANGED
@@ -11,6 +11,14 @@ from fastapi.responses import JSONResponse
11
 
12
  from slop_farmer.config import PrSearchRefreshOptions
13
  from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
 
 
 
 
 
 
 
 
14
  from slop_farmer.reports.pr_search_service import (
15
  get_pr_search_cluster,
16
  get_pr_search_clusters,
@@ -204,6 +212,104 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
204
  ),
205
  )
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  return app
208
 
209
 
@@ -289,6 +395,7 @@ def _looks_not_found(exc: ValueError) -> bool:
289
  message = str(exc).lower()
290
  return (
291
  "not found" in message
 
292
  or "no active pr search run" in message
293
  or "was not found in the active indexed universe" in message
294
  )
 
11
 
12
  from slop_farmer.config import PrSearchRefreshOptions
13
  from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
14
+ from slop_farmer.reports.analysis_service import (
15
+ get_analysis_best,
16
+ get_analysis_meta_bug,
17
+ get_analysis_status,
18
+ get_pr_analysis,
19
+ list_analysis_duplicate_prs,
20
+ list_analysis_meta_bugs,
21
+ )
22
  from slop_farmer.reports.pr_search_service import (
23
  get_pr_search_cluster,
24
  get_pr_search_clusters,
 
212
  ),
213
  )
214
 
215
+ @app.get("/v1/repos/{owner}/{repo}/analysis/status")
216
+ async def analysis_status(
217
+ owner: str,
218
+ repo: str,
219
+ request: Request,
220
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
221
+ ) -> dict[str, Any]:
222
+ settings = request.app.state.settings
223
+ repo_slug = _repo_slug(settings, owner, repo)
224
+ return get_analysis_status(settings.index_path, repo=repo_slug, variant=variant)
225
+
226
+ @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/analysis")
227
+ async def pr_analysis(
228
+ owner: str,
229
+ repo: str,
230
+ number: int,
231
+ request: Request,
232
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
233
+ ) -> dict[str, Any]:
234
+ settings = request.app.state.settings
235
+ repo_slug = _repo_slug(settings, owner, repo)
236
+ return get_pr_analysis(
237
+ settings.index_path,
238
+ repo=repo_slug,
239
+ pr_number=number,
240
+ variant=variant,
241
+ )
242
+
243
+ @app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs")
244
+ async def analysis_meta_bugs(
245
+ owner: str,
246
+ repo: str,
247
+ request: Request,
248
+ limit: int | None = None,
249
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
250
+ ) -> dict[str, Any]:
251
+ settings = request.app.state.settings
252
+ repo_slug = _repo_slug(settings, owner, repo)
253
+ return list_analysis_meta_bugs(
254
+ settings.index_path,
255
+ repo=repo_slug,
256
+ variant=variant,
257
+ limit=_limit(
258
+ limit,
259
+ default=settings.cluster_list_limit_default,
260
+ maximum=settings.cluster_list_limit_max,
261
+ ),
262
+ )
263
+
264
+ @app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs/{cluster_id}")
265
+ async def analysis_meta_bug(
266
+ owner: str,
267
+ repo: str,
268
+ cluster_id: str,
269
+ request: Request,
270
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
271
+ ) -> dict[str, Any]:
272
+ settings = request.app.state.settings
273
+ repo_slug = _repo_slug(settings, owner, repo)
274
+ return get_analysis_meta_bug(
275
+ settings.index_path,
276
+ repo=repo_slug,
277
+ cluster_id=cluster_id,
278
+ variant=variant,
279
+ )
280
+
281
+ @app.get("/v1/repos/{owner}/{repo}/analysis/duplicate-prs")
282
+ async def analysis_duplicate_prs(
283
+ owner: str,
284
+ repo: str,
285
+ request: Request,
286
+ limit: int | None = None,
287
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
288
+ ) -> dict[str, Any]:
289
+ settings = request.app.state.settings
290
+ repo_slug = _repo_slug(settings, owner, repo)
291
+ return list_analysis_duplicate_prs(
292
+ settings.index_path,
293
+ repo=repo_slug,
294
+ variant=variant,
295
+ limit=_limit(
296
+ limit,
297
+ default=settings.cluster_list_limit_default,
298
+ maximum=settings.cluster_list_limit_max,
299
+ ),
300
+ )
301
+
302
+ @app.get("/v1/repos/{owner}/{repo}/analysis/best")
303
+ async def analysis_best(
304
+ owner: str,
305
+ repo: str,
306
+ request: Request,
307
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
308
+ ) -> dict[str, Any]:
309
+ settings = request.app.state.settings
310
+ repo_slug = _repo_slug(settings, owner, repo)
311
+ return get_analysis_best(settings.index_path, repo=repo_slug, variant=variant)
312
+
313
  return app
314
 
315
 
 
395
  message = str(exc).lower()
396
  return (
397
  "not found" in message
398
+ or "no analysis report was found" in message
399
  or "no active pr search run" in message
400
  or "was not found in the active indexed universe" in message
401
  )
src/slop_farmer/reports/analysis_service.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from slop_farmer.data.parquet_io import read_json
8
+ from slop_farmer.data.search_duckdb import connect_pr_search_db, resolve_active_run
9
+
10
+ ANALYSIS_VARIANTS = {"auto", "deterministic", "hybrid"}
11
+ ANALYSIS_REPORT_FILENAMES = {
12
+ "deterministic": "analysis-report.json",
13
+ "hybrid": "analysis-report-hybrid.json",
14
+ }
15
+
16
+
17
+ @dataclass(frozen=True, slots=True)
18
+ class ActiveSnapshotContext:
19
+ active_run: dict[str, Any]
20
+ snapshot_dir: Path
21
+
22
+
23
+ @dataclass(frozen=True, slots=True)
24
+ class AnalysisContext:
25
+ active_run: dict[str, Any]
26
+ report: dict[str, Any]
27
+ variant_requested: str
28
+ variant_used: str
29
+
30
+
31
+ def get_analysis_status(
32
+ db_path: Path,
33
+ *,
34
+ repo: str | None = None,
35
+ variant: str = "auto",
36
+ ) -> dict[str, Any]:
37
+ active = _resolve_active_snapshot_context(db_path, repo=repo)
38
+ report_path, variant_used = _resolve_analysis_report_path(
39
+ active.snapshot_dir,
40
+ variant,
41
+ required=False,
42
+ )
43
+ payload = {
44
+ "repo": str(active.active_run["repo"]),
45
+ "snapshot_id": str(active.active_run["snapshot_id"]),
46
+ "run_id": str(active.active_run["id"]),
47
+ "variant_requested": _normalize_analysis_variant(variant),
48
+ "available": report_path is not None,
49
+ }
50
+ if report_path is None or variant_used is None:
51
+ return payload
52
+ report = _load_report(report_path)
53
+ return {
54
+ **payload,
55
+ "variant_used": variant_used,
56
+ "llm_enrichment": bool(report.get("llm_enrichment")),
57
+ "generated_at": report.get("generated_at"),
58
+ "counts": _analysis_counts(report),
59
+ }
60
+
61
+
62
+ def get_pr_analysis(
63
+ db_path: Path,
64
+ *,
65
+ pr_number: int,
66
+ repo: str | None = None,
67
+ variant: str = "auto",
68
+ ) -> dict[str, Any]:
69
+ context = _load_analysis_context(db_path, repo=repo, variant=variant)
70
+ meta_bug, rank = _find_meta_bug_for_pr(context.report, pr_number)
71
+ duplicate_pr = _find_duplicate_pr_for_pr(context.report, pr_number)
72
+ return {
73
+ **_analysis_base_payload(context),
74
+ "pr_number": pr_number,
75
+ "found": meta_bug is not None or duplicate_pr is not None,
76
+ "meta_bug": None if meta_bug is None else _meta_bug_payload(meta_bug, rank=rank),
77
+ "duplicate_pr": duplicate_pr,
78
+ }
79
+
80
+
81
+ def list_analysis_meta_bugs(
82
+ db_path: Path,
83
+ *,
84
+ repo: str | None = None,
85
+ variant: str = "auto",
86
+ limit: int = 50,
87
+ ) -> dict[str, Any]:
88
+ context = _load_analysis_context(db_path, repo=repo, variant=variant)
89
+ meta_bugs = [
90
+ _meta_bug_payload(cluster, rank=index)
91
+ for index, cluster in enumerate(context.report.get("meta_bugs", [])[:limit], start=1)
92
+ ]
93
+ return {
94
+ **_analysis_base_payload(context),
95
+ "meta_bugs": meta_bugs,
96
+ "meta_bug_count": len(meta_bugs),
97
+ }
98
+
99
+
100
+ def get_analysis_meta_bug(
101
+ db_path: Path,
102
+ *,
103
+ cluster_id: str,
104
+ repo: str | None = None,
105
+ variant: str = "auto",
106
+ ) -> dict[str, Any]:
107
+ context = _load_analysis_context(db_path, repo=repo, variant=variant)
108
+ for index, cluster in enumerate(context.report.get("meta_bugs", []), start=1):
109
+ if str(cluster.get("cluster_id")) != cluster_id:
110
+ continue
111
+ return {
112
+ **_analysis_base_payload(context),
113
+ "meta_bug": _meta_bug_payload(cluster, rank=index),
114
+ "duplicate_pr": _find_duplicate_pr_by_cluster_id(context.report, cluster_id),
115
+ }
116
+ raise ValueError(f"Analysis cluster {cluster_id!r} was not found in the active snapshot.")
117
+
118
+
119
+ def list_analysis_duplicate_prs(
120
+ db_path: Path,
121
+ *,
122
+ repo: str | None = None,
123
+ variant: str = "auto",
124
+ limit: int = 50,
125
+ ) -> dict[str, Any]:
126
+ context = _load_analysis_context(db_path, repo=repo, variant=variant)
127
+ duplicate_prs = [
128
+ {"rank": index, **dict(entry)}
129
+ for index, entry in enumerate(context.report.get("duplicate_prs", [])[:limit], start=1)
130
+ ]
131
+ return {
132
+ **_analysis_base_payload(context),
133
+ "duplicate_prs": duplicate_prs,
134
+ "duplicate_pr_count": len(duplicate_prs),
135
+ }
136
+
137
+
138
+ def get_analysis_best(
139
+ db_path: Path,
140
+ *,
141
+ repo: str | None = None,
142
+ variant: str = "auto",
143
+ ) -> dict[str, Any]:
144
+ context = _load_analysis_context(db_path, repo=repo, variant=variant)
145
+ return {
146
+ **_analysis_base_payload(context),
147
+ "best_issue": _best_entry_with_cluster_id(
148
+ context.report,
149
+ context.report.get("best_issue"),
150
+ number_key="issue_number",
151
+ numbers_key="issue_numbers",
152
+ ),
153
+ "best_pr": _best_entry_with_cluster_id(
154
+ context.report,
155
+ context.report.get("best_pr"),
156
+ number_key="pr_number",
157
+ numbers_key="pr_numbers",
158
+ ),
159
+ }
160
+
161
+
162
+ def _resolve_active_snapshot_context(
163
+ db_path: Path,
164
+ *,
165
+ repo: str | None,
166
+ ) -> ActiveSnapshotContext:
167
+ connection = connect_pr_search_db(db_path, read_only=True)
168
+ try:
169
+ active_run = resolve_active_run(connection, repo=repo)
170
+ finally:
171
+ connection.close()
172
+ return ActiveSnapshotContext(
173
+ active_run={str(key): value for key, value in active_run.items()},
174
+ snapshot_dir=Path(str(active_run["snapshot_dir"])).resolve(),
175
+ )
176
+
177
+
178
+ def _load_analysis_context(
179
+ db_path: Path,
180
+ *,
181
+ repo: str | None,
182
+ variant: str,
183
+ ) -> AnalysisContext:
184
+ active = _resolve_active_snapshot_context(db_path, repo=repo)
185
+ report_path, variant_used = _resolve_analysis_report_path(
186
+ active.snapshot_dir,
187
+ variant,
188
+ required=True,
189
+ )
190
+ assert report_path is not None
191
+ assert variant_used is not None
192
+ return AnalysisContext(
193
+ active_run=active.active_run,
194
+ report=_load_report(report_path),
195
+ variant_requested=_normalize_analysis_variant(variant),
196
+ variant_used=variant_used,
197
+ )
198
+
199
+
200
+ def _resolve_analysis_report_path(
201
+ snapshot_dir: Path,
202
+ variant: str,
203
+ *,
204
+ required: bool,
205
+ ) -> tuple[Path | None, str | None]:
206
+ normalized = _normalize_analysis_variant(variant)
207
+ if normalized == "auto":
208
+ hybrid_path = snapshot_dir / ANALYSIS_REPORT_FILENAMES["hybrid"]
209
+ if hybrid_path.exists():
210
+ return hybrid_path, "hybrid"
211
+ deterministic_path = snapshot_dir / ANALYSIS_REPORT_FILENAMES["deterministic"]
212
+ if deterministic_path.exists():
213
+ return deterministic_path, "deterministic"
214
+ if not required:
215
+ return None, None
216
+ raise ValueError("No analysis report was found for the active snapshot.")
217
+ report_path = snapshot_dir / ANALYSIS_REPORT_FILENAMES[normalized]
218
+ if report_path.exists():
219
+ return report_path, normalized
220
+ if not required:
221
+ return None, None
222
+ raise ValueError(
223
+ f"{normalized.capitalize()} analysis report was not found for the active snapshot."
224
+ )
225
+
226
+
227
+ def _normalize_analysis_variant(variant: str) -> str:
228
+ normalized = variant.strip().lower()
229
+ if normalized not in ANALYSIS_VARIANTS:
230
+ raise ValueError(
231
+ f"Unsupported analysis variant {variant!r}; expected auto, hybrid, or deterministic."
232
+ )
233
+ return normalized
234
+
235
+
236
+ def _analysis_base_payload(context: AnalysisContext) -> dict[str, Any]:
237
+ return {
238
+ "repo": str(context.active_run["repo"]),
239
+ "snapshot_id": str(context.active_run["snapshot_id"]),
240
+ "run_id": str(context.active_run["id"]),
241
+ "variant_requested": context.variant_requested,
242
+ "variant_used": context.variant_used,
243
+ "llm_enrichment": bool(context.report.get("llm_enrichment")),
244
+ "generated_at": context.report.get("generated_at"),
245
+ }
246
+
247
+
248
+ def _analysis_counts(report: dict[str, Any]) -> dict[str, int]:
249
+ return {
250
+ "meta_bugs": len(report.get("meta_bugs") or []),
251
+ "duplicate_issues": len(report.get("duplicate_issues") or []),
252
+ "duplicate_prs": len(report.get("duplicate_prs") or []),
253
+ }
254
+
255
+
256
+ def _meta_bug_payload(cluster: dict[str, Any], *, rank: int | None = None) -> dict[str, Any]:
257
+ payload = dict(cluster)
258
+ if rank is not None:
259
+ payload["rank"] = rank
260
+ return payload
261
+
262
+
263
+ def _find_meta_bug_for_pr(
264
+ report: dict[str, Any],
265
+ pr_number: int,
266
+ ) -> tuple[dict[str, Any] | None, int | None]:
267
+ for index, cluster in enumerate(report.get("meta_bugs", []), start=1):
268
+ pr_numbers = {int(number) for number in cluster.get("pr_numbers", [])}
269
+ if pr_number in pr_numbers:
270
+ return dict(cluster), index
271
+ return None, None
272
+
273
+
274
+ def _find_duplicate_pr_for_pr(report: dict[str, Any], pr_number: int) -> dict[str, Any] | None:
275
+ for entry in report.get("duplicate_prs", []):
276
+ numbers = {
277
+ int(entry["canonical_pr_number"]),
278
+ *(int(number) for number in entry.get("duplicate_pr_numbers", [])),
279
+ }
280
+ if pr_number in numbers:
281
+ return dict(entry)
282
+ return None
283
+
284
+
285
+ def _find_duplicate_pr_by_cluster_id(
286
+ report: dict[str, Any],
287
+ cluster_id: str,
288
+ ) -> dict[str, Any] | None:
289
+ for entry in report.get("duplicate_prs", []):
290
+ if str(entry.get("cluster_id")) == cluster_id:
291
+ return dict(entry)
292
+ return None
293
+
294
+
295
+ def _best_entry_with_cluster_id(
296
+ report: dict[str, Any],
297
+ entry: Any,
298
+ *,
299
+ number_key: str,
300
+ numbers_key: str,
301
+ ) -> dict[str, Any] | None:
302
+ if not isinstance(entry, dict):
303
+ return None
304
+ number = entry.get(number_key)
305
+ if number is None:
306
+ return dict(entry)
307
+ for cluster in report.get("meta_bugs", []):
308
+ numbers = {int(value) for value in cluster.get(numbers_key, [])}
309
+ if int(number) in numbers:
310
+ return {"cluster_id": cluster.get("cluster_id"), **dict(entry)}
311
+ return dict(entry)
312
+
313
+
314
+ def _load_report(path: Path) -> dict[str, Any]:
315
+ payload = read_json(path)
316
+ if not isinstance(payload, dict):
317
+ raise ValueError(f"Analysis report at {path} must contain a JSON object.")
318
+ return {str(key): value for key, value in payload.items()}
src/slop_farmer/reports/new_contributor_report.py CHANGED
@@ -102,6 +102,7 @@ query SearchIssues($query: String!, $cursor: String) {
102
  }
103
  }
104
  """.strip()
 
105
 
106
 
107
  def run_new_contributor_report(options: NewContributorReportOptions) -> Path:
@@ -167,8 +168,16 @@ def _report_contributors(
167
  previous_snapshot_dir = _previous_snapshot_dir(snapshot)
168
  previous_primary_authors = _snapshot_primary_authors(previous_snapshot_dir)
169
  previous_merged_pr_authors = _snapshot_merged_pr_authors(previous_snapshot_dir)
170
- previous_report_contributors = _previous_report_contributors(previous_snapshot_dir)
171
- cutoff = _snapshot_reference_time(snapshot) - timedelta(days=options.window_days)
 
 
 
 
 
 
 
 
172
  selected = []
173
  for row in author_rows:
174
  if row["pr_count"] == 0:
@@ -224,12 +233,30 @@ def _report_contributors(
224
  for index, row in enumerate(selected, start=1):
225
  first_seen_in_snapshot = row["author_login"] not in previous_primary_authors
226
  known_via_prior_merged_pr = row["author_login"] in previous_merged_pr_authors
 
227
  if index == 1 or index == total_selected or index % 10 == 0:
228
  _report_log(
229
  f"Enriching contributors: {index}/{total_selected} "
230
  f"(current={row['author_login']}, first_seen={str(first_seen_in_snapshot).lower()}, "
231
  f"known_via_prior_merged_pr={str(known_via_prior_merged_pr).lower()})"
232
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  try:
234
  summary = summarize_user(row["author_login"], options.window_days, None)
235
  fetch_error = None
@@ -442,15 +469,21 @@ def _snapshot_merged_pr_authors(snapshot_dir: Path | None) -> set[str]:
442
  return authors
443
 
444
 
445
- def _previous_report_contributors(snapshot_dir: Path | None) -> set[str]:
446
  if snapshot_dir is None:
447
- return set()
448
  path = snapshot_dir / "new-contributors-report.json"
449
  if not path.exists():
450
- return set()
451
  try:
452
  payload = read_json(path)
453
  except Exception:
 
 
 
 
 
 
454
  return set()
455
  contributors = payload.get("contributors")
456
  if not isinstance(contributors, list):
@@ -462,15 +495,10 @@ def _previous_report_contributors(snapshot_dir: Path | None) -> set[str]:
462
  }
463
 
464
 
465
- def _previous_report_contributor_entries(snapshot_dir: Path | None) -> dict[str, dict[str, Any]]:
466
- if snapshot_dir is None:
467
- return {}
468
- path = snapshot_dir / "new-contributors-report.json"
469
- if not path.exists():
470
- return {}
471
- try:
472
- payload = read_json(path)
473
- except Exception:
474
  return {}
475
  contributors = payload.get("contributors")
476
  if not isinstance(contributors, list):
@@ -482,6 +510,104 @@ def _previous_report_contributor_entries(snapshot_dir: Path | None) -> dict[str,
482
  }
483
 
484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  def _contributor_entry(
486
  repo: str,
487
  row: dict[str, Any],
 
102
  }
103
  }
104
  """.strip()
105
+ PREVIOUS_REPORT_REUSE_MAX_AGE = timedelta(days=2)
106
 
107
 
108
  def run_new_contributor_report(options: NewContributorReportOptions) -> Path:
 
168
  previous_snapshot_dir = _previous_snapshot_dir(snapshot)
169
  previous_primary_authors = _snapshot_primary_authors(previous_snapshot_dir)
170
  previous_merged_pr_authors = _snapshot_merged_pr_authors(previous_snapshot_dir)
171
+ previous_report_payload = _previous_report_payload(previous_snapshot_dir)
172
+ previous_report_contributors = _previous_report_contributors(previous_report_payload)
173
+ previous_report_entries = _previous_report_contributor_entries(previous_report_payload)
174
+ snapshot_reference_time = _snapshot_reference_time(snapshot)
175
+ previous_report_reusable = _previous_report_reuse_allowed(
176
+ previous_report_payload,
177
+ window_days=options.window_days,
178
+ reference_time=snapshot_reference_time,
179
+ )
180
+ cutoff = snapshot_reference_time - timedelta(days=options.window_days)
181
  selected = []
182
  for row in author_rows:
183
  if row["pr_count"] == 0:
 
233
  for index, row in enumerate(selected, start=1):
234
  first_seen_in_snapshot = row["author_login"] not in previous_primary_authors
235
  known_via_prior_merged_pr = row["author_login"] in previous_merged_pr_authors
236
+ previous_entry = previous_report_entries.get(row["author_login"])
237
  if index == 1 or index == total_selected or index % 10 == 0:
238
  _report_log(
239
  f"Enriching contributors: {index}/{total_selected} "
240
  f"(current={row['author_login']}, first_seen={str(first_seen_in_snapshot).lower()}, "
241
  f"known_via_prior_merged_pr={str(known_via_prior_merged_pr).lower()})"
242
  )
243
+ if (
244
+ previous_report_reusable
245
+ and previous_entry is not None
246
+ and not previous_entry.get("fetch_error")
247
+ and not known_via_prior_merged_pr
248
+ ):
249
+ contributors.append(
250
+ _reused_previous_report_entry(
251
+ snapshot["repo"],
252
+ row,
253
+ previous_entry,
254
+ first_seen_in_snapshot=first_seen_in_snapshot,
255
+ known_via_prior_merged_pr=known_via_prior_merged_pr,
256
+ )
257
+ )
258
+ reused_previous_report += 1
259
+ continue
260
  try:
261
  summary = summarize_user(row["author_login"], options.window_days, None)
262
  fetch_error = None
 
469
  return authors
470
 
471
 
472
+ def _previous_report_payload(snapshot_dir: Path | None) -> dict[str, Any] | None:
473
  if snapshot_dir is None:
474
+ return None
475
  path = snapshot_dir / "new-contributors-report.json"
476
  if not path.exists():
477
+ return None
478
  try:
479
  payload = read_json(path)
480
  except Exception:
481
+ return None
482
+ return payload if isinstance(payload, dict) else None
483
+
484
+
485
+ def _previous_report_contributors(payload: dict[str, Any] | None) -> set[str]:
486
+ if payload is None:
487
  return set()
488
  contributors = payload.get("contributors")
489
  if not isinstance(contributors, list):
 
495
  }
496
 
497
 
498
+ def _previous_report_contributor_entries(
499
+ payload: dict[str, Any] | None,
500
+ ) -> dict[str, dict[str, Any]]:
501
+ if payload is None:
 
 
 
 
 
502
  return {}
503
  contributors = payload.get("contributors")
504
  if not isinstance(contributors, list):
 
510
  }
511
 
512
 
513
+ def _previous_report_reuse_allowed(
514
+ payload: dict[str, Any] | None,
515
+ *,
516
+ window_days: int,
517
+ reference_time: datetime,
518
+ ) -> bool:
519
+ if payload is None:
520
+ return False
521
+ if _coerce_int(payload.get("window_days")) != window_days:
522
+ return False
523
+ generated_at = _coerce_datetime(payload.get("generated_at"))
524
+ if generated_at is None:
525
+ return False
526
+ return abs(reference_time - generated_at) <= PREVIOUS_REPORT_REUSE_MAX_AGE
527
+
528
+
529
+ def _reused_previous_report_entry(
530
+ repo: str,
531
+ row: dict[str, Any],
532
+ previous_entry: dict[str, Any],
533
+ *,
534
+ first_seen_in_snapshot: bool,
535
+ known_via_prior_merged_pr: bool,
536
+ ) -> dict[str, Any]:
537
+ login = row["author_login"]
538
+ age_days = _coerce_int(previous_entry.get("account_age_days"))
539
+ return {
540
+ "author_login": login,
541
+ "name": previous_entry.get("name"),
542
+ "profile_url": _profile_url(login),
543
+ "repo_pull_requests_url": _repo_search_url(repo, login, is_pr=True),
544
+ "repo_issues_url": _repo_search_url(repo, login, is_pr=False),
545
+ "repo_first_seen_at": row["first_seen_at"],
546
+ "repo_last_seen_at": row["last_seen_at"],
547
+ "repo_primary_artifact_count": row["primary_artifact_count"],
548
+ "repo_artifact_count": row["artifact_count"],
549
+ "snapshot_issue_count": row["issue_count"],
550
+ "snapshot_pr_count": row["pr_count"],
551
+ "snapshot_comment_count": row["comment_count"],
552
+ "snapshot_review_count": row["review_count"],
553
+ "snapshot_review_comment_count": row["review_comment_count"],
554
+ "repo_association": row.get("repo_association"),
555
+ "new_to_repo": first_seen_in_snapshot,
556
+ "first_seen_in_snapshot": first_seen_in_snapshot,
557
+ "known_via_prior_merged_pr": known_via_prior_merged_pr,
558
+ "report_reason": "first_seen_in_snapshot" if first_seen_in_snapshot else None,
559
+ "enrichment_source": "previous_report",
560
+ "live_refetch_skipped": True,
561
+ "account_age_days": age_days,
562
+ "young_account": age_days is not None and age_days <= 365,
563
+ "follow_through_score": previous_entry.get("follow_through_score"),
564
+ "breadth_score": previous_entry.get("breadth_score"),
565
+ "automation_risk_signal": previous_entry.get("automation_risk_signal"),
566
+ "heuristic_note": previous_entry.get("heuristic_note"),
567
+ "public_orgs": _previous_report_public_orgs(previous_entry),
568
+ "activity": _previous_report_activity(previous_entry),
569
+ "examples": {
570
+ "pull_requests": [
571
+ _artifact_example(item, "pull_request") for item in row["pull_requests"]
572
+ ],
573
+ "issues": [_artifact_example(item, "issue") for item in row["issues"]],
574
+ },
575
+ "fetch_error": None,
576
+ }
577
+
578
+
579
+ def _previous_report_public_orgs(previous_entry: dict[str, Any]) -> list[str]:
580
+ values = previous_entry.get("public_orgs")
581
+ if not isinstance(values, list):
582
+ return []
583
+ public_orgs: list[str] = []
584
+ for value in values:
585
+ if isinstance(value, str) and value.strip():
586
+ public_orgs.append(value.strip())
587
+ elif isinstance(value, dict):
588
+ login = str(value.get("login") or "").strip()
589
+ if login:
590
+ public_orgs.append(login)
591
+ return public_orgs
592
+
593
+
594
+ def _previous_report_activity(previous_entry: dict[str, Any]) -> dict[str, Any]:
595
+ activity = previous_entry.get("activity")
596
+ if not isinstance(activity, dict):
597
+ activity = previous_entry
598
+ return {
599
+ "visible_authored_pr_count": activity.get("visible_authored_pr_count"),
600
+ "merged_pr_count": activity.get("merged_pr_count"),
601
+ "closed_unmerged_pr_count": activity.get("closed_unmerged_pr_count"),
602
+ "open_pr_count": activity.get("open_pr_count"),
603
+ "merged_pr_rate": activity.get("merged_pr_rate"),
604
+ "closed_unmerged_pr_rate": activity.get("closed_unmerged_pr_rate"),
605
+ "still_open_pr_rate": activity.get("still_open_pr_rate"),
606
+ "distinct_repos_with_authored_prs": activity.get("distinct_repos_with_authored_prs"),
607
+ "distinct_repos_with_open_prs": activity.get("distinct_repos_with_open_prs"),
608
+ }
609
+
610
+
611
  def _contributor_entry(
612
  repo: str,
613
  row: dict[str, Any],