evalstate HF Staff commited on
Commit
114bead
·
verified ·
1 Parent(s): d09c394

Deploy PR search API with issues/contributors routes

Browse files
README.md CHANGED
@@ -29,8 +29,9 @@ Defaults for this deployment:
29
  CLI examples:
30
 
31
  ```bash
32
- pr-search repo status
33
- pr-search pr similar 67096
34
- pr-search pr clusters 67096
35
- pr-search --json pr similar 67096
 
36
  ```
 
29
  CLI examples:
30
 
31
  ```bash
32
+ pr-search status
33
+ pr-search code similar 67096
34
+ pr-search code clusters for-pr 67096
35
+ pr-search issues list --limit 5
36
+ pr-search contributors list --limit 10
37
  ```
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "slop-farmer"
7
- version = "0.1.1"
8
  description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
9
  readme = "README.md"
10
  requires-python = ">=3.13.5"
@@ -60,6 +60,13 @@ select = [
60
  ]
61
  ignore = ["E501"]
62
 
 
 
 
 
 
 
 
63
  [tool.slop-farmer.dashboard-data]
64
  output-dir = "web/public/data"
65
  window-days = 14
 
4
 
5
  [project]
6
  name = "slop-farmer"
7
+ version = "0.1.0"
8
  description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
9
  readme = "README.md"
10
  requires-python = ">=3.13.5"
 
60
  ]
61
  ignore = ["E501"]
62
 
63
+ [tool.slop-farmer.analyze]
64
+ output-dir = "eval_data"
65
+ hf-repo-id = "evalstate/transformers-pr"
66
+ ranking-backend = "hybrid"
67
+ model = "gpt-5.4-mini"
68
+ max-clusters = 10
69
+
70
  [tool.slop-farmer.dashboard-data]
71
  output-dir = "web/public/data"
72
  window-days = 14
src/slop_farmer.egg-info/PKG-INFO CHANGED
@@ -11,7 +11,7 @@ Requires-Dist: huggingface_hub>=0.30.0
11
  Requires-Dist: pydantic>=2.11
12
  Requires-Dist: PyYAML>=6.0.2
13
  Requires-Dist: rank-bm25>=0.2.2
14
- Requires-Dist: fast-agent-mcp>=0.6.16
15
  Requires-Dist: uvicorn>=0.34.0
16
  Provides-Extra: dev
17
  Requires-Dist: httpx>=0.28.0; extra == "dev"
@@ -409,3 +409,44 @@ Or use the CLI wrapper with a YAML config:
409
  ```bash
410
  uv run slop-farmer --config configs/diffusers.yaml deploy-dashboard --refresh-contributors
411
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  Requires-Dist: pydantic>=2.11
12
  Requires-Dist: PyYAML>=6.0.2
13
  Requires-Dist: rank-bm25>=0.2.2
14
+ Requires-Dist: fast-agent-mcp>=0.6.17
15
  Requires-Dist: uvicorn>=0.34.0
16
  Provides-Extra: dev
17
  Requires-Dist: httpx>=0.28.0; extra == "dev"
 
409
  ```bash
410
  uv run slop-farmer --config configs/diffusers.yaml deploy-dashboard --refresh-contributors
411
  ```
412
+
413
+ ## Deploy the PR similarity API to a Hugging Face Docker Space
414
+
415
+ The repo includes the FastAPI service for the read-oriented PR similarity surface.
416
+ The standalone `pr-search` client now lives in the downstream `pr-search-cli`
417
+ package.
418
+
419
+ Deploy the OpenClaw API Space with:
420
+
421
+ ```bash
422
+ scripts/update_openclaw_pr_search_api.sh
423
+ ```
424
+
425
+ Or use the generic deploy script directly:
426
+
427
+ ```bash
428
+ SPACE_ID=evalstate/openclaw-pr-api \
429
+ SPACE_TITLE="OpenClaw PR API" \
430
+ DEFAULT_REPO=openclaw/openclaw \
431
+ GHR_BASE_URL=https://ghreplica.dutiful.dev \
432
+ HF_REPO_ID=evalstate/openclaw-pr \
433
+ BUCKET_ID=evalstate/openclaw-pr-api-data \
434
+ scripts/deploy_pr_search_space.sh
435
+ ```
436
+
437
+ This deploy flow:
438
+
439
+ - creates or updates a Docker Space
440
+ - uploads a minimal app bundle with a generated Space `README.md`
441
+ - sets runtime variables for the API
442
+ - mounts the configured HF bucket at `/data`
443
+
444
+ After the Space is live, you can query it either through the in-repo admin CLI:
445
+
446
+ ```bash
447
+ uv run slop-farmer pr-search status --repo openclaw/openclaw
448
+ uv run slop-farmer pr-search similar 67096 --repo openclaw/openclaw
449
+ ```
450
+
451
+ Or through the downstream `pr-search-cli` package, which owns the standalone
452
+ `pr-search` executable.
src/slop_farmer.egg-info/SOURCES.txt CHANGED
@@ -17,7 +17,6 @@ src/slop_farmer/app/hf_checkpoint_import.py
17
  src/slop_farmer/app/pipeline.py
18
  src/slop_farmer/app/pr_search.py
19
  src/slop_farmer/app/pr_search_api.py
20
- src/slop_farmer/app/pr_search_client.py
21
  src/slop_farmer/app/publish.py
22
  src/slop_farmer/app/snapshot_state.py
23
  src/slop_farmer/app/workflow.py
@@ -42,6 +41,7 @@ src/slop_farmer/reports/pr_heuristics.py
42
  src/slop_farmer/reports/pr_scope.py
43
  src/slop_farmer/reports/pr_search_scope.py
44
  src/slop_farmer/reports/pr_search_service.py
 
45
  src/slop_farmer/reports/user_activity.py
46
  tests/test_analysis.py
47
  tests/test_analysis_cache.py
@@ -61,7 +61,6 @@ tests/test_pipeline_checkpoint_resume.py
61
  tests/test_pr_scope.py
62
  tests/test_pr_search.py
63
  tests/test_pr_search_api.py
64
- tests/test_pr_search_client.py
65
  tests/test_publish.py
66
  tests/test_snapshot_state.py
67
  tests/test_update_transformers_dataset.py
 
17
  src/slop_farmer/app/pipeline.py
18
  src/slop_farmer/app/pr_search.py
19
  src/slop_farmer/app/pr_search_api.py
 
20
  src/slop_farmer/app/publish.py
21
  src/slop_farmer/app/snapshot_state.py
22
  src/slop_farmer/app/workflow.py
 
41
  src/slop_farmer/reports/pr_scope.py
42
  src/slop_farmer/reports/pr_search_scope.py
43
  src/slop_farmer/reports/pr_search_service.py
44
+ src/slop_farmer/reports/read_views.py
45
  src/slop_farmer/reports/user_activity.py
46
  tests/test_analysis.py
47
  tests/test_analysis_cache.py
 
61
  tests/test_pr_scope.py
62
  tests/test_pr_search.py
63
  tests/test_pr_search_api.py
 
64
  tests/test_publish.py
65
  tests/test_snapshot_state.py
66
  tests/test_update_transformers_dataset.py
src/slop_farmer.egg-info/entry_points.txt CHANGED
@@ -1,3 +1,2 @@
1
  [console_scripts]
2
- pr-search = slop_farmer.app.pr_search_client:main
3
  slop-farmer = slop_farmer.app.cli:main
 
1
  [console_scripts]
 
2
  slop-farmer = slop_farmer.app.cli:main
src/slop_farmer.egg-info/requires.txt CHANGED
@@ -5,7 +5,7 @@ huggingface_hub>=0.30.0
5
  pydantic>=2.11
6
  PyYAML>=6.0.2
7
  rank-bm25>=0.2.2
8
- fast-agent-mcp>=0.6.16
9
  uvicorn>=0.34.0
10
 
11
  [dev]
 
5
  pydantic>=2.11
6
  PyYAML>=6.0.2
7
  rank-bm25>=0.2.2
8
+ fast-agent-mcp>=0.6.17
9
  uvicorn>=0.34.0
10
 
11
  [dev]
src/slop_farmer/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
  __all__ = ["__version__"]
2
 
3
- __version__ = "0.1.1"
 
1
  __all__ = ["__version__"]
2
 
3
+ __version__ = "0.1.0"
src/slop_farmer/app/cli.py CHANGED
@@ -13,8 +13,6 @@ from slop_farmer.config import (
13
  AnalysisOptions,
14
  CheckpointImportOptions,
15
  DashboardDataOptions,
16
- DatasetRefreshOptions,
17
- DatasetStatusOptions,
18
  DeployDashboardOptions,
19
  FullPipelineOptions,
20
  MarkdownReportOptions,
@@ -43,7 +41,6 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
43
  subparsers = parser.add_subparsers(dest="command", required=True)
44
 
45
  _add_scrape_parser(subparsers, defaults["scrape"])
46
- _add_refresh_dataset_parser(subparsers, defaults["refresh-dataset"])
47
  _add_analyze_parser(subparsers, defaults["analyze"])
48
  _add_pr_scope_parser(subparsers, defaults["pr-scope"])
49
  _add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
@@ -55,7 +52,6 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
55
  _add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
56
  _add_publish_snapshot_parser(subparsers, defaults["publish-snapshot"])
57
  _add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
58
- _add_dataset_status_parser(subparsers, defaults["dataset-status"])
59
  _add_full_pipeline_parser(subparsers, defaults["full-pipeline"])
60
  return parser
61
 
@@ -63,7 +59,6 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
63
  def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
64
  commands = (
65
  "scrape",
66
- "refresh-dataset",
67
  "analyze",
68
  "import-hf-checkpoint",
69
  "pr-scope",
@@ -73,7 +68,6 @@ def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]
73
  "dashboard-data",
74
  "publish-snapshot",
75
  "deploy-dashboard",
76
- "dataset-status",
77
  "full-pipeline",
78
  )
79
  return {command: command_defaults(command, config_path=config_path) for command in commands}
@@ -190,80 +184,6 @@ def _add_scrape_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
190
  )
191
 
192
 
193
- def _add_refresh_dataset_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
194
- refresh = subparsers.add_parser(
195
- "refresh-dataset",
196
- help="Refresh the canonical Hugging Face dataset repo from remote watermark state.",
197
- )
198
- refresh.add_argument(
199
- "--repo",
200
- default=defaults.get("repo", "huggingface/transformers"),
201
- help="GitHub repository in owner/name form.",
202
- )
203
- refresh.add_argument(
204
- "--hf-repo-id",
205
- default=defaults.get("hf-repo-id"),
206
- required=defaults.get("hf-repo-id") is None,
207
- help="Canonical Hugging Face dataset repo id to refresh.",
208
- )
209
- refresh.add_argument("--max-issues", type=int, default=defaults.get("max-issues"))
210
- refresh.add_argument("--max-prs", type=int, default=defaults.get("max-prs"))
211
- refresh.add_argument(
212
- "--max-issue-comments", type=int, default=defaults.get("max-issue-comments")
213
- )
214
- refresh.add_argument(
215
- "--max-reviews-per-pr", type=int, default=defaults.get("max-reviews-per-pr")
216
- )
217
- refresh.add_argument(
218
- "--max-review-comments-per-pr",
219
- type=int,
220
- default=defaults.get("max-review-comments-per-pr"),
221
- )
222
- refresh.add_argument(
223
- "--fetch-timeline",
224
- action="store_true",
225
- default=bool(defaults.get("fetch-timeline", False)),
226
- )
227
- refresh.add_argument(
228
- "--new-contributor-report",
229
- dest="new_contributor_report",
230
- action="store_true",
231
- default=bool(defaults.get("new-contributor-report", True)),
232
- )
233
- refresh.add_argument(
234
- "--no-new-contributor-report",
235
- dest="new_contributor_report",
236
- action="store_false",
237
- )
238
- refresh.add_argument(
239
- "--new-contributor-window-days",
240
- type=int,
241
- default=int(defaults.get("new-contributor-window-days", 42)),
242
- )
243
- refresh.add_argument(
244
- "--new-contributor-max-authors",
245
- type=int,
246
- default=int(defaults.get("new-contributor-max-authors", 25)),
247
- )
248
- refresh.add_argument("--http-timeout", type=int, default=300)
249
- refresh.add_argument("--http-max-retries", type=int, default=8)
250
- refresh.add_argument("--checkpoint-every-comments", type=int, default=1000)
251
- refresh.add_argument("--checkpoint-every-prs", type=int, default=25)
252
- refresh.add_argument(
253
- "--private-hf-repo",
254
- dest="private_hf_repo",
255
- action="store_true",
256
- default=bool(defaults.get("private-hf-repo", False)),
257
- help="Create the target dataset repo as private if needed.",
258
- )
259
- refresh.add_argument(
260
- "--private",
261
- dest="private_hf_repo",
262
- action="store_true",
263
- help=argparse.SUPPRESS,
264
- )
265
-
266
-
267
  def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
268
  analyze = subparsers.add_parser(
269
  "analyze", help="Analyze a local snapshot and write a shortlist JSON report."
@@ -717,61 +637,6 @@ def _add_pr_search_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
717
  status.add_argument("--repo", help="Optional repo override.")
718
  status.add_argument("--json", action="store_true", help="Emit JSON.")
719
 
720
- contributor = pr_search_subparsers.add_parser(
721
- "contributor", help="Show indexed contributor summary for one author login."
722
- )
723
- contributor.add_argument("login", help="GitHub author login to query.")
724
- contributor.add_argument(
725
- "--db",
726
- type=Path,
727
- default=Path(defaults["db"]) if defaults.get("db") else None,
728
- help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
729
- )
730
- contributor.add_argument(
731
- "--output-dir",
732
- type=Path,
733
- default=Path(defaults.get("output-dir", "data")),
734
- )
735
- contributor.add_argument("--repo", help="Optional repo override.")
736
- contributor.add_argument("--json", action="store_true", help="Emit JSON.")
737
-
738
- contributor_prs = pr_search_subparsers.add_parser(
739
- "contributor-prs", help="List indexed PRs for one contributor login."
740
- )
741
- contributor_prs.add_argument("login", help="GitHub author login to query.")
742
- contributor_prs.add_argument(
743
- "--db",
744
- type=Path,
745
- default=Path(defaults["db"]) if defaults.get("db") else None,
746
- help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
747
- )
748
- contributor_prs.add_argument(
749
- "--output-dir",
750
- type=Path,
751
- default=Path(defaults.get("output-dir", "data")),
752
- )
753
- contributor_prs.add_argument("--repo", help="Optional repo override.")
754
- contributor_prs.add_argument("--limit", type=int, default=20, help="Maximum rows to show.")
755
- contributor_prs.add_argument("--json", action="store_true", help="Emit JSON.")
756
-
757
- pr_contributor = pr_search_subparsers.add_parser(
758
- "pr-contributor", help="Show contributor summary for the author of one indexed PR."
759
- )
760
- pr_contributor.add_argument("pr_number", type=int, help="Pull request number to query.")
761
- pr_contributor.add_argument(
762
- "--db",
763
- type=Path,
764
- default=Path(defaults["db"]) if defaults.get("db") else None,
765
- help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
766
- )
767
- pr_contributor.add_argument(
768
- "--output-dir",
769
- type=Path,
770
- default=Path(defaults.get("output-dir", "data")),
771
- )
772
- pr_contributor.add_argument("--repo", help="Optional repo override.")
773
- pr_contributor.add_argument("--json", action="store_true", help="Emit JSON.")
774
-
775
 
776
  def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
777
  new_contributor = subparsers.add_parser(
@@ -794,24 +659,6 @@ def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]
794
  new_contributor.add_argument(
795
  "--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
796
  )
797
- new_contributor.add_argument(
798
- "--hf-repo-id",
799
- default=defaults.get("hf-repo-id"),
800
- help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
801
- )
802
- new_contributor.add_argument(
803
- "--hf-revision",
804
- default=defaults.get("hf-revision"),
805
- help="Optional Hub revision for metadata and README download.",
806
- )
807
- new_contributor.add_argument(
808
- "--hf-materialize-dir",
809
- type=Path,
810
- default=Path(defaults["hf-materialize-dir"])
811
- if defaults.get("hf-materialize-dir")
812
- else None,
813
- help="Optional local directory used when materializing an HF dataset snapshot.",
814
- )
815
  new_contributor.add_argument(
816
  "--window-days",
817
  type=int,
@@ -855,24 +702,6 @@ def _add_dashboard_data_parser(subparsers: Any, defaults: dict[str, Any]) -> Non
855
  type=Path,
856
  help="Optional PR scope cluster JSON. Defaults to pr-scope-clusters.json in the snapshot.",
857
  )
858
- dashboard.add_argument(
859
- "--hf-repo-id",
860
- default=defaults.get("hf-repo-id"),
861
- help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
862
- )
863
- dashboard.add_argument(
864
- "--hf-revision",
865
- default=defaults.get("hf-revision"),
866
- help="Optional Hub revision for metadata and README download.",
867
- )
868
- dashboard.add_argument(
869
- "--hf-materialize-dir",
870
- type=Path,
871
- default=Path(defaults["hf-materialize-dir"])
872
- if defaults.get("hf-materialize-dir")
873
- else None,
874
- help="Optional local directory used when materializing an HF dataset snapshot.",
875
- )
876
  dashboard.add_argument(
877
  "--window-days",
878
  type=int,
@@ -932,24 +761,6 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
932
  deploy_dashboard.add_argument(
933
  "--contributors-input", type=Path, help="Optional contributor report JSON override."
934
  )
935
- deploy_dashboard.add_argument(
936
- "--hf-repo-id",
937
- default=defaults.get("hf-repo-id"),
938
- help="Materialize a Hugging Face dataset repo instead of using the latest local snapshot.",
939
- )
940
- deploy_dashboard.add_argument(
941
- "--hf-revision",
942
- default=defaults.get("hf-revision"),
943
- help="Optional Hub revision for metadata and README download.",
944
- )
945
- deploy_dashboard.add_argument(
946
- "--hf-materialize-dir",
947
- type=Path,
948
- default=Path(defaults["hf-materialize-dir"])
949
- if defaults.get("hf-materialize-dir")
950
- else None,
951
- help="Optional local directory used when materializing an HF dataset snapshot.",
952
- )
953
  deploy_dashboard.add_argument(
954
  "--refresh-contributors",
955
  action="store_true",
@@ -1006,31 +817,6 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
1006
  )
1007
 
1008
 
1009
- def _add_dataset_status_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
1010
- dataset_status = subparsers.add_parser(
1011
- "dataset-status",
1012
- help="Inspect canonical dataset freshness and the local latest pointer.",
1013
- )
1014
- dataset_status.add_argument("--repo", default=defaults.get("repo"))
1015
- dataset_status.add_argument(
1016
- "--output-dir",
1017
- type=Path,
1018
- default=Path(defaults.get("output-dir", "data")),
1019
- help="Local workspace root containing snapshots/latest.json.",
1020
- )
1021
- dataset_status.add_argument(
1022
- "--hf-repo-id",
1023
- default=defaults.get("hf-repo-id"),
1024
- help="Canonical Hugging Face dataset repo id to inspect.",
1025
- )
1026
- dataset_status.add_argument(
1027
- "--hf-revision",
1028
- default=defaults.get("hf-revision"),
1029
- help="Optional Hub revision for metadata and README download.",
1030
- )
1031
- dataset_status.add_argument("--json", action="store_true", help="Emit machine-readable JSON.")
1032
-
1033
-
1034
  def _add_full_pipeline_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
1035
  full_pipeline = subparsers.add_parser(
1036
  "full-pipeline",
@@ -1147,33 +933,6 @@ def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
1147
  print(run_pipeline(options))
1148
 
1149
 
1150
- def _run_refresh_dataset(args: argparse.Namespace, config_path: Path | None) -> None:
1151
- del config_path
1152
- from slop_farmer.app.dataset_refresh import run_dataset_refresh
1153
-
1154
- result = run_dataset_refresh(
1155
- DatasetRefreshOptions(
1156
- repo=RepoRef.parse(args.repo),
1157
- hf_repo_id=args.hf_repo_id,
1158
- private_hf_repo=args.private_hf_repo,
1159
- max_issues=args.max_issues,
1160
- max_prs=args.max_prs,
1161
- max_issue_comments=args.max_issue_comments,
1162
- max_reviews_per_pr=args.max_reviews_per_pr,
1163
- max_review_comments_per_pr=args.max_review_comments_per_pr,
1164
- fetch_timeline=args.fetch_timeline,
1165
- new_contributor_report=args.new_contributor_report,
1166
- new_contributor_window_days=args.new_contributor_window_days,
1167
- new_contributor_max_authors=args.new_contributor_max_authors,
1168
- http_timeout=args.http_timeout,
1169
- http_max_retries=args.http_max_retries,
1170
- checkpoint_every_comments=args.checkpoint_every_comments,
1171
- checkpoint_every_prs=args.checkpoint_every_prs,
1172
- )
1173
- )
1174
- print(json.dumps(result, indent=2))
1175
-
1176
-
1177
  def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
1178
  from slop_farmer.reports.analysis import run_analysis
1179
 
@@ -1282,18 +1041,12 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
1282
  explain_pr_search_pair,
1283
  format_pr_search_candidate_clusters,
1284
  format_pr_search_cluster,
1285
- format_pr_search_contributor,
1286
- format_pr_search_contributor_pulls,
1287
  format_pr_search_pair,
1288
  format_pr_search_probe,
1289
- format_pr_search_pull_contributor,
1290
  format_pr_search_similar,
1291
  format_pr_search_status,
1292
  get_pr_search_candidate_clusters,
1293
  get_pr_search_cluster,
1294
- get_pr_search_contributor,
1295
- get_pr_search_contributor_pulls,
1296
- get_pr_search_pull_contributor,
1297
  get_pr_search_similar,
1298
  get_pr_search_status,
1299
  probe_pr_search_github,
@@ -1387,36 +1140,6 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
1387
  print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
1388
  return
1389
 
1390
- if args.pr_search_command == "contributor":
1391
- result = get_pr_search_contributor(db_path, author_login=args.login, repo=args.repo)
1392
- print(json.dumps(result, indent=2) if args.json else format_pr_search_contributor(result))
1393
- return
1394
-
1395
- if args.pr_search_command == "contributor-prs":
1396
- result = get_pr_search_contributor_pulls(
1397
- db_path,
1398
- author_login=args.login,
1399
- repo=args.repo,
1400
- limit=args.limit,
1401
- )
1402
- print(
1403
- json.dumps(result, indent=2)
1404
- if args.json
1405
- else format_pr_search_contributor_pulls(result)
1406
- )
1407
- return
1408
-
1409
- if args.pr_search_command == "pr-contributor":
1410
- result = get_pr_search_pull_contributor(
1411
- db_path,
1412
- pr_number=args.pr_number,
1413
- repo=args.repo,
1414
- )
1415
- print(
1416
- json.dumps(result, indent=2) if args.json else format_pr_search_pull_contributor(result)
1417
- )
1418
- return
1419
-
1420
  raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
1421
 
1422
 
@@ -1458,7 +1181,6 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
1458
  del config_path
1459
  from slop_farmer.reports.new_contributor_report import run_new_contributor_report
1460
 
1461
- hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
1462
  print(
1463
  run_new_contributor_report(
1464
  NewContributorReportOptions(
@@ -1466,9 +1188,6 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
1466
  output_dir=args.output_dir,
1467
  output=args.output,
1468
  json_output=args.json_output,
1469
- hf_repo_id=hf_repo_id,
1470
- hf_revision=hf_revision,
1471
- hf_materialize_dir=hf_materialize_dir,
1472
  window_days=args.window_days,
1473
  max_authors=args.max_authors,
1474
  )
@@ -1480,7 +1199,6 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
1480
  from slop_farmer.reports.dashboard import run_dashboard_data
1481
 
1482
  dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
1483
- hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
1484
  print(
1485
  run_dashboard_data(
1486
  DashboardDataOptions(
@@ -1489,9 +1207,6 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
1489
  analysis_input=args.analysis_input,
1490
  contributors_input=args.contributors_input,
1491
  pr_scope_input=args.pr_scope_input,
1492
- hf_repo_id=hf_repo_id,
1493
- hf_revision=hf_revision,
1494
- hf_materialize_dir=hf_materialize_dir,
1495
  window_days=args.window_days,
1496
  snapshot_root=(
1497
  Path(dashboard_defaults["snapshot-root"])
@@ -1507,7 +1222,6 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
1507
  del config_path
1508
  from slop_farmer.app.deploy import run_deploy_dashboard
1509
 
1510
- hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
1511
  run_deploy_dashboard(
1512
  DeployDashboardOptions(
1513
  pipeline_data_dir=args.pipeline_data_dir,
@@ -1515,9 +1229,6 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
1515
  snapshot_dir=args.snapshot_dir,
1516
  analysis_input=args.analysis_input,
1517
  contributors_input=args.contributors_input,
1518
- hf_repo_id=hf_repo_id,
1519
- hf_revision=hf_revision,
1520
- hf_materialize_dir=hf_materialize_dir,
1521
  refresh_contributors=args.refresh_contributors,
1522
  dashboard_window_days=args.dashboard_window_days,
1523
  contributor_window_days=args.contributor_window_days,
@@ -1536,22 +1247,6 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
1536
  )
1537
 
1538
 
1539
- def _run_dataset_status(args: argparse.Namespace, config_path: Path | None) -> None:
1540
- del config_path
1541
- from slop_farmer.app.dataset_status import format_dataset_status, get_dataset_status
1542
-
1543
- result = get_dataset_status(
1544
- DatasetStatusOptions(
1545
- repo=args.repo,
1546
- output_dir=args.output_dir,
1547
- hf_repo_id=args.hf_repo_id,
1548
- hf_revision=args.hf_revision,
1549
- json_output=args.json,
1550
- )
1551
- )
1552
- print(json.dumps(result, indent=2) if args.json else format_dataset_status(result))
1553
-
1554
-
1555
  def _run_publish_snapshot(args: argparse.Namespace, config_path: Path | None) -> None:
1556
  del config_path
1557
  from slop_farmer.app.publish import run_publish_snapshot
@@ -1601,7 +1296,6 @@ def main() -> None:
1601
 
1602
  handlers: dict[str, CommandHandler] = {
1603
  "scrape": _run_scrape,
1604
- "refresh-dataset": _run_refresh_dataset,
1605
  "analyze": _run_analyze,
1606
  "markdown-report": _run_markdown_report,
1607
  "duplicate-prs": _run_duplicate_prs,
@@ -1612,7 +1306,6 @@ def main() -> None:
1612
  "new-contributor-report": _run_new_contributor_report,
1613
  "dashboard-data": _run_dashboard_data,
1614
  "deploy-dashboard": _run_deploy_dashboard,
1615
- "dataset-status": _run_dataset_status,
1616
  "publish-snapshot": _run_publish_snapshot,
1617
  "full-pipeline": _run_full_pipeline,
1618
  }
 
13
  AnalysisOptions,
14
  CheckpointImportOptions,
15
  DashboardDataOptions,
 
 
16
  DeployDashboardOptions,
17
  FullPipelineOptions,
18
  MarkdownReportOptions,
 
41
  subparsers = parser.add_subparsers(dest="command", required=True)
42
 
43
  _add_scrape_parser(subparsers, defaults["scrape"])
 
44
  _add_analyze_parser(subparsers, defaults["analyze"])
45
  _add_pr_scope_parser(subparsers, defaults["pr-scope"])
46
  _add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
 
52
  _add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
53
  _add_publish_snapshot_parser(subparsers, defaults["publish-snapshot"])
54
  _add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
 
55
  _add_full_pipeline_parser(subparsers, defaults["full-pipeline"])
56
  return parser
57
 
 
59
  def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
60
  commands = (
61
  "scrape",
 
62
  "analyze",
63
  "import-hf-checkpoint",
64
  "pr-scope",
 
68
  "dashboard-data",
69
  "publish-snapshot",
70
  "deploy-dashboard",
 
71
  "full-pipeline",
72
  )
73
  return {command: command_defaults(command, config_path=config_path) for command in commands}
 
184
  )
185
 
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
188
  analyze = subparsers.add_parser(
189
  "analyze", help="Analyze a local snapshot and write a shortlist JSON report."
 
637
  status.add_argument("--repo", help="Optional repo override.")
638
  status.add_argument("--json", action="store_true", help="Emit JSON.")
639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
 
641
  def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
642
  new_contributor = subparsers.add_parser(
 
659
  new_contributor.add_argument(
660
  "--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
661
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  new_contributor.add_argument(
663
  "--window-days",
664
  type=int,
 
702
  type=Path,
703
  help="Optional PR scope cluster JSON. Defaults to pr-scope-clusters.json in the snapshot.",
704
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
  dashboard.add_argument(
706
  "--window-days",
707
  type=int,
 
761
  deploy_dashboard.add_argument(
762
  "--contributors-input", type=Path, help="Optional contributor report JSON override."
763
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
764
  deploy_dashboard.add_argument(
765
  "--refresh-contributors",
766
  action="store_true",
 
817
  )
818
 
819
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
820
  def _add_full_pipeline_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
821
  full_pipeline = subparsers.add_parser(
822
  "full-pipeline",
 
933
  print(run_pipeline(options))
934
 
935
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936
  def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
937
  from slop_farmer.reports.analysis import run_analysis
938
 
 
1041
  explain_pr_search_pair,
1042
  format_pr_search_candidate_clusters,
1043
  format_pr_search_cluster,
 
 
1044
  format_pr_search_pair,
1045
  format_pr_search_probe,
 
1046
  format_pr_search_similar,
1047
  format_pr_search_status,
1048
  get_pr_search_candidate_clusters,
1049
  get_pr_search_cluster,
 
 
 
1050
  get_pr_search_similar,
1051
  get_pr_search_status,
1052
  probe_pr_search_github,
 
1140
  print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
1141
  return
1142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1143
  raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
1144
 
1145
 
 
1181
  del config_path
1182
  from slop_farmer.reports.new_contributor_report import run_new_contributor_report
1183
 
 
1184
  print(
1185
  run_new_contributor_report(
1186
  NewContributorReportOptions(
 
1188
  output_dir=args.output_dir,
1189
  output=args.output,
1190
  json_output=args.json_output,
 
 
 
1191
  window_days=args.window_days,
1192
  max_authors=args.max_authors,
1193
  )
 
1199
  from slop_farmer.reports.dashboard import run_dashboard_data
1200
 
1201
  dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
 
1202
  print(
1203
  run_dashboard_data(
1204
  DashboardDataOptions(
 
1207
  analysis_input=args.analysis_input,
1208
  contributors_input=args.contributors_input,
1209
  pr_scope_input=args.pr_scope_input,
 
 
 
1210
  window_days=args.window_days,
1211
  snapshot_root=(
1212
  Path(dashboard_defaults["snapshot-root"])
 
1222
  del config_path
1223
  from slop_farmer.app.deploy import run_deploy_dashboard
1224
 
 
1225
  run_deploy_dashboard(
1226
  DeployDashboardOptions(
1227
  pipeline_data_dir=args.pipeline_data_dir,
 
1229
  snapshot_dir=args.snapshot_dir,
1230
  analysis_input=args.analysis_input,
1231
  contributors_input=args.contributors_input,
 
 
 
1232
  refresh_contributors=args.refresh_contributors,
1233
  dashboard_window_days=args.dashboard_window_days,
1234
  contributor_window_days=args.contributor_window_days,
 
1247
  )
1248
 
1249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1250
  def _run_publish_snapshot(args: argparse.Namespace, config_path: Path | None) -> None:
1251
  del config_path
1252
  from slop_farmer.app.publish import run_publish_snapshot
 
1296
 
1297
  handlers: dict[str, CommandHandler] = {
1298
  "scrape": _run_scrape,
 
1299
  "analyze": _run_analyze,
1300
  "markdown-report": _run_markdown_report,
1301
  "duplicate-prs": _run_duplicate_prs,
 
1306
  "new-contributor-report": _run_new_contributor_report,
1307
  "dashboard-data": _run_dashboard_data,
1308
  "deploy-dashboard": _run_deploy_dashboard,
 
1309
  "publish-snapshot": _run_publish_snapshot,
1310
  "full-pipeline": _run_full_pipeline,
1311
  }
src/slop_farmer/app/deploy.py CHANGED
@@ -5,7 +5,6 @@ import subprocess
5
  from pathlib import Path
6
 
7
  from slop_farmer.config import DeployDashboardOptions
8
- from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
9
 
10
 
11
  def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
@@ -18,16 +17,6 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
18
  {
19
  "PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
20
  "WEB_DIR": str(options.web_dir),
21
- "SNAPSHOT_DIR": str(
22
- resolve_snapshot_source_dir(
23
- snapshot_dir=options.snapshot_dir,
24
- local_snapshots_root=options.pipeline_data_dir.resolve() / "snapshots",
25
- hf_repo_id=options.hf_repo_id,
26
- hf_revision=options.hf_revision,
27
- hf_materialize_dir=options.hf_materialize_dir,
28
- hf_output_dir=options.pipeline_data_dir,
29
- )
30
- ),
31
  "DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
32
  "CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
33
  "CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
@@ -39,6 +28,8 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
39
  "SPACE_SHORT_DESCRIPTION": options.space_short_description,
40
  }
41
  )
 
 
42
  if options.analysis_input is not None:
43
  env["ANALYSIS_INPUT"] = str(options.analysis_input)
44
  if options.contributors_input is not None:
 
5
  from pathlib import Path
6
 
7
  from slop_farmer.config import DeployDashboardOptions
 
8
 
9
 
10
  def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
 
17
  {
18
  "PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
19
  "WEB_DIR": str(options.web_dir),
 
 
 
 
 
 
 
 
 
 
20
  "DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
21
  "CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
22
  "CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
 
28
  "SPACE_SHORT_DESCRIPTION": options.space_short_description,
29
  }
30
  )
31
+ if options.snapshot_dir is not None:
32
+ env["SNAPSHOT_DIR"] = str(options.snapshot_dir)
33
  if options.analysis_input is not None:
34
  env["ANALYSIS_INPUT"] = str(options.analysis_input)
35
  if options.contributors_input is not None:
src/slop_farmer/app/hf_checkpoint_import.py CHANGED
@@ -28,7 +28,6 @@ from huggingface_hub import HfApi, hf_hub_download
28
 
29
  from slop_farmer.app.publish import publish_snapshot
30
  from slop_farmer.config import CheckpointImportOptions
31
- from slop_farmer.data.dataset_card import build_hf_dataset_card
32
  from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
33
  from slop_farmer.data.parquet_io import (
34
  SCHEMAS,
@@ -456,15 +455,76 @@ def _viewer_comment_rows(
456
  def _dataset_card(
457
  repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
458
  ) -> str:
459
- return build_hf_dataset_card(
460
- repo_slug,
461
- snapshot_id,
462
- notes=[
463
- f"source HF dataset: `{source_repo_id}`",
464
- f"source checkpoint root: `{checkpoint_root}`",
465
- "links were regenerated locally from text references and timeline events",
466
- ],
467
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
 
469
 
470
  def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:
 
28
 
29
  from slop_farmer.app.publish import publish_snapshot
30
  from slop_farmer.config import CheckpointImportOptions
 
31
  from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
32
  from slop_farmer.data.parquet_io import (
33
  SCHEMAS,
 
455
  def _dataset_card(
456
  repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
457
  ) -> str:
458
+ return f"""---
459
+ pretty_name: Transformers PR Slop Dataset
460
+ configs:
461
+ - config_name: issues
462
+ data_files:
463
+ - split: train
464
+ path: issues.parquet
465
+ default: true
466
+ - config_name: prs
467
+ data_files:
468
+ - split: train
469
+ path: pull_requests.parquet
470
+ - config_name: issue_comments
471
+ data_files:
472
+ - split: train
473
+ path: issue_comments.parquet
474
+ - config_name: pr_comments
475
+ data_files:
476
+ - split: train
477
+ path: pr_comments.parquet
478
+ - config_name: pr_reviews
479
+ data_files:
480
+ - split: train
481
+ path: reviews.parquet
482
+ - config_name: pr_files
483
+ data_files:
484
+ - split: train
485
+ path: pr_files.parquet
486
+ - config_name: pr_diffs
487
+ data_files:
488
+ - split: train
489
+ path: pr_diffs.parquet
490
+ - config_name: review_comments
491
+ data_files:
492
+ - split: train
493
+ path: review_comments.parquet
494
+ - config_name: links
495
+ data_files:
496
+ - split: train
497
+ path: links.parquet
498
+ - config_name: events
499
+ data_files:
500
+ - split: train
501
+ path: events.parquet
502
+ ---
503
+ ---
504
+
505
+ # Transformers PR Slop Dataset
506
+
507
+ Imported checkpoint snapshot for `{repo_slug}`.
508
+
509
+ Files:
510
+ - `issues.parquet`
511
+ - `pull_requests.parquet`
512
+ - `comments.parquet`
513
+ - `issue_comments.parquet`
514
+ - `pr_comments.parquet`
515
+ - `reviews.parquet`
516
+ - `pr_files.parquet`
517
+ - `pr_diffs.parquet`
518
+ - `review_comments.parquet`
519
+ - `links.parquet`
520
+ - `events.parquet`
521
+
522
+ Notes:
523
+ - source HF dataset: `{source_repo_id}`
524
+ - source checkpoint root: `{checkpoint_root}`
525
+ - latest imported checkpoint: `{snapshot_id}`
526
+ - links were regenerated locally from text references and timeline events
527
+ """
528
 
529
 
530
  def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:
src/slop_farmer/app/pipeline.py CHANGED
@@ -9,7 +9,6 @@ from typing import Any, Protocol
9
 
10
  from slop_farmer.app.publish import publish_snapshot
11
  from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
12
- from slop_farmer.data.dataset_card import build_hf_dataset_card
13
  from slop_farmer.data.github_api import GitHubClient
14
  from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
15
  from slop_farmer.data.normalize import (
@@ -113,14 +112,96 @@ def _reference_time_for_age_caps(crawl_started_at: str) -> datetime:
113
  def _dataset_card(
114
  repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
115
  ) -> str:
116
- notes = ["new contributor reviewer artifacts are included"] if include_new_contributors else []
117
- del manifest
118
- return build_hf_dataset_card(
119
- repo,
120
- snapshot_id,
121
- include_new_contributors=include_new_contributors,
122
- notes=notes,
123
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
 
126
  def _viewer_comment_rows(
@@ -964,9 +1045,6 @@ def run_pipeline(options: PipelineOptions, client: GitHubClientLike | None = Non
964
  output_dir=options.output_dir,
965
  output=None,
966
  json_output=None,
967
- hf_repo_id=None,
968
- hf_revision=None,
969
- hf_materialize_dir=None,
970
  window_days=options.new_contributor_window_days,
971
  max_authors=options.new_contributor_max_authors,
972
  )
 
9
 
10
  from slop_farmer.app.publish import publish_snapshot
11
  from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
 
12
  from slop_farmer.data.github_api import GitHubClient
13
  from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
14
  from slop_farmer.data.normalize import (
 
112
  def _dataset_card(
113
  repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
114
  ) -> str:
115
+ new_contributor_config = ""
116
+ new_contributor_file = ""
117
+ if include_new_contributors:
118
+ new_contributor_config = """- config_name: new_contributors
119
+ data_files:
120
+ - split: train
121
+ path: new_contributors.parquet
122
+ """
123
+ new_contributor_file = """- `new_contributors.parquet`
124
+ - `new-contributors-report.json`
125
+ - `new-contributors-report.md`
126
+ """
127
+ return f"""---
128
+ pretty_name: Transformers PR Slop Dataset
129
+ configs:
130
+ - config_name: issues
131
+ data_files:
132
+ - split: train
133
+ path: issues.parquet
134
+ default: true
135
+ - config_name: prs
136
+ data_files:
137
+ - split: train
138
+ path: pull_requests.parquet
139
+ - config_name: issue_comments
140
+ data_files:
141
+ - split: train
142
+ path: issue_comments.parquet
143
+ - config_name: pr_comments
144
+ data_files:
145
+ - split: train
146
+ path: pr_comments.parquet
147
+ - config_name: pr_reviews
148
+ data_files:
149
+ - split: train
150
+ path: reviews.parquet
151
+ - config_name: pr_files
152
+ data_files:
153
+ - split: train
154
+ path: pr_files.parquet
155
+ - config_name: pr_diffs
156
+ data_files:
157
+ - split: train
158
+ path: pr_diffs.parquet
159
+ - config_name: review_comments
160
+ data_files:
161
+ - split: train
162
+ path: review_comments.parquet
163
+ - config_name: links
164
+ data_files:
165
+ - split: train
166
+ path: links.parquet
167
+ - config_name: events
168
+ data_files:
169
+ - split: train
170
+ path: events.parquet
171
+ {new_contributor_config}---
172
+ ---
173
+
174
+ # Transformers PR Slop Dataset
175
+
176
+ Normalized snapshots of issues, pull requests, comments, reviews, and linkage data from `{repo}`.
177
+
178
+ Files:
179
+ - `issues.parquet`
180
+ - `pull_requests.parquet`
181
+ - `comments.parquet`
182
+ - `issue_comments.parquet` (derived view of issue discussion comments)
183
+ - `pr_comments.parquet` (derived view of pull request discussion comments)
184
+ - `reviews.parquet`
185
+ - `pr_files.parquet`
186
+ - `pr_diffs.parquet`
187
+ - `review_comments.parquet`
188
+ - `links.parquet`
189
+ - `events.parquet`
190
+ {new_contributor_file}
191
+
192
+ Use:
193
+ - duplicate PR and issue analysis
194
+ - triage and ranking experiments
195
+ - eval set creation
196
+
197
+ Notes:
198
+ - updated daily
199
+ - latest snapshot: `{snapshot_id}`
200
+ - raw data only; no labels or moderation decisions
201
+ - PR metadata, file-level patch hunks, and full unified diffs are included
202
+ - new contributor reviewer artifacts are included when generated for the snapshot
203
+ - full file contents for changed files are not included
204
+ """
205
 
206
 
207
  def _viewer_comment_rows(
 
1045
  output_dir=options.output_dir,
1046
  output=None,
1047
  json_output=None,
 
 
 
1048
  window_days=options.new_contributor_window_days,
1049
  max_authors=options.new_contributor_max_authors,
1050
  )
src/slop_farmer/app/pr_search.py CHANGED
@@ -10,12 +10,9 @@ get_pr_search_status = pr_search_service.get_pr_search_status
10
  get_pr_search_similar = pr_search_service.get_pr_search_similar
11
  get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
12
  get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
13
- get_pr_search_contributor = pr_search_service.get_pr_search_contributor
14
- get_pr_search_contributor_pulls = pr_search_service.get_pr_search_contributor_pulls
15
  get_pr_search_clusters = pr_search_service.get_pr_search_clusters
16
  list_pr_search_clusters = pr_search_service.list_pr_search_clusters
17
  get_pr_search_cluster = pr_search_service.get_pr_search_cluster
18
- get_pr_search_pull_contributor = pr_search_service.get_pr_search_pull_contributor
19
  explain_pr_search_pair = pr_search_service.explain_pr_search_pair
20
  probe_pr_search_live = pr_search_service.probe_pr_search_live
21
  probe_pr_search_github = pr_search_service.probe_pr_search_github
@@ -34,7 +31,6 @@ def format_pr_search_status(result: Mapping[str, Any]) -> str:
34
  (
35
  "Rows: "
36
  f"documents={counts['documents']} "
37
- f"contributors={counts.get('contributors', 0)} "
38
  f"features={counts['features']} "
39
  f"neighbors={counts['neighbors']} "
40
  f"clusters={counts['clusters']} "
@@ -249,73 +245,3 @@ def format_pr_search_probe(result: Mapping[str, Any]) -> str:
249
  if row.get("reason"):
250
  lines.append(f" reason: {row['reason']}")
251
  return "\n".join(lines)
252
-
253
-
254
- def format_pr_search_contributor(result: Mapping[str, Any]) -> str:
255
- contributor = result["contributor"]
256
- lines = [
257
- f"Contributor {contributor['author_login']}",
258
- f"Repo: {result['repo']}",
259
- f"Snapshot: {result['snapshot_id']}",
260
- f"Name: {contributor.get('name') or '-'}",
261
- f"Profile: {contributor.get('profile_url') or '-'}",
262
- f"Association: {contributor.get('repo_association') or '-'}",
263
- f"First seen in snapshot: {'yes' if contributor.get('first_seen_in_snapshot') else 'no'}",
264
- (
265
- "Scores: "
266
- f"follow-through={contributor.get('follow_through_score') or '-'} "
267
- f"breadth={contributor.get('breadth_score') or '-'} "
268
- f"risk={contributor.get('automation_risk_signal') or '-'}"
269
- ),
270
- f"Heuristic: {contributor.get('heuristic_note') or '-'}",
271
- f"Public orgs: {', '.join(contributor.get('public_orgs') or []) or '-'}",
272
- "",
273
- "Recent indexed PRs:",
274
- ]
275
- pulls = result.get("pulls") or []
276
- if not pulls:
277
- lines.append("- none")
278
- return "\n".join(lines)
279
- for row in pulls:
280
- lines.append(
281
- f"- PR #{row['pr_number']}: {row.get('title') or ''} "
282
- f"[state={row.get('state') or '-'} merged={'yes' if row.get('merged') else 'no'}]"
283
- )
284
- return "\n".join(lines)
285
-
286
-
287
- def format_pr_search_contributor_pulls(result: Mapping[str, Any]) -> str:
288
- contributor = result["contributor"]
289
- lines = [
290
- f"Contributor PRs: {contributor['author_login']}",
291
- f"Repo: {result['repo']}",
292
- f"Snapshot: {result['snapshot_id']}",
293
- f"Pull requests: {result.get('pull_count', len(result.get('pulls') or []))}",
294
- "",
295
- ]
296
- pulls = result.get("pulls") or []
297
- if not pulls:
298
- lines.append("No indexed PRs found for that contributor.")
299
- return "\n".join(lines)
300
- for row in pulls:
301
- lines.append(
302
- f"- PR #{row['pr_number']}: {row.get('title') or ''} "
303
- f"(updated={row.get('updated_at') or '-'}, state={row.get('state') or '-'})"
304
- )
305
- return "\n".join(lines)
306
-
307
-
308
- def format_pr_search_pull_contributor(result: Mapping[str, Any]) -> str:
309
- pr = result["pr"]
310
- contributor = result["contributor"]
311
- return "\n".join(
312
- [
313
- f"PR #{pr['pr_number']}: {pr.get('title') or ''}",
314
- f"Author: {contributor['author_login']}",
315
- f"Risk: {contributor.get('automation_risk_signal') or '-'}",
316
- f"Follow-through: {contributor.get('follow_through_score') or '-'}",
317
- f"Breadth: {contributor.get('breadth_score') or '-'}",
318
- f"Heuristic: {contributor.get('heuristic_note') or '-'}",
319
- f"Profile: {contributor.get('profile_url') or '-'}",
320
- ]
321
- )
 
10
  get_pr_search_similar = pr_search_service.get_pr_search_similar
11
  get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
12
  get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
 
 
13
  get_pr_search_clusters = pr_search_service.get_pr_search_clusters
14
  list_pr_search_clusters = pr_search_service.list_pr_search_clusters
15
  get_pr_search_cluster = pr_search_service.get_pr_search_cluster
 
16
  explain_pr_search_pair = pr_search_service.explain_pr_search_pair
17
  probe_pr_search_live = pr_search_service.probe_pr_search_live
18
  probe_pr_search_github = pr_search_service.probe_pr_search_github
 
31
  (
32
  "Rows: "
33
  f"documents={counts['documents']} "
 
34
  f"features={counts['features']} "
35
  f"neighbors={counts['neighbors']} "
36
  f"clusters={counts['clusters']} "
 
245
  if row.get("reason"):
246
  lines.append(f" reason: {row['reason']}")
247
  return "\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/slop_farmer/app/pr_search_api.py CHANGED
@@ -11,25 +11,30 @@ from fastapi.responses import JSONResponse
11
 
12
  from slop_farmer.config import PrSearchRefreshOptions
13
  from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
14
- from slop_farmer.reports.analysis_service import (
15
- get_analysis_best,
16
- get_analysis_meta_bug,
17
- get_analysis_status,
18
- get_pr_analysis,
19
- list_analysis_duplicate_prs,
20
- list_analysis_meta_bugs,
21
- )
22
  from slop_farmer.reports.pr_search_service import (
23
  get_pr_search_cluster,
24
  get_pr_search_clusters,
25
- get_pr_search_contributor,
26
- get_pr_search_contributor_pulls,
27
- get_pr_search_pull_contributor,
28
  get_pr_search_similar_lookup,
29
  get_pr_search_status,
30
  list_pr_search_clusters,
31
  run_pr_search_refresh,
32
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  @dataclass(slots=True)
@@ -37,7 +42,6 @@ class PrSearchApiSettings:
37
  default_repo: str | None
38
  index_path: Path
39
  output_dir: Path
40
- analysis_dir: Path | None = None
41
  snapshot_dir: Path | None = None
42
  hf_repo_id: str | None = None
43
  hf_revision: str | None = None
@@ -55,6 +59,10 @@ class PrSearchApiSettings:
55
  candidate_limit_max: int = 20
56
  cluster_list_limit_default: int = 50
57
  cluster_list_limit_max: int = 200
 
 
 
 
58
  probe_limit_default: int = 10
59
  probe_limit_max: int = 25
60
 
@@ -70,7 +78,6 @@ class PrSearchApiSettings:
70
  default_repo=os.environ.get("DEFAULT_REPO"),
71
  index_path=index_path,
72
  output_dir=output_dir,
73
- analysis_dir=_env_path("ANALYSIS_DIR") or (output_dir / "analysis"),
74
  snapshot_dir=snapshot_dir,
75
  hf_repo_id=os.environ.get("HF_REPO_ID"),
76
  hf_revision=os.environ.get("HF_REVISION"),
@@ -88,6 +95,10 @@ class PrSearchApiSettings:
88
  candidate_limit_max=_env_int("CANDIDATE_LIMIT_MAX", 20),
89
  cluster_list_limit_default=_env_int("CLUSTER_LIST_LIMIT_DEFAULT", 50),
90
  cluster_list_limit_max=_env_int("CLUSTER_LIST_LIMIT_MAX", 200),
 
 
 
 
91
  probe_limit_default=_env_int("PROBE_LIMIT_DEFAULT", 10),
92
  probe_limit_max=_env_int("PROBE_LIMIT_MAX", 25),
93
  )
@@ -102,13 +113,14 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
102
  app.state.ready = False
103
  app.state.startup_error = None
104
  try:
 
105
  _bootstrap_index(api_settings)
106
  app.state.ready = _is_ready(api_settings)
107
  except Exception as exc:
108
  app.state.startup_error = str(exc)
109
  yield
110
 
111
- app = FastAPI(title="slop PR search API", version="0.1.1", lifespan=lifespan)
112
 
113
  @app.exception_handler(ValueError)
114
  async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
@@ -139,7 +151,9 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
139
  async def repo_status(owner: str, repo: str, request: Request) -> dict[str, Any]:
140
  settings = request.app.state.settings
141
  repo_slug = _repo_slug(settings, owner, repo)
142
- return get_pr_search_status(settings.index_path, repo=repo_slug)
 
 
143
 
144
  @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/similar")
145
  async def pr_similar(
@@ -217,80 +231,89 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
217
  ),
218
  )
219
 
220
- @app.get("/v1/repos/{owner}/{repo}/contributors/{login}")
221
- async def contributor_view(
222
- owner: str, repo: str, login: str, request: Request
 
 
 
223
  ) -> dict[str, Any]:
224
  settings = request.app.state.settings
225
  repo_slug = _repo_slug(settings, owner, repo)
226
- return get_pr_search_contributor(settings.index_path, repo=repo_slug, author_login=login)
227
 
228
- @app.get("/v1/repos/{owner}/{repo}/contributors/{login}/pulls")
229
- async def contributor_pulls(
230
  owner: str,
231
  repo: str,
232
- login: str,
233
  request: Request,
234
  limit: int | None = None,
 
235
  ) -> dict[str, Any]:
236
  settings = request.app.state.settings
237
  repo_slug = _repo_slug(settings, owner, repo)
238
- return get_pr_search_contributor_pulls(
239
- settings.index_path,
240
- repo=repo_slug,
241
- author_login=login,
242
  limit=_limit(
243
- limit, default=settings.similar_limit_default, maximum=settings.similar_limit_max
 
 
244
  ),
 
245
  )
246
 
247
- @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/contributor")
248
- async def pull_contributor(
249
  owner: str,
250
  repo: str,
251
- number: int,
252
  request: Request,
 
253
  ) -> dict[str, Any]:
254
  settings = request.app.state.settings
255
  repo_slug = _repo_slug(settings, owner, repo)
256
- return get_pr_search_pull_contributor(settings.index_path, repo=repo_slug, pr_number=number)
 
 
 
 
257
 
258
- @app.get("/v1/repos/{owner}/{repo}/analysis/status")
259
- async def analysis_status(
260
  owner: str,
261
  repo: str,
 
262
  request: Request,
263
  variant: Literal["auto", "hybrid", "deterministic"] = "auto",
264
  ) -> dict[str, Any]:
265
  settings = request.app.state.settings
266
  repo_slug = _repo_slug(settings, owner, repo)
267
- return get_analysis_status(
268
- settings.index_path,
269
- repo=repo_slug,
270
  variant=variant,
271
- analysis_root=settings.analysis_dir,
272
  )
273
 
274
- @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/analysis")
275
- async def pr_analysis(
276
  owner: str,
277
  repo: str,
278
  number: int,
279
  request: Request,
 
280
  variant: Literal["auto", "hybrid", "deterministic"] = "auto",
281
  ) -> dict[str, Any]:
282
  settings = request.app.state.settings
283
  repo_slug = _repo_slug(settings, owner, repo)
284
- return get_pr_analysis(
285
- settings.index_path,
286
- repo=repo_slug,
287
  pr_number=number,
 
288
  variant=variant,
289
- analysis_root=settings.analysis_dir,
290
  )
291
 
292
- @app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs")
293
- async def analysis_meta_bugs(
294
  owner: str,
295
  repo: str,
296
  request: Request,
@@ -299,73 +322,76 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
299
  ) -> dict[str, Any]:
300
  settings = request.app.state.settings
301
  repo_slug = _repo_slug(settings, owner, repo)
302
- return list_analysis_meta_bugs(
303
- settings.index_path,
304
- repo=repo_slug,
305
- variant=variant,
306
- analysis_root=settings.analysis_dir,
307
  limit=_limit(
308
  limit,
309
- default=settings.cluster_list_limit_default,
310
- maximum=settings.cluster_list_limit_max,
311
  ),
 
312
  )
313
 
314
- @app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs/{cluster_id}")
315
- async def analysis_meta_bug(
316
  owner: str,
317
  repo: str,
318
- cluster_id: str,
319
  request: Request,
320
  variant: Literal["auto", "hybrid", "deterministic"] = "auto",
321
  ) -> dict[str, Any]:
322
  settings = request.app.state.settings
323
  repo_slug = _repo_slug(settings, owner, repo)
324
- return get_analysis_meta_bug(
325
- settings.index_path,
326
- repo=repo_slug,
327
- cluster_id=cluster_id,
328
- variant=variant,
329
- analysis_root=settings.analysis_dir,
330
- )
 
 
 
 
331
 
332
- @app.get("/v1/repos/{owner}/{repo}/analysis/duplicate-prs")
333
- async def analysis_duplicate_prs(
334
  owner: str,
335
  repo: str,
336
  request: Request,
337
  limit: int | None = None,
338
- variant: Literal["auto", "hybrid", "deterministic"] = "auto",
339
  ) -> dict[str, Any]:
340
  settings = request.app.state.settings
341
  repo_slug = _repo_slug(settings, owner, repo)
342
- return list_analysis_duplicate_prs(
343
- settings.index_path,
344
- repo=repo_slug,
345
- variant=variant,
346
- analysis_root=settings.analysis_dir,
347
  limit=_limit(
348
  limit,
349
- default=settings.cluster_list_limit_default,
350
- maximum=settings.cluster_list_limit_max,
351
  ),
352
  )
353
 
354
- @app.get("/v1/repos/{owner}/{repo}/analysis/best")
355
- async def analysis_best(
356
  owner: str,
357
  repo: str,
 
358
  request: Request,
359
- variant: Literal["auto", "hybrid", "deterministic"] = "auto",
360
  ) -> dict[str, Any]:
361
  settings = request.app.state.settings
362
  repo_slug = _repo_slug(settings, owner, repo)
363
- return get_analysis_best(
364
- settings.index_path,
365
- repo=repo_slug,
366
- variant=variant,
367
- analysis_root=settings.analysis_dir,
368
- )
 
 
 
 
 
 
369
 
370
  return app
371
 
@@ -391,6 +417,21 @@ def _bootstrap_index(settings: PrSearchApiSettings) -> None:
391
  )
392
 
393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  def _needs_refresh(settings: PrSearchApiSettings) -> bool:
395
  if settings.rebuild_on_start:
396
  return True
@@ -429,6 +470,17 @@ def _repo_slug(settings: PrSearchApiSettings, owner: str, repo: str) -> str:
429
  return repo_slug
430
 
431
 
 
 
 
 
 
 
 
 
 
 
 
432
  def _limit(value: int | None, *, default: int, maximum: int) -> int:
433
  limit = default if value is None else value
434
  if limit < 1:
@@ -452,8 +504,6 @@ def _looks_not_found(exc: ValueError) -> bool:
452
  message = str(exc).lower()
453
  return (
454
  "not found" in message
455
- or "analysis report was not found" in message
456
- or "no analysis report was found" in message
457
  or "no active pr search run" in message
458
  or "was not found in the active indexed universe" in message
459
  )
 
11
 
12
  from slop_farmer.config import PrSearchRefreshOptions
13
  from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
14
+ from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
15
+ from slop_farmer.data.snapshot_paths import default_hf_materialize_dir
 
 
 
 
 
 
16
  from slop_farmer.reports.pr_search_service import (
17
  get_pr_search_cluster,
18
  get_pr_search_clusters,
 
 
 
19
  get_pr_search_similar_lookup,
20
  get_pr_search_status,
21
  list_pr_search_clusters,
22
  run_pr_search_refresh,
23
  )
24
+ from slop_farmer.reports.read_views import (
25
+ check_issue_cluster_membership,
26
+ get_contributor,
27
+ get_contributor_risk,
28
+ get_contributor_status,
29
+ get_issue_best,
30
+ get_issue_cluster,
31
+ get_issue_cluster_status,
32
+ get_issue_clusters_for_pr,
33
+ get_snapshot_surfaces,
34
+ list_contributors,
35
+ list_issue_clusters,
36
+ list_issue_duplicate_prs,
37
+ )
38
 
39
 
40
  @dataclass(slots=True)
 
42
  default_repo: str | None
43
  index_path: Path
44
  output_dir: Path
 
45
  snapshot_dir: Path | None = None
46
  hf_repo_id: str | None = None
47
  hf_revision: str | None = None
 
59
  candidate_limit_max: int = 20
60
  cluster_list_limit_default: int = 50
61
  cluster_list_limit_max: int = 200
62
+ issue_list_limit_default: int = 50
63
+ issue_list_limit_max: int = 200
64
+ contributor_list_limit_default: int = 50
65
+ contributor_list_limit_max: int = 200
66
  probe_limit_default: int = 10
67
  probe_limit_max: int = 25
68
 
 
78
  default_repo=os.environ.get("DEFAULT_REPO"),
79
  index_path=index_path,
80
  output_dir=output_dir,
 
81
  snapshot_dir=snapshot_dir,
82
  hf_repo_id=os.environ.get("HF_REPO_ID"),
83
  hf_revision=os.environ.get("HF_REVISION"),
 
95
  candidate_limit_max=_env_int("CANDIDATE_LIMIT_MAX", 20),
96
  cluster_list_limit_default=_env_int("CLUSTER_LIST_LIMIT_DEFAULT", 50),
97
  cluster_list_limit_max=_env_int("CLUSTER_LIST_LIMIT_MAX", 200),
98
+ issue_list_limit_default=_env_int("ISSUE_LIST_LIMIT_DEFAULT", 50),
99
+ issue_list_limit_max=_env_int("ISSUE_LIST_LIMIT_MAX", 200),
100
+ contributor_list_limit_default=_env_int("CONTRIBUTOR_LIST_LIMIT_DEFAULT", 50),
101
+ contributor_list_limit_max=_env_int("CONTRIBUTOR_LIST_LIMIT_MAX", 200),
102
  probe_limit_default=_env_int("PROBE_LIMIT_DEFAULT", 10),
103
  probe_limit_max=_env_int("PROBE_LIMIT_MAX", 25),
104
  )
 
113
  app.state.ready = False
114
  app.state.startup_error = None
115
  try:
116
+ _bootstrap_snapshot_assets(api_settings)
117
  _bootstrap_index(api_settings)
118
  app.state.ready = _is_ready(api_settings)
119
  except Exception as exc:
120
  app.state.startup_error = str(exc)
121
  yield
122
 
123
+ app = FastAPI(title="slop PR search API", version="0.1.0", lifespan=lifespan)
124
 
125
  @app.exception_handler(ValueError)
126
  async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
 
151
  async def repo_status(owner: str, repo: str, request: Request) -> dict[str, Any]:
152
  settings = request.app.state.settings
153
  repo_slug = _repo_slug(settings, owner, repo)
154
+ status = get_pr_search_status(settings.index_path, repo=repo_slug)
155
+ snapshot_dir = _status_snapshot_dir(status)
156
+ return {**status, "surfaces": get_snapshot_surfaces(snapshot_dir)}
157
 
158
  @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/similar")
159
  async def pr_similar(
 
231
  ),
232
  )
233
 
234
+ @app.get("/v1/repos/{owner}/{repo}/issues/status")
235
+ async def issue_status(
236
+ owner: str,
237
+ repo: str,
238
+ request: Request,
239
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
240
  ) -> dict[str, Any]:
241
  settings = request.app.state.settings
242
  repo_slug = _repo_slug(settings, owner, repo)
243
+ return get_issue_cluster_status(_active_snapshot_dir(settings, repo_slug), variant=variant)
244
 
245
+ @app.get("/v1/repos/{owner}/{repo}/issues/clusters")
246
+ async def issue_clusters(
247
  owner: str,
248
  repo: str,
 
249
  request: Request,
250
  limit: int | None = None,
251
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
252
  ) -> dict[str, Any]:
253
  settings = request.app.state.settings
254
  repo_slug = _repo_slug(settings, owner, repo)
255
+ return list_issue_clusters(
256
+ _active_snapshot_dir(settings, repo_slug),
 
 
257
  limit=_limit(
258
+ limit,
259
+ default=settings.issue_list_limit_default,
260
+ maximum=settings.issue_list_limit_max,
261
  ),
262
+ variant=variant,
263
  )
264
 
265
+ @app.get("/v1/repos/{owner}/{repo}/issues/clusters/{cluster_id}")
266
+ async def issue_cluster(
267
  owner: str,
268
  repo: str,
269
+ cluster_id: str,
270
  request: Request,
271
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
272
  ) -> dict[str, Any]:
273
  settings = request.app.state.settings
274
  repo_slug = _repo_slug(settings, owner, repo)
275
+ return get_issue_cluster(
276
+ _active_snapshot_dir(settings, repo_slug),
277
+ cluster_id=cluster_id,
278
+ variant=variant,
279
+ )
280
 
281
+ @app.get("/v1/repos/{owner}/{repo}/issues/pulls/{number}")
282
+ async def issue_clusters_for_pr(
283
  owner: str,
284
  repo: str,
285
+ number: int,
286
  request: Request,
287
  variant: Literal["auto", "hybrid", "deterministic"] = "auto",
288
  ) -> dict[str, Any]:
289
  settings = request.app.state.settings
290
  repo_slug = _repo_slug(settings, owner, repo)
291
+ return get_issue_clusters_for_pr(
292
+ _active_snapshot_dir(settings, repo_slug),
293
+ pr_number=number,
294
  variant=variant,
 
295
  )
296
 
297
+ @app.get("/v1/repos/{owner}/{repo}/issues/pulls/{number}/membership")
298
+ async def issue_membership_for_pr(
299
  owner: str,
300
  repo: str,
301
  number: int,
302
  request: Request,
303
+ cluster_id: str | None = None,
304
  variant: Literal["auto", "hybrid", "deterministic"] = "auto",
305
  ) -> dict[str, Any]:
306
  settings = request.app.state.settings
307
  repo_slug = _repo_slug(settings, owner, repo)
308
+ return check_issue_cluster_membership(
309
+ _active_snapshot_dir(settings, repo_slug),
 
310
  pr_number=number,
311
+ cluster_id=cluster_id,
312
  variant=variant,
 
313
  )
314
 
315
+ @app.get("/v1/repos/{owner}/{repo}/issues/duplicate-prs")
316
+ async def issue_duplicate_prs(
317
  owner: str,
318
  repo: str,
319
  request: Request,
 
322
  ) -> dict[str, Any]:
323
  settings = request.app.state.settings
324
  repo_slug = _repo_slug(settings, owner, repo)
325
+ return list_issue_duplicate_prs(
326
+ _active_snapshot_dir(settings, repo_slug),
 
 
 
327
  limit=_limit(
328
  limit,
329
+ default=settings.issue_list_limit_default,
330
+ maximum=settings.issue_list_limit_max,
331
  ),
332
+ variant=variant,
333
  )
334
 
335
+ @app.get("/v1/repos/{owner}/{repo}/issues/best")
336
+ async def issue_best(
337
  owner: str,
338
  repo: str,
 
339
  request: Request,
340
  variant: Literal["auto", "hybrid", "deterministic"] = "auto",
341
  ) -> dict[str, Any]:
342
  settings = request.app.state.settings
343
  repo_slug = _repo_slug(settings, owner, repo)
344
+ return get_issue_best(_active_snapshot_dir(settings, repo_slug), variant=variant)
345
+
346
+ @app.get("/v1/repos/{owner}/{repo}/contributors/status")
347
+ async def contributor_status(
348
+ owner: str,
349
+ repo: str,
350
+ request: Request,
351
+ ) -> dict[str, Any]:
352
+ settings = request.app.state.settings
353
+ repo_slug = _repo_slug(settings, owner, repo)
354
+ return get_contributor_status(_active_snapshot_dir(settings, repo_slug))
355
 
356
+ @app.get("/v1/repos/{owner}/{repo}/contributors")
357
+ async def contributors(
358
  owner: str,
359
  repo: str,
360
  request: Request,
361
  limit: int | None = None,
 
362
  ) -> dict[str, Any]:
363
  settings = request.app.state.settings
364
  repo_slug = _repo_slug(settings, owner, repo)
365
+ return list_contributors(
366
+ _active_snapshot_dir(settings, repo_slug),
 
 
 
367
  limit=_limit(
368
  limit,
369
+ default=settings.contributor_list_limit_default,
370
+ maximum=settings.contributor_list_limit_max,
371
  ),
372
  )
373
 
374
+ @app.get("/v1/repos/{owner}/{repo}/contributors/{login}")
375
+ async def contributor(
376
  owner: str,
377
  repo: str,
378
+ login: str,
379
  request: Request,
 
380
  ) -> dict[str, Any]:
381
  settings = request.app.state.settings
382
  repo_slug = _repo_slug(settings, owner, repo)
383
+ return get_contributor(_active_snapshot_dir(settings, repo_slug), author_login=login)
384
+
385
+ @app.get("/v1/repos/{owner}/{repo}/contributors/{login}/risk")
386
+ async def contributor_risk(
387
+ owner: str,
388
+ repo: str,
389
+ login: str,
390
+ request: Request,
391
+ ) -> dict[str, Any]:
392
+ settings = request.app.state.settings
393
+ repo_slug = _repo_slug(settings, owner, repo)
394
+ return get_contributor_risk(_active_snapshot_dir(settings, repo_slug), author_login=login)
395
 
396
  return app
397
 
 
417
  )
418
 
419
 
420
+ def _bootstrap_snapshot_assets(settings: PrSearchApiSettings) -> None:
421
+ if settings.snapshot_dir is not None or settings.hf_repo_id is None:
422
+ return
423
+ materialize_dir = settings.hf_materialize_dir or default_hf_materialize_dir(
424
+ settings.output_dir,
425
+ settings.hf_repo_id,
426
+ settings.hf_revision,
427
+ )
428
+ materialize_hf_dataset_snapshot(
429
+ repo_id=settings.hf_repo_id,
430
+ local_dir=materialize_dir,
431
+ revision=settings.hf_revision,
432
+ )
433
+
434
+
435
  def _needs_refresh(settings: PrSearchApiSettings) -> bool:
436
  if settings.rebuild_on_start:
437
  return True
 
470
  return repo_slug
471
 
472
 
473
+ def _active_snapshot_dir(settings: PrSearchApiSettings, repo_slug: str) -> Path:
474
+ return _status_snapshot_dir(get_pr_search_status(settings.index_path, repo=repo_slug))
475
+
476
+
477
+ def _status_snapshot_dir(status: dict[str, Any]) -> Path:
478
+ snapshot_dir = status.get("snapshot_dir")
479
+ if not snapshot_dir:
480
+ raise HTTPException(status_code=503, detail="active snapshot directory is unavailable")
481
+ return Path(str(snapshot_dir))
482
+
483
+
484
  def _limit(value: int | None, *, default: int, maximum: int) -> int:
485
  limit = default if value is None else value
486
  if limit < 1:
 
504
  message = str(exc).lower()
505
  return (
506
  "not found" in message
 
 
507
  or "no active pr search run" in message
508
  or "was not found in the active indexed universe" in message
509
  )
src/slop_farmer/app/workflow.py CHANGED
@@ -74,9 +74,6 @@ def run_full_pipeline(options: FullPipelineOptions) -> str:
74
  analysis_input=analysis_path,
75
  contributors_input=snapshot_dir / "new-contributors-report.json",
76
  pr_scope_input=snapshot_dir / "pr-scope-clusters.json",
77
- hf_repo_id=None,
78
- hf_revision=None,
79
- hf_materialize_dir=None,
80
  window_days=options.dashboard_window_days,
81
  )
82
  )
 
74
  analysis_input=analysis_path,
75
  contributors_input=snapshot_dir / "new-contributors-report.json",
76
  pr_scope_input=snapshot_dir / "pr-scope-clusters.json",
 
 
 
77
  window_days=options.dashboard_window_days,
78
  )
79
  )
src/slop_farmer/app_config.py CHANGED
@@ -184,18 +184,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
184
  "new-contributor-window-days": contributor_window_days,
185
  "new-contributor-max-authors": contributor_max_authors,
186
  },
187
- "refresh-dataset": {
188
- "repo": repo,
189
- "hf-repo-id": dataset_id,
190
- "fetch-timeline": scrape.get("fetch-timeline"),
191
- "max-issues": scrape.get("max-issues"),
192
- "max-prs": scrape.get("max-prs"),
193
- "max-issue-comments": scrape.get("max-issue-comments"),
194
- "max-reviews-per-pr": scrape.get("max-reviews-per-pr"),
195
- "max-review-comments-per-pr": scrape.get("max-review-comments-per-pr"),
196
- "new-contributor-window-days": contributor_window_days,
197
- "new-contributor-max-authors": contributor_max_authors,
198
- },
199
  "analyze": {
200
  "output-dir": str(data_dir) if data_dir else None,
201
  "hf-repo-id": analysis.get("hf-repo-id", dataset_id),
@@ -213,7 +201,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
213
  },
214
  "pr-scope": {
215
  "output-dir": str(data_dir) if data_dir else None,
216
- "hf-repo-id": dataset_id,
217
  "cluster-suppression-rules": cluster_suppression_rules,
218
  },
219
  "pr-search": {
@@ -223,14 +210,12 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
223
  },
224
  "new-contributor-report": {
225
  "output-dir": str(data_dir) if data_dir else None,
226
- "hf-repo-id": dataset_id,
227
  "window-days": contributor_window_days,
228
  "max-authors": contributor_max_authors,
229
  },
230
  "dashboard-data": {
231
  "output-dir": str(dashboard_dir) if dashboard_dir else None,
232
  "snapshot-root": str(data_dir / "snapshots") if data_dir else None,
233
- "hf-repo-id": dataset_id,
234
  "window-days": dashboard_window_days,
235
  },
236
  "publish-snapshot": {
@@ -251,7 +236,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
251
  "deploy-dashboard": {
252
  "pipeline-data-dir": str(data_dir) if data_dir else None,
253
  "web-dir": str(web_dir) if web_dir else None,
254
- "hf-repo-id": dataset_id,
255
  "dashboard-window-days": dashboard_window_days,
256
  "contributor-window-days": contributor_window_days,
257
  "contributor-max-authors": contributor_max_authors,
@@ -264,11 +248,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
264
  "dataset-id": dataset_id,
265
  "space-tags": tags_value,
266
  },
267
- "dataset-status": {
268
- "repo": repo,
269
- "output-dir": str(data_dir) if data_dir else None,
270
- "hf-repo-id": dataset_id,
271
- },
272
  }
273
  for command, values in defaults.items():
274
  defaults[command] = {key: value for key, value in values.items() if value is not None}
@@ -280,7 +259,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
280
  defaults[command].update(_resolve_command_paths(config_path, values))
281
 
282
  defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
283
- defaults["refresh-dataset"].update(_resolve_command_paths(config_path, scrape))
284
  defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
285
  defaults["full-pipeline"].update(_resolve_command_paths(config_path, full_pipeline))
286
  return defaults
 
184
  "new-contributor-window-days": contributor_window_days,
185
  "new-contributor-max-authors": contributor_max_authors,
186
  },
 
 
 
 
 
 
 
 
 
 
 
 
187
  "analyze": {
188
  "output-dir": str(data_dir) if data_dir else None,
189
  "hf-repo-id": analysis.get("hf-repo-id", dataset_id),
 
201
  },
202
  "pr-scope": {
203
  "output-dir": str(data_dir) if data_dir else None,
 
204
  "cluster-suppression-rules": cluster_suppression_rules,
205
  },
206
  "pr-search": {
 
210
  },
211
  "new-contributor-report": {
212
  "output-dir": str(data_dir) if data_dir else None,
 
213
  "window-days": contributor_window_days,
214
  "max-authors": contributor_max_authors,
215
  },
216
  "dashboard-data": {
217
  "output-dir": str(dashboard_dir) if dashboard_dir else None,
218
  "snapshot-root": str(data_dir / "snapshots") if data_dir else None,
 
219
  "window-days": dashboard_window_days,
220
  },
221
  "publish-snapshot": {
 
236
  "deploy-dashboard": {
237
  "pipeline-data-dir": str(data_dir) if data_dir else None,
238
  "web-dir": str(web_dir) if web_dir else None,
 
239
  "dashboard-window-days": dashboard_window_days,
240
  "contributor-window-days": contributor_window_days,
241
  "contributor-max-authors": contributor_max_authors,
 
248
  "dataset-id": dataset_id,
249
  "space-tags": tags_value,
250
  },
 
 
 
 
 
251
  }
252
  for command, values in defaults.items():
253
  defaults[command] = {key: value for key, value in values.items() if value is not None}
 
259
  defaults[command].update(_resolve_command_paths(config_path, values))
260
 
261
  defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
 
262
  defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
263
  defaults["full-pipeline"].update(_resolve_command_paths(config_path, full_pipeline))
264
  return defaults
src/slop_farmer/config.py CHANGED
@@ -127,9 +127,6 @@ class NewContributorReportOptions:
127
  json_output: Path | None
128
  window_days: int
129
  max_authors: int
130
- hf_repo_id: str | None = None
131
- hf_revision: str | None = None
132
- hf_materialize_dir: Path | None = None
133
 
134
 
135
  @dataclass(slots=True)
@@ -140,9 +137,6 @@ class DashboardDataOptions:
140
  contributors_input: Path | None
141
  pr_scope_input: Path | None
142
  window_days: int
143
- hf_repo_id: str | None = None
144
- hf_revision: str | None = None
145
- hf_materialize_dir: Path | None = None
146
  snapshot_root: Path | None = None
147
 
148
 
@@ -161,9 +155,6 @@ class DeployDashboardOptions:
161
  snapshot_dir: Path | None
162
  analysis_input: Path | None
163
  contributors_input: Path | None
164
- hf_repo_id: str | None
165
- hf_revision: str | None
166
- hf_materialize_dir: Path | None
167
  refresh_contributors: bool
168
  dashboard_window_days: int
169
  contributor_window_days: int
@@ -242,32 +233,3 @@ class FullPipelineOptions:
242
  max_issues: int | None
243
  max_prs: int | None
244
  open_prs_only: bool = False
245
-
246
-
247
- @dataclass(slots=True)
248
- class DatasetRefreshOptions:
249
- repo: RepoRef
250
- hf_repo_id: str
251
- private_hf_repo: bool
252
- max_issues: int | None
253
- max_prs: int | None
254
- max_issue_comments: int | None
255
- max_reviews_per_pr: int | None
256
- max_review_comments_per_pr: int | None
257
- fetch_timeline: bool
258
- new_contributor_report: bool
259
- new_contributor_window_days: int
260
- new_contributor_max_authors: int
261
- http_timeout: int
262
- http_max_retries: int
263
- checkpoint_every_comments: int
264
- checkpoint_every_prs: int
265
-
266
-
267
- @dataclass(slots=True)
268
- class DatasetStatusOptions:
269
- output_dir: Path
270
- hf_repo_id: str | None
271
- hf_revision: str | None
272
- repo: str | None = None
273
- json_output: bool = False
 
127
  json_output: Path | None
128
  window_days: int
129
  max_authors: int
 
 
 
130
 
131
 
132
  @dataclass(slots=True)
 
137
  contributors_input: Path | None
138
  pr_scope_input: Path | None
139
  window_days: int
 
 
 
140
  snapshot_root: Path | None = None
141
 
142
 
 
155
  snapshot_dir: Path | None
156
  analysis_input: Path | None
157
  contributors_input: Path | None
 
 
 
158
  refresh_contributors: bool
159
  dashboard_window_days: int
160
  contributor_window_days: int
 
233
  max_issues: int | None
234
  max_prs: int | None
235
  open_prs_only: bool = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/slop_farmer/data/search_duckdb.py CHANGED
@@ -31,7 +31,6 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
31
  "repo",
32
  "pr_number",
33
  "github_id",
34
- "author_login",
35
  "state",
36
  "draft",
37
  "merged",
@@ -47,48 +46,6 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
47
  "review_comments_count",
48
  "html_url",
49
  ),
50
- "pr_search_contributors": (
51
- "run_id",
52
- "repo",
53
- "snapshot_id",
54
- "report_generated_at",
55
- "window_days",
56
- "author_login",
57
- "name",
58
- "profile_url",
59
- "repo_pull_requests_url",
60
- "repo_issues_url",
61
- "repo_first_seen_at",
62
- "repo_last_seen_at",
63
- "repo_primary_artifact_count",
64
- "repo_artifact_count",
65
- "snapshot_issue_count",
66
- "snapshot_pr_count",
67
- "snapshot_comment_count",
68
- "snapshot_review_count",
69
- "snapshot_review_comment_count",
70
- "repo_association",
71
- "new_to_repo",
72
- "first_seen_in_snapshot",
73
- "report_reason",
74
- "account_age_days",
75
- "young_account",
76
- "follow_through_score",
77
- "breadth_score",
78
- "automation_risk_signal",
79
- "heuristic_note",
80
- "public_orgs_json",
81
- "visible_authored_pr_count",
82
- "merged_pr_count",
83
- "closed_unmerged_pr_count",
84
- "open_pr_count",
85
- "merged_pr_rate",
86
- "closed_unmerged_pr_rate",
87
- "still_open_pr_rate",
88
- "distinct_repos_with_authored_prs",
89
- "distinct_repos_with_open_prs",
90
- "fetch_error",
91
- ),
92
  "pr_scope_features": (
93
  "run_id",
94
  "repo",
@@ -187,7 +144,6 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
187
  repo VARCHAR,
188
  pr_number BIGINT,
189
  github_id BIGINT,
190
- author_login VARCHAR,
191
  state VARCHAR,
192
  draft BOOLEAN,
193
  merged BOOLEAN,
@@ -203,48 +159,6 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
203
  review_comments_count BIGINT,
204
  html_url VARCHAR
205
  );
206
- CREATE TABLE IF NOT EXISTS pr_search_contributors (
207
- run_id VARCHAR,
208
- repo VARCHAR,
209
- snapshot_id VARCHAR,
210
- report_generated_at VARCHAR,
211
- window_days BIGINT,
212
- author_login VARCHAR,
213
- name VARCHAR,
214
- profile_url VARCHAR,
215
- repo_pull_requests_url VARCHAR,
216
- repo_issues_url VARCHAR,
217
- repo_first_seen_at VARCHAR,
218
- repo_last_seen_at VARCHAR,
219
- repo_primary_artifact_count BIGINT,
220
- repo_artifact_count BIGINT,
221
- snapshot_issue_count BIGINT,
222
- snapshot_pr_count BIGINT,
223
- snapshot_comment_count BIGINT,
224
- snapshot_review_count BIGINT,
225
- snapshot_review_comment_count BIGINT,
226
- repo_association VARCHAR,
227
- new_to_repo BOOLEAN,
228
- first_seen_in_snapshot BOOLEAN,
229
- report_reason VARCHAR,
230
- account_age_days BIGINT,
231
- young_account BOOLEAN,
232
- follow_through_score VARCHAR,
233
- breadth_score VARCHAR,
234
- automation_risk_signal VARCHAR,
235
- heuristic_note VARCHAR,
236
- public_orgs_json VARCHAR,
237
- visible_authored_pr_count BIGINT,
238
- merged_pr_count BIGINT,
239
- closed_unmerged_pr_count BIGINT,
240
- open_pr_count BIGINT,
241
- merged_pr_rate DOUBLE,
242
- closed_unmerged_pr_rate DOUBLE,
243
- still_open_pr_rate DOUBLE,
244
- distinct_repos_with_authored_prs BIGINT,
245
- distinct_repos_with_open_prs BIGINT,
246
- fetch_error VARCHAR
247
- );
248
  CREATE TABLE IF NOT EXISTS pr_scope_features (
249
  run_id VARCHAR,
250
  repo VARCHAR,
@@ -318,8 +232,6 @@ CREATE TABLE IF NOT EXISTS pr_scope_cluster_candidates (
318
  CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
319
  CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
320
  CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
321
- CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_author ON pr_search_documents (run_id, author_login);
322
- CREATE INDEX IF NOT EXISTS idx_pr_search_contributors_run_author ON pr_search_contributors (run_id, author_login);
323
  CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
324
  CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
325
  CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
@@ -344,9 +256,6 @@ def connect_pr_search_db(path: Path, *, read_only: bool = False) -> duckdb.DuckD
344
 
345
  def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
346
  connection.execute(SCHEMA_SQL)
347
- connection.execute(
348
- "ALTER TABLE pr_search_documents ADD COLUMN IF NOT EXISTS author_login VARCHAR"
349
- )
350
 
351
 
352
  def insert_rows(
@@ -444,7 +353,6 @@ def resolve_active_run(
444
  def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
445
  return {
446
  "documents": _count(connection, "pr_search_documents", run_id),
447
- "contributors": _count(connection, "pr_search_contributors", run_id),
448
  "features": _count(connection, "pr_scope_features", run_id),
449
  "run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
450
  "neighbors": _count(connection, "pr_scope_neighbors", run_id),
@@ -467,60 +375,6 @@ def get_document(
467
  )
468
 
469
 
470
- def get_contributor(
471
- connection: duckdb.DuckDBPyConnection,
472
- *,
473
- run_id: str,
474
- author_login: str,
475
- ) -> dict[str, Any] | None:
476
- return fetch_one(
477
- connection,
478
- """
479
- SELECT *
480
- FROM pr_search_contributors
481
- WHERE run_id = ? AND lower(author_login) = lower(?)
482
- """,
483
- [run_id, author_login],
484
- )
485
-
486
-
487
- def get_contributor_pulls(
488
- connection: duckdb.DuckDBPyConnection,
489
- *,
490
- run_id: str,
491
- author_login: str,
492
- limit: int,
493
- ) -> list[dict[str, Any]]:
494
- return fetch_rows(
495
- connection,
496
- """
497
- SELECT
498
- pr_number,
499
- github_id,
500
- author_login,
501
- state,
502
- draft,
503
- merged,
504
- title,
505
- base_ref,
506
- created_at,
507
- updated_at,
508
- merged_at,
509
- additions,
510
- deletions,
511
- changed_files,
512
- comments_count,
513
- review_comments_count,
514
- html_url
515
- FROM pr_search_documents
516
- WHERE run_id = ? AND lower(author_login) = lower(?)
517
- ORDER BY updated_at DESC NULLS LAST, pr_number DESC
518
- LIMIT ?
519
- """,
520
- [run_id, author_login, limit],
521
- )
522
-
523
-
524
  def get_feature(
525
  connection: duckdb.DuckDBPyConnection,
526
  *,
 
31
  "repo",
32
  "pr_number",
33
  "github_id",
 
34
  "state",
35
  "draft",
36
  "merged",
 
46
  "review_comments_count",
47
  "html_url",
48
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  "pr_scope_features": (
50
  "run_id",
51
  "repo",
 
144
  repo VARCHAR,
145
  pr_number BIGINT,
146
  github_id BIGINT,
 
147
  state VARCHAR,
148
  draft BOOLEAN,
149
  merged BOOLEAN,
 
159
  review_comments_count BIGINT,
160
  html_url VARCHAR
161
  );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  CREATE TABLE IF NOT EXISTS pr_scope_features (
163
  run_id VARCHAR,
164
  repo VARCHAR,
 
232
  CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
233
  CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
234
  CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
 
 
235
  CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
236
  CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
237
  CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
 
256
 
257
  def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
258
  connection.execute(SCHEMA_SQL)
 
 
 
259
 
260
 
261
  def insert_rows(
 
353
  def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
354
  return {
355
  "documents": _count(connection, "pr_search_documents", run_id),
 
356
  "features": _count(connection, "pr_scope_features", run_id),
357
  "run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
358
  "neighbors": _count(connection, "pr_scope_neighbors", run_id),
 
375
  )
376
 
377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  def get_feature(
379
  connection: duckdb.DuckDBPyConnection,
380
  *,
src/slop_farmer/data/snapshot_materialize.py CHANGED
@@ -74,6 +74,9 @@ def _materialize_hf_snapshot_repo_snapshot(
74
  "links.parquet",
75
  "events.parquet",
76
  "manifest.json",
 
 
 
77
  "new_contributors.parquet",
78
  "new-contributors-report.json",
79
  "new-contributors-report.md",
@@ -149,6 +152,9 @@ def _materialize_hf_root_snapshot(
149
  "links.parquet",
150
  "events.parquet",
151
  "manifest.json",
 
 
 
152
  "new_contributors.parquet",
153
  "new-contributors-report.json",
154
  "new-contributors-report.md",
 
74
  "links.parquet",
75
  "events.parquet",
76
  "manifest.json",
77
+ "analysis-report.json",
78
+ "analysis-report-hybrid.json",
79
+ "analysis-report-deterministic.json",
80
  "new_contributors.parquet",
81
  "new-contributors-report.json",
82
  "new-contributors-report.md",
 
152
  "links.parquet",
153
  "events.parquet",
154
  "manifest.json",
155
+ "analysis-report.json",
156
+ "analysis-report-hybrid.json",
157
+ "analysis-report-deterministic.json",
158
  "new_contributors.parquet",
159
  "new-contributors-report.json",
160
  "new-contributors-report.md",
src/slop_farmer/reports/analysis.py CHANGED
@@ -19,7 +19,11 @@ from rank_bm25 import BM25Okapi
19
  from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
20
  from slop_farmer.data.links import build_text_link_rows
21
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
22
- from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 
 
 
 
23
  from slop_farmer.reports.analysis_cache import (
24
  HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
25
  PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
@@ -762,14 +766,18 @@ def _artifact_suffix(row: dict[str, Any] | None, kind: str) -> str:
762
 
763
 
764
  def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
765
- return resolve_snapshot_source_dir(
766
- snapshot_dir=options.snapshot_dir,
767
- local_snapshots_root=options.output_dir.resolve() / "snapshots",
768
- hf_repo_id=options.hf_repo_id,
769
- hf_revision=options.hf_revision,
770
- hf_materialize_dir=options.hf_materialize_dir,
771
- hf_output_dir=options.output_dir,
772
- )
 
 
 
 
773
 
774
 
775
  def _load_snapshot(snapshot_dir: Path) -> SnapshotData:
 
19
  from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
20
  from slop_farmer.data.links import build_text_link_rows
21
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
22
+ from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
23
+ from slop_farmer.data.snapshot_paths import (
24
+ default_hf_materialize_dir,
25
+ resolve_snapshot_dir_from_output,
26
+ )
27
  from slop_farmer.reports.analysis_cache import (
28
  HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
29
  PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
 
766
 
767
 
768
  def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
769
+ if options.hf_repo_id:
770
+ materialize_dir = options.hf_materialize_dir or default_hf_materialize_dir(
771
+ options.output_dir,
772
+ options.hf_repo_id,
773
+ options.hf_revision,
774
+ )
775
+ return materialize_hf_dataset_snapshot(
776
+ repo_id=options.hf_repo_id,
777
+ local_dir=materialize_dir,
778
+ revision=options.hf_revision,
779
+ ).resolve()
780
+ return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
781
 
782
 
783
  def _load_snapshot(snapshot_dir: Path) -> SnapshotData:
src/slop_farmer/reports/dashboard.py CHANGED
@@ -8,7 +8,7 @@ from typing import Any
8
 
9
  from slop_farmer.config import DashboardDataOptions
10
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
11
- from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
12
 
13
 
14
  def run_dashboard_data(options: DashboardDataOptions) -> Path:
@@ -88,14 +88,7 @@ def _resolve_snapshot_dir(options: DashboardDataOptions) -> Path:
88
  if options.snapshot_root is not None
89
  else (Path("data") / "snapshots").resolve()
90
  )
91
- return resolve_snapshot_source_dir(
92
- snapshot_dir=options.snapshot_dir,
93
- local_snapshots_root=snapshots_root,
94
- hf_repo_id=options.hf_repo_id,
95
- hf_revision=options.hf_revision,
96
- hf_materialize_dir=options.hf_materialize_dir,
97
- hf_output_dir=snapshots_root.parent,
98
- )
99
 
100
 
101
  def _read_optional_json(path: Path) -> dict[str, Any]:
 
8
 
9
  from slop_farmer.config import DashboardDataOptions
10
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
11
+ from slop_farmer.data.snapshot_paths import resolve_snapshot_dir_from_snapshots_root
12
 
13
 
14
  def run_dashboard_data(options: DashboardDataOptions) -> Path:
 
88
  if options.snapshot_root is not None
89
  else (Path("data") / "snapshots").resolve()
90
  )
91
+ return resolve_snapshot_dir_from_snapshots_root(snapshots_root, options.snapshot_dir)
 
 
 
 
 
 
 
92
 
93
 
94
  def _read_optional_json(path: Path) -> dict[str, Any]:
src/slop_farmer/reports/new_contributor_report.py CHANGED
@@ -12,7 +12,7 @@ from typing import Any
12
  from slop_farmer.config import NewContributorReportOptions, resolve_github_token
13
  from slop_farmer.data.http import urlopen_with_retry
14
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
15
- from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
16
  from slop_farmer.reports.user_activity import summarize_user
17
 
18
  GRAPHQL_URL = "https://api.github.com/graphql"
@@ -131,14 +131,7 @@ def run_new_contributor_report(options: NewContributorReportOptions) -> Path:
131
 
132
 
133
  def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
134
- return resolve_snapshot_source_dir(
135
- snapshot_dir=options.snapshot_dir,
136
- local_snapshots_root=options.output_dir.resolve() / "snapshots",
137
- hf_repo_id=options.hf_repo_id,
138
- hf_revision=options.hf_revision,
139
- hf_materialize_dir=options.hf_materialize_dir,
140
- hf_output_dir=options.output_dir,
141
- )
142
 
143
 
144
  def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
@@ -251,6 +244,7 @@ def _report_contributors(
251
  previous_report_reusable
252
  and previous_entry is not None
253
  and not previous_entry.get("fetch_error")
 
254
  ):
255
  contributors.append(
256
  _reused_previous_report_entry(
@@ -262,8 +256,6 @@ def _report_contributors(
262
  )
263
  )
264
  reused_previous_report += 1
265
- if known_via_prior_merged_pr:
266
- reused_known_merged += 1
267
  continue
268
  try:
269
  summary = summarize_user(row["author_login"], options.window_days, None)
 
12
  from slop_farmer.config import NewContributorReportOptions, resolve_github_token
13
  from slop_farmer.data.http import urlopen_with_retry
14
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
15
+ from slop_farmer.data.snapshot_paths import resolve_snapshot_dir_from_output
16
  from slop_farmer.reports.user_activity import summarize_user
17
 
18
  GRAPHQL_URL = "https://api.github.com/graphql"
 
131
 
132
 
133
  def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
134
+ return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
 
 
 
 
 
 
 
135
 
136
 
137
  def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
 
244
  previous_report_reusable
245
  and previous_entry is not None
246
  and not previous_entry.get("fetch_error")
247
+ and not known_via_prior_merged_pr
248
  ):
249
  contributors.append(
250
  _reused_previous_report_entry(
 
256
  )
257
  )
258
  reused_previous_report += 1
 
 
259
  continue
260
  try:
261
  summary = summarize_user(row["author_login"], options.window_days, None)
src/slop_farmer/reports/pr_scope.py CHANGED
@@ -42,7 +42,11 @@ from typing import Any
42
  from pydantic import BaseModel, Field
43
 
44
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
45
- from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 
 
 
 
46
  from slop_farmer.reports.pr_heuristics import (
47
  compile_cluster_suppression_rules,
48
  suppressed_pull_request_reasons,
@@ -256,14 +260,17 @@ def run_pr_scope_report(options: Any) -> Path:
256
 
257
 
258
  def _resolve_snapshot_dir(options: Any) -> Path:
259
- return resolve_snapshot_source_dir(
260
- snapshot_dir=options.snapshot_dir,
261
- local_snapshots_root=options.output_dir.resolve() / "snapshots",
262
- hf_repo_id=options.hf_repo_id,
263
- hf_revision=options.hf_revision,
264
- hf_materialize_dir=options.hf_materialize_dir,
265
- hf_output_dir=options.output_dir,
266
- )
 
 
 
267
 
268
 
269
  def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:
 
42
  from pydantic import BaseModel, Field
43
 
44
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
45
+ from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
46
+ from slop_farmer.data.snapshot_paths import (
47
+ default_hf_materialize_dir,
48
+ resolve_snapshot_dir_from_output,
49
+ )
50
  from slop_farmer.reports.pr_heuristics import (
51
  compile_cluster_suppression_rules,
52
  suppressed_pull_request_reasons,
 
260
 
261
 
262
  def _resolve_snapshot_dir(options: Any) -> Path:
263
+ if options.hf_repo_id:
264
+ snapshot_dir = materialize_hf_dataset_snapshot(
265
+ repo_id=options.hf_repo_id,
266
+ local_dir=options.hf_materialize_dir
267
+ or default_hf_materialize_dir(
268
+ options.output_dir, options.hf_repo_id, options.hf_revision
269
+ ),
270
+ revision=options.hf_revision,
271
+ )
272
+ return snapshot_dir.resolve()
273
+ return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
274
 
275
 
276
  def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:
src/slop_farmer/reports/pr_search_scope.py CHANGED
@@ -10,7 +10,11 @@ from typing import Any
10
 
11
  from slop_farmer.config import PrSearchRefreshOptions
12
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
13
- from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 
 
 
 
14
  from slop_farmer.reports.pr_heuristics import (
15
  compile_cluster_suppression_rules,
16
  suppressed_pull_request_reasons,
@@ -32,14 +36,17 @@ DEFAULT_CANDIDATE_LIMIT = 5
32
 
33
 
34
  def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
35
- return resolve_snapshot_source_dir(
36
- snapshot_dir=options.snapshot_dir,
37
- local_snapshots_root=options.output_dir.resolve() / "snapshots",
38
- hf_repo_id=options.hf_repo_id,
39
- hf_revision=options.hf_revision,
40
- hf_materialize_dir=options.hf_materialize_dir,
41
- hf_output_dir=options.output_dir,
42
- )
 
 
 
43
 
44
 
45
  def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
@@ -47,7 +54,6 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
47
  manifest = read_json(manifest_path) if manifest_path.exists() else {}
48
  pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
49
  pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
50
- contributors = read_parquet_rows(snapshot_dir / "new_contributors.parquet")
51
  repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
52
  snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
53
  return {
@@ -56,7 +62,6 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
56
  "manifest": manifest,
57
  "pull_requests": pull_requests,
58
  "pr_files": pr_files,
59
- "contributors": contributors,
60
  }
61
 
62
 
@@ -407,7 +412,6 @@ def _document_row(row: Mapping[str, Any]) -> dict[str, Any]:
407
  return {
408
  "pr_number": int(row["number"]),
409
  "github_id": row.get("github_id"),
410
- "author_login": row.get("author_login"),
411
  "state": row.get("state"),
412
  "draft": bool(row.get("draft")),
413
  "merged": bool(row.get("merged")),
 
10
 
11
  from slop_farmer.config import PrSearchRefreshOptions
12
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
13
+ from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
14
+ from slop_farmer.data.snapshot_paths import (
15
+ default_hf_materialize_dir,
16
+ resolve_snapshot_dir_from_output,
17
+ )
18
  from slop_farmer.reports.pr_heuristics import (
19
  compile_cluster_suppression_rules,
20
  suppressed_pull_request_reasons,
 
36
 
37
 
38
  def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
39
+ if options.hf_repo_id:
40
+ snapshot_dir = materialize_hf_dataset_snapshot(
41
+ repo_id=options.hf_repo_id,
42
+ local_dir=options.hf_materialize_dir
43
+ or default_hf_materialize_dir(
44
+ options.output_dir, options.hf_repo_id, options.hf_revision
45
+ ),
46
+ revision=options.hf_revision,
47
+ )
48
+ return snapshot_dir.resolve()
49
+ return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
50
 
51
 
52
  def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
 
54
  manifest = read_json(manifest_path) if manifest_path.exists() else {}
55
  pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
56
  pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
 
57
  repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
58
  snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
59
  return {
 
62
  "manifest": manifest,
63
  "pull_requests": pull_requests,
64
  "pr_files": pr_files,
 
65
  }
66
 
67
 
 
412
  return {
413
  "pr_number": int(row["number"]),
414
  "github_id": row.get("github_id"),
 
415
  "state": row.get("state"),
416
  "draft": bool(row.get("draft")),
417
  "merged": bool(row.get("merged")),
src/slop_farmer/reports/pr_search_service.py CHANGED
@@ -1,7 +1,7 @@
1
  from __future__ import annotations
2
 
3
  import json
4
- from collections.abc import Iterable, Mapping, Sequence
5
  from contextlib import suppress
6
  from pathlib import Path
7
  from typing import Any, Protocol
@@ -17,8 +17,6 @@ from slop_farmer.data.search_duckdb import (
17
  get_cluster,
18
  get_cluster_ids_for_prs,
19
  get_cluster_members,
20
- get_contributor,
21
- get_contributor_pulls,
22
  get_document,
23
  get_feature,
24
  get_pair_neighbor_row,
@@ -101,16 +99,6 @@ def run_pr_search_refresh(options: PrSearchRefreshOptions) -> dict[str, Any]:
101
  "pr_search_documents",
102
  _scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
103
  )
104
- insert_rows(
105
- connection,
106
- "pr_search_contributors",
107
- _contributor_rows(
108
- snapshot["contributors"],
109
- run_id=run_id,
110
- repo=repo,
111
- snapshot_id=str(snapshot["snapshot_id"]),
112
- ),
113
- )
114
  insert_rows(
115
  connection,
116
  "pr_scope_features",
@@ -302,85 +290,6 @@ def get_pr_search_candidate_clusters(
302
  connection.close()
303
 
304
 
305
- def get_pr_search_contributor(
306
- db_path: Path,
307
- *,
308
- author_login: str,
309
- repo: str | None = None,
310
- ) -> dict[str, Any]:
311
- connection = connect_pr_search_db(db_path, read_only=True)
312
- try:
313
- active_run = resolve_active_run(connection, repo=repo)
314
- run_id = str(active_run["id"])
315
- contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
316
- pulls = _document_rows(
317
- get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=20)
318
- )
319
- return {
320
- "repo": active_run["repo"],
321
- "snapshot_id": active_run["snapshot_id"],
322
- "run_id": run_id,
323
- "contributor": contributor,
324
- "pulls": pulls,
325
- "pull_count": len(pulls),
326
- }
327
- finally:
328
- connection.close()
329
-
330
-
331
- def get_pr_search_contributor_pulls(
332
- db_path: Path,
333
- *,
334
- author_login: str,
335
- repo: str | None = None,
336
- limit: int = 20,
337
- ) -> dict[str, Any]:
338
- connection = connect_pr_search_db(db_path, read_only=True)
339
- try:
340
- active_run = resolve_active_run(connection, repo=repo)
341
- run_id = str(active_run["id"])
342
- contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
343
- pulls = _document_rows(
344
- get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=limit)
345
- )
346
- return {
347
- "repo": active_run["repo"],
348
- "snapshot_id": active_run["snapshot_id"],
349
- "run_id": run_id,
350
- "contributor": contributor,
351
- "pulls": pulls,
352
- "pull_count": len(pulls),
353
- }
354
- finally:
355
- connection.close()
356
-
357
-
358
- def get_pr_search_pull_contributor(
359
- db_path: Path,
360
- *,
361
- pr_number: int,
362
- repo: str | None = None,
363
- ) -> dict[str, Any]:
364
- connection = connect_pr_search_db(db_path, read_only=True)
365
- try:
366
- active_run = resolve_active_run(connection, repo=repo)
367
- run_id = str(active_run["id"])
368
- document = _require_document(connection, run_id=run_id, pr_number=pr_number)
369
- author_login = str(document.get("author_login") or "").strip()
370
- if not author_login:
371
- raise ValueError(f"PR #{pr_number} does not have an indexed author_login.")
372
- contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
373
- return {
374
- "repo": active_run["repo"],
375
- "snapshot_id": active_run["snapshot_id"],
376
- "run_id": run_id,
377
- "pr": _without_json_fields(document),
378
- "contributor": contributor,
379
- }
380
- finally:
381
- connection.close()
382
-
383
-
384
  def get_pr_search_similar_lookup(
385
  db_path: Path,
386
  *,
@@ -892,15 +801,6 @@ def _require_feature(connection: Any, *, run_id: str, pr_number: int) -> dict[st
892
  return feature
893
 
894
 
895
- def _require_contributor(connection: Any, *, run_id: str, author_login: str) -> dict[str, Any]:
896
- contributor = get_contributor(connection, run_id=run_id, author_login=author_login)
897
- if contributor is None:
898
- raise ValueError(
899
- f"Contributor {author_login!r} was not found in the active indexed universe."
900
- )
901
- return _contributor_row(contributor)
902
-
903
-
904
  def _json_list(raw: Any) -> list[str]:
905
  if isinstance(raw, list):
906
  return [str(item) for item in raw]
@@ -938,71 +838,6 @@ def _without_json_fields(row: Mapping[str, Any]) -> dict[str, Any]:
938
  return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
939
 
940
 
941
- def _document_rows(rows: Sequence[Mapping[str, Any]]) -> list[dict[str, Any]]:
942
- return [_without_json_fields(row) for row in rows]
943
-
944
-
945
- def _contributor_rows(
946
- rows: list[Mapping[str, Any]],
947
- *,
948
- run_id: str,
949
- repo: str,
950
- snapshot_id: str,
951
- ) -> list[dict[str, Any]]:
952
- return [
953
- {
954
- "run_id": run_id,
955
- "repo": repo,
956
- "snapshot_id": snapshot_id,
957
- "report_generated_at": row.get("report_generated_at"),
958
- "window_days": row.get("window_days"),
959
- "author_login": row.get("author_login"),
960
- "name": row.get("name"),
961
- "profile_url": row.get("profile_url"),
962
- "repo_pull_requests_url": row.get("repo_pull_requests_url"),
963
- "repo_issues_url": row.get("repo_issues_url"),
964
- "repo_first_seen_at": row.get("repo_first_seen_at"),
965
- "repo_last_seen_at": row.get("repo_last_seen_at"),
966
- "repo_primary_artifact_count": row.get("repo_primary_artifact_count"),
967
- "repo_artifact_count": row.get("repo_artifact_count"),
968
- "snapshot_issue_count": row.get("snapshot_issue_count"),
969
- "snapshot_pr_count": row.get("snapshot_pr_count"),
970
- "snapshot_comment_count": row.get("snapshot_comment_count"),
971
- "snapshot_review_count": row.get("snapshot_review_count"),
972
- "snapshot_review_comment_count": row.get("snapshot_review_comment_count"),
973
- "repo_association": row.get("repo_association"),
974
- "new_to_repo": row.get("new_to_repo"),
975
- "first_seen_in_snapshot": row.get("first_seen_in_snapshot"),
976
- "report_reason": row.get("report_reason"),
977
- "account_age_days": row.get("account_age_days"),
978
- "young_account": row.get("young_account"),
979
- "follow_through_score": row.get("follow_through_score"),
980
- "breadth_score": row.get("breadth_score"),
981
- "automation_risk_signal": row.get("automation_risk_signal"),
982
- "heuristic_note": row.get("heuristic_note"),
983
- "public_orgs_json": row.get("public_orgs"),
984
- "visible_authored_pr_count": row.get("visible_authored_pr_count"),
985
- "merged_pr_count": row.get("merged_pr_count"),
986
- "closed_unmerged_pr_count": row.get("closed_unmerged_pr_count"),
987
- "open_pr_count": row.get("open_pr_count"),
988
- "merged_pr_rate": row.get("merged_pr_rate"),
989
- "closed_unmerged_pr_rate": row.get("closed_unmerged_pr_rate"),
990
- "still_open_pr_rate": row.get("still_open_pr_rate"),
991
- "distinct_repos_with_authored_prs": row.get("distinct_repos_with_authored_prs"),
992
- "distinct_repos_with_open_prs": row.get("distinct_repos_with_open_prs"),
993
- "fetch_error": row.get("fetch_error"),
994
- }
995
- for row in rows
996
- ]
997
-
998
-
999
- def _contributor_row(row: Mapping[str, Any]) -> dict[str, Any]:
1000
- return {
1001
- **_without_json_fields(row),
1002
- "public_orgs": _json_list(row.get("public_orgs_json")),
1003
- }
1004
-
1005
-
1006
  def _normalize_lookup_mode(mode: str) -> str:
1007
  normalized = mode.strip().lower()
1008
  if normalized not in {"auto", "indexed", "live"}:
 
1
  from __future__ import annotations
2
 
3
  import json
4
+ from collections.abc import Iterable, Mapping
5
  from contextlib import suppress
6
  from pathlib import Path
7
  from typing import Any, Protocol
 
17
  get_cluster,
18
  get_cluster_ids_for_prs,
19
  get_cluster_members,
 
 
20
  get_document,
21
  get_feature,
22
  get_pair_neighbor_row,
 
99
  "pr_search_documents",
100
  _scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
101
  )
 
 
 
 
 
 
 
 
 
 
102
  insert_rows(
103
  connection,
104
  "pr_scope_features",
 
290
  connection.close()
291
 
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  def get_pr_search_similar_lookup(
294
  db_path: Path,
295
  *,
 
801
  return feature
802
 
803
 
 
 
 
 
 
 
 
 
 
804
  def _json_list(raw: Any) -> list[str]:
805
  if isinstance(raw, list):
806
  return [str(item) for item in raw]
 
838
  return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
839
 
840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  def _normalize_lookup_mode(mode: str) -> str:
842
  normalized = mode.strip().lower()
843
  if normalized not in {"auto", "indexed", "live"}:
src/slop_farmer/reports/read_views.py ADDED
@@ -0,0 +1,742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any, Literal
6
+
7
+ from slop_farmer.data.parquet_io import read_json, read_parquet_rows
8
+
9
+ AnalysisVariant = Literal["auto", "hybrid", "deterministic"]
10
+
11
+
12
+ @dataclass(slots=True, frozen=True)
13
+ class _SnapshotMetadata:
14
+ repo: str
15
+ snapshot_id: str
16
+
17
+
18
+ @dataclass(slots=True, frozen=True)
19
+ class _AnalysisSelection:
20
+ path: Path
21
+ payload: dict[str, Any]
22
+ variant_used: str
23
+ llm_enrichment: bool
24
+
25
+
26
+ def get_snapshot_surfaces(snapshot_dir: Path) -> dict[str, Any]:
27
+ issue_status = get_issue_cluster_status(snapshot_dir, variant="auto")
28
+ contributor_status = get_contributor_status(snapshot_dir)
29
+ return {
30
+ "issues": {
31
+ "available": issue_status["available"],
32
+ "variant_used": issue_status.get("variant_used"),
33
+ "llm_enrichment": issue_status.get("llm_enrichment"),
34
+ "generated_at": issue_status.get("generated_at"),
35
+ "cluster_count": (issue_status.get("counts") or {}).get("meta_bugs", 0),
36
+ "duplicate_pr_count": (issue_status.get("counts") or {}).get("duplicate_prs", 0),
37
+ "available_variants": issue_status.get("available_variants") or [],
38
+ },
39
+ "contributors": {
40
+ "available": contributor_status["available"],
41
+ "generated_at": contributor_status.get("generated_at"),
42
+ "contributor_count": contributor_status.get("contributor_count", 0),
43
+ },
44
+ }
45
+
46
+
47
+ def get_issue_cluster_status(snapshot_dir: Path, *, variant: AnalysisVariant) -> dict[str, Any]:
48
+ metadata = _snapshot_metadata(snapshot_dir)
49
+ candidates = _analysis_candidates(snapshot_dir)
50
+ selection = _select_analysis_report(candidates, variant=variant)
51
+ status = {
52
+ "repo": metadata.repo,
53
+ "snapshot_id": metadata.snapshot_id,
54
+ "variant_requested": variant,
55
+ "available": selection is not None,
56
+ "available_variants": sorted({candidate["variant"] for candidate in candidates}),
57
+ }
58
+ if selection is None:
59
+ return {
60
+ **status,
61
+ "variant_used": None,
62
+ "llm_enrichment": False,
63
+ "generated_at": None,
64
+ "report_path": None,
65
+ "counts": {"meta_bugs": 0, "duplicate_issues": 0, "duplicate_prs": 0},
66
+ }
67
+ payload = selection.payload
68
+ return {
69
+ **status,
70
+ "variant_used": selection.variant_used,
71
+ "llm_enrichment": selection.llm_enrichment,
72
+ "generated_at": payload.get("generated_at"),
73
+ "report_path": selection.path.name,
74
+ "counts": _analysis_counts(payload),
75
+ }
76
+
77
+
78
+ def list_issue_clusters(
79
+ snapshot_dir: Path,
80
+ *,
81
+ limit: int | None,
82
+ variant: AnalysisVariant,
83
+ ) -> dict[str, Any]:
84
+ metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
85
+ base = _analysis_base_payload(metadata, selection, variant=variant)
86
+ if selection is None:
87
+ return {**base, "clusters": [], "cluster_count": 0}
88
+ clusters = [
89
+ _issue_cluster_summary(cluster, issue_map, pr_map, rank=index)
90
+ for index, cluster in enumerate(selection.payload.get("meta_bugs") or [], start=1)
91
+ ]
92
+ total = len(clusters)
93
+ return {
94
+ **base,
95
+ "clusters": clusters[:limit] if limit is not None else clusters,
96
+ "cluster_count": total,
97
+ }
98
+
99
+
100
+ def get_issue_cluster(
101
+ snapshot_dir: Path,
102
+ *,
103
+ cluster_id: str,
104
+ variant: AnalysisVariant,
105
+ ) -> dict[str, Any]:
106
+ metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
107
+ base = _analysis_base_payload(metadata, selection, variant=variant)
108
+ if selection is None:
109
+ return {
110
+ **base,
111
+ "cluster_id": cluster_id,
112
+ "found": False,
113
+ "cluster": None,
114
+ "issues": [],
115
+ "pull_requests": [],
116
+ }
117
+ cluster = next(
118
+ (
119
+ row
120
+ for row in selection.payload.get("meta_bugs") or []
121
+ if str(row.get("cluster_id") or "") == cluster_id
122
+ ),
123
+ None,
124
+ )
125
+ if cluster is None:
126
+ return {
127
+ **base,
128
+ "cluster_id": cluster_id,
129
+ "found": False,
130
+ "cluster": None,
131
+ "issues": [],
132
+ "pull_requests": [],
133
+ }
134
+ issue_numbers = _ordered_ints(cluster.get("issue_numbers"))
135
+ pr_numbers = _ordered_ints(cluster.get("pr_numbers"))
136
+ canonical_pr_number = _coerce_int(cluster.get("canonical_pr_number"))
137
+ return {
138
+ **base,
139
+ "cluster_id": cluster_id,
140
+ "found": True,
141
+ "cluster": {
142
+ **_issue_cluster_summary(cluster, issue_map, pr_map),
143
+ "canonical_issue_reason": cluster.get("canonical_issue_reason"),
144
+ "canonical_pr_reason": cluster.get("canonical_pr_reason"),
145
+ "best_issue_reason": cluster.get("best_issue_reason"),
146
+ "best_pr_reason": cluster.get("best_pr_reason"),
147
+ },
148
+ "issues": [_issue_member_row(number, issue_map.get(number)) for number in issue_numbers],
149
+ "pull_requests": [
150
+ _pr_member_row(
151
+ number,
152
+ pr_map.get(number),
153
+ role="canonical" if canonical_pr_number == number else "member",
154
+ )
155
+ for number in pr_numbers
156
+ ],
157
+ }
158
+
159
+
160
+ def get_issue_clusters_for_pr(
161
+ snapshot_dir: Path,
162
+ *,
163
+ pr_number: int,
164
+ variant: AnalysisVariant,
165
+ ) -> dict[str, Any]:
166
+ metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
167
+ base = _analysis_base_payload(metadata, selection, variant=variant)
168
+ if selection is None:
169
+ return {**base, "pr_number": pr_number, "found": False, "clusters": [], "cluster_count": 0}
170
+ matches = []
171
+ for index, cluster in enumerate(selection.payload.get("meta_bugs") or [], start=1):
172
+ pr_numbers = _ordered_ints(cluster.get("pr_numbers"))
173
+ if pr_number not in pr_numbers:
174
+ continue
175
+ canonical_pr_number = _coerce_int(cluster.get("canonical_pr_number"))
176
+ matches.append(
177
+ {
178
+ **_issue_cluster_summary(cluster, issue_map, pr_map, rank=index),
179
+ "membership_role": "canonical" if canonical_pr_number == pr_number else "member",
180
+ }
181
+ )
182
+ return {
183
+ **base,
184
+ "pr_number": pr_number,
185
+ "found": bool(matches),
186
+ "clusters": matches,
187
+ "cluster_count": len(matches),
188
+ }
189
+
190
+
191
+ def check_issue_cluster_membership(
192
+ snapshot_dir: Path,
193
+ *,
194
+ pr_number: int,
195
+ cluster_id: str | None,
196
+ variant: AnalysisVariant,
197
+ ) -> dict[str, Any]:
198
+ lookup = get_issue_clusters_for_pr(snapshot_dir, pr_number=pr_number, variant=variant)
199
+ matches = list(lookup.get("clusters") or [])
200
+ matching_cluster_ids = [str(row.get("cluster_id")) for row in matches if row.get("cluster_id")]
201
+ if cluster_id is None:
202
+ return {
203
+ **lookup,
204
+ "cluster_id": None,
205
+ "matched": bool(matching_cluster_ids),
206
+ "matching_cluster_ids": matching_cluster_ids,
207
+ }
208
+ match = next((row for row in matches if row.get("cluster_id") == cluster_id), None)
209
+ return {
210
+ **lookup,
211
+ "cluster_id": cluster_id,
212
+ "matched": match is not None,
213
+ "matching_cluster_ids": matching_cluster_ids,
214
+ "membership": match,
215
+ }
216
+
217
+
218
+ def list_issue_duplicate_prs(
219
+ snapshot_dir: Path,
220
+ *,
221
+ limit: int | None,
222
+ variant: AnalysisVariant,
223
+ ) -> dict[str, Any]:
224
+ metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
225
+ base = _analysis_base_payload(metadata, selection, variant=variant)
226
+ if selection is None:
227
+ return {**base, "duplicate_prs": [], "duplicate_pr_count": 0}
228
+ rows = [
229
+ _duplicate_pr_summary(entry, issue_map, pr_map, rank=index)
230
+ for index, entry in enumerate(selection.payload.get("duplicate_prs") or [], start=1)
231
+ ]
232
+ total = len(rows)
233
+ return {
234
+ **base,
235
+ "duplicate_prs": rows[:limit] if limit is not None else rows,
236
+ "duplicate_pr_count": total,
237
+ }
238
+
239
+
240
+ def get_issue_best(snapshot_dir: Path, *, variant: AnalysisVariant) -> dict[str, Any]:
241
+ metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
242
+ base = _analysis_base_payload(metadata, selection, variant=variant)
243
+ if selection is None:
244
+ return {**base, "best_issue": None, "best_pr": None}
245
+ return {
246
+ **base,
247
+ "best_issue": _best_issue_summary(selection.payload.get("best_issue"), issue_map),
248
+ "best_pr": _best_pr_summary(selection.payload.get("best_pr"), pr_map),
249
+ }
250
+
251
+
252
+ def get_contributor_status(snapshot_dir: Path) -> dict[str, Any]:
253
+ metadata = _snapshot_metadata(snapshot_dir)
254
+ report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
255
+ contributors = report.get("contributors") if isinstance(report.get("contributors"), list) else []
256
+ return {
257
+ "repo": str(report.get("repo") or metadata.repo),
258
+ "snapshot_id": str(report.get("snapshot_id") or metadata.snapshot_id),
259
+ "available": bool(report),
260
+ "generated_at": report.get("generated_at"),
261
+ "window_days": _coerce_int(report.get("window_days")),
262
+ "contributor_count": len(contributors),
263
+ }
264
+
265
+
266
+ def list_contributors(snapshot_dir: Path, *, limit: int | None) -> dict[str, Any]:
267
+ status = get_contributor_status(snapshot_dir)
268
+ report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
269
+ rows = [
270
+ _contributor_summary(entry, rank=index)
271
+ for index, entry in enumerate(report.get("contributors") or [], start=1)
272
+ if isinstance(entry, dict)
273
+ ]
274
+ total = len(rows)
275
+ return {
276
+ **status,
277
+ "contributors": rows[:limit] if limit is not None else rows,
278
+ "contributor_count": total,
279
+ }
280
+
281
+
282
+ def get_contributor(snapshot_dir: Path, *, author_login: str) -> dict[str, Any]:
283
+ status = get_contributor_status(snapshot_dir)
284
+ report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
285
+ contributor = _find_contributor(report.get("contributors") or [], author_login)
286
+ if contributor is None:
287
+ return {
288
+ **status,
289
+ "author_login": author_login,
290
+ "found": False,
291
+ "summary": None,
292
+ "risk": None,
293
+ "contributor": None,
294
+ }
295
+ return {
296
+ **status,
297
+ "author_login": str(contributor.get("author_login") or author_login),
298
+ "found": True,
299
+ "summary": _contributor_summary(contributor),
300
+ "risk": _contributor_risk(contributor),
301
+ "contributor": contributor,
302
+ }
303
+
304
+
305
+ def get_contributor_risk(snapshot_dir: Path, *, author_login: str) -> dict[str, Any]:
306
+ contributor = get_contributor(snapshot_dir, author_login=author_login)
307
+ risk = contributor.get("risk")
308
+ return {
309
+ "repo": contributor.get("repo"),
310
+ "snapshot_id": contributor.get("snapshot_id"),
311
+ "available": contributor.get("available"),
312
+ "generated_at": contributor.get("generated_at"),
313
+ "author_login": contributor.get("author_login"),
314
+ "found": contributor.get("found"),
315
+ "risk_available": risk is not None,
316
+ "risk": risk,
317
+ }
318
+
319
+
320
+ def _analysis_context(
321
+ snapshot_dir: Path,
322
+ *,
323
+ variant: AnalysisVariant,
324
+ ) -> tuple[_SnapshotMetadata, _AnalysisSelection | None, dict[int, dict[str, Any]], dict[int, dict[str, Any]]]:
325
+ metadata = _snapshot_metadata(snapshot_dir)
326
+ selection = _select_analysis_report(_analysis_candidates(snapshot_dir), variant=variant)
327
+ issue_map, pr_map = _artifact_maps(snapshot_dir)
328
+ return metadata, selection, issue_map, pr_map
329
+
330
+
331
+ def _analysis_base_payload(
332
+ metadata: _SnapshotMetadata,
333
+ selection: _AnalysisSelection | None,
334
+ *,
335
+ variant: AnalysisVariant,
336
+ ) -> dict[str, Any]:
337
+ base = {
338
+ "repo": metadata.repo,
339
+ "snapshot_id": metadata.snapshot_id,
340
+ "variant_requested": variant,
341
+ "available": selection is not None,
342
+ "variant_used": None,
343
+ "llm_enrichment": False,
344
+ "generated_at": None,
345
+ }
346
+ if selection is None:
347
+ return base
348
+ return {
349
+ **base,
350
+ "variant_used": selection.variant_used,
351
+ "llm_enrichment": selection.llm_enrichment,
352
+ "generated_at": selection.payload.get("generated_at"),
353
+ }
354
+
355
+
356
+ def _analysis_candidates(snapshot_dir: Path) -> list[dict[str, Any]]:
357
+ candidates: list[dict[str, Any]] = []
358
+ for path in _analysis_report_paths(snapshot_dir):
359
+ payload = _read_optional_json(path)
360
+ if not payload:
361
+ continue
362
+ llm_enrichment = bool(payload.get("llm_enrichment"))
363
+ candidates.append(
364
+ {
365
+ "path": path,
366
+ "payload": payload,
367
+ "variant": _analysis_variant(path.name, payload, llm_enrichment=llm_enrichment),
368
+ "llm_enrichment": llm_enrichment,
369
+ }
370
+ )
371
+ return candidates
372
+
373
+
374
+ def _select_analysis_report(
375
+ candidates: list[dict[str, Any]],
376
+ *,
377
+ variant: AnalysisVariant,
378
+ ) -> _AnalysisSelection | None:
379
+ if not candidates:
380
+ return None
381
+ if variant == "auto":
382
+ ordered = sorted(candidates, key=_analysis_auto_priority)
383
+ else:
384
+ ordered = [candidate for candidate in candidates if candidate["variant"] == variant]
385
+ ordered.sort(key=_analysis_specific_priority)
386
+ if not ordered:
387
+ return None
388
+ winner = ordered[0]
389
+ return _AnalysisSelection(
390
+ path=Path(winner["path"]),
391
+ payload=dict(winner["payload"]),
392
+ variant_used=str(winner["variant"]),
393
+ llm_enrichment=bool(winner["llm_enrichment"]),
394
+ )
395
+
396
+
397
+ def _analysis_report_paths(snapshot_dir: Path) -> list[Path]:
398
+ ordered = [
399
+ snapshot_dir / "analysis-report-hybrid.json",
400
+ snapshot_dir / "analysis-report-deterministic.json",
401
+ snapshot_dir / "analysis-report.json",
402
+ ]
403
+ seen = {path.name for path in ordered}
404
+ ordered.extend(
405
+ path for path in sorted(snapshot_dir.glob("analysis-report*.json")) if path.name not in seen
406
+ )
407
+ return [path for path in ordered if path.exists()]
408
+
409
+
410
+ def _analysis_auto_priority(candidate: dict[str, Any]) -> tuple[int, str]:
411
+ path = Path(candidate["path"])
412
+ if path.name == "analysis-report-hybrid.json":
413
+ return (0, path.name)
414
+ if bool(candidate.get("llm_enrichment")):
415
+ return (1, path.name)
416
+ if path.name == "analysis-report.json":
417
+ return (2, path.name)
418
+ return (3, path.name)
419
+
420
+
421
+ def _analysis_specific_priority(candidate: dict[str, Any]) -> tuple[int, str]:
422
+ path = Path(candidate["path"])
423
+ if path.name.endswith(f"-{candidate['variant']}.json"):
424
+ return (0, path.name)
425
+ if path.name == "analysis-report.json":
426
+ return (1, path.name)
427
+ return (2, path.name)
428
+
429
+
430
+ def _analysis_variant(path_name: str, payload: dict[str, Any], *, llm_enrichment: bool) -> str:
431
+ lowered = path_name.lower()
432
+ if "hybrid" in lowered:
433
+ return "hybrid"
434
+ if "deterministic" in lowered:
435
+ return "deterministic"
436
+ if isinstance(payload.get("variant_used"), str):
437
+ variant_used = str(payload["variant_used"]).strip().lower()
438
+ if variant_used in {"hybrid", "deterministic"}:
439
+ return variant_used
440
+ return "hybrid" if llm_enrichment else "deterministic"
441
+
442
+
443
+ def _analysis_counts(payload: dict[str, Any]) -> dict[str, int]:
444
+ return {
445
+ "meta_bugs": len(payload.get("meta_bugs") or []),
446
+ "duplicate_issues": len(payload.get("duplicate_issues") or []),
447
+ "duplicate_prs": len(payload.get("duplicate_prs") or []),
448
+ }
449
+
450
+
451
+ def _artifact_maps(snapshot_dir: Path) -> tuple[dict[int, dict[str, Any]], dict[int, dict[str, Any]]]:
452
+ issue_rows = read_parquet_rows(snapshot_dir / "issues.parquet") if (snapshot_dir / "issues.parquet").exists() else []
453
+ pr_rows = (
454
+ read_parquet_rows(snapshot_dir / "pull_requests.parquet")
455
+ if (snapshot_dir / "pull_requests.parquet").exists()
456
+ else []
457
+ )
458
+ issue_map = {int(row["number"]): row for row in issue_rows if _coerce_int(row.get("number")) is not None}
459
+ pr_map = {int(row["number"]): row for row in pr_rows if _coerce_int(row.get("number")) is not None}
460
+ return issue_map, pr_map
461
+
462
+
463
+ def _issue_cluster_summary(
464
+ cluster: dict[str, Any],
465
+ issue_map: dict[int, dict[str, Any]],
466
+ pr_map: dict[int, dict[str, Any]],
467
+ *,
468
+ rank: int | None = None,
469
+ ) -> dict[str, Any]:
470
+ canonical_issue_number = _coerce_int(cluster.get("canonical_issue_number"))
471
+ canonical_pr_number = _coerce_int(cluster.get("canonical_pr_number"))
472
+ issue_numbers = _ordered_ints(cluster.get("issue_numbers"))
473
+ pr_numbers = _ordered_ints(cluster.get("pr_numbers"))
474
+ return {
475
+ "rank": rank,
476
+ "cluster_id": str(cluster.get("cluster_id") or f"cluster-{rank or 0}"),
477
+ "title": _cluster_title(cluster, issue_map, pr_map, canonical_issue_number, canonical_pr_number),
478
+ "summary": cluster.get("summary"),
479
+ "status": cluster.get("status"),
480
+ "confidence": _coerce_float(cluster.get("confidence")),
481
+ "canonical_issue_number": canonical_issue_number,
482
+ "canonical_issue_title": _title_for_issue(canonical_issue_number, issue_map),
483
+ "canonical_issue_url": _url_for_issue(canonical_issue_number, issue_map),
484
+ "canonical_pr_number": canonical_pr_number,
485
+ "canonical_pr_title": _title_for_pr(canonical_pr_number, pr_map),
486
+ "canonical_pr_url": _url_for_pr(canonical_pr_number, pr_map),
487
+ "issue_numbers": issue_numbers,
488
+ "issue_count": len(issue_numbers),
489
+ "pr_numbers": pr_numbers,
490
+ "pr_count": len(pr_numbers),
491
+ "evidence_types": [str(value) for value in (cluster.get("evidence_types") or []) if value],
492
+ "github_url": _cluster_url(canonical_issue_number, canonical_pr_number, issue_map, pr_map),
493
+ }
494
+
495
+
496
+ def _cluster_title(
497
+ cluster: dict[str, Any],
498
+ issue_map: dict[int, dict[str, Any]],
499
+ pr_map: dict[int, dict[str, Any]],
500
+ canonical_issue_number: int | None,
501
+ canonical_pr_number: int | None,
502
+ ) -> str:
503
+ issue_title = _title_for_issue(canonical_issue_number, issue_map)
504
+ if issue_title:
505
+ return issue_title
506
+ pr_title = _title_for_pr(canonical_pr_number, pr_map)
507
+ if pr_title:
508
+ return pr_title
509
+ summary = str(cluster.get("summary") or "").strip()
510
+ if summary:
511
+ return summary
512
+ return str(cluster.get("cluster_id") or "cluster")
513
+
514
+
515
+ def _cluster_url(
516
+ canonical_issue_number: int | None,
517
+ canonical_pr_number: int | None,
518
+ issue_map: dict[int, dict[str, Any]],
519
+ pr_map: dict[int, dict[str, Any]],
520
+ ) -> str | None:
521
+ return _url_for_issue(canonical_issue_number, issue_map) or _url_for_pr(canonical_pr_number, pr_map)
522
+
523
+
524
+ def _duplicate_pr_summary(
525
+ entry: dict[str, Any],
526
+ issue_map: dict[int, dict[str, Any]],
527
+ pr_map: dict[int, dict[str, Any]],
528
+ *,
529
+ rank: int,
530
+ ) -> dict[str, Any]:
531
+ canonical_pr_number = _coerce_int(entry.get("canonical_pr_number"))
532
+ target_issue_number = _coerce_int(entry.get("target_issue_number"))
533
+ duplicates = _ordered_ints(entry.get("duplicate_pr_numbers"))
534
+ return {
535
+ "rank": rank,
536
+ "cluster_id": str(entry.get("cluster_id") or f"duplicate-pr-{rank}"),
537
+ "canonical_pr_number": canonical_pr_number,
538
+ "canonical_pr_title": _title_for_pr(canonical_pr_number, pr_map),
539
+ "canonical_pr_url": _url_for_pr(canonical_pr_number, pr_map),
540
+ "target_issue_number": target_issue_number,
541
+ "target_issue_title": _title_for_issue(target_issue_number, issue_map),
542
+ "target_issue_url": _url_for_issue(target_issue_number, issue_map),
543
+ "duplicate_pr_numbers": duplicates,
544
+ "duplicate_pr_count": len(duplicates),
545
+ "reason": entry.get("reason"),
546
+ }
547
+
548
+
549
+ def _best_issue_summary(entry: Any, issue_map: dict[int, dict[str, Any]]) -> dict[str, Any] | None:
550
+ if not isinstance(entry, dict):
551
+ return None
552
+ issue_number = _coerce_int(entry.get("issue_number"))
553
+ return {
554
+ "cluster_id": entry.get("cluster_id"),
555
+ "issue_number": issue_number,
556
+ "title": _title_for_issue(issue_number, issue_map),
557
+ "url": _url_for_issue(issue_number, issue_map),
558
+ "reason": entry.get("reason"),
559
+ "score": _coerce_float(entry.get("score")),
560
+ }
561
+
562
+
563
+ def _best_pr_summary(entry: Any, pr_map: dict[int, dict[str, Any]]) -> dict[str, Any] | None:
564
+ if not isinstance(entry, dict):
565
+ return None
566
+ pr_number = _coerce_int(entry.get("pr_number"))
567
+ return {
568
+ "cluster_id": entry.get("cluster_id"),
569
+ "pr_number": pr_number,
570
+ "title": _title_for_pr(pr_number, pr_map),
571
+ "url": _url_for_pr(pr_number, pr_map),
572
+ "reason": entry.get("reason"),
573
+ "score": _coerce_float(entry.get("score")),
574
+ }
575
+
576
+
577
+ def _issue_member_row(number: int, row: dict[str, Any] | None) -> dict[str, Any]:
578
+ row = row or {}
579
+ return {
580
+ "number": number,
581
+ "title": row.get("title"),
582
+ "state": row.get("state"),
583
+ "author_login": row.get("author_login"),
584
+ "created_at": row.get("created_at"),
585
+ "updated_at": row.get("updated_at"),
586
+ "html_url": row.get("html_url"),
587
+ }
588
+
589
+
590
+ def _pr_member_row(number: int, row: dict[str, Any] | None, *, role: str) -> dict[str, Any]:
591
+ row = row or {}
592
+ return {
593
+ "number": number,
594
+ "role": role,
595
+ "title": row.get("title"),
596
+ "author_login": row.get("author_login"),
597
+ "state": row.get("state"),
598
+ "draft": bool(row.get("draft")),
599
+ "merged": bool(row.get("merged")),
600
+ "author_association": row.get("author_association"),
601
+ "created_at": row.get("created_at"),
602
+ "updated_at": row.get("updated_at"),
603
+ "html_url": row.get("html_url"),
604
+ }
605
+
606
+
607
+ def _contributor_summary(contributor: dict[str, Any], *, rank: int | None = None) -> dict[str, Any]:
608
+ activity = contributor.get("activity") if isinstance(contributor.get("activity"), dict) else {}
609
+ return {
610
+ "rank": rank,
611
+ "author_login": contributor.get("author_login"),
612
+ "name": contributor.get("name"),
613
+ "profile_url": contributor.get("profile_url"),
614
+ "repo_association": contributor.get("repo_association"),
615
+ "first_seen_in_snapshot": contributor.get("first_seen_in_snapshot"),
616
+ "new_to_repo": contributor.get("new_to_repo"),
617
+ "snapshot_pr_count": _coerce_int(contributor.get("snapshot_pr_count")) or 0,
618
+ "snapshot_issue_count": _coerce_int(contributor.get("snapshot_issue_count")) or 0,
619
+ "follow_through_score": contributor.get("follow_through_score"),
620
+ "breadth_score": contributor.get("breadth_score"),
621
+ "automation_risk_signal": contributor.get("automation_risk_signal"),
622
+ "heuristic_note": contributor.get("heuristic_note"),
623
+ "account_age_days": _coerce_int(contributor.get("account_age_days")),
624
+ "public_pr_count_42d": _coerce_int(activity.get("visible_authored_pr_count")),
625
+ "public_repo_count_42d": _coerce_int(activity.get("distinct_repos_with_authored_prs")),
626
+ "repo_pull_requests_url": contributor.get("repo_pull_requests_url"),
627
+ "repo_issues_url": contributor.get("repo_issues_url"),
628
+ }
629
+
630
+
631
+ def _contributor_risk(contributor: dict[str, Any]) -> dict[str, Any]:
632
+ activity = contributor.get("activity") if isinstance(contributor.get("activity"), dict) else {}
633
+ return {
634
+ "automation_risk_signal": contributor.get("automation_risk_signal"),
635
+ "heuristic_note": contributor.get("heuristic_note"),
636
+ "follow_through_score": contributor.get("follow_through_score"),
637
+ "breadth_score": contributor.get("breadth_score"),
638
+ "account_age_days": _coerce_int(contributor.get("account_age_days")),
639
+ "public_pr_count_42d": _coerce_int(activity.get("visible_authored_pr_count")),
640
+ "public_repo_count_42d": _coerce_int(activity.get("distinct_repos_with_authored_prs")),
641
+ "report_reason": contributor.get("report_reason"),
642
+ }
643
+
644
+
645
+ def _find_contributor(entries: list[Any], author_login: str) -> dict[str, Any] | None:
646
+ lowered = author_login.casefold()
647
+ for entry in entries:
648
+ if not isinstance(entry, dict):
649
+ continue
650
+ login = str(entry.get("author_login") or "")
651
+ if login.casefold() == lowered:
652
+ return entry
653
+ return None
654
+
655
+
656
+ def _snapshot_metadata(snapshot_dir: Path) -> _SnapshotMetadata:
657
+ manifest = _read_optional_json(snapshot_dir / "manifest.json")
658
+ repo = str(manifest.get("repo") or _infer_repo(snapshot_dir) or "")
659
+ snapshot_id = str(manifest.get("snapshot_id") or snapshot_dir.name)
660
+ return _SnapshotMetadata(repo=repo, snapshot_id=snapshot_id)
661
+
662
+
663
+ def _infer_repo(snapshot_dir: Path) -> str | None:
664
+ for filename in ("pull_requests.parquet", "issues.parquet"):
665
+ path = snapshot_dir / filename
666
+ if not path.exists():
667
+ continue
668
+ rows = read_parquet_rows(path)
669
+ if rows and rows[0].get("repo"):
670
+ return str(rows[0]["repo"])
671
+ for filename in _analysis_report_paths(snapshot_dir):
672
+ payload = _read_optional_json(filename)
673
+ if payload.get("repo"):
674
+ return str(payload["repo"])
675
+ report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
676
+ if report.get("repo"):
677
+ return str(report["repo"])
678
+ return None
679
+
680
+
681
+ def _title_for_issue(number: int | None, issue_map: dict[int, dict[str, Any]]) -> str | None:
682
+ if number is None or number not in issue_map:
683
+ return None
684
+ title = issue_map[number].get("title")
685
+ return str(title) if title else None
686
+
687
+
688
+ def _url_for_issue(number: int | None, issue_map: dict[int, dict[str, Any]]) -> str | None:
689
+ if number is None or number not in issue_map:
690
+ return None
691
+ value = issue_map[number].get("html_url")
692
+ return str(value) if value else None
693
+
694
+
695
+ def _title_for_pr(number: int | None, pr_map: dict[int, dict[str, Any]]) -> str | None:
696
+ if number is None or number not in pr_map:
697
+ return None
698
+ title = pr_map[number].get("title")
699
+ return str(title) if title else None
700
+
701
+
702
+ def _url_for_pr(number: int | None, pr_map: dict[int, dict[str, Any]]) -> str | None:
703
+ if number is None or number not in pr_map:
704
+ return None
705
+ value = pr_map[number].get("html_url")
706
+ return str(value) if value else None
707
+
708
+
709
+ def _ordered_ints(values: Any) -> list[int]:
710
+ if not isinstance(values, list):
711
+ return []
712
+ ordered: list[int] = []
713
+ for value in values:
714
+ number = _coerce_int(value)
715
+ if number is not None:
716
+ ordered.append(number)
717
+ return ordered
718
+
719
+
720
+ def _coerce_int(value: Any) -> int | None:
721
+ if value is None:
722
+ return None
723
+ try:
724
+ return int(value)
725
+ except (TypeError, ValueError):
726
+ return None
727
+
728
+
729
+ def _coerce_float(value: Any) -> float | None:
730
+ if value is None:
731
+ return None
732
+ try:
733
+ return float(value)
734
+ except (TypeError, ValueError):
735
+ return None
736
+
737
+
738
+ def _read_optional_json(path: Path) -> dict[str, Any]:
739
+ if not path.exists():
740
+ return {}
741
+ payload = read_json(path)
742
+ return payload if isinstance(payload, dict) else {}
uv.lock CHANGED
@@ -561,7 +561,7 @@ wheels = [
561
 
562
  [[package]]
563
  name = "fast-agent-mcp"
564
- version = "0.6.17"
565
  source = { registry = "https://pypi.org/simple" }
566
  dependencies = [
567
  { name = "a2a-sdk" },
@@ -598,9 +598,9 @@ dependencies = [
598
  { name = "uvloop", marker = "sys_platform != 'win32'" },
599
  { name = "watchfiles" },
600
  ]
601
- sdist = { url = "https://files.pythonhosted.org/packages/8c/a1/b6b1045345d38b342da3def7723a2dc6a44faff9c01fee6d81afbd272d62/fast_agent_mcp-0.6.17.tar.gz", hash = "sha256:a920113d47ef2ab82be1bd63b77d3bf78f8f862a5a6e91f1fd0aa931850fb25f", size = 2091401, upload-time = "2026-04-16T21:48:43.334Z" }
602
  wheels = [
603
- { url = "https://files.pythonhosted.org/packages/b4/ef/47e05d6fa95e04ed8ad60afac3ae29d8205894fb220ffde193bd33578f3a/fast_agent_mcp-0.6.17-py3-none-any.whl", hash = "sha256:a23c5a5ed8924e38809dabd31f994e5cc81b8c084e84632bb1eb246b257c4752", size = 1573794, upload-time = "2026-04-16T21:48:38.999Z" },
604
  ]
605
 
606
  [[package]]
@@ -2366,7 +2366,7 @@ wheels = [
2366
 
2367
  [[package]]
2368
  name = "slop-farmer"
2369
- version = "0.1.1"
2370
  source = { editable = "." }
2371
  dependencies = [
2372
  { name = "duckdb" },
 
561
 
562
  [[package]]
563
  name = "fast-agent-mcp"
564
+ version = "0.6.18"
565
  source = { registry = "https://pypi.org/simple" }
566
  dependencies = [
567
  { name = "a2a-sdk" },
 
598
  { name = "uvloop", marker = "sys_platform != 'win32'" },
599
  { name = "watchfiles" },
600
  ]
601
+ sdist = { url = "https://files.pythonhosted.org/packages/68/9f/a66344581177eb70cd817a58a3305c4b2c2b5f98661129c2cecc4aa36e77/fast_agent_mcp-0.6.18.tar.gz", hash = "sha256:5ee5624890a9670b6f1a912998807e0fd451aa1c7205d189a964764a988c7bc0", size = 2091443, upload-time = "2026-04-17T20:52:25.84Z" }
602
  wheels = [
603
+ { url = "https://files.pythonhosted.org/packages/49/63/d8942bde2e706c869f93835ea85a2015be0edf5772c4e9ec8939a1001172/fast_agent_mcp-0.6.18-py3-none-any.whl", hash = "sha256:67c0c011763a28b8d5779b5d4d5cdc61e6f3dbc8cd1a7227388229957429835f", size = 1573842, upload-time = "2026-04-17T20:52:28.807Z" },
604
  ]
605
 
606
  [[package]]
 
2366
 
2367
  [[package]]
2368
  name = "slop-farmer"
2369
+ version = "0.1.0"
2370
  source = { editable = "." }
2371
  dependencies = [
2372
  { name = "duckdb" },