evalstate HF Staff commited on
Commit
938e923
·
verified ·
1 Parent(s): b137fbb

Deploy OpenClaw PR API

Browse files
README.md CHANGED
@@ -20,6 +20,12 @@ tags:
20
 
21
  Machine-oriented API for PR similarity search.
22
 
 
 
 
 
 
 
23
  Defaults for this deployment:
24
 
25
  - repo: `openclaw/openclaw`
 
20
 
21
  Machine-oriented API for PR similarity search.
22
 
23
+ Canonical storage roles:
24
+
25
+ - dataset repo: published latest state and canonical current analysis
26
+ - mounted bucket: mutable operational cache only
27
+ - Space disk: ephemeral runtime storage
28
+
29
  Defaults for this deployment:
30
 
31
  - repo: `openclaw/openclaw`
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "slop-farmer"
7
- version = "0.1.0"
8
  description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
9
  readme = "README.md"
10
  requires-python = ">=3.13.5"
@@ -12,7 +12,7 @@ dependencies = [
12
  "duckdb>=1.2.2",
13
  "pyarrow>=18.0.0",
14
  "fastapi>=0.115.0",
15
- "huggingface_hub>=0.30.0",
16
  "pydantic>=2.11",
17
  "PyYAML>=6.0.2",
18
  "rank-bm25>=0.2.2",
@@ -60,13 +60,6 @@ select = [
60
  ]
61
  ignore = ["E501"]
62
 
63
- [tool.slop-farmer.analyze]
64
- output-dir = "eval_data"
65
- hf-repo-id = "evalstate/transformers-pr"
66
- ranking-backend = "hybrid"
67
- model = "gpt-5.4-mini"
68
- max-clusters = 10
69
-
70
  [tool.slop-farmer.dashboard-data]
71
  output-dir = "web/public/data"
72
  window-days = 14
 
4
 
5
  [project]
6
  name = "slop-farmer"
7
+ version = "0.1.1"
8
  description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
9
  readme = "README.md"
10
  requires-python = ">=3.13.5"
 
12
  "duckdb>=1.2.2",
13
  "pyarrow>=18.0.0",
14
  "fastapi>=0.115.0",
15
+ "huggingface_hub>=1.11.0",
16
  "pydantic>=2.11",
17
  "PyYAML>=6.0.2",
18
  "rank-bm25>=0.2.2",
 
60
  ]
61
  ignore = ["E501"]
62
 
 
 
 
 
 
 
 
63
  [tool.slop-farmer.dashboard-data]
64
  output-dir = "web/public/data"
65
  window-days = 14
src/slop_farmer.egg-info/PKG-INFO CHANGED
@@ -1,13 +1,13 @@
1
  Metadata-Version: 2.4
2
  Name: slop-farmer
3
- Version: 0.1.0
4
  Summary: GitHub-to-Hub data pipeline for transformers issue and PR triage research.
5
  Requires-Python: >=3.13.5
6
  Description-Content-Type: text/markdown
7
  Requires-Dist: duckdb>=1.2.2
8
  Requires-Dist: pyarrow>=18.0.0
9
  Requires-Dist: fastapi>=0.115.0
10
- Requires-Dist: huggingface_hub>=0.30.0
11
  Requires-Dist: pydantic>=2.11
12
  Requires-Dist: PyYAML>=6.0.2
13
  Requires-Dist: rank-bm25>=0.2.2
@@ -126,18 +126,25 @@ Authentication defaults:
126
  - GitHub: `GITHUB_TOKEN`, then `gh auth token`
127
  - Hugging Face: `HF_TOKEN`, otherwise existing `hf auth` login
128
 
129
- ## Scheduled Hugging Face Job for `configs/transformers.yaml`
130
 
131
- To keep the Transformers dataset fresh on the Hub without relying on a local watermark,
132
- submit the repo's job script instead:
 
 
 
 
 
 
 
133
 
134
  ```bash
135
- scripts/submit_transformers_dataset_job.sh
136
  ```
137
 
138
  By default this creates a scheduled HF Job that:
139
 
140
- - reads `configs/transformers.yaml`
141
  - refreshes `dataset_id` incrementally against the current Hub dataset state
142
  - regenerates the new contributor report
143
  - uploads the updated snapshot back to the dataset repo
@@ -146,20 +153,28 @@ Useful overrides:
146
 
147
  ```bash
148
  # fire once immediately instead of creating a schedule
149
- MODE=run scripts/submit_transformers_dataset_job.sh
150
 
151
  # change the cron schedule
152
- SCHEDULE="0 */6 * * *" scripts/submit_transformers_dataset_job.sh
153
 
154
  # optionally mount a writable HF bucket for temp files
155
  SCRATCH_BUCKET=evalstate/slop-farmer-scratch \
156
- scripts/submit_transformers_dataset_job.sh
157
  ```
158
 
159
  Buckets are best treated here as optional scratch space via `TMPDIR`, not as the canonical
160
  published dataset. The repo's local analysis and PR-scope tooling already knows how to
161
  materialize versioned Hub **dataset repos**; it does not currently read HF buckets directly.
162
 
 
 
 
 
 
 
 
 
163
  ## Analyze a Hub dataset
164
 
165
  You can analyze the published Hugging Face dataset directly without scraping GitHub again:
@@ -176,15 +191,12 @@ This materializes the dataset-viewer parquet export into a local snapshot cache
176
 
177
  Repo-local defaults for `analyze` can be stored in `pyproject.toml` under `[tool.slop-farmer.analyze]`. This repo currently defaults to:
178
 
179
- - `output-dir = "eval_data"`
180
- - `hf-repo-id = "evalstate/transformers-pr"`
181
- - `ranking-backend = "hybrid"`
182
- - `model = "gpt-5-mini?reasoning=low"`
183
 
184
- So from repo root you can now usually just run:
185
 
186
  ```bash
187
- uv run slop-farmer analyze
188
  ```
189
 
190
  ## Cluster open PRs by code scope
@@ -353,15 +365,19 @@ scrape:
353
  Then commands stay aligned without repeating repo/workspace/window settings:
354
 
355
  ```bash
356
- uv run slop-farmer --config configs/diffusers.yaml scrape --publish
357
  uv run slop-farmer --config configs/diffusers.yaml analyze
358
  uv run slop-farmer --config configs/diffusers.yaml pr-scope
 
359
  uv run slop-farmer --config configs/diffusers.yaml new-contributor-report
360
  uv run slop-farmer --config configs/diffusers.yaml dashboard-data
361
- uv run slop-farmer --config configs/diffusers.yaml publish-snapshot
362
  uv run slop-farmer --config configs/diffusers.yaml deploy-dashboard --refresh-contributors
 
363
  ```
364
 
 
 
 
365
  If you run `analyze` before `publish-snapshot`, the uploaded snapshot will also include
366
  `analysis-state/`, which makes the hybrid cache portable across machines and reusable in
367
  later snapshots when `analysis.cached_analysis: true` is enabled.
 
1
  Metadata-Version: 2.4
2
  Name: slop-farmer
3
+ Version: 0.1.1
4
  Summary: GitHub-to-Hub data pipeline for transformers issue and PR triage research.
5
  Requires-Python: >=3.13.5
6
  Description-Content-Type: text/markdown
7
  Requires-Dist: duckdb>=1.2.2
8
  Requires-Dist: pyarrow>=18.0.0
9
  Requires-Dist: fastapi>=0.115.0
10
+ Requires-Dist: huggingface_hub>=1.11.0
11
  Requires-Dist: pydantic>=2.11
12
  Requires-Dist: PyYAML>=6.0.2
13
  Requires-Dist: rank-bm25>=0.2.2
 
126
  - GitHub: `GITHUB_TOKEN`, then `gh auth token`
127
  - Hugging Face: `HF_TOKEN`, otherwise existing `hf auth` login
128
 
129
+ ## Canonical dataset upkeep
130
 
131
+ `dataset_id` is the canonical latest dataset repo.
132
+
133
+ Use the remote-first writer:
134
+
135
+ ```bash
136
+ uv run slop-farmer --config configs/transformers.yaml refresh-dataset
137
+ ```
138
+
139
+ Or submit the generic HF Job wrapper:
140
 
141
  ```bash
142
+ scripts/submit_dataset_job.sh
143
  ```
144
 
145
  By default this creates a scheduled HF Job that:
146
 
147
+ - reads `CONFIG_PATH` (defaults to `configs/transformers.yaml`)
148
  - refreshes `dataset_id` incrementally against the current Hub dataset state
149
  - regenerates the new contributor report
150
  - uploads the updated snapshot back to the dataset repo
 
153
 
154
  ```bash
155
  # fire once immediately instead of creating a schedule
156
+ MODE=run scripts/submit_dataset_job.sh
157
 
158
  # change the cron schedule
159
+ SCHEDULE="0 */6 * * *" scripts/submit_dataset_job.sh
160
 
161
  # optionally mount a writable HF bucket for temp files
162
  SCRATCH_BUCKET=evalstate/slop-farmer-scratch \
163
+ scripts/submit_dataset_job.sh
164
  ```
165
 
166
  Buckets are best treated here as optional scratch space via `TMPDIR`, not as the canonical
167
  published dataset. The repo's local analysis and PR-scope tooling already knows how to
168
  materialize versioned Hub **dataset repos**; it does not currently read HF buckets directly.
169
 
170
+ Compatibility wrappers remain available:
171
+
172
+ - `scripts/submit_transformers_dataset_job.sh`
173
+ - `scripts/submit_openclaw_dataset_job.sh`
174
+
175
+ For the current storage model and recommended modes, see
176
+ [`docs/data-architecture.md`](docs/data-architecture.md).
177
+
178
  ## Analyze a Hub dataset
179
 
180
  You can analyze the published Hugging Face dataset directly without scraping GitHub again:
 
191
 
192
  Repo-local defaults for `analyze` can be stored in `pyproject.toml` under `[tool.slop-farmer.analyze]`. This repo currently defaults to:
193
 
194
+ - `dashboard-data.output-dir = "web/public/data"`
 
 
 
195
 
196
+ For repo-specific remote-first analysis, prefer a YAML config with `dataset_id`, e.g.:
197
 
198
  ```bash
199
+ uv run slop-farmer --config configs/openclaw.yaml analyze
200
  ```
201
 
202
  ## Cluster open PRs by code scope
 
365
  Then commands stay aligned without repeating repo/workspace/window settings:
366
 
367
  ```bash
368
+ uv run slop-farmer --config configs/diffusers.yaml refresh-dataset
369
  uv run slop-farmer --config configs/diffusers.yaml analyze
370
  uv run slop-farmer --config configs/diffusers.yaml pr-scope
371
+ uv run slop-farmer --config configs/diffusers.yaml pr-search refresh
372
  uv run slop-farmer --config configs/diffusers.yaml new-contributor-report
373
  uv run slop-farmer --config configs/diffusers.yaml dashboard-data
 
374
  uv run slop-farmer --config configs/diffusers.yaml deploy-dashboard --refresh-contributors
375
+ uv run slop-farmer --config configs/diffusers.yaml dataset-status
376
  ```
377
 
378
+ Those reader commands default to `dataset_id` when configured. Pass `--snapshot-dir` to force
379
+ an explicit local snapshot instead.
380
+
381
  If you run `analyze` before `publish-snapshot`, the uploaded snapshot will also include
382
  `analysis-state/`, which makes the hybrid cache portable across machines and reusable in
383
  later snapshots when `analysis.cached_analysis: true` is enabled.
src/slop_farmer.egg-info/SOURCES.txt CHANGED
@@ -11,6 +11,8 @@ src/slop_farmer.egg-info/requires.txt
11
  src/slop_farmer.egg-info/top_level.txt
12
  src/slop_farmer/app/__init__.py
13
  src/slop_farmer/app/cli.py
 
 
14
  src/slop_farmer/app/deploy.py
15
  src/slop_farmer/app/duplicate_prs.py
16
  src/slop_farmer/app/hf_checkpoint_import.py
@@ -21,8 +23,10 @@ src/slop_farmer/app/publish.py
21
  src/slop_farmer/app/snapshot_state.py
22
  src/slop_farmer/app/workflow.py
23
  src/slop_farmer/data/__init__.py
 
24
  src/slop_farmer/data/ghreplica_api.py
25
  src/slop_farmer/data/github_api.py
 
26
  src/slop_farmer/data/http.py
27
  src/slop_farmer/data/links.py
28
  src/slop_farmer/data/normalize.py
@@ -30,9 +34,11 @@ src/slop_farmer/data/parquet_io.py
30
  src/slop_farmer/data/search_duckdb.py
31
  src/slop_farmer/data/snapshot_materialize.py
32
  src/slop_farmer/data/snapshot_paths.py
 
33
  src/slop_farmer/reports/__init__.py
34
  src/slop_farmer/reports/analysis.py
35
  src/slop_farmer/reports/analysis_cache.py
 
36
  src/slop_farmer/reports/canonical_duplicate_pr.py
37
  src/slop_farmer/reports/dashboard.py
38
  src/slop_farmer/reports/duplicate_prs.py
@@ -49,6 +55,7 @@ tests/test_canonical_duplicate_pr.py
49
  tests/test_cli.py
50
  tests/test_config.py
51
  tests/test_dashboard.py
 
52
  tests/test_farmer_setup_assets.py
53
  tests/test_ghreplica_api.py
54
  tests/test_github_api.py
 
11
  src/slop_farmer.egg-info/top_level.txt
12
  src/slop_farmer/app/__init__.py
13
  src/slop_farmer/app/cli.py
14
+ src/slop_farmer/app/dataset_refresh.py
15
+ src/slop_farmer/app/dataset_status.py
16
  src/slop_farmer/app/deploy.py
17
  src/slop_farmer/app/duplicate_prs.py
18
  src/slop_farmer/app/hf_checkpoint_import.py
 
23
  src/slop_farmer/app/snapshot_state.py
24
  src/slop_farmer/app/workflow.py
25
  src/slop_farmer/data/__init__.py
26
+ src/slop_farmer/data/dataset_card.py
27
  src/slop_farmer/data/ghreplica_api.py
28
  src/slop_farmer/data/github_api.py
29
+ src/slop_farmer/data/hf_dataset_repo.py
30
  src/slop_farmer/data/http.py
31
  src/slop_farmer/data/links.py
32
  src/slop_farmer/data/normalize.py
 
34
  src/slop_farmer/data/search_duckdb.py
35
  src/slop_farmer/data/snapshot_materialize.py
36
  src/slop_farmer/data/snapshot_paths.py
37
+ src/slop_farmer/data/snapshot_source.py
38
  src/slop_farmer/reports/__init__.py
39
  src/slop_farmer/reports/analysis.py
40
  src/slop_farmer/reports/analysis_cache.py
41
+ src/slop_farmer/reports/analysis_service.py
42
  src/slop_farmer/reports/canonical_duplicate_pr.py
43
  src/slop_farmer/reports/dashboard.py
44
  src/slop_farmer/reports/duplicate_prs.py
 
55
  tests/test_cli.py
56
  tests/test_config.py
57
  tests/test_dashboard.py
58
+ tests/test_dataset_status.py
59
  tests/test_farmer_setup_assets.py
60
  tests/test_ghreplica_api.py
61
  tests/test_github_api.py
src/slop_farmer.egg-info/requires.txt CHANGED
@@ -1,7 +1,7 @@
1
  duckdb>=1.2.2
2
  pyarrow>=18.0.0
3
  fastapi>=0.115.0
4
- huggingface_hub>=0.30.0
5
  pydantic>=2.11
6
  PyYAML>=6.0.2
7
  rank-bm25>=0.2.2
 
1
  duckdb>=1.2.2
2
  pyarrow>=18.0.0
3
  fastapi>=0.115.0
4
+ huggingface_hub>=1.11.0
5
  pydantic>=2.11
6
  PyYAML>=6.0.2
7
  rank-bm25>=0.2.2
src/slop_farmer/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
  __all__ = ["__version__"]
2
 
3
- __version__ = "0.1.0"
 
1
  __all__ = ["__version__"]
2
 
3
+ __version__ = "0.1.1"
src/slop_farmer/app/analysis_id.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from pathlib import Path
5
+
6
+ from slop_farmer.app_config import command_defaults
7
+ from slop_farmer.data.parquet_io import read_json
8
+ from slop_farmer.data.snapshot_paths import ROOT_MANIFEST_FILENAME, resolve_snapshot_dir_from_output
9
+
10
+ MODEL_SLUG_PATTERN = re.compile(r"[^a-z0-9]+")
11
+
12
+
13
+ def model_slug(model: str) -> str:
14
+ base = model.split("?", 1)[0].strip().lower()
15
+ slug = MODEL_SLUG_PATTERN.sub("", base)
16
+ return slug or "model"
17
+
18
+
19
+ def build_analysis_id(
20
+ *,
21
+ snapshot_id: str,
22
+ model: str,
23
+ ranking_backend: str,
24
+ suffix: str | None = None,
25
+ ) -> str:
26
+ parts = [
27
+ MODEL_SLUG_PATTERN.sub("", ranking_backend.strip().lower()) or "analysis",
28
+ model_slug(model),
29
+ snapshot_id.strip().lower(),
30
+ ]
31
+ if suffix:
32
+ normalized_suffix = MODEL_SLUG_PATTERN.sub("-", suffix.strip().lower()).strip("-")
33
+ if normalized_suffix:
34
+ parts.append(normalized_suffix)
35
+ return "-".join(parts)
36
+
37
+
38
+ def analysis_id_from_snapshot(
39
+ *,
40
+ snapshot_dir: Path,
41
+ model: str,
42
+ ranking_backend: str,
43
+ suffix: str | None = None,
44
+ ) -> str:
45
+ manifest_path = snapshot_dir / ROOT_MANIFEST_FILENAME
46
+ manifest = read_json(manifest_path) if manifest_path.exists() else {}
47
+ if not isinstance(manifest, dict):
48
+ raise ValueError(f"Snapshot manifest at {manifest_path} must contain a JSON object.")
49
+ snapshot_id = str(manifest.get("snapshot_id") or snapshot_dir.name).strip()
50
+ if not snapshot_id:
51
+ raise ValueError(f"Could not resolve snapshot_id from {manifest_path}")
52
+ return build_analysis_id(
53
+ snapshot_id=snapshot_id,
54
+ model=model,
55
+ ranking_backend=ranking_backend,
56
+ suffix=suffix,
57
+ )
58
+
59
+
60
+ def analysis_id_from_config(
61
+ *,
62
+ config_path: Path,
63
+ output_dir: Path | None = None,
64
+ snapshot_dir: Path | None = None,
65
+ model: str | None = None,
66
+ ranking_backend: str | None = None,
67
+ suffix: str | None = None,
68
+ ) -> str:
69
+ defaults = command_defaults("analyze", config_path=config_path)
70
+ resolved_snapshot_dir = resolve_snapshot_dir_from_output(
71
+ Path(output_dir or defaults.get("output-dir", "data")),
72
+ snapshot_dir,
73
+ )
74
+ resolved_model = str(model or defaults.get("model", "gpt-5.4-mini?service_tier=flex"))
75
+ resolved_backend = str(ranking_backend or defaults.get("ranking-backend", "hybrid"))
76
+ return analysis_id_from_snapshot(
77
+ snapshot_dir=resolved_snapshot_dir,
78
+ model=resolved_model,
79
+ ranking_backend=resolved_backend,
80
+ suffix=suffix,
81
+ )
src/slop_farmer/app/cli.py CHANGED
@@ -13,15 +13,17 @@ from slop_farmer.config import (
13
  AnalysisOptions,
14
  CheckpointImportOptions,
15
  DashboardDataOptions,
 
 
16
  DeployDashboardOptions,
17
- FullPipelineOptions,
18
  MarkdownReportOptions,
19
  NewContributorReportOptions,
20
  PipelineOptions,
21
  PrScopeOptions,
22
  PrSearchRefreshOptions,
23
- PublishSnapshotOptions,
24
  RepoRef,
 
25
  SnapshotAdoptOptions,
26
  )
27
  from slop_farmer.reports.duplicate_prs import DEFAULT_DUPLICATE_PR_MODEL
@@ -29,6 +31,16 @@ from slop_farmer.reports.duplicate_prs import DEFAULT_DUPLICATE_PR_MODEL
29
  CommandHandler = Callable[[argparse.Namespace, Path | None], None]
30
 
31
 
 
 
 
 
 
 
 
 
 
 
32
  def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
33
  defaults = _load_parser_defaults(config_path)
34
 
@@ -41,6 +53,7 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
41
  subparsers = parser.add_subparsers(dest="command", required=True)
42
 
43
  _add_scrape_parser(subparsers, defaults["scrape"])
 
44
  _add_analyze_parser(subparsers, defaults["analyze"])
45
  _add_pr_scope_parser(subparsers, defaults["pr-scope"])
46
  _add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
@@ -50,15 +63,17 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
50
  _add_pr_search_parser(subparsers, defaults["pr-search"])
51
  _add_new_contributor_report_parser(subparsers, defaults["new-contributor-report"])
52
  _add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
53
- _add_publish_snapshot_parser(subparsers, defaults["publish-snapshot"])
 
54
  _add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
55
- _add_full_pipeline_parser(subparsers, defaults["full-pipeline"])
56
  return parser
57
 
58
 
59
  def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
60
  commands = (
61
  "scrape",
 
62
  "analyze",
63
  "import-hf-checkpoint",
64
  "pr-scope",
@@ -66,9 +81,10 @@ def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]
66
  "adopt-snapshot",
67
  "new-contributor-report",
68
  "dashboard-data",
69
- "publish-snapshot",
 
70
  "deploy-dashboard",
71
- "full-pipeline",
72
  )
73
  return {command: command_defaults(command, config_path=config_path) for command in commands}
74
 
@@ -141,52 +157,110 @@ def _add_scrape_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
141
  help="Fetch issue timeline events for linkage rows.",
142
  )
143
  scrape.add_argument(
144
- "--publish",
 
145
  action="store_true",
146
- default=bool(defaults.get("publish", False)),
147
- help="Upload the snapshot to the Hugging Face Hub.",
 
 
 
 
 
 
 
 
 
 
 
 
148
  )
149
  scrape.add_argument(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  "--hf-repo-id",
151
  default=defaults.get("hf-repo-id"),
152
- help="Hub dataset repo ID used with --publish.",
 
153
  )
154
- scrape.add_argument(
155
- "--private-hf-repo",
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  action="store_true",
157
- default=bool(defaults.get("private-hf-repo", False)),
158
- help="Create the Hub dataset repo as private.",
159
  )
160
- scrape.add_argument(
161
  "--new-contributor-report",
162
  dest="new_contributor_report",
163
  action="store_true",
164
- default=defaults.get("new-contributor-report"),
165
- help="Generate new contributor dataset/report artifacts. Defaults to enabled when --publish is used.",
166
  )
167
- scrape.add_argument(
168
  "--no-new-contributor-report",
169
  dest="new_contributor_report",
170
  action="store_false",
171
- help="Skip new contributor dataset/report generation.",
172
  )
173
- scrape.add_argument(
174
  "--new-contributor-window-days",
175
  type=int,
176
  default=int(defaults.get("new-contributor-window-days", 42)),
177
- help="Recent public activity window for contributor enrichment.",
178
  )
179
- scrape.add_argument(
180
  "--new-contributor-max-authors",
181
  type=int,
182
  default=int(defaults.get("new-contributor-max-authors", 25)),
183
- help="Maximum number of contributors to include in the new contributor report. Use 0 for no cap.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  )
185
 
186
 
187
  def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
188
  analyze = subparsers.add_parser(
189
- "analyze", help="Analyze a local snapshot and write a shortlist JSON report."
 
190
  )
191
  analyze.add_argument(
192
  "--snapshot-dir",
@@ -200,7 +274,7 @@ def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
200
  analyze.add_argument(
201
  "--hf-repo-id",
202
  default=defaults.get("hf-repo-id"),
203
- help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
204
  )
205
  analyze.add_argument(
206
  "--hf-revision",
@@ -223,7 +297,7 @@ def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
223
  )
224
  analyze.add_argument(
225
  "--model",
226
- default=defaults.get("model", "gpt-5-mini?reasoning=low"),
227
  help="Model string used by fast-agent when enabled.",
228
  )
229
  analyze.add_argument(
@@ -232,6 +306,15 @@ def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
232
  default=int(defaults.get("max-clusters", 10)),
233
  help="Maximum number of meta clusters to include in the report.",
234
  )
 
 
 
 
 
 
 
 
 
235
  analyze.add_argument(
236
  "--open-prs-only",
237
  action="store_true",
@@ -637,6 +720,61 @@ def _add_pr_search_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
637
  status.add_argument("--repo", help="Optional repo override.")
638
  status.add_argument("--json", action="store_true", help="Emit JSON.")
639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
 
641
  def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
642
  new_contributor = subparsers.add_parser(
@@ -659,6 +797,24 @@ def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]
659
  new_contributor.add_argument(
660
  "--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
661
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  new_contributor.add_argument(
663
  "--window-days",
664
  type=int,
@@ -690,17 +846,35 @@ def _add_dashboard_data_parser(subparsers: Any, defaults: dict[str, Any]) -> Non
690
  dashboard.add_argument(
691
  "--analysis-input",
692
  type=Path,
693
- help="Optional analysis report JSON. Defaults to analysis-report.json in the snapshot.",
694
  )
695
  dashboard.add_argument(
696
  "--contributors-input",
697
  type=Path,
698
- help="Optional new contributor report JSON. Defaults to new-contributors-report.json in the snapshot.",
699
  )
700
  dashboard.add_argument(
701
  "--pr-scope-input",
702
  type=Path,
703
- help="Optional PR scope cluster JSON. Defaults to pr-scope-clusters.json in the snapshot.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704
  )
705
  dashboard.add_argument(
706
  "--window-days",
@@ -710,27 +884,77 @@ def _add_dashboard_data_parser(subparsers: Any, defaults: dict[str, Any]) -> Non
710
  )
711
 
712
 
713
- def _add_publish_snapshot_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
714
- publish_snapshot = subparsers.add_parser(
715
- "publish-snapshot",
716
- help="Publish an existing local snapshot to a Hugging Face dataset repo.",
717
  )
718
- publish_snapshot.add_argument(
719
  "--output-dir",
720
  type=Path,
721
  default=Path(defaults.get("output-dir", "data")),
722
  help="Pipeline workspace root containing snapshots/latest.json.",
723
  )
724
- publish_snapshot.add_argument(
725
- "--snapshot-dir", type=Path, help="Optional explicit snapshot directory to upload."
 
 
726
  )
727
- publish_snapshot.add_argument(
 
 
 
 
 
728
  "--hf-repo-id",
729
  default=defaults.get("hf-repo-id"),
730
  required=defaults.get("hf-repo-id") is None,
731
  help="Target Hugging Face dataset repo id.",
732
  )
733
- publish_snapshot.add_argument(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
  "--private-hf-repo",
735
  action="store_true",
736
  default=bool(defaults.get("private-hf-repo", False)),
@@ -740,7 +964,8 @@ def _add_publish_snapshot_parser(subparsers: Any, defaults: dict[str, Any]) -> N
740
 
741
  def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
742
  deploy_dashboard = subparsers.add_parser(
743
- "deploy-dashboard", help="Build and publish the static dashboard to a Hugging Face Space."
 
744
  )
745
  deploy_dashboard.add_argument(
746
  "--pipeline-data-dir",
@@ -756,10 +981,37 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
756
  help="Optional snapshot directory to publish. Defaults to the latest snapshot in --pipeline-data-dir.",
757
  )
758
  deploy_dashboard.add_argument(
759
- "--analysis-input", type=Path, help="Optional analysis report JSON override."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
  )
761
  deploy_dashboard.add_argument(
762
- "--contributors-input", type=Path, help="Optional contributor report JSON override."
 
 
 
 
 
 
 
 
 
 
763
  )
764
  deploy_dashboard.add_argument(
765
  "--refresh-contributors",
@@ -817,71 +1069,29 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
817
  )
818
 
819
 
820
- def _add_full_pipeline_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
821
- full_pipeline = subparsers.add_parser(
822
- "full-pipeline",
823
- help="Run scrape, publish, analyze, markdown export, and dashboard export for one repo.",
824
- )
825
- full_pipeline.add_argument(
826
- "--repo",
827
- default=defaults.get("repo"),
828
- required=defaults.get("repo") is None,
829
- help="GitHub repository in owner/name form.",
830
- )
831
- full_pipeline.add_argument(
832
- "--dataset",
833
- default=defaults.get("dataset"),
834
- required=defaults.get("dataset") is None,
835
- help="Target Hugging Face dataset repo id.",
836
  )
837
- full_pipeline.add_argument("--model", default=defaults.get("model", "gpt-5-mini?reasoning=low"))
838
- full_pipeline.add_argument(
839
- "--workspace-root",
840
  type=Path,
841
- default=Path(defaults.get("workspace-root", "runs")),
842
- )
843
- full_pipeline.add_argument("--private-hf-repo", action="store_true")
844
- full_pipeline.add_argument(
845
- "--ranking-backend",
846
- choices=("hybrid", "deterministic"),
847
- default=defaults.get("ranking-backend", "hybrid"),
848
- )
849
- full_pipeline.add_argument(
850
- "--max-clusters", type=int, default=int(defaults.get("max-clusters", 10))
851
- )
852
- full_pipeline.add_argument(
853
- "--fetch-timeline", dest="fetch_timeline", action="store_true", default=True
854
- )
855
- full_pipeline.add_argument("--no-fetch-timeline", dest="fetch_timeline", action="store_false")
856
- full_pipeline.add_argument(
857
- "--dashboard-window-days",
858
- type=int,
859
- default=int(defaults.get("dashboard-window-days", 14)),
860
- )
861
- full_pipeline.add_argument(
862
- "--new-contributor-window-days",
863
- type=int,
864
- default=int(defaults.get("new-contributor-window-days", 42)),
865
- )
866
- full_pipeline.add_argument(
867
- "--new-contributor-max-authors",
868
- type=int,
869
- default=int(defaults.get("new-contributor-max-authors", 25)),
870
- help="Contributor enrichment cap override. Full pipeline treats 0 as no cap and currently forces no cap.",
871
- )
872
- full_pipeline.add_argument(
873
- "--issue-max-age-days", type=int, default=defaults.get("issue-max-age-days")
874
  )
875
- full_pipeline.add_argument(
876
- "--pr-max-age-days", type=int, default=defaults.get("pr-max-age-days")
 
 
877
  )
878
- full_pipeline.add_argument("--max-issues", type=int, default=defaults.get("max-issues"))
879
- full_pipeline.add_argument("--max-prs", type=int, default=defaults.get("max-prs"))
880
- full_pipeline.add_argument(
881
- "--open-prs-only",
882
- action="store_true",
883
- default=bool(defaults.get("open-prs-only", False)),
884
  )
 
885
 
886
 
887
  # Dispatch helpers
@@ -905,9 +1115,7 @@ def _resolve_hf_inputs(args: argparse.Namespace) -> tuple[str | None, str | None
905
  def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
906
  from slop_farmer.app.pipeline import run_pipeline
907
 
908
- new_contributor_report = args.new_contributor_report
909
- if new_contributor_report is None:
910
- new_contributor_report = bool(args.publish)
911
  options = PipelineOptions(
912
  repo=RepoRef.parse(args.repo),
913
  output_dir=args.output_dir,
@@ -921,9 +1129,6 @@ def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
921
  max_reviews_per_pr=args.max_reviews_per_pr,
922
  max_review_comments_per_pr=args.max_review_comments_per_pr,
923
  fetch_timeline=args.fetch_timeline,
924
- publish=args.publish,
925
- hf_repo_id=args.hf_repo_id,
926
- private_hf_repo=args.private_hf_repo,
927
  new_contributor_report=new_contributor_report,
928
  new_contributor_window_days=args.new_contributor_window_days,
929
  new_contributor_max_authors=args.new_contributor_max_authors,
@@ -933,6 +1138,34 @@ def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
933
  print(run_pipeline(options))
934
 
935
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936
  def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
937
  from slop_farmer.reports.analysis import run_analysis
938
 
@@ -948,6 +1181,7 @@ def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
948
  ranking_backend=args.ranking_backend,
949
  model=args.model,
950
  max_clusters=args.max_clusters,
 
951
  open_prs_only=args.open_prs_only,
952
  cached_analysis=bool(analyze_defaults.get("cached_analysis", False)),
953
  pr_template_cleanup_mode=str(
@@ -1041,12 +1275,18 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
1041
  explain_pr_search_pair,
1042
  format_pr_search_candidate_clusters,
1043
  format_pr_search_cluster,
 
 
1044
  format_pr_search_pair,
1045
  format_pr_search_probe,
 
1046
  format_pr_search_similar,
1047
  format_pr_search_status,
1048
  get_pr_search_candidate_clusters,
1049
  get_pr_search_cluster,
 
 
 
1050
  get_pr_search_similar,
1051
  get_pr_search_status,
1052
  probe_pr_search_github,
@@ -1140,6 +1380,36 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
1140
  print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
1141
  return
1142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1143
  raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
1144
 
1145
 
@@ -1181,6 +1451,7 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
1181
  del config_path
1182
  from slop_farmer.reports.new_contributor_report import run_new_contributor_report
1183
 
 
1184
  print(
1185
  run_new_contributor_report(
1186
  NewContributorReportOptions(
@@ -1188,6 +1459,9 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
1188
  output_dir=args.output_dir,
1189
  output=args.output,
1190
  json_output=args.json_output,
 
 
 
1191
  window_days=args.window_days,
1192
  max_authors=args.max_authors,
1193
  )
@@ -1199,6 +1473,7 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
1199
  from slop_farmer.reports.dashboard import run_dashboard_data
1200
 
1201
  dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
 
1202
  print(
1203
  run_dashboard_data(
1204
  DashboardDataOptions(
@@ -1207,6 +1482,9 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
1207
  analysis_input=args.analysis_input,
1208
  contributors_input=args.contributors_input,
1209
  pr_scope_input=args.pr_scope_input,
 
 
 
1210
  window_days=args.window_days,
1211
  snapshot_root=(
1212
  Path(dashboard_defaults["snapshot-root"])
@@ -1222,6 +1500,7 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
1222
  del config_path
1223
  from slop_farmer.app.deploy import run_deploy_dashboard
1224
 
 
1225
  run_deploy_dashboard(
1226
  DeployDashboardOptions(
1227
  pipeline_data_dir=args.pipeline_data_dir,
@@ -1229,6 +1508,10 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
1229
  snapshot_dir=args.snapshot_dir,
1230
  analysis_input=args.analysis_input,
1231
  contributors_input=args.contributors_input,
 
 
 
 
1232
  refresh_contributors=args.refresh_contributors,
1233
  dashboard_window_days=args.dashboard_window_days,
1234
  contributor_window_days=args.contributor_window_days,
@@ -1247,44 +1530,60 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
1247
  )
1248
 
1249
 
1250
- def _run_publish_snapshot(args: argparse.Namespace, config_path: Path | None) -> None:
1251
  del config_path
1252
- from slop_farmer.app.publish import run_publish_snapshot
1253
 
1254
- run_publish_snapshot(
1255
- PublishSnapshotOptions(
 
1256
  output_dir=args.output_dir,
1257
- snapshot_dir=args.snapshot_dir,
1258
  hf_repo_id=args.hf_repo_id,
1259
- private_hf_repo=args.private_hf_repo,
 
1260
  )
1261
  )
 
1262
 
1263
 
1264
- def _run_full_pipeline(args: argparse.Namespace, config_path: Path | None) -> None:
1265
  del config_path
1266
- from slop_farmer.app.workflow import run_full_pipeline
1267
 
1268
  print(
1269
- run_full_pipeline(
1270
- FullPipelineOptions(
1271
- repo=RepoRef.parse(args.repo),
1272
- dataset=args.dataset,
1273
- model=args.model,
1274
- workspace_root=args.workspace_root,
1275
- private_hf_repo=args.private_hf_repo,
1276
- ranking_backend=args.ranking_backend,
1277
- max_clusters=args.max_clusters,
1278
- fetch_timeline=args.fetch_timeline,
1279
- dashboard_window_days=args.dashboard_window_days,
1280
- new_contributor_window_days=args.new_contributor_window_days,
1281
- new_contributor_max_authors=args.new_contributor_max_authors,
1282
- issue_max_age_days=args.issue_max_age_days,
1283
- pr_max_age_days=args.pr_max_age_days,
1284
- max_issues=args.max_issues,
1285
- max_prs=args.max_prs,
1286
- open_prs_only=args.open_prs_only,
1287
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1288
  )
1289
  )
1290
 
@@ -1296,6 +1595,7 @@ def main() -> None:
1296
 
1297
  handlers: dict[str, CommandHandler] = {
1298
  "scrape": _run_scrape,
 
1299
  "analyze": _run_analyze,
1300
  "markdown-report": _run_markdown_report,
1301
  "duplicate-prs": _run_duplicate_prs,
@@ -1306,8 +1606,9 @@ def main() -> None:
1306
  "new-contributor-report": _run_new_contributor_report,
1307
  "dashboard-data": _run_dashboard_data,
1308
  "deploy-dashboard": _run_deploy_dashboard,
1309
- "publish-snapshot": _run_publish_snapshot,
1310
- "full-pipeline": _run_full_pipeline,
 
1311
  }
1312
  handler = handlers.get(args.command)
1313
  if handler is None:
 
13
  AnalysisOptions,
14
  CheckpointImportOptions,
15
  DashboardDataOptions,
16
+ DatasetRefreshOptions,
17
+ DatasetStatusOptions,
18
  DeployDashboardOptions,
 
19
  MarkdownReportOptions,
20
  NewContributorReportOptions,
21
  PipelineOptions,
22
  PrScopeOptions,
23
  PrSearchRefreshOptions,
24
+ PublishAnalysisArtifactsOptions,
25
  RepoRef,
26
+ SaveCacheOptions,
27
  SnapshotAdoptOptions,
28
  )
29
  from slop_farmer.reports.duplicate_prs import DEFAULT_DUPLICATE_PR_MODEL
 
31
  CommandHandler = Callable[[argparse.Namespace, Path | None], None]
32
 
33
 
34
+ def _int_at_least(minimum: int) -> Callable[[str], int]:
35
+ def parse(raw: str) -> int:
36
+ value = int(raw)
37
+ if value < minimum:
38
+ raise argparse.ArgumentTypeError(f"expected integer >= {minimum}")
39
+ return value
40
+
41
+ return parse
42
+
43
+
44
  def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
45
  defaults = _load_parser_defaults(config_path)
46
 
 
53
  subparsers = parser.add_subparsers(dest="command", required=True)
54
 
55
  _add_scrape_parser(subparsers, defaults["scrape"])
56
+ _add_refresh_dataset_parser(subparsers, defaults["refresh-dataset"])
57
  _add_analyze_parser(subparsers, defaults["analyze"])
58
  _add_pr_scope_parser(subparsers, defaults["pr-scope"])
59
  _add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
 
63
  _add_pr_search_parser(subparsers, defaults["pr-search"])
64
  _add_new_contributor_report_parser(subparsers, defaults["new-contributor-report"])
65
  _add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
66
+ _add_publish_analysis_artifacts_parser(subparsers, defaults["publish-analysis-artifacts"])
67
+ _add_save_cache_parser(subparsers, defaults["save-cache"])
68
  _add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
69
+ _add_dataset_status_parser(subparsers, defaults["dataset-status"])
70
  return parser
71
 
72
 
73
  def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
74
  commands = (
75
  "scrape",
76
+ "refresh-dataset",
77
  "analyze",
78
  "import-hf-checkpoint",
79
  "pr-scope",
 
81
  "adopt-snapshot",
82
  "new-contributor-report",
83
  "dashboard-data",
84
+ "publish-analysis-artifacts",
85
+ "save-cache",
86
  "deploy-dashboard",
87
+ "dataset-status",
88
  )
89
  return {command: command_defaults(command, config_path=config_path) for command in commands}
90
 
 
157
  help="Fetch issue timeline events for linkage rows.",
158
  )
159
  scrape.add_argument(
160
+ "--new-contributor-report",
161
+ dest="new_contributor_report",
162
  action="store_true",
163
+ default=defaults.get("new-contributor-report"),
164
+ help="Generate new contributor dataset/report artifacts for the local snapshot.",
165
+ )
166
+ scrape.add_argument(
167
+ "--no-new-contributor-report",
168
+ dest="new_contributor_report",
169
+ action="store_false",
170
+ help="Skip new contributor dataset/report generation.",
171
+ )
172
+ scrape.add_argument(
173
+ "--new-contributor-window-days",
174
+ type=int,
175
+ default=int(defaults.get("new-contributor-window-days", 42)),
176
+ help="Recent public activity window for contributor enrichment.",
177
  )
178
  scrape.add_argument(
179
+ "--new-contributor-max-authors",
180
+ type=int,
181
+ default=int(defaults.get("new-contributor-max-authors", 25)),
182
+ help="Maximum number of contributors to include in the new contributor report. Use 0 for no cap.",
183
+ )
184
+
185
+
186
+ def _add_refresh_dataset_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
187
+ refresh = subparsers.add_parser(
188
+ "refresh-dataset",
189
+ help="Refresh the canonical Hugging Face dataset repo from remote watermark state.",
190
+ )
191
+ refresh.add_argument(
192
+ "--repo",
193
+ default=defaults.get("repo", "huggingface/transformers"),
194
+ help="GitHub repository in owner/name form.",
195
+ )
196
+ refresh.add_argument(
197
  "--hf-repo-id",
198
  default=defaults.get("hf-repo-id"),
199
+ required=defaults.get("hf-repo-id") is None,
200
+ help="Canonical Hugging Face dataset repo id to refresh.",
201
  )
202
+ refresh.add_argument("--max-issues", type=int, default=defaults.get("max-issues"))
203
+ refresh.add_argument("--max-prs", type=int, default=defaults.get("max-prs"))
204
+ refresh.add_argument(
205
+ "--max-issue-comments", type=int, default=defaults.get("max-issue-comments")
206
+ )
207
+ refresh.add_argument(
208
+ "--max-reviews-per-pr", type=int, default=defaults.get("max-reviews-per-pr")
209
+ )
210
+ refresh.add_argument(
211
+ "--max-review-comments-per-pr",
212
+ type=int,
213
+ default=defaults.get("max-review-comments-per-pr"),
214
+ )
215
+ refresh.add_argument(
216
+ "--fetch-timeline",
217
  action="store_true",
218
+ default=bool(defaults.get("fetch-timeline", False)),
 
219
  )
220
+ refresh.add_argument(
221
  "--new-contributor-report",
222
  dest="new_contributor_report",
223
  action="store_true",
224
+ default=bool(defaults.get("new-contributor-report", True)),
 
225
  )
226
+ refresh.add_argument(
227
  "--no-new-contributor-report",
228
  dest="new_contributor_report",
229
  action="store_false",
 
230
  )
231
+ refresh.add_argument(
232
  "--new-contributor-window-days",
233
  type=int,
234
  default=int(defaults.get("new-contributor-window-days", 42)),
 
235
  )
236
+ refresh.add_argument(
237
  "--new-contributor-max-authors",
238
  type=int,
239
  default=int(defaults.get("new-contributor-max-authors", 25)),
240
+ )
241
+ refresh.add_argument("--http-timeout", type=int, default=300)
242
+ refresh.add_argument("--http-max-retries", type=int, default=8)
243
+ refresh.add_argument("--checkpoint-every-comments", type=int, default=1000)
244
+ refresh.add_argument("--checkpoint-every-prs", type=int, default=25)
245
+ refresh.add_argument(
246
+ "--private-hf-repo",
247
+ dest="private_hf_repo",
248
+ action="store_true",
249
+ default=bool(defaults.get("private-hf-repo", False)),
250
+ help="Create the target dataset repo as private if needed.",
251
+ )
252
+ refresh.add_argument(
253
+ "--private",
254
+ dest="private_hf_repo",
255
+ action="store_true",
256
+ help=argparse.SUPPRESS,
257
  )
258
 
259
 
260
  def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
261
  analyze = subparsers.add_parser(
262
+ "analyze",
263
+ help="Analyze a snapshot and write a local JSON report. Canonical publication is separate.",
264
  )
265
  analyze.add_argument(
266
  "--snapshot-dir",
 
274
  analyze.add_argument(
275
  "--hf-repo-id",
276
  default=defaults.get("hf-repo-id"),
277
+ help="Analyze a canonical Hugging Face dataset repo by materializing a self-consistent published snapshot locally.",
278
  )
279
  analyze.add_argument(
280
  "--hf-revision",
 
297
  )
298
  analyze.add_argument(
299
  "--model",
300
+ default=defaults.get("model", "gpt-5.4-mini?service_tier=flex"),
301
  help="Model string used by fast-agent when enabled.",
302
  )
303
  analyze.add_argument(
 
306
  default=int(defaults.get("max-clusters", 10)),
307
  help="Maximum number of meta clusters to include in the report.",
308
  )
309
+ analyze.add_argument(
310
+ "--hybrid-llm-concurrency",
311
+ type=_int_at_least(1),
312
+ default=int(defaults.get("hybrid-llm-concurrency", 1)),
313
+ help=(
314
+ "Maximum number of hybrid LLM review units to run at once. "
315
+ "Use 1 to minimize provider pressure."
316
+ ),
317
+ )
318
  analyze.add_argument(
319
  "--open-prs-only",
320
  action="store_true",
 
720
  status.add_argument("--repo", help="Optional repo override.")
721
  status.add_argument("--json", action="store_true", help="Emit JSON.")
722
 
723
+ contributor = pr_search_subparsers.add_parser(
724
+ "contributor", help="Show indexed contributor summary for one author login."
725
+ )
726
+ contributor.add_argument("login", help="GitHub author login to query.")
727
+ contributor.add_argument(
728
+ "--db",
729
+ type=Path,
730
+ default=Path(defaults["db"]) if defaults.get("db") else None,
731
+ help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
732
+ )
733
+ contributor.add_argument(
734
+ "--output-dir",
735
+ type=Path,
736
+ default=Path(defaults.get("output-dir", "data")),
737
+ )
738
+ contributor.add_argument("--repo", help="Optional repo override.")
739
+ contributor.add_argument("--json", action="store_true", help="Emit JSON.")
740
+
741
+ contributor_prs = pr_search_subparsers.add_parser(
742
+ "contributor-prs", help="List indexed PRs for one contributor login."
743
+ )
744
+ contributor_prs.add_argument("login", help="GitHub author login to query.")
745
+ contributor_prs.add_argument(
746
+ "--db",
747
+ type=Path,
748
+ default=Path(defaults["db"]) if defaults.get("db") else None,
749
+ help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
750
+ )
751
+ contributor_prs.add_argument(
752
+ "--output-dir",
753
+ type=Path,
754
+ default=Path(defaults.get("output-dir", "data")),
755
+ )
756
+ contributor_prs.add_argument("--repo", help="Optional repo override.")
757
+ contributor_prs.add_argument("--limit", type=int, default=20, help="Maximum rows to show.")
758
+ contributor_prs.add_argument("--json", action="store_true", help="Emit JSON.")
759
+
760
+ pr_contributor = pr_search_subparsers.add_parser(
761
+ "pr-contributor", help="Show contributor summary for the author of one indexed PR."
762
+ )
763
+ pr_contributor.add_argument("pr_number", type=int, help="Pull request number to query.")
764
+ pr_contributor.add_argument(
765
+ "--db",
766
+ type=Path,
767
+ default=Path(defaults["db"]) if defaults.get("db") else None,
768
+ help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
769
+ )
770
+ pr_contributor.add_argument(
771
+ "--output-dir",
772
+ type=Path,
773
+ default=Path(defaults.get("output-dir", "data")),
774
+ )
775
+ pr_contributor.add_argument("--repo", help="Optional repo override.")
776
+ pr_contributor.add_argument("--json", action="store_true", help="Emit JSON.")
777
+
778
 
779
  def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
780
  new_contributor = subparsers.add_parser(
 
797
  new_contributor.add_argument(
798
  "--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
799
  )
800
+ new_contributor.add_argument(
801
+ "--hf-repo-id",
802
+ default=defaults.get("hf-repo-id"),
803
+ help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
804
+ )
805
+ new_contributor.add_argument(
806
+ "--hf-revision",
807
+ default=defaults.get("hf-revision"),
808
+ help="Optional Hub revision for metadata and README download.",
809
+ )
810
+ new_contributor.add_argument(
811
+ "--hf-materialize-dir",
812
+ type=Path,
813
+ default=Path(defaults["hf-materialize-dir"])
814
+ if defaults.get("hf-materialize-dir")
815
+ else None,
816
+ help="Optional local directory used when materializing an HF dataset snapshot.",
817
+ )
818
  new_contributor.add_argument(
819
  "--window-days",
820
  type=int,
 
846
  dashboard.add_argument(
847
  "--analysis-input",
848
  type=Path,
849
+ help="Optional analysis report JSON override. Defaults to canonical published current analysis when available, otherwise falls back to snapshot-local analysis files.",
850
  )
851
  dashboard.add_argument(
852
  "--contributors-input",
853
  type=Path,
854
+ help="Optional contributor report JSON override. Defaults to the materialized snapshot's new-contributors-report.json.",
855
  )
856
  dashboard.add_argument(
857
  "--pr-scope-input",
858
  type=Path,
859
+ help="Optional PR scope cluster JSON override. Defaults to the materialized snapshot's pr-scope-clusters.json.",
860
+ )
861
+ dashboard.add_argument(
862
+ "--hf-repo-id",
863
+ default=defaults.get("hf-repo-id"),
864
+ help="Materialize the canonical Hugging Face dataset repo instead of using the latest local snapshot.",
865
+ )
866
+ dashboard.add_argument(
867
+ "--hf-revision",
868
+ default=defaults.get("hf-revision"),
869
+ help="Optional Hub revision for metadata and README download.",
870
+ )
871
+ dashboard.add_argument(
872
+ "--hf-materialize-dir",
873
+ type=Path,
874
+ default=Path(defaults["hf-materialize-dir"])
875
+ if defaults.get("hf-materialize-dir")
876
+ else None,
877
+ help="Optional local directory used when materializing an HF dataset snapshot.",
878
  )
879
  dashboard.add_argument(
880
  "--window-days",
 
884
  )
885
 
886
 
887
+ def _add_publish_analysis_artifacts_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
888
+ publish_analysis = subparsers.add_parser(
889
+ "publish-analysis-artifacts",
890
+ help="Publish archived and optional canonical hybrid analysis artifacts to a dataset repo.",
891
  )
892
+ publish_analysis.add_argument(
893
  "--output-dir",
894
  type=Path,
895
  default=Path(defaults.get("output-dir", "data")),
896
  help="Pipeline workspace root containing snapshots/latest.json.",
897
  )
898
+ publish_analysis.add_argument(
899
+ "--snapshot-dir",
900
+ type=Path,
901
+ help="Optional explicit snapshot directory containing analysis-report-hybrid.json.",
902
  )
903
+ publish_analysis.add_argument(
904
+ "--analysis-input",
905
+ type=Path,
906
+ help="Optional explicit hybrid analysis report JSON to publish instead of snapshot-dir discovery.",
907
+ )
908
+ publish_analysis.add_argument(
909
  "--hf-repo-id",
910
  default=defaults.get("hf-repo-id"),
911
  required=defaults.get("hf-repo-id") is None,
912
  help="Target Hugging Face dataset repo id.",
913
  )
914
+ publish_analysis.add_argument("--analysis-id", required=True, help="Immutable analysis run id.")
915
+ publish_analysis.add_argument(
916
+ "--canonical",
917
+ action="store_true",
918
+ default=bool(defaults.get("canonical", False)),
919
+ help="Also update the stable analysis/current canonical alias.",
920
+ )
921
+ publish_analysis.add_argument(
922
+ "--save-cache",
923
+ action="store_true",
924
+ default=bool(defaults.get("save-cache", False)),
925
+ help="Also upload snapshot-local analysis-state/ as mutable operational cache at repo-root analysis-state/.",
926
+ )
927
+ publish_analysis.add_argument(
928
+ "--private-hf-repo",
929
+ action="store_true",
930
+ default=bool(defaults.get("private-hf-repo", False)),
931
+ help="Create the target dataset repo as private if needed.",
932
+ )
933
+
934
+
935
+ def _add_save_cache_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
936
+ save_cache = subparsers.add_parser(
937
+ "save-cache",
938
+ help="Upload snapshot-local analysis-state/ as mutable operational cache to a dataset repo.",
939
+ )
940
+ save_cache.add_argument(
941
+ "--output-dir",
942
+ type=Path,
943
+ default=Path(defaults.get("output-dir", "data")),
944
+ help="Pipeline workspace root containing snapshots/latest.json.",
945
+ )
946
+ save_cache.add_argument(
947
+ "--snapshot-dir",
948
+ type=Path,
949
+ help="Optional explicit snapshot directory containing analysis-state/.",
950
+ )
951
+ save_cache.add_argument(
952
+ "--hf-repo-id",
953
+ default=defaults.get("hf-repo-id"),
954
+ required=defaults.get("hf-repo-id") is None,
955
+ help="Target Hugging Face dataset repo id.",
956
+ )
957
+ save_cache.add_argument(
958
  "--private-hf-repo",
959
  action="store_true",
960
  default=bool(defaults.get("private-hf-repo", False)),
 
964
 
965
  def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
966
  deploy_dashboard = subparsers.add_parser(
967
+ "deploy-dashboard",
968
+ help="Build and publish the static dashboard to a Hugging Face Space from a materialized dataset view.",
969
  )
970
  deploy_dashboard.add_argument(
971
  "--pipeline-data-dir",
 
981
  help="Optional snapshot directory to publish. Defaults to the latest snapshot in --pipeline-data-dir.",
982
  )
983
  deploy_dashboard.add_argument(
984
+ "--analysis-input",
985
+ type=Path,
986
+ help="Optional analysis report JSON override. Omit to prefer canonical published current analysis when available.",
987
+ )
988
+ deploy_dashboard.add_argument(
989
+ "--contributors-input",
990
+ type=Path,
991
+ help="Optional contributor report JSON override.",
992
+ )
993
+ deploy_dashboard.add_argument(
994
+ "--pr-scope-input",
995
+ type=Path,
996
+ help="Optional PR scope cluster JSON override.",
997
+ )
998
+ deploy_dashboard.add_argument(
999
+ "--hf-repo-id",
1000
+ default=defaults.get("hf-repo-id"),
1001
+ help="Materialize the canonical Hugging Face dataset repo instead of using the latest local snapshot.",
1002
  )
1003
  deploy_dashboard.add_argument(
1004
+ "--hf-revision",
1005
+ default=defaults.get("hf-revision"),
1006
+ help="Optional Hub revision for metadata and README download.",
1007
+ )
1008
+ deploy_dashboard.add_argument(
1009
+ "--hf-materialize-dir",
1010
+ type=Path,
1011
+ default=Path(defaults["hf-materialize-dir"])
1012
+ if defaults.get("hf-materialize-dir")
1013
+ else None,
1014
+ help="Optional local directory used when materializing an HF dataset snapshot.",
1015
  )
1016
  deploy_dashboard.add_argument(
1017
  "--refresh-contributors",
 
1069
  )
1070
 
1071
 
1072
+ def _add_dataset_status_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
1073
+ dataset_status = subparsers.add_parser(
1074
+ "dataset-status",
1075
+ help="Inspect canonical dataset freshness and the local latest pointer.",
 
 
 
 
 
 
 
 
 
 
 
 
1076
  )
1077
+ dataset_status.add_argument("--repo", default=defaults.get("repo"))
1078
+ dataset_status.add_argument(
1079
+ "--output-dir",
1080
  type=Path,
1081
+ default=Path(defaults.get("output-dir", "data")),
1082
+ help="Local workspace root containing snapshots/latest.json.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1083
  )
1084
+ dataset_status.add_argument(
1085
+ "--hf-repo-id",
1086
+ default=defaults.get("hf-repo-id"),
1087
+ help="Canonical Hugging Face dataset repo id to inspect.",
1088
  )
1089
+ dataset_status.add_argument(
1090
+ "--hf-revision",
1091
+ default=defaults.get("hf-revision"),
1092
+ help="Optional Hub revision for metadata and README download.",
 
 
1093
  )
1094
+ dataset_status.add_argument("--json", action="store_true", help="Emit machine-readable JSON.")
1095
 
1096
 
1097
  # Dispatch helpers
 
1115
  def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
1116
  from slop_farmer.app.pipeline import run_pipeline
1117
 
1118
+ new_contributor_report = bool(args.new_contributor_report)
 
 
1119
  options = PipelineOptions(
1120
  repo=RepoRef.parse(args.repo),
1121
  output_dir=args.output_dir,
 
1129
  max_reviews_per_pr=args.max_reviews_per_pr,
1130
  max_review_comments_per_pr=args.max_review_comments_per_pr,
1131
  fetch_timeline=args.fetch_timeline,
 
 
 
1132
  new_contributor_report=new_contributor_report,
1133
  new_contributor_window_days=args.new_contributor_window_days,
1134
  new_contributor_max_authors=args.new_contributor_max_authors,
 
1138
  print(run_pipeline(options))
1139
 
1140
 
1141
+ def _run_refresh_dataset(args: argparse.Namespace, config_path: Path | None) -> None:
1142
+ from slop_farmer.app.dataset_refresh import run_dataset_refresh
1143
+
1144
+ refresh_defaults = command_defaults("refresh-dataset", config_path=config_path)
1145
+ result = run_dataset_refresh(
1146
+ DatasetRefreshOptions(
1147
+ repo=RepoRef.parse(args.repo),
1148
+ hf_repo_id=args.hf_repo_id,
1149
+ private_hf_repo=args.private_hf_repo,
1150
+ max_issues=args.max_issues,
1151
+ max_prs=args.max_prs,
1152
+ max_issue_comments=args.max_issue_comments,
1153
+ max_reviews_per_pr=args.max_reviews_per_pr,
1154
+ max_review_comments_per_pr=args.max_review_comments_per_pr,
1155
+ fetch_timeline=args.fetch_timeline,
1156
+ new_contributor_report=args.new_contributor_report,
1157
+ new_contributor_window_days=args.new_contributor_window_days,
1158
+ new_contributor_max_authors=args.new_contributor_max_authors,
1159
+ http_timeout=args.http_timeout,
1160
+ http_max_retries=args.http_max_retries,
1161
+ checkpoint_every_comments=args.checkpoint_every_comments,
1162
+ checkpoint_every_prs=args.checkpoint_every_prs,
1163
+ cluster_suppression_rules=tuple(refresh_defaults.get("cluster-suppression-rules", ())),
1164
+ )
1165
+ )
1166
+ print(json.dumps(result, indent=2))
1167
+
1168
+
1169
  def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
1170
  from slop_farmer.reports.analysis import run_analysis
1171
 
 
1181
  ranking_backend=args.ranking_backend,
1182
  model=args.model,
1183
  max_clusters=args.max_clusters,
1184
+ hybrid_llm_concurrency=args.hybrid_llm_concurrency,
1185
  open_prs_only=args.open_prs_only,
1186
  cached_analysis=bool(analyze_defaults.get("cached_analysis", False)),
1187
  pr_template_cleanup_mode=str(
 
1275
  explain_pr_search_pair,
1276
  format_pr_search_candidate_clusters,
1277
  format_pr_search_cluster,
1278
+ format_pr_search_contributor,
1279
+ format_pr_search_contributor_pulls,
1280
  format_pr_search_pair,
1281
  format_pr_search_probe,
1282
+ format_pr_search_pull_contributor,
1283
  format_pr_search_similar,
1284
  format_pr_search_status,
1285
  get_pr_search_candidate_clusters,
1286
  get_pr_search_cluster,
1287
+ get_pr_search_contributor,
1288
+ get_pr_search_contributor_pulls,
1289
+ get_pr_search_pull_contributor,
1290
  get_pr_search_similar,
1291
  get_pr_search_status,
1292
  probe_pr_search_github,
 
1380
  print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
1381
  return
1382
 
1383
+ if args.pr_search_command == "contributor":
1384
+ result = get_pr_search_contributor(db_path, author_login=args.login, repo=args.repo)
1385
+ print(json.dumps(result, indent=2) if args.json else format_pr_search_contributor(result))
1386
+ return
1387
+
1388
+ if args.pr_search_command == "contributor-prs":
1389
+ result = get_pr_search_contributor_pulls(
1390
+ db_path,
1391
+ author_login=args.login,
1392
+ repo=args.repo,
1393
+ limit=args.limit,
1394
+ )
1395
+ print(
1396
+ json.dumps(result, indent=2)
1397
+ if args.json
1398
+ else format_pr_search_contributor_pulls(result)
1399
+ )
1400
+ return
1401
+
1402
+ if args.pr_search_command == "pr-contributor":
1403
+ result = get_pr_search_pull_contributor(
1404
+ db_path,
1405
+ pr_number=args.pr_number,
1406
+ repo=args.repo,
1407
+ )
1408
+ print(
1409
+ json.dumps(result, indent=2) if args.json else format_pr_search_pull_contributor(result)
1410
+ )
1411
+ return
1412
+
1413
  raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
1414
 
1415
 
 
1451
  del config_path
1452
  from slop_farmer.reports.new_contributor_report import run_new_contributor_report
1453
 
1454
+ hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
1455
  print(
1456
  run_new_contributor_report(
1457
  NewContributorReportOptions(
 
1459
  output_dir=args.output_dir,
1460
  output=args.output,
1461
  json_output=args.json_output,
1462
+ hf_repo_id=hf_repo_id,
1463
+ hf_revision=hf_revision,
1464
+ hf_materialize_dir=hf_materialize_dir,
1465
  window_days=args.window_days,
1466
  max_authors=args.max_authors,
1467
  )
 
1473
  from slop_farmer.reports.dashboard import run_dashboard_data
1474
 
1475
  dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
1476
+ hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
1477
  print(
1478
  run_dashboard_data(
1479
  DashboardDataOptions(
 
1482
  analysis_input=args.analysis_input,
1483
  contributors_input=args.contributors_input,
1484
  pr_scope_input=args.pr_scope_input,
1485
+ hf_repo_id=hf_repo_id,
1486
+ hf_revision=hf_revision,
1487
+ hf_materialize_dir=hf_materialize_dir,
1488
  window_days=args.window_days,
1489
  snapshot_root=(
1490
  Path(dashboard_defaults["snapshot-root"])
 
1500
  del config_path
1501
  from slop_farmer.app.deploy import run_deploy_dashboard
1502
 
1503
+ hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
1504
  run_deploy_dashboard(
1505
  DeployDashboardOptions(
1506
  pipeline_data_dir=args.pipeline_data_dir,
 
1508
  snapshot_dir=args.snapshot_dir,
1509
  analysis_input=args.analysis_input,
1510
  contributors_input=args.contributors_input,
1511
+ pr_scope_input=args.pr_scope_input,
1512
+ hf_repo_id=hf_repo_id,
1513
+ hf_revision=hf_revision,
1514
+ hf_materialize_dir=hf_materialize_dir,
1515
  refresh_contributors=args.refresh_contributors,
1516
  dashboard_window_days=args.dashboard_window_days,
1517
  contributor_window_days=args.contributor_window_days,
 
1530
  )
1531
 
1532
 
1533
+ def _run_dataset_status(args: argparse.Namespace, config_path: Path | None) -> None:
1534
  del config_path
1535
+ from slop_farmer.app.dataset_status import format_dataset_status, get_dataset_status
1536
 
1537
+ result = get_dataset_status(
1538
+ DatasetStatusOptions(
1539
+ repo=args.repo,
1540
  output_dir=args.output_dir,
 
1541
  hf_repo_id=args.hf_repo_id,
1542
+ hf_revision=args.hf_revision,
1543
+ json_output=args.json,
1544
  )
1545
  )
1546
+ print(json.dumps(result, indent=2) if args.json else format_dataset_status(result))
1547
 
1548
 
1549
+ def _run_publish_analysis_artifacts(args: argparse.Namespace, config_path: Path | None) -> None:
1550
  del config_path
1551
+ from slop_farmer.app.publish_analysis import run_publish_analysis_artifacts
1552
 
1553
  print(
1554
+ json.dumps(
1555
+ run_publish_analysis_artifacts(
1556
+ PublishAnalysisArtifactsOptions(
1557
+ output_dir=args.output_dir,
1558
+ snapshot_dir=args.snapshot_dir,
1559
+ analysis_input=args.analysis_input,
1560
+ hf_repo_id=args.hf_repo_id,
1561
+ analysis_id=args.analysis_id,
1562
+ canonical=args.canonical,
1563
+ save_cache=args.save_cache,
1564
+ private_hf_repo=args.private_hf_repo,
1565
+ )
1566
+ ),
1567
+ indent=2,
1568
+ )
1569
+ )
1570
+
1571
+
1572
+ def _run_save_cache(args: argparse.Namespace, config_path: Path | None) -> None:
1573
+ del config_path
1574
+ from slop_farmer.app.save_cache import run_save_cache
1575
+
1576
+ print(
1577
+ json.dumps(
1578
+ run_save_cache(
1579
+ SaveCacheOptions(
1580
+ output_dir=args.output_dir,
1581
+ snapshot_dir=args.snapshot_dir,
1582
+ hf_repo_id=args.hf_repo_id,
1583
+ private_hf_repo=args.private_hf_repo,
1584
+ )
1585
+ ),
1586
+ indent=2,
1587
  )
1588
  )
1589
 
 
1595
 
1596
  handlers: dict[str, CommandHandler] = {
1597
  "scrape": _run_scrape,
1598
+ "refresh-dataset": _run_refresh_dataset,
1599
  "analyze": _run_analyze,
1600
  "markdown-report": _run_markdown_report,
1601
  "duplicate-prs": _run_duplicate_prs,
 
1606
  "new-contributor-report": _run_new_contributor_report,
1607
  "dashboard-data": _run_dashboard_data,
1608
  "deploy-dashboard": _run_deploy_dashboard,
1609
+ "dataset-status": _run_dataset_status,
1610
+ "publish-analysis-artifacts": _run_publish_analysis_artifacts,
1611
+ "save-cache": _run_save_cache,
1612
  }
1613
  handler = handlers.get(args.command)
1614
  if handler is None:
src/slop_farmer/app/dataset_refresh.py CHANGED
@@ -17,6 +17,7 @@ from slop_farmer.app_config import command_defaults, extract_cli_config_path
17
  from slop_farmer.config import (
18
  DatasetRefreshOptions,
19
  NewContributorReportOptions,
 
20
  RepoRef,
21
  resolve_github_token,
22
  )
@@ -48,6 +49,7 @@ from slop_farmer.data.parquet_io import (
48
  write_text,
49
  )
50
  from slop_farmer.reports.new_contributor_report import run_new_contributor_report
 
51
 
52
  PRIMARY_KEYS: dict[str, tuple[str, ...]] = {
53
  "issues": ("github_id",),
@@ -318,6 +320,9 @@ def _build_argument_parser(*, config_path: Path | None = None) -> argparse.Argum
318
  default=bool(defaults.get("private-hf-repo", False)),
319
  )
320
  parser.add_argument("--private", dest="private_hf_repo", action="store_true")
 
 
 
321
  return parser
322
 
323
 
@@ -872,7 +877,7 @@ def run_dataset_refresh(options: DatasetRefreshOptions) -> dict[str, Any]:
872
  table_name: merge_rows(table_name, previous_tables[table_name], delta_rows)
873
  for table_name, delta_rows in delta_tables.items()
874
  }
875
- manifest = {
876
  "repo": repo_slug,
877
  "snapshot_id": sid,
878
  "crawl_started_at": crawl_started_at,
@@ -918,8 +923,27 @@ def run_dataset_refresh(options: DatasetRefreshOptions) -> dict[str, Any]:
918
  )
919
  write_parquet(issue_comment_rows, output_root / "issue_comments.parquet", "comments")
920
  write_parquet(pr_comment_rows, output_root / "pr_comments.parquet", "comments")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
921
  if options.new_contributor_report:
922
- write_json(manifest, output_root / "manifest.json")
923
  log("Generating new contributor dataset/report artifacts")
924
  run_new_contributor_report(
925
  NewContributorReportOptions(
@@ -937,11 +961,14 @@ def run_dataset_refresh(options: DatasetRefreshOptions) -> dict[str, Any]:
937
  manifest["counts"]["new_contributors"] = len(
938
  read_parquet_rows(output_root / "new_contributors.parquet")
939
  )
940
- manifest["artifacts"] = {
941
- "new_contributors_parquet": "new_contributors.parquet",
942
- "new_contributors_json": "new-contributors-report.json",
943
- "new_contributors_markdown": "new-contributors-report.md",
944
- }
 
 
 
945
  manifest["watermark"].pop("previous_snapshot_dir", None)
946
  write_json(manifest, output_root / "manifest.json")
947
  write_text(
@@ -962,7 +989,7 @@ def run_dataset_refresh(options: DatasetRefreshOptions) -> dict[str, Any]:
962
  },
963
  output_root / "state" / "watermark.json",
964
  )
965
- write_json(manifest, output_root / "snapshots" / sid / "manifest.json")
966
  write_json(
967
  {
968
  "repo": repo_slug,
@@ -1012,6 +1039,7 @@ def main(argv: list[str] | None = None) -> None:
1012
  http_max_retries=args.http_max_retries,
1013
  checkpoint_every_comments=args.checkpoint_every_comments,
1014
  checkpoint_every_prs=args.checkpoint_every_prs,
 
1015
  )
1016
  )
1017
  print(json.dumps(result, indent=2))
 
17
  from slop_farmer.config import (
18
  DatasetRefreshOptions,
19
  NewContributorReportOptions,
20
+ PrScopeOptions,
21
  RepoRef,
22
  resolve_github_token,
23
  )
 
49
  write_text,
50
  )
51
  from slop_farmer.reports.new_contributor_report import run_new_contributor_report
52
+ from slop_farmer.reports.pr_scope import run_pr_scope_report
53
 
54
  PRIMARY_KEYS: dict[str, tuple[str, ...]] = {
55
  "issues": ("github_id",),
 
320
  default=bool(defaults.get("private-hf-repo", False)),
321
  )
322
  parser.add_argument("--private", dest="private_hf_repo", action="store_true")
323
+ parser.set_defaults(
324
+ cluster_suppression_rules=tuple(defaults.get("cluster-suppression-rules", ()))
325
+ )
326
  return parser
327
 
328
 
 
877
  table_name: merge_rows(table_name, previous_tables[table_name], delta_rows)
878
  for table_name, delta_rows in delta_tables.items()
879
  }
880
+ manifest: dict[str, Any] = {
881
  "repo": repo_slug,
882
  "snapshot_id": sid,
883
  "crawl_started_at": crawl_started_at,
 
923
  )
924
  write_parquet(issue_comment_rows, output_root / "issue_comments.parquet", "comments")
925
  write_parquet(pr_comment_rows, output_root / "pr_comments.parquet", "comments")
926
+ archived_snapshot_dir = output_root / "snapshots" / sid
927
+ archived_snapshot_dir.mkdir(parents=True, exist_ok=True)
928
+ write_json(manifest, output_root / "manifest.json")
929
+ log("Generating PR scope clusters")
930
+ pr_scope_path = run_pr_scope_report(
931
+ PrScopeOptions(
932
+ snapshot_dir=output_root,
933
+ output_dir=output_root,
934
+ output=output_root / "pr-scope-clusters.json",
935
+ hf_repo_id=None,
936
+ hf_revision=None,
937
+ hf_materialize_dir=None,
938
+ cluster_suppression_rules=options.cluster_suppression_rules,
939
+ )
940
+ )
941
+ shutil.copy2(pr_scope_path, archived_snapshot_dir / pr_scope_path.name)
942
+ artifacts: dict[str, str] = {
943
+ "pr_scope_clusters_json": pr_scope_path.name,
944
+ "archived_pr_scope_clusters_json": f"snapshots/{sid}/{pr_scope_path.name}",
945
+ }
946
  if options.new_contributor_report:
 
947
  log("Generating new contributor dataset/report artifacts")
948
  run_new_contributor_report(
949
  NewContributorReportOptions(
 
961
  manifest["counts"]["new_contributors"] = len(
962
  read_parquet_rows(output_root / "new_contributors.parquet")
963
  )
964
+ artifacts.update(
965
+ {
966
+ "new_contributors_parquet": "new_contributors.parquet",
967
+ "new_contributors_json": "new-contributors-report.json",
968
+ "new_contributors_markdown": "new-contributors-report.md",
969
+ }
970
+ )
971
+ manifest["artifacts"] = artifacts
972
  manifest["watermark"].pop("previous_snapshot_dir", None)
973
  write_json(manifest, output_root / "manifest.json")
974
  write_text(
 
989
  },
990
  output_root / "state" / "watermark.json",
991
  )
992
+ write_json(manifest, archived_snapshot_dir / "manifest.json")
993
  write_json(
994
  {
995
  "repo": repo_slug,
 
1039
  http_max_retries=args.http_max_retries,
1040
  checkpoint_every_comments=args.checkpoint_every_comments,
1041
  checkpoint_every_prs=args.checkpoint_every_prs,
1042
+ cluster_suppression_rules=tuple(args.cluster_suppression_rules),
1043
  )
1044
  )
1045
  print(json.dumps(result, indent=2))
src/slop_farmer/app/dataset_status.py CHANGED
@@ -15,6 +15,14 @@ from slop_farmer.data.hf_dataset_repo import (
15
  stable_snapshot_candidates,
16
  )
17
  from slop_farmer.data.parquet_io import read_json
 
 
 
 
 
 
 
 
18
 
19
 
20
  def _coerce_datetime(value: Any) -> datetime | None:
@@ -51,17 +59,41 @@ def _local_status(output_dir: Path) -> dict[str, Any] | None:
51
  if not latest_path.exists():
52
  return None
53
  payload = read_json(latest_path)
54
- snapshot_dir = payload.get("snapshot_dir")
55
- manifest = {}
56
- if isinstance(snapshot_dir, str) and snapshot_dir:
57
- manifest_path = Path(snapshot_dir).resolve() / "manifest.json"
 
 
58
  if manifest_path.exists():
59
  manifest = read_json(manifest_path)
 
60
  return {
61
  "latest_path": str(latest_path),
62
  "latest_pointer": payload,
63
- "snapshot_dir": snapshot_dir,
64
  "snapshot_id": manifest.get("snapshot_id") or payload.get("latest_snapshot_id"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  }
66
 
67
 
@@ -73,7 +105,7 @@ def _remote_status(repo_id: str, revision: str | None) -> dict[str, Any]:
73
  latest_pointer = load_remote_json_file(
74
  api,
75
  repo_id,
76
- "snapshots/latest.json",
77
  root,
78
  revision=revision,
79
  )
@@ -98,27 +130,32 @@ def _remote_status(repo_id: str, revision: str | None) -> dict[str, Any]:
98
  continue
99
  manifest = read_json(downloaded)
100
  break
101
- snapshot_prefix = (
102
- str(latest_pointer.get("snapshot_dir") or "").strip("/")
 
 
 
 
 
 
 
 
103
  if isinstance(latest_pointer, dict)
104
- else ""
105
  )
106
- contributors_present = any(
107
- path in remote_paths
108
- for path in (
109
- "new_contributors.parquet",
110
- "new-contributors-report.json",
111
- "new-contributors-report.md",
112
- )
113
  )
114
- if snapshot_prefix:
115
- contributors_present = contributors_present or any(
116
- path in remote_paths
117
- for path in (
118
- f"{snapshot_prefix}/new_contributors.parquet",
119
- f"{snapshot_prefix}/new-contributors-report.json",
120
- f"{snapshot_prefix}/new-contributors-report.md",
121
- )
122
  )
123
  extracted_at = manifest.get("extracted_at") if manifest else None
124
  return {
@@ -127,12 +164,75 @@ def _remote_status(repo_id: str, revision: str | None) -> dict[str, Any]:
127
  "latest_pointer": latest_pointer,
128
  "watermark": watermark,
129
  "manifest": manifest,
130
- "contributors_present": contributors_present,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  "remote_path_count": len(remote_paths),
132
  "age": _age_summary(extracted_at),
133
  }
134
 
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def get_dataset_status(options: DatasetStatusOptions) -> dict[str, Any]:
137
  remote = _remote_status(options.hf_repo_id, options.hf_revision) if options.hf_repo_id else None
138
  local = _local_status(options.output_dir)
@@ -156,6 +256,9 @@ def format_dataset_status(status: dict[str, Any]) -> str:
156
  watermark = remote.get("watermark") or {}
157
  latest_pointer = remote.get("latest_pointer") or {}
158
  age = remote.get("age") or {}
 
 
 
159
  lines = [
160
  f"Repo: {status.get('repo') or '?'}",
161
  f"Dataset: {status.get('dataset_id') or 'not configured'}",
@@ -166,10 +269,32 @@ def format_dataset_status(status: dict[str, Any]) -> str:
166
  f"Remote latest snapshot: {manifest.get('snapshot_id') or latest_pointer.get('latest_snapshot_id') or '?'}",
167
  f"Remote extracted at: {manifest.get('extracted_at') or '?'}",
168
  f"Remote next_since: {watermark.get('next_since') or latest_pointer.get('next_since') or '?'}",
169
- f"Contributor artifacts: {'yes' if remote.get('contributors_present') else 'no'}",
170
- f"Freshness: {age.get('summary') or 'unknown'} ({age.get('staleness') or 'unknown'})",
171
  ]
172
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  if local:
174
  lines.extend(
175
  [
@@ -177,6 +302,15 @@ def format_dataset_status(status: dict[str, Any]) -> str:
177
  f"Local snapshot id: {local.get('snapshot_id') or '?'}",
178
  ]
179
  )
 
 
 
 
 
 
 
 
 
180
  else:
181
  lines.append("Local latest pointer: none")
182
  return "\n".join(lines)
 
15
  stable_snapshot_candidates,
16
  )
17
  from slop_farmer.data.parquet_io import read_json
18
+ from slop_farmer.data.snapshot_paths import (
19
+ CONTRIBUTOR_ARTIFACT_FILENAMES,
20
+ CURRENT_ANALYSIS_MANIFEST_PATH,
21
+ PR_SCOPE_CLUSTERS_FILENAME,
22
+ SNAPSHOTS_LATEST_PATH,
23
+ load_current_analysis_manifest,
24
+ repo_relative_path_to_local,
25
+ )
26
 
27
 
28
  def _coerce_datetime(value: Any) -> datetime | None:
 
59
  if not latest_path.exists():
60
  return None
61
  payload = read_json(latest_path)
62
+ snapshot_dir_raw = payload.get("snapshot_dir")
63
+ manifest: dict[str, Any] = {}
64
+ snapshot_dir: Path | None = None
65
+ if isinstance(snapshot_dir_raw, str) and snapshot_dir_raw:
66
+ snapshot_dir = Path(snapshot_dir_raw).resolve()
67
+ manifest_path = snapshot_dir / "manifest.json"
68
  if manifest_path.exists():
69
  manifest = read_json(manifest_path)
70
+ current_analysis = _local_current_analysis(snapshot_dir)
71
  return {
72
  "latest_path": str(latest_path),
73
  "latest_pointer": payload,
74
+ "snapshot_dir": snapshot_dir_raw,
75
  "snapshot_id": manifest.get("snapshot_id") or payload.get("latest_snapshot_id"),
76
+ "current_analysis": current_analysis,
77
+ }
78
+
79
+
80
+ def _local_current_analysis(snapshot_dir: Path | None) -> dict[str, Any]:
81
+ if snapshot_dir is None:
82
+ return {"present": False}
83
+ manifest_path = repo_relative_path_to_local(snapshot_dir, CURRENT_ANALYSIS_MANIFEST_PATH)
84
+ if not manifest_path.exists():
85
+ return {"present": False}
86
+ try:
87
+ manifest = load_current_analysis_manifest(manifest_path)
88
+ except ValueError as exc:
89
+ return {"present": True, "valid": False, "detail": str(exc)}
90
+ return {
91
+ "present": True,
92
+ "valid": True,
93
+ "snapshot_id": manifest["snapshot_id"],
94
+ "analysis_id": manifest["analysis_id"],
95
+ "variant": manifest["variant"],
96
+ "published_at": manifest["published_at"],
97
  }
98
 
99
 
 
105
  latest_pointer = load_remote_json_file(
106
  api,
107
  repo_id,
108
+ SNAPSHOTS_LATEST_PATH,
109
  root,
110
  revision=revision,
111
  )
 
130
  continue
131
  manifest = read_json(downloaded)
132
  break
133
+ current_analysis = _remote_current_analysis(
134
+ api,
135
+ repo_id,
136
+ root,
137
+ revision=revision,
138
+ remote_paths=remote_paths,
139
+ latest_pointer=latest_pointer,
140
+ )
141
+ latest_snapshot_id = (
142
+ str(latest_pointer.get("latest_snapshot_id"))
143
  if isinstance(latest_pointer, dict)
144
+ else None
145
  )
146
+ archived_run_manifests = sorted(
147
+ path
148
+ for path in remote_paths
149
+ if path.startswith("snapshots/")
150
+ and "/analysis-runs/" in path
151
+ and path.endswith("/manifest.json")
 
152
  )
153
+ current_snapshot_run_count = 0
154
+ if latest_snapshot_id:
155
+ current_snapshot_run_count = sum(
156
+ 1
157
+ for path in archived_run_manifests
158
+ if path.startswith(f"snapshots/{latest_snapshot_id}/analysis-runs/")
 
 
159
  )
160
  extracted_at = manifest.get("extracted_at") if manifest else None
161
  return {
 
164
  "latest_pointer": latest_pointer,
165
  "watermark": watermark,
166
  "manifest": manifest,
167
+ "cheap_artifacts": {
168
+ "pr_scope_clusters": _remote_has_latest_artifact(
169
+ remote_paths,
170
+ latest_pointer,
171
+ PR_SCOPE_CLUSTERS_FILENAME,
172
+ ),
173
+ "contributors": all(
174
+ _remote_has_latest_artifact(remote_paths, latest_pointer, filename)
175
+ for filename in CONTRIBUTOR_ARTIFACT_FILENAMES
176
+ ),
177
+ },
178
+ "current_analysis": current_analysis,
179
+ "archived_analysis_runs": {
180
+ "count": len(archived_run_manifests),
181
+ "current_snapshot_count": current_snapshot_run_count,
182
+ },
183
  "remote_path_count": len(remote_paths),
184
  "age": _age_summary(extracted_at),
185
  }
186
 
187
 
188
+ def _remote_current_analysis(
189
+ api: HfApi,
190
+ repo_id: str,
191
+ root: Path,
192
+ *,
193
+ revision: str | None,
194
+ remote_paths: set[str],
195
+ latest_pointer: dict[str, Any] | None,
196
+ ) -> dict[str, Any]:
197
+ if CURRENT_ANALYSIS_MANIFEST_PATH not in remote_paths:
198
+ return {"present": False}
199
+ downloaded = load_remote_file(
200
+ api,
201
+ repo_id,
202
+ CURRENT_ANALYSIS_MANIFEST_PATH,
203
+ root,
204
+ revision=revision,
205
+ )
206
+ if downloaded is None:
207
+ return {"present": False}
208
+ try:
209
+ manifest = load_current_analysis_manifest(downloaded)
210
+ except ValueError as exc:
211
+ return {"present": True, "valid": False, "detail": str(exc)}
212
+ latest_snapshot_id = (
213
+ str(latest_pointer.get("latest_snapshot_id")) if isinstance(latest_pointer, dict) else None
214
+ )
215
+ return {
216
+ "present": True,
217
+ "valid": True,
218
+ "snapshot_id": manifest["snapshot_id"],
219
+ "analysis_id": manifest["analysis_id"],
220
+ "variant": manifest["variant"],
221
+ "published_at": manifest["published_at"],
222
+ "matches_latest_snapshot": manifest["snapshot_id"] == latest_snapshot_id,
223
+ "artifact_count": len(manifest["artifacts"]),
224
+ }
225
+
226
+
227
+ def _remote_has_latest_artifact(
228
+ remote_paths: set[str],
229
+ latest_pointer: dict[str, Any] | None,
230
+ filename: str,
231
+ ) -> bool:
232
+ candidates = stable_snapshot_candidates(latest_pointer, filename)
233
+ return any(candidate in remote_paths for candidate in candidates)
234
+
235
+
236
  def get_dataset_status(options: DatasetStatusOptions) -> dict[str, Any]:
237
  remote = _remote_status(options.hf_repo_id, options.hf_revision) if options.hf_repo_id else None
238
  local = _local_status(options.output_dir)
 
256
  watermark = remote.get("watermark") or {}
257
  latest_pointer = remote.get("latest_pointer") or {}
258
  age = remote.get("age") or {}
259
+ current_analysis = remote.get("current_analysis") or {}
260
+ cheap_artifacts = remote.get("cheap_artifacts") or {}
261
+ archived_runs = remote.get("archived_analysis_runs") or {}
262
  lines = [
263
  f"Repo: {status.get('repo') or '?'}",
264
  f"Dataset: {status.get('dataset_id') or 'not configured'}",
 
269
  f"Remote latest snapshot: {manifest.get('snapshot_id') or latest_pointer.get('latest_snapshot_id') or '?'}",
270
  f"Remote extracted at: {manifest.get('extracted_at') or '?'}",
271
  f"Remote next_since: {watermark.get('next_since') or latest_pointer.get('next_since') or '?'}",
272
+ f"PR scope artifact: {'yes' if cheap_artifacts.get('pr_scope_clusters') else 'no'}",
273
+ f"Contributor artifacts: {'yes' if cheap_artifacts.get('contributors') else 'no'}",
274
  ]
275
  )
276
+ if current_analysis.get("present"):
277
+ if current_analysis.get("valid") is False:
278
+ lines.append(f"Current analysis: invalid ({current_analysis.get('detail')})")
279
+ else:
280
+ lines.append(
281
+ "Current analysis: "
282
+ f"snapshot={current_analysis.get('snapshot_id')} "
283
+ f"analysis_id={current_analysis.get('analysis_id')}"
284
+ )
285
+ lines.append(
286
+ "Current analysis matches latest snapshot: "
287
+ f"{'yes' if current_analysis.get('matches_latest_snapshot') else 'no'}"
288
+ )
289
+ else:
290
+ lines.append("Current analysis: none")
291
+ lines.append(
292
+ "Archived analysis runs: "
293
+ f"{archived_runs.get('count', 0)} total, {archived_runs.get('current_snapshot_count', 0)} for latest snapshot"
294
+ )
295
+ lines.append(
296
+ f"Freshness: {age.get('summary') or 'unknown'} ({age.get('staleness') or 'unknown'})"
297
+ )
298
  if local:
299
  lines.extend(
300
  [
 
302
  f"Local snapshot id: {local.get('snapshot_id') or '?'}",
303
  ]
304
  )
305
+ local_current_analysis = local.get("current_analysis") or {}
306
+ if local_current_analysis.get("present"):
307
+ lines.append(
308
+ "Local current analysis: "
309
+ f"snapshot={local_current_analysis.get('snapshot_id')} "
310
+ f"analysis_id={local_current_analysis.get('analysis_id')}"
311
+ )
312
+ else:
313
+ lines.append("Local current analysis: none")
314
  else:
315
  lines.append("Local latest pointer: none")
316
  return "\n".join(lines)
src/slop_farmer/app/deploy.py CHANGED
@@ -5,6 +5,7 @@ import subprocess
5
  from pathlib import Path
6
 
7
  from slop_farmer.config import DeployDashboardOptions
 
8
 
9
 
10
  def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
@@ -17,6 +18,16 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
17
  {
18
  "PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
19
  "WEB_DIR": str(options.web_dir),
 
 
 
 
 
 
 
 
 
 
20
  "DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
21
  "CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
22
  "CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
@@ -28,12 +39,12 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
28
  "SPACE_SHORT_DESCRIPTION": options.space_short_description,
29
  }
30
  )
31
- if options.snapshot_dir is not None:
32
- env["SNAPSHOT_DIR"] = str(options.snapshot_dir)
33
  if options.analysis_input is not None:
34
- env["ANALYSIS_INPUT"] = str(options.analysis_input)
35
  if options.contributors_input is not None:
36
- env["CONTRIBUTORS_INPUT"] = str(options.contributors_input)
 
 
37
  if options.refresh_contributors:
38
  env["REFRESH_CONTRIBUTORS"] = "1"
39
  if options.private_space:
 
5
  from pathlib import Path
6
 
7
  from slop_farmer.config import DeployDashboardOptions
8
+ from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
9
 
10
 
11
  def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
 
18
  {
19
  "PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
20
  "WEB_DIR": str(options.web_dir),
21
+ "SNAPSHOT_DIR": str(
22
+ resolve_snapshot_source_dir(
23
+ snapshot_dir=options.snapshot_dir,
24
+ local_snapshots_root=options.pipeline_data_dir.resolve() / "snapshots",
25
+ hf_repo_id=options.hf_repo_id,
26
+ hf_revision=options.hf_revision,
27
+ hf_materialize_dir=options.hf_materialize_dir,
28
+ hf_output_dir=options.pipeline_data_dir,
29
+ )
30
+ ),
31
  "DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
32
  "CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
33
  "CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
 
39
  "SPACE_SHORT_DESCRIPTION": options.space_short_description,
40
  }
41
  )
 
 
42
  if options.analysis_input is not None:
43
+ env["ANALYSIS_INPUT"] = str(options.analysis_input.resolve())
44
  if options.contributors_input is not None:
45
+ env["CONTRIBUTORS_INPUT"] = str(options.contributors_input.resolve())
46
+ if options.pr_scope_input is not None:
47
+ env["PR_SCOPE_INPUT"] = str(options.pr_scope_input.resolve())
48
  if options.refresh_contributors:
49
  env["REFRESH_CONTRIBUTORS"] = "1"
50
  if options.private_space:
src/slop_farmer/app/hf_checkpoint_import.py CHANGED
@@ -26,8 +26,9 @@ from typing import Any
26
 
27
  from huggingface_hub import HfApi, hf_hub_download
28
 
29
- from slop_farmer.app.publish import publish_snapshot
30
  from slop_farmer.config import CheckpointImportOptions
 
31
  from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
32
  from slop_farmer.data.parquet_io import (
33
  SCHEMAS,
@@ -106,7 +107,9 @@ def import_hf_checkpoint(options: CheckpointImportOptions) -> Path:
106
  force=options.force,
107
  )
108
  if options.publish_repo_id:
109
- publish_snapshot(snapshot_dir, options.publish_repo_id, private=options.private_hf_repo)
 
 
110
  return snapshot_dir
111
 
112
 
@@ -455,76 +458,15 @@ def _viewer_comment_rows(
455
  def _dataset_card(
456
  repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
457
  ) -> str:
458
- return f"""---
459
- pretty_name: Transformers PR Slop Dataset
460
- configs:
461
- - config_name: issues
462
- data_files:
463
- - split: train
464
- path: issues.parquet
465
- default: true
466
- - config_name: prs
467
- data_files:
468
- - split: train
469
- path: pull_requests.parquet
470
- - config_name: issue_comments
471
- data_files:
472
- - split: train
473
- path: issue_comments.parquet
474
- - config_name: pr_comments
475
- data_files:
476
- - split: train
477
- path: pr_comments.parquet
478
- - config_name: pr_reviews
479
- data_files:
480
- - split: train
481
- path: reviews.parquet
482
- - config_name: pr_files
483
- data_files:
484
- - split: train
485
- path: pr_files.parquet
486
- - config_name: pr_diffs
487
- data_files:
488
- - split: train
489
- path: pr_diffs.parquet
490
- - config_name: review_comments
491
- data_files:
492
- - split: train
493
- path: review_comments.parquet
494
- - config_name: links
495
- data_files:
496
- - split: train
497
- path: links.parquet
498
- - config_name: events
499
- data_files:
500
- - split: train
501
- path: events.parquet
502
- ---
503
- ---
504
-
505
- # Transformers PR Slop Dataset
506
-
507
- Imported checkpoint snapshot for `{repo_slug}`.
508
-
509
- Files:
510
- - `issues.parquet`
511
- - `pull_requests.parquet`
512
- - `comments.parquet`
513
- - `issue_comments.parquet`
514
- - `pr_comments.parquet`
515
- - `reviews.parquet`
516
- - `pr_files.parquet`
517
- - `pr_diffs.parquet`
518
- - `review_comments.parquet`
519
- - `links.parquet`
520
- - `events.parquet`
521
-
522
- Notes:
523
- - source HF dataset: `{source_repo_id}`
524
- - source checkpoint root: `{checkpoint_root}`
525
- - latest imported checkpoint: `{snapshot_id}`
526
- - links were regenerated locally from text references and timeline events
527
- """
528
 
529
 
530
  def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:
 
26
 
27
  from huggingface_hub import HfApi, hf_hub_download
28
 
29
+ from slop_farmer.app.publish_dataset_snapshot import publish_dataset_snapshot
30
  from slop_farmer.config import CheckpointImportOptions
31
+ from slop_farmer.data.dataset_card import build_hf_dataset_card
32
  from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
33
  from slop_farmer.data.parquet_io import (
34
  SCHEMAS,
 
107
  force=options.force,
108
  )
109
  if options.publish_repo_id:
110
+ publish_dataset_snapshot(
111
+ snapshot_dir, options.publish_repo_id, private=options.private_hf_repo
112
+ )
113
  return snapshot_dir
114
 
115
 
 
458
  def _dataset_card(
459
  repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
460
  ) -> str:
461
+ return build_hf_dataset_card(
462
+ repo_slug,
463
+ snapshot_id,
464
+ notes=[
465
+ f"source HF dataset: `{source_repo_id}`",
466
+ f"source checkpoint root: `{checkpoint_root}`",
467
+ "links were regenerated locally from text references and timeline events",
468
+ ],
469
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
471
 
472
  def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:
src/slop_farmer/app/pipeline.py CHANGED
@@ -7,8 +7,8 @@ from datetime import UTC, datetime, timedelta
7
  from pathlib import Path
8
  from typing import Any, Protocol
9
 
10
- from slop_farmer.app.publish import publish_snapshot
11
  from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
 
12
  from slop_farmer.data.github_api import GitHubClient
13
  from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
14
  from slop_farmer.data.normalize import (
@@ -112,96 +112,14 @@ def _reference_time_for_age_caps(crawl_started_at: str) -> datetime:
112
  def _dataset_card(
113
  repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
114
  ) -> str:
115
- new_contributor_config = ""
116
- new_contributor_file = ""
117
- if include_new_contributors:
118
- new_contributor_config = """- config_name: new_contributors
119
- data_files:
120
- - split: train
121
- path: new_contributors.parquet
122
- """
123
- new_contributor_file = """- `new_contributors.parquet`
124
- - `new-contributors-report.json`
125
- - `new-contributors-report.md`
126
- """
127
- return f"""---
128
- pretty_name: Transformers PR Slop Dataset
129
- configs:
130
- - config_name: issues
131
- data_files:
132
- - split: train
133
- path: issues.parquet
134
- default: true
135
- - config_name: prs
136
- data_files:
137
- - split: train
138
- path: pull_requests.parquet
139
- - config_name: issue_comments
140
- data_files:
141
- - split: train
142
- path: issue_comments.parquet
143
- - config_name: pr_comments
144
- data_files:
145
- - split: train
146
- path: pr_comments.parquet
147
- - config_name: pr_reviews
148
- data_files:
149
- - split: train
150
- path: reviews.parquet
151
- - config_name: pr_files
152
- data_files:
153
- - split: train
154
- path: pr_files.parquet
155
- - config_name: pr_diffs
156
- data_files:
157
- - split: train
158
- path: pr_diffs.parquet
159
- - config_name: review_comments
160
- data_files:
161
- - split: train
162
- path: review_comments.parquet
163
- - config_name: links
164
- data_files:
165
- - split: train
166
- path: links.parquet
167
- - config_name: events
168
- data_files:
169
- - split: train
170
- path: events.parquet
171
- {new_contributor_config}---
172
- ---
173
-
174
- # Transformers PR Slop Dataset
175
-
176
- Normalized snapshots of issues, pull requests, comments, reviews, and linkage data from `{repo}`.
177
-
178
- Files:
179
- - `issues.parquet`
180
- - `pull_requests.parquet`
181
- - `comments.parquet`
182
- - `issue_comments.parquet` (derived view of issue discussion comments)
183
- - `pr_comments.parquet` (derived view of pull request discussion comments)
184
- - `reviews.parquet`
185
- - `pr_files.parquet`
186
- - `pr_diffs.parquet`
187
- - `review_comments.parquet`
188
- - `links.parquet`
189
- - `events.parquet`
190
- {new_contributor_file}
191
-
192
- Use:
193
- - duplicate PR and issue analysis
194
- - triage and ranking experiments
195
- - eval set creation
196
-
197
- Notes:
198
- - updated daily
199
- - latest snapshot: `{snapshot_id}`
200
- - raw data only; no labels or moderation decisions
201
- - PR metadata, file-level patch hunks, and full unified diffs are included
202
- - new contributor reviewer artifacts are included when generated for the snapshot
203
- - full file contents for changed files are not included
204
- """
205
 
206
 
207
  def _viewer_comment_rows(
@@ -982,9 +900,6 @@ def run_pipeline(options: PipelineOptions, client: GitHubClientLike | None = Non
982
  "issue_max_age_days": options.issue_max_age_days,
983
  "pr_max_age_days": options.pr_max_age_days,
984
  "fetch_timeline": options.fetch_timeline,
985
- "publish": options.publish,
986
- "hf_repo_id": options.hf_repo_id,
987
- "private_hf_repo": options.private_hf_repo,
988
  "new_contributor_report": options.new_contributor_report,
989
  "new_contributor_window_days": options.new_contributor_window_days,
990
  "new_contributor_max_authors": options.new_contributor_max_authors,
@@ -1045,6 +960,9 @@ def run_pipeline(options: PipelineOptions, client: GitHubClientLike | None = Non
1045
  output_dir=options.output_dir,
1046
  output=None,
1047
  json_output=None,
 
 
 
1048
  window_days=options.new_contributor_window_days,
1049
  max_authors=options.new_contributor_max_authors,
1050
  )
@@ -1094,12 +1012,5 @@ def run_pipeline(options: PipelineOptions, client: GitHubClientLike | None = Non
1094
  _log(f"Updated watermark state: {_watermark_path(options.output_dir)}")
1095
 
1096
  _clear_checkpoint(options.output_dir, snapshot_dir)
1097
- if options.publish:
1098
- if not options.hf_repo_id:
1099
- raise ValueError("--publish requires --hf-repo-id")
1100
- publish_snapshot(
1101
- snapshot_dir, options.hf_repo_id, private=options.private_hf_repo, log=_log
1102
- )
1103
-
1104
  _log(f"Snapshot complete: {snapshot_dir}")
1105
  return snapshot_dir
 
7
  from pathlib import Path
8
  from typing import Any, Protocol
9
 
 
10
  from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
11
+ from slop_farmer.data.dataset_card import build_hf_dataset_card
12
  from slop_farmer.data.github_api import GitHubClient
13
  from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
14
  from slop_farmer.data.normalize import (
 
112
  def _dataset_card(
113
  repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
114
  ) -> str:
115
+ notes = ["new contributor reviewer artifacts are included"] if include_new_contributors else []
116
+ del manifest
117
+ return build_hf_dataset_card(
118
+ repo,
119
+ snapshot_id,
120
+ include_new_contributors=include_new_contributors,
121
+ notes=notes,
122
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
 
125
  def _viewer_comment_rows(
 
900
  "issue_max_age_days": options.issue_max_age_days,
901
  "pr_max_age_days": options.pr_max_age_days,
902
  "fetch_timeline": options.fetch_timeline,
 
 
 
903
  "new_contributor_report": options.new_contributor_report,
904
  "new_contributor_window_days": options.new_contributor_window_days,
905
  "new_contributor_max_authors": options.new_contributor_max_authors,
 
960
  output_dir=options.output_dir,
961
  output=None,
962
  json_output=None,
963
+ hf_repo_id=None,
964
+ hf_revision=None,
965
+ hf_materialize_dir=None,
966
  window_days=options.new_contributor_window_days,
967
  max_authors=options.new_contributor_max_authors,
968
  )
 
1012
  _log(f"Updated watermark state: {_watermark_path(options.output_dir)}")
1013
 
1014
  _clear_checkpoint(options.output_dir, snapshot_dir)
 
 
 
 
 
 
 
1015
  _log(f"Snapshot complete: {snapshot_dir}")
1016
  return snapshot_dir
src/slop_farmer/app/pr_search.py CHANGED
@@ -10,9 +10,12 @@ get_pr_search_status = pr_search_service.get_pr_search_status
10
  get_pr_search_similar = pr_search_service.get_pr_search_similar
11
  get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
12
  get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
 
 
13
  get_pr_search_clusters = pr_search_service.get_pr_search_clusters
14
  list_pr_search_clusters = pr_search_service.list_pr_search_clusters
15
  get_pr_search_cluster = pr_search_service.get_pr_search_cluster
 
16
  explain_pr_search_pair = pr_search_service.explain_pr_search_pair
17
  probe_pr_search_live = pr_search_service.probe_pr_search_live
18
  probe_pr_search_github = pr_search_service.probe_pr_search_github
@@ -31,6 +34,7 @@ def format_pr_search_status(result: Mapping[str, Any]) -> str:
31
  (
32
  "Rows: "
33
  f"documents={counts['documents']} "
 
34
  f"features={counts['features']} "
35
  f"neighbors={counts['neighbors']} "
36
  f"clusters={counts['clusters']} "
@@ -245,3 +249,73 @@ def format_pr_search_probe(result: Mapping[str, Any]) -> str:
245
  if row.get("reason"):
246
  lines.append(f" reason: {row['reason']}")
247
  return "\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  get_pr_search_similar = pr_search_service.get_pr_search_similar
11
  get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
12
  get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
13
+ get_pr_search_contributor = pr_search_service.get_pr_search_contributor
14
+ get_pr_search_contributor_pulls = pr_search_service.get_pr_search_contributor_pulls
15
  get_pr_search_clusters = pr_search_service.get_pr_search_clusters
16
  list_pr_search_clusters = pr_search_service.list_pr_search_clusters
17
  get_pr_search_cluster = pr_search_service.get_pr_search_cluster
18
+ get_pr_search_pull_contributor = pr_search_service.get_pr_search_pull_contributor
19
  explain_pr_search_pair = pr_search_service.explain_pr_search_pair
20
  probe_pr_search_live = pr_search_service.probe_pr_search_live
21
  probe_pr_search_github = pr_search_service.probe_pr_search_github
 
34
  (
35
  "Rows: "
36
  f"documents={counts['documents']} "
37
+ f"contributors={counts.get('contributors', 0)} "
38
  f"features={counts['features']} "
39
  f"neighbors={counts['neighbors']} "
40
  f"clusters={counts['clusters']} "
 
249
  if row.get("reason"):
250
  lines.append(f" reason: {row['reason']}")
251
  return "\n".join(lines)
252
+
253
+
254
+ def format_pr_search_contributor(result: Mapping[str, Any]) -> str:
255
+ contributor = result["contributor"]
256
+ lines = [
257
+ f"Contributor {contributor['author_login']}",
258
+ f"Repo: {result['repo']}",
259
+ f"Snapshot: {result['snapshot_id']}",
260
+ f"Name: {contributor.get('name') or '-'}",
261
+ f"Profile: {contributor.get('profile_url') or '-'}",
262
+ f"Association: {contributor.get('repo_association') or '-'}",
263
+ f"First seen in snapshot: {'yes' if contributor.get('first_seen_in_snapshot') else 'no'}",
264
+ (
265
+ "Scores: "
266
+ f"follow-through={contributor.get('follow_through_score') or '-'} "
267
+ f"breadth={contributor.get('breadth_score') or '-'} "
268
+ f"risk={contributor.get('automation_risk_signal') or '-'}"
269
+ ),
270
+ f"Heuristic: {contributor.get('heuristic_note') or '-'}",
271
+ f"Public orgs: {', '.join(contributor.get('public_orgs') or []) or '-'}",
272
+ "",
273
+ "Recent indexed PRs:",
274
+ ]
275
+ pulls = result.get("pulls") or []
276
+ if not pulls:
277
+ lines.append("- none")
278
+ return "\n".join(lines)
279
+ for row in pulls:
280
+ lines.append(
281
+ f"- PR #{row['pr_number']}: {row.get('title') or ''} "
282
+ f"[state={row.get('state') or '-'} merged={'yes' if row.get('merged') else 'no'}]"
283
+ )
284
+ return "\n".join(lines)
285
+
286
+
287
+ def format_pr_search_contributor_pulls(result: Mapping[str, Any]) -> str:
288
+ contributor = result["contributor"]
289
+ lines = [
290
+ f"Contributor PRs: {contributor['author_login']}",
291
+ f"Repo: {result['repo']}",
292
+ f"Snapshot: {result['snapshot_id']}",
293
+ f"Pull requests: {result.get('pull_count', len(result.get('pulls') or []))}",
294
+ "",
295
+ ]
296
+ pulls = result.get("pulls") or []
297
+ if not pulls:
298
+ lines.append("No indexed PRs found for that contributor.")
299
+ return "\n".join(lines)
300
+ for row in pulls:
301
+ lines.append(
302
+ f"- PR #{row['pr_number']}: {row.get('title') or ''} "
303
+ f"(updated={row.get('updated_at') or '-'}, state={row.get('state') or '-'})"
304
+ )
305
+ return "\n".join(lines)
306
+
307
+
308
+ def format_pr_search_pull_contributor(result: Mapping[str, Any]) -> str:
309
+ pr = result["pr"]
310
+ contributor = result["contributor"]
311
+ return "\n".join(
312
+ [
313
+ f"PR #{pr['pr_number']}: {pr.get('title') or ''}",
314
+ f"Author: {contributor['author_login']}",
315
+ f"Risk: {contributor.get('automation_risk_signal') or '-'}",
316
+ f"Follow-through: {contributor.get('follow_through_score') or '-'}",
317
+ f"Breadth: {contributor.get('breadth_score') or '-'}",
318
+ f"Heuristic: {contributor.get('heuristic_note') or '-'}",
319
+ f"Profile: {contributor.get('profile_url') or '-'}",
320
+ ]
321
+ )
src/slop_farmer/app/pr_search_api.py CHANGED
@@ -12,10 +12,23 @@ from fastapi.responses import JSONResponse
12
  from slop_farmer.config import PrSearchRefreshOptions
13
  from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
14
  from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
15
- from slop_farmer.data.snapshot_paths import default_hf_materialize_dir
 
 
 
 
 
 
 
 
 
 
 
16
  from slop_farmer.reports.pr_search_service import (
17
  get_pr_search_cluster,
18
  get_pr_search_clusters,
 
 
19
  get_pr_search_similar_lookup,
20
  get_pr_search_status,
21
  list_pr_search_clusters,
@@ -120,7 +133,7 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
120
  app.state.startup_error = str(exc)
121
  yield
122
 
123
- app = FastAPI(title="slop PR search API", version="0.1.0", lifespan=lifespan)
124
 
125
  @app.exception_handler(ValueError)
126
  async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
@@ -156,11 +169,13 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
156
  contributor_snapshot_dir = _surface_snapshot_dir(
157
  settings, repo_slug, surface="contributors"
158
  )
159
- surface_payload = {
160
- "issues": get_snapshot_surfaces(issue_snapshot_dir)["issues"],
161
- "contributors": get_snapshot_surfaces(contributor_snapshot_dir)["contributors"],
 
 
 
162
  }
163
- return {**status, "surfaces": surface_payload}
164
 
165
  @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/similar")
166
  async def pr_similar(
@@ -238,6 +253,166 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
238
  ),
239
  )
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  @app.get("/v1/repos/{owner}/{repo}/issues/status")
242
  async def issue_status(
243
  owner: str,
@@ -364,7 +539,9 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
364
  ) -> dict[str, Any]:
365
  settings = request.app.state.settings
366
  repo_slug = _repo_slug(settings, owner, repo)
367
- return get_contributor_status(_surface_snapshot_dir(settings, repo_slug, surface="contributors"))
 
 
368
 
369
  @app.get("/v1/repos/{owner}/{repo}/contributors")
370
  async def contributors(
@@ -531,7 +708,9 @@ def _surface_available(snapshot_dir: Path, *, surface: Literal["issues", "contri
531
  if not snapshot_dir.exists():
532
  return False
533
  if surface == "issues":
534
- return any(snapshot_dir.glob("analysis-report*.json"))
 
 
535
  return (snapshot_dir / "new-contributors-report.json").exists()
536
 
537
 
@@ -558,6 +737,10 @@ def _looks_not_found(exc: ValueError) -> bool:
558
  message = str(exc).lower()
559
  return (
560
  "not found" in message
 
 
 
 
561
  or "no active pr search run" in message
562
  or "was not found in the active indexed universe" in message
563
  )
 
12
  from slop_farmer.config import PrSearchRefreshOptions
13
  from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
14
  from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
15
+ from slop_farmer.data.snapshot_paths import (
16
+ CURRENT_ANALYSIS_MANIFEST_PATH,
17
+ default_hf_materialize_dir,
18
+ )
19
+ from slop_farmer.reports.analysis_service import (
20
+ get_analysis_best,
21
+ get_analysis_meta_bug,
22
+ get_analysis_status,
23
+ get_pr_analysis,
24
+ list_analysis_duplicate_prs,
25
+ list_analysis_meta_bugs,
26
+ )
27
  from slop_farmer.reports.pr_search_service import (
28
  get_pr_search_cluster,
29
  get_pr_search_clusters,
30
+ get_pr_search_contributor_pulls,
31
+ get_pr_search_pull_contributor,
32
  get_pr_search_similar_lookup,
33
  get_pr_search_status,
34
  list_pr_search_clusters,
 
133
  app.state.startup_error = str(exc)
134
  yield
135
 
136
+ app = FastAPI(title="slop PR search API", version="0.1.1", lifespan=lifespan)
137
 
138
  @app.exception_handler(ValueError)
139
  async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
 
169
  contributor_snapshot_dir = _surface_snapshot_dir(
170
  settings, repo_slug, surface="contributors"
171
  )
172
+ return {
173
+ **status,
174
+ "surfaces": {
175
+ "issues": get_snapshot_surfaces(issue_snapshot_dir)["issues"],
176
+ "contributors": get_snapshot_surfaces(contributor_snapshot_dir)["contributors"],
177
+ },
178
  }
 
179
 
180
  @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/similar")
181
  async def pr_similar(
 
253
  ),
254
  )
255
 
256
+ @app.get("/v1/repos/{owner}/{repo}/contributors/{login}/pulls")
257
+ async def contributor_pulls(
258
+ owner: str,
259
+ repo: str,
260
+ login: str,
261
+ request: Request,
262
+ limit: int | None = None,
263
+ ) -> dict[str, Any]:
264
+ settings = request.app.state.settings
265
+ repo_slug = _repo_slug(settings, owner, repo)
266
+ return get_pr_search_contributor_pulls(
267
+ settings.index_path,
268
+ repo=repo_slug,
269
+ author_login=login,
270
+ limit=_limit(
271
+ limit, default=settings.similar_limit_default, maximum=settings.similar_limit_max
272
+ ),
273
+ )
274
+
275
+ @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/contributor")
276
+ async def pull_contributor(
277
+ owner: str,
278
+ repo: str,
279
+ number: int,
280
+ request: Request,
281
+ ) -> dict[str, Any]:
282
+ settings = request.app.state.settings
283
+ repo_slug = _repo_slug(settings, owner, repo)
284
+ return get_pr_search_pull_contributor(settings.index_path, repo=repo_slug, pr_number=number)
285
+
286
+ @app.get("/v1/repos/{owner}/{repo}/analysis/status")
287
+ async def analysis_status(
288
+ owner: str,
289
+ repo: str,
290
+ request: Request,
291
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
292
+ snapshot_id: str | None = None,
293
+ analysis_id: str | None = None,
294
+ ) -> dict[str, Any]:
295
+ settings = request.app.state.settings
296
+ repo_slug = _repo_slug(settings, owner, repo)
297
+ return get_analysis_status(
298
+ settings.index_path,
299
+ repo=repo_slug,
300
+ variant=variant,
301
+ snapshot_id=snapshot_id,
302
+ analysis_id=analysis_id,
303
+ )
304
+
305
+ @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/analysis")
306
+ async def pr_analysis(
307
+ owner: str,
308
+ repo: str,
309
+ number: int,
310
+ request: Request,
311
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
312
+ snapshot_id: str | None = None,
313
+ analysis_id: str | None = None,
314
+ ) -> dict[str, Any]:
315
+ settings = request.app.state.settings
316
+ repo_slug = _repo_slug(settings, owner, repo)
317
+ return get_pr_analysis(
318
+ settings.index_path,
319
+ repo=repo_slug,
320
+ pr_number=number,
321
+ variant=variant,
322
+ snapshot_id=snapshot_id,
323
+ analysis_id=analysis_id,
324
+ )
325
+
326
+ @app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs")
327
+ async def analysis_meta_bugs(
328
+ owner: str,
329
+ repo: str,
330
+ request: Request,
331
+ limit: int | None = None,
332
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
333
+ snapshot_id: str | None = None,
334
+ analysis_id: str | None = None,
335
+ ) -> dict[str, Any]:
336
+ settings = request.app.state.settings
337
+ repo_slug = _repo_slug(settings, owner, repo)
338
+ return list_analysis_meta_bugs(
339
+ settings.index_path,
340
+ repo=repo_slug,
341
+ variant=variant,
342
+ limit=_limit(
343
+ limit,
344
+ default=settings.cluster_list_limit_default,
345
+ maximum=settings.cluster_list_limit_max,
346
+ ),
347
+ snapshot_id=snapshot_id,
348
+ analysis_id=analysis_id,
349
+ )
350
+
351
+ @app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs/{cluster_id}")
352
+ async def analysis_meta_bug(
353
+ owner: str,
354
+ repo: str,
355
+ cluster_id: str,
356
+ request: Request,
357
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
358
+ snapshot_id: str | None = None,
359
+ analysis_id: str | None = None,
360
+ ) -> dict[str, Any]:
361
+ settings = request.app.state.settings
362
+ repo_slug = _repo_slug(settings, owner, repo)
363
+ return get_analysis_meta_bug(
364
+ settings.index_path,
365
+ repo=repo_slug,
366
+ cluster_id=cluster_id,
367
+ variant=variant,
368
+ snapshot_id=snapshot_id,
369
+ analysis_id=analysis_id,
370
+ )
371
+
372
+ @app.get("/v1/repos/{owner}/{repo}/analysis/duplicate-prs")
373
+ async def analysis_duplicate_prs(
374
+ owner: str,
375
+ repo: str,
376
+ request: Request,
377
+ limit: int | None = None,
378
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
379
+ snapshot_id: str | None = None,
380
+ analysis_id: str | None = None,
381
+ ) -> dict[str, Any]:
382
+ settings = request.app.state.settings
383
+ repo_slug = _repo_slug(settings, owner, repo)
384
+ return list_analysis_duplicate_prs(
385
+ settings.index_path,
386
+ repo=repo_slug,
387
+ variant=variant,
388
+ limit=_limit(
389
+ limit,
390
+ default=settings.cluster_list_limit_default,
391
+ maximum=settings.cluster_list_limit_max,
392
+ ),
393
+ snapshot_id=snapshot_id,
394
+ analysis_id=analysis_id,
395
+ )
396
+
397
+ @app.get("/v1/repos/{owner}/{repo}/analysis/best")
398
+ async def analysis_best(
399
+ owner: str,
400
+ repo: str,
401
+ request: Request,
402
+ variant: Literal["auto", "hybrid", "deterministic"] = "auto",
403
+ snapshot_id: str | None = None,
404
+ analysis_id: str | None = None,
405
+ ) -> dict[str, Any]:
406
+ settings = request.app.state.settings
407
+ repo_slug = _repo_slug(settings, owner, repo)
408
+ return get_analysis_best(
409
+ settings.index_path,
410
+ repo=repo_slug,
411
+ variant=variant,
412
+ snapshot_id=snapshot_id,
413
+ analysis_id=analysis_id,
414
+ )
415
+
416
  @app.get("/v1/repos/{owner}/{repo}/issues/status")
417
  async def issue_status(
418
  owner: str,
 
539
  ) -> dict[str, Any]:
540
  settings = request.app.state.settings
541
  repo_slug = _repo_slug(settings, owner, repo)
542
+ return get_contributor_status(
543
+ _surface_snapshot_dir(settings, repo_slug, surface="contributors")
544
+ )
545
 
546
  @app.get("/v1/repos/{owner}/{repo}/contributors")
547
  async def contributors(
 
708
  if not snapshot_dir.exists():
709
  return False
710
  if surface == "issues":
711
+ return (snapshot_dir / CURRENT_ANALYSIS_MANIFEST_PATH).exists() or any(
712
+ snapshot_dir.glob("analysis-report*.json")
713
+ )
714
  return (snapshot_dir / "new-contributors-report.json").exists()
715
 
716
 
 
737
  message = str(exc).lower()
738
  return (
739
  "not found" in message
740
+ or "analysis report was not found" in message
741
+ or "no analysis report was found" in message
742
+ or "published analysis" in message
743
+ or "materialized snapshot" in message
744
  or "no active pr search run" in message
745
  or "was not found in the active indexed universe" in message
746
  )
src/slop_farmer/app/publish_analysis.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from collections.abc import Callable, Iterable
5
+ from dataclasses import dataclass
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+ from typing import Any, Protocol, cast
9
+
10
+ from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
11
+
12
+ from slop_farmer.app.save_cache import _save_analysis_cache_api
13
+ from slop_farmer.config import PublishAnalysisArtifactsOptions
14
+ from slop_farmer.data.parquet_io import read_json
15
+ from slop_farmer.data.snapshot_paths import (
16
+ ANALYSIS_REPORT_FILENAME_BY_VARIANT,
17
+ HYBRID_ANALYSIS_REVIEWS_FILENAME,
18
+ ROOT_MANIFEST_FILENAME,
19
+ analysis_run_artifact_path,
20
+ analysis_run_manifest_path,
21
+ archived_snapshot_manifest_path,
22
+ build_archived_analysis_run_manifest,
23
+ build_current_analysis_manifest,
24
+ current_analysis_artifact_path,
25
+ resolve_snapshot_dir_from_output,
26
+ )
27
+
28
+
29
+ class HubApiLike(Protocol):
30
+ def create_repo(
31
+ self,
32
+ repo_id: str,
33
+ *,
34
+ repo_type: str,
35
+ private: bool,
36
+ exist_ok: bool,
37
+ ) -> None: ...
38
+
39
+ def create_commit(
40
+ self,
41
+ repo_id: str,
42
+ operations: Iterable[CommitOperationAdd],
43
+ *,
44
+ commit_message: str,
45
+ repo_type: str,
46
+ ) -> Any: ...
47
+
48
+ def upload_folder(
49
+ self,
50
+ *,
51
+ repo_id: str,
52
+ folder_path: Path,
53
+ path_in_repo: str,
54
+ repo_type: str,
55
+ commit_message: str,
56
+ ) -> None: ...
57
+
58
+
59
+ @dataclass(frozen=True, slots=True)
60
+ class PublishableAnalysisArtifacts:
61
+ repo: str
62
+ snapshot_id: str
63
+ model: str | None
64
+ report_path: Path
65
+ reviews_path: Path | None
66
+ report_payload: dict[str, Any]
67
+
68
+
69
+ def run_publish_analysis_artifacts(options: PublishAnalysisArtifactsOptions) -> dict[str, Any]:
70
+ snapshot_dir = resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
71
+ return publish_analysis_artifacts(
72
+ snapshot_dir=snapshot_dir,
73
+ analysis_input=options.analysis_input,
74
+ hf_repo_id=options.hf_repo_id,
75
+ analysis_id=options.analysis_id,
76
+ canonical=options.canonical,
77
+ save_cache=options.save_cache,
78
+ private=options.private_hf_repo,
79
+ )
80
+
81
+
82
+ def publish_analysis_artifacts(
83
+ *,
84
+ snapshot_dir: Path,
85
+ analysis_input: Path | None,
86
+ hf_repo_id: str,
87
+ analysis_id: str,
88
+ canonical: bool,
89
+ private: bool,
90
+ save_cache: bool = False,
91
+ log: Callable[[str], None] | None = None,
92
+ ) -> dict[str, Any]:
93
+ return _publish_analysis_artifacts_api(
94
+ cast("HubApiLike", HfApi()),
95
+ snapshot_dir=snapshot_dir,
96
+ analysis_input=analysis_input,
97
+ hf_repo_id=hf_repo_id,
98
+ analysis_id=analysis_id,
99
+ canonical=canonical,
100
+ private=private,
101
+ save_cache=save_cache,
102
+ log=log,
103
+ )
104
+
105
+
106
+ def _publish_analysis_artifacts_api(
107
+ api: HubApiLike,
108
+ *,
109
+ snapshot_dir: Path,
110
+ analysis_input: Path | None = None,
111
+ hf_repo_id: str,
112
+ analysis_id: str,
113
+ canonical: bool,
114
+ private: bool,
115
+ save_cache: bool = False,
116
+ log: Callable[[str], None] | None = None,
117
+ ) -> dict[str, Any]:
118
+ artifacts = _discover_publishable_analysis(snapshot_dir, analysis_input=analysis_input)
119
+ published_at = _iso_now()
120
+ channel = "canonical" if canonical else "comparison"
121
+ archived_manifest = build_archived_analysis_run_manifest(
122
+ repo=artifacts.repo,
123
+ snapshot_id=artifacts.snapshot_id,
124
+ analysis_id=analysis_id,
125
+ variant="hybrid",
126
+ channel=channel,
127
+ model=artifacts.model,
128
+ published_at=published_at,
129
+ include_hybrid_reviews=artifacts.reviews_path is not None,
130
+ )
131
+ current_manifest = (
132
+ build_current_analysis_manifest(
133
+ repo=artifacts.repo,
134
+ snapshot_id=artifacts.snapshot_id,
135
+ analysis_id=analysis_id,
136
+ variant="hybrid",
137
+ channel=channel,
138
+ model=artifacts.model,
139
+ published_at=published_at,
140
+ include_hybrid_reviews=artifacts.reviews_path is not None,
141
+ )
142
+ if canonical
143
+ else None
144
+ )
145
+ snapshot_manifest = _updated_snapshot_manifest(
146
+ snapshot_dir=snapshot_dir,
147
+ hf_repo_id=hf_repo_id,
148
+ snapshot_id=artifacts.snapshot_id,
149
+ analysis_id=analysis_id,
150
+ archived_manifest=archived_manifest,
151
+ canonical=canonical,
152
+ )
153
+ operations = _commit_operations(
154
+ artifacts=artifacts,
155
+ analysis_id=analysis_id,
156
+ archived_manifest=archived_manifest,
157
+ current_manifest=current_manifest,
158
+ snapshot_manifest=snapshot_manifest,
159
+ )
160
+
161
+ if log:
162
+ log(f"Ensuring Hub dataset repo exists: {hf_repo_id}")
163
+ api.create_repo(hf_repo_id, repo_type="dataset", private=private, exist_ok=True)
164
+ if log:
165
+ log(f"Publishing analysis {analysis_id} for snapshot {artifacts.snapshot_id}")
166
+ api.create_commit(
167
+ hf_repo_id,
168
+ operations,
169
+ commit_message=f"Publish analysis {analysis_id} for snapshot {artifacts.snapshot_id}",
170
+ repo_type="dataset",
171
+ )
172
+ cache_result = (
173
+ _save_analysis_cache_api(
174
+ api,
175
+ snapshot_dir=snapshot_dir,
176
+ hf_repo_id=hf_repo_id,
177
+ private=private,
178
+ log=log,
179
+ )
180
+ if save_cache
181
+ else None
182
+ )
183
+ result: dict[str, Any] = {
184
+ "repo": artifacts.repo,
185
+ "dataset_id": hf_repo_id,
186
+ "snapshot_id": artifacts.snapshot_id,
187
+ "analysis_id": analysis_id,
188
+ "canonical": canonical,
189
+ "save_cache": save_cache,
190
+ "published_at": published_at,
191
+ "artifact_paths": [operation.path_in_repo for operation in operations],
192
+ }
193
+ if cache_result is not None:
194
+ result["cache"] = cache_result
195
+ if log:
196
+ log(f"Published analysis artifacts to {hf_repo_id}")
197
+ return result
198
+
199
+
200
+ def _discover_publishable_analysis(
201
+ snapshot_dir: Path, *, analysis_input: Path | None
202
+ ) -> PublishableAnalysisArtifacts:
203
+ manifest_path = snapshot_dir / ROOT_MANIFEST_FILENAME
204
+ if not manifest_path.exists():
205
+ raise FileNotFoundError(f"Snapshot manifest is missing: {manifest_path}")
206
+ manifest = read_json(manifest_path)
207
+ if not isinstance(manifest, dict):
208
+ raise ValueError(f"Snapshot manifest at {manifest_path} must contain a JSON object.")
209
+ snapshot_id = str(manifest.get("snapshot_id") or snapshot_dir.name).strip()
210
+ repo = str(manifest.get("repo") or "").strip()
211
+ if not repo:
212
+ raise ValueError(f"Snapshot manifest at {manifest_path} does not define repo.")
213
+
214
+ report_path = (
215
+ analysis_input.resolve()
216
+ if analysis_input is not None
217
+ else snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"]
218
+ )
219
+ if not report_path.exists():
220
+ raise FileNotFoundError(f"Hybrid analysis report is missing: {report_path}")
221
+ report_payload = read_json(report_path)
222
+ if not isinstance(report_payload, dict):
223
+ raise ValueError(f"Hybrid analysis report at {report_path} must contain a JSON object.")
224
+ report_snapshot_id = str(report_payload.get("snapshot_id") or snapshot_id).strip()
225
+ if report_snapshot_id != snapshot_id:
226
+ raise ValueError(
227
+ f"Hybrid analysis report snapshot_id {report_snapshot_id!r} does not match manifest snapshot_id {snapshot_id!r}."
228
+ )
229
+ report_repo = str(report_payload.get("repo") or repo).strip()
230
+ if report_repo != repo:
231
+ raise ValueError(
232
+ f"Hybrid analysis report repo {report_repo!r} does not match manifest repo {repo!r}."
233
+ )
234
+ model = report_payload.get("model")
235
+ if model is not None:
236
+ model = str(model)
237
+
238
+ reviews_path = report_path.with_name(f"{report_path.stem}.llm-reviews.json")
239
+ return PublishableAnalysisArtifacts(
240
+ repo=repo,
241
+ snapshot_id=snapshot_id,
242
+ model=model,
243
+ report_path=report_path,
244
+ reviews_path=reviews_path if reviews_path.exists() else None,
245
+ report_payload={str(key): value for key, value in report_payload.items()},
246
+ )
247
+
248
+
249
+ def _updated_snapshot_manifest(
250
+ *,
251
+ snapshot_dir: Path,
252
+ hf_repo_id: str,
253
+ snapshot_id: str,
254
+ analysis_id: str,
255
+ archived_manifest: dict[str, Any],
256
+ canonical: bool,
257
+ ) -> dict[str, Any]:
258
+ manifest = _load_remote_snapshot_manifest(hf_repo_id, snapshot_id) or read_json(
259
+ snapshot_dir / ROOT_MANIFEST_FILENAME
260
+ )
261
+ if not isinstance(manifest, dict):
262
+ raise ValueError("Archived snapshot manifest must contain a JSON object.")
263
+ updated = {str(key): value for key, value in manifest.items()}
264
+ published_analysis: dict[str, Any] | Any = updated.get("published_analysis")
265
+ if not isinstance(published_analysis, dict):
266
+ published_analysis = {"schema_version": 1, "runs": {}}
267
+ runs: dict[str, Any] | Any = published_analysis.get("runs")
268
+ if not isinstance(runs, dict):
269
+ runs = {}
270
+ runs[analysis_id] = {
271
+ "analysis_id": analysis_id,
272
+ "variant": archived_manifest["variant"],
273
+ "channel": archived_manifest["channel"],
274
+ "model": archived_manifest.get("model"),
275
+ "published_at": archived_manifest["published_at"],
276
+ "manifest_path": analysis_run_manifest_path(snapshot_id, analysis_id),
277
+ "artifacts": archived_manifest["artifacts"],
278
+ }
279
+ published_analysis["schema_version"] = 1
280
+ published_analysis["runs"] = runs
281
+ if canonical:
282
+ published_analysis["canonical_analysis_id"] = analysis_id
283
+ updated["published_analysis"] = published_analysis
284
+ return updated
285
+
286
+
287
+ def _load_remote_snapshot_manifest(hf_repo_id: str, snapshot_id: str) -> dict[str, Any] | None:
288
+ try:
289
+ downloaded = hf_hub_download(
290
+ repo_id=hf_repo_id,
291
+ repo_type="dataset",
292
+ filename=archived_snapshot_manifest_path(snapshot_id),
293
+ )
294
+ except Exception:
295
+ return None
296
+ payload = json.loads(Path(downloaded).read_text(encoding="utf-8"))
297
+ return payload if isinstance(payload, dict) else None
298
+
299
+
300
+ def _commit_operations(
301
+ *,
302
+ artifacts: PublishableAnalysisArtifacts,
303
+ analysis_id: str,
304
+ archived_manifest: dict[str, Any],
305
+ current_manifest: dict[str, Any] | None,
306
+ snapshot_manifest: dict[str, Any],
307
+ ) -> list[CommitOperationAdd]:
308
+ report_filename = ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"]
309
+ operations = [
310
+ CommitOperationAdd(
311
+ path_in_repo=analysis_run_artifact_path(
312
+ artifacts.snapshot_id,
313
+ analysis_id,
314
+ report_filename,
315
+ ),
316
+ path_or_fileobj=artifacts.report_path,
317
+ ),
318
+ CommitOperationAdd(
319
+ path_in_repo=analysis_run_manifest_path(artifacts.snapshot_id, analysis_id),
320
+ path_or_fileobj=_json_bytes(archived_manifest),
321
+ ),
322
+ CommitOperationAdd(
323
+ path_in_repo=archived_snapshot_manifest_path(artifacts.snapshot_id),
324
+ path_or_fileobj=_json_bytes(snapshot_manifest),
325
+ ),
326
+ ]
327
+ if artifacts.reviews_path is not None:
328
+ operations.append(
329
+ CommitOperationAdd(
330
+ path_in_repo=analysis_run_artifact_path(
331
+ artifacts.snapshot_id,
332
+ analysis_id,
333
+ HYBRID_ANALYSIS_REVIEWS_FILENAME,
334
+ ),
335
+ path_or_fileobj=artifacts.reviews_path,
336
+ )
337
+ )
338
+ if current_manifest is not None:
339
+ operations.extend(
340
+ [
341
+ CommitOperationAdd(
342
+ path_in_repo=current_analysis_artifact_path(report_filename),
343
+ path_or_fileobj=artifacts.report_path,
344
+ ),
345
+ CommitOperationAdd(
346
+ path_in_repo=current_analysis_artifact_path(ROOT_MANIFEST_FILENAME),
347
+ path_or_fileobj=_json_bytes(current_manifest),
348
+ ),
349
+ ]
350
+ )
351
+ if artifacts.reviews_path is not None:
352
+ operations.append(
353
+ CommitOperationAdd(
354
+ path_in_repo=current_analysis_artifact_path(HYBRID_ANALYSIS_REVIEWS_FILENAME),
355
+ path_or_fileobj=artifacts.reviews_path,
356
+ )
357
+ )
358
+ return operations
359
+
360
+
361
+ def _json_bytes(payload: dict[str, Any]) -> bytes:
362
+ return (json.dumps(payload, indent=2, sort_keys=True) + "\n").encode("utf-8")
363
+
364
+
365
+ def _iso_now() -> str:
366
+ return datetime.now(tz=UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
src/slop_farmer/app/publish_dataset_snapshot.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable
4
+ from pathlib import Path
5
+ from typing import Protocol, cast
6
+
7
+ from huggingface_hub import HfApi
8
+
9
+
10
+ class HubApiLike(Protocol):
11
+ def create_repo(
12
+ self, repo_id: str, *, repo_type: str, private: bool, exist_ok: bool
13
+ ) -> None: ...
14
+
15
+ def upload_folder(
16
+ self,
17
+ *,
18
+ repo_id: str,
19
+ folder_path: Path,
20
+ path_in_repo: str,
21
+ repo_type: str,
22
+ commit_message: str,
23
+ ) -> None: ...
24
+
25
+
26
+ def publish_dataset_snapshot(
27
+ snapshot_dir: Path,
28
+ hf_repo_id: str,
29
+ *,
30
+ private: bool,
31
+ log: Callable[[str], None] | None = None,
32
+ ) -> None:
33
+ _publish_dataset_snapshot_api(
34
+ cast("HubApiLike", HfApi()),
35
+ snapshot_dir,
36
+ hf_repo_id,
37
+ private,
38
+ log=log,
39
+ )
40
+
41
+
42
+ def _publish_dataset_snapshot_api(
43
+ api: HubApiLike,
44
+ snapshot_dir: Path,
45
+ hf_repo_id: str,
46
+ private: bool,
47
+ log: Callable[[str], None] | None = None,
48
+ ) -> None:
49
+ if log:
50
+ log(f"Ensuring Hub dataset repo exists: {hf_repo_id}")
51
+ api.create_repo(hf_repo_id, repo_type="dataset", private=private, exist_ok=True)
52
+ if log:
53
+ log(f"Uploading snapshot to Hub: {snapshot_dir}")
54
+ api.upload_folder(
55
+ repo_id=hf_repo_id,
56
+ folder_path=snapshot_dir,
57
+ path_in_repo=".",
58
+ repo_type="dataset",
59
+ commit_message=f"Add snapshot {snapshot_dir.name}",
60
+ )
61
+ if log:
62
+ log(f"Upload finished: {hf_repo_id}")
src/slop_farmer/app/save_cache.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable
4
+ from pathlib import Path
5
+ from typing import Any, Protocol, cast
6
+
7
+ from huggingface_hub import HfApi
8
+
9
+ from slop_farmer.config import SaveCacheOptions
10
+ from slop_farmer.data.parquet_io import read_json
11
+ from slop_farmer.data.snapshot_paths import ROOT_MANIFEST_FILENAME, resolve_snapshot_dir_from_output
12
+
13
+ ANALYSIS_STATE_DIRNAME = "analysis-state"
14
+
15
+
16
+ class HubApiLike(Protocol):
17
+ def create_repo(
18
+ self,
19
+ repo_id: str,
20
+ *,
21
+ repo_type: str,
22
+ private: bool,
23
+ exist_ok: bool,
24
+ ) -> None: ...
25
+
26
+ def upload_folder(
27
+ self,
28
+ *,
29
+ repo_id: str,
30
+ folder_path: Path,
31
+ path_in_repo: str,
32
+ repo_type: str,
33
+ commit_message: str,
34
+ ) -> None: ...
35
+
36
+
37
+ def run_save_cache(options: SaveCacheOptions) -> dict[str, Any]:
38
+ snapshot_dir = resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
39
+ return save_analysis_cache(
40
+ snapshot_dir=snapshot_dir,
41
+ hf_repo_id=options.hf_repo_id,
42
+ private=options.private_hf_repo,
43
+ )
44
+
45
+
46
+ def save_analysis_cache(
47
+ *,
48
+ snapshot_dir: Path,
49
+ hf_repo_id: str,
50
+ private: bool,
51
+ log: Callable[[str], None] | None = None,
52
+ ) -> dict[str, Any]:
53
+ return _save_analysis_cache_api(
54
+ cast("HubApiLike", HfApi()),
55
+ snapshot_dir=snapshot_dir,
56
+ hf_repo_id=hf_repo_id,
57
+ private=private,
58
+ log=log,
59
+ )
60
+
61
+
62
+ def _save_analysis_cache_api(
63
+ api: HubApiLike,
64
+ *,
65
+ snapshot_dir: Path,
66
+ hf_repo_id: str,
67
+ private: bool,
68
+ log: Callable[[str], None] | None = None,
69
+ ) -> dict[str, Any]:
70
+ cache_dir = snapshot_dir / ANALYSIS_STATE_DIRNAME
71
+ if not cache_dir.exists():
72
+ raise FileNotFoundError(f"Analysis cache directory is missing: {cache_dir}")
73
+ if not cache_dir.is_dir():
74
+ raise NotADirectoryError(f"Analysis cache path is not a directory: {cache_dir}")
75
+ artifact_paths = _cache_artifact_paths(cache_dir)
76
+ if not artifact_paths:
77
+ raise ValueError(f"Analysis cache directory is empty: {cache_dir}")
78
+
79
+ manifest_path = snapshot_dir / ROOT_MANIFEST_FILENAME
80
+ manifest = read_json(manifest_path) if manifest_path.exists() else {}
81
+ if not isinstance(manifest, dict):
82
+ raise ValueError(f"Snapshot manifest at {manifest_path} must contain a JSON object.")
83
+ snapshot_id = str(manifest.get("snapshot_id") or snapshot_dir.name).strip()
84
+ repo = str(manifest.get("repo") or "").strip()
85
+
86
+ if log:
87
+ log(f"Ensuring Hub dataset repo exists: {hf_repo_id}")
88
+ api.create_repo(hf_repo_id, repo_type="dataset", private=private, exist_ok=True)
89
+ if log:
90
+ log(f"Saving analysis cache for snapshot {snapshot_id}")
91
+ api.upload_folder(
92
+ repo_id=hf_repo_id,
93
+ folder_path=cache_dir,
94
+ path_in_repo=ANALYSIS_STATE_DIRNAME,
95
+ repo_type="dataset",
96
+ commit_message=f"Save analysis cache for snapshot {snapshot_id}",
97
+ )
98
+ result = {
99
+ "dataset_id": hf_repo_id,
100
+ "snapshot_id": snapshot_id,
101
+ "artifact_paths": [f"{ANALYSIS_STATE_DIRNAME}/{path}" for path in artifact_paths],
102
+ }
103
+ if repo:
104
+ result["repo"] = repo
105
+ if log:
106
+ log(f"Saved analysis cache to {hf_repo_id}")
107
+ return result
108
+
109
+
110
+ def _cache_artifact_paths(cache_dir: Path) -> list[str]:
111
+ return sorted(
112
+ str(path.relative_to(cache_dir).as_posix())
113
+ for path in cache_dir.rglob("*")
114
+ if path.is_file()
115
+ )
src/slop_farmer/app_config.py CHANGED
@@ -109,7 +109,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
109
  dashboard = payload.get("dashboard")
110
  analysis = payload.get("analysis")
111
  scrape = payload.get("scrape")
112
- full_pipeline = payload.get("full-pipeline")
113
  pull_requests = payload.get("pull-requests")
114
  if dashboard is None:
115
  dashboard = {}
@@ -117,8 +116,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
117
  analysis = {}
118
  if scrape is None:
119
  scrape = {}
120
- if full_pipeline is None:
121
- full_pipeline = {}
122
  if pull_requests is None:
123
  pull_requests = {}
124
  if not isinstance(dashboard, dict):
@@ -127,8 +124,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
127
  raise ValueError(f"Expected analysis mapping in config file: {config_path}")
128
  if not isinstance(scrape, dict):
129
  raise ValueError(f"Expected scrape mapping in config file: {config_path}")
130
- if not isinstance(full_pipeline, dict):
131
- raise ValueError(f"Expected full-pipeline mapping in config file: {config_path}")
132
  if not isinstance(pull_requests, dict):
133
  raise ValueError(f"Expected pull-requests mapping in config file: {config_path}")
134
 
@@ -184,12 +179,26 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
184
  "new-contributor-window-days": contributor_window_days,
185
  "new-contributor-max-authors": contributor_max_authors,
186
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  "analyze": {
188
  "output-dir": str(data_dir) if data_dir else None,
189
  "hf-repo-id": analysis.get("hf-repo-id", dataset_id),
190
  "model": analysis.get("model"),
191
  "ranking-backend": analysis.get("ranking_backend"),
192
  "max-clusters": analysis.get("max_clusters"),
 
193
  "cached_analysis": analysis.get("cached_analysis"),
194
  "open-prs-only": analysis.get("open_prs_only"),
195
  "pr-template-cleanup-mode": pr_template_cleanup_mode,
@@ -201,6 +210,7 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
201
  },
202
  "pr-scope": {
203
  "output-dir": str(data_dir) if data_dir else None,
 
204
  "cluster-suppression-rules": cluster_suppression_rules,
205
  },
206
  "pr-search": {
@@ -210,32 +220,28 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
210
  },
211
  "new-contributor-report": {
212
  "output-dir": str(data_dir) if data_dir else None,
 
213
  "window-days": contributor_window_days,
214
  "max-authors": contributor_max_authors,
215
  },
216
  "dashboard-data": {
217
  "output-dir": str(dashboard_dir) if dashboard_dir else None,
218
  "snapshot-root": str(data_dir / "snapshots") if data_dir else None,
 
219
  "window-days": dashboard_window_days,
220
  },
221
- "publish-snapshot": {
222
  "output-dir": str(data_dir) if data_dir else None,
223
  "hf-repo-id": dataset_id,
224
  },
225
- "full-pipeline": {
226
- "repo": repo,
227
- "dataset": dataset_id,
228
- "workspace-root": str(workspace_path.parent) if workspace_path else None,
229
- "model": analysis.get("model"),
230
- "ranking-backend": analysis.get("ranking_backend"),
231
- "max-clusters": analysis.get("max_clusters"),
232
- "dashboard-window-days": dashboard_window_days,
233
- "new-contributor-window-days": contributor_window_days,
234
- "new-contributor-max-authors": contributor_max_authors,
235
  },
236
  "deploy-dashboard": {
237
  "pipeline-data-dir": str(data_dir) if data_dir else None,
238
  "web-dir": str(web_dir) if web_dir else None,
 
239
  "dashboard-window-days": dashboard_window_days,
240
  "contributor-window-days": contributor_window_days,
241
  "contributor-max-authors": contributor_max_authors,
@@ -248,6 +254,11 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
248
  "dataset-id": dataset_id,
249
  "space-tags": tags_value,
250
  },
 
 
 
 
 
251
  }
252
  for command, values in defaults.items():
253
  defaults[command] = {key: value for key, value in values.items() if value is not None}
@@ -259,8 +270,8 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
259
  defaults[command].update(_resolve_command_paths(config_path, values))
260
 
261
  defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
 
262
  defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
263
- defaults["full-pipeline"].update(_resolve_command_paths(config_path, full_pipeline))
264
  return defaults
265
 
266
 
 
109
  dashboard = payload.get("dashboard")
110
  analysis = payload.get("analysis")
111
  scrape = payload.get("scrape")
 
112
  pull_requests = payload.get("pull-requests")
113
  if dashboard is None:
114
  dashboard = {}
 
116
  analysis = {}
117
  if scrape is None:
118
  scrape = {}
 
 
119
  if pull_requests is None:
120
  pull_requests = {}
121
  if not isinstance(dashboard, dict):
 
124
  raise ValueError(f"Expected analysis mapping in config file: {config_path}")
125
  if not isinstance(scrape, dict):
126
  raise ValueError(f"Expected scrape mapping in config file: {config_path}")
 
 
127
  if not isinstance(pull_requests, dict):
128
  raise ValueError(f"Expected pull-requests mapping in config file: {config_path}")
129
 
 
179
  "new-contributor-window-days": contributor_window_days,
180
  "new-contributor-max-authors": contributor_max_authors,
181
  },
182
+ "refresh-dataset": {
183
+ "repo": repo,
184
+ "hf-repo-id": dataset_id,
185
+ "fetch-timeline": scrape.get("fetch-timeline"),
186
+ "max-issues": scrape.get("max-issues"),
187
+ "max-prs": scrape.get("max-prs"),
188
+ "max-issue-comments": scrape.get("max-issue-comments"),
189
+ "max-reviews-per-pr": scrape.get("max-reviews-per-pr"),
190
+ "max-review-comments-per-pr": scrape.get("max-review-comments-per-pr"),
191
+ "new-contributor-window-days": contributor_window_days,
192
+ "new-contributor-max-authors": contributor_max_authors,
193
+ "cluster-suppression-rules": cluster_suppression_rules,
194
+ },
195
  "analyze": {
196
  "output-dir": str(data_dir) if data_dir else None,
197
  "hf-repo-id": analysis.get("hf-repo-id", dataset_id),
198
  "model": analysis.get("model"),
199
  "ranking-backend": analysis.get("ranking_backend"),
200
  "max-clusters": analysis.get("max_clusters"),
201
+ "hybrid-llm-concurrency": analysis.get("hybrid_llm_concurrency"),
202
  "cached_analysis": analysis.get("cached_analysis"),
203
  "open-prs-only": analysis.get("open_prs_only"),
204
  "pr-template-cleanup-mode": pr_template_cleanup_mode,
 
210
  },
211
  "pr-scope": {
212
  "output-dir": str(data_dir) if data_dir else None,
213
+ "hf-repo-id": dataset_id,
214
  "cluster-suppression-rules": cluster_suppression_rules,
215
  },
216
  "pr-search": {
 
220
  },
221
  "new-contributor-report": {
222
  "output-dir": str(data_dir) if data_dir else None,
223
+ "hf-repo-id": dataset_id,
224
  "window-days": contributor_window_days,
225
  "max-authors": contributor_max_authors,
226
  },
227
  "dashboard-data": {
228
  "output-dir": str(dashboard_dir) if dashboard_dir else None,
229
  "snapshot-root": str(data_dir / "snapshots") if data_dir else None,
230
+ "hf-repo-id": dataset_id,
231
  "window-days": dashboard_window_days,
232
  },
233
+ "publish-analysis-artifacts": {
234
  "output-dir": str(data_dir) if data_dir else None,
235
  "hf-repo-id": dataset_id,
236
  },
237
+ "save-cache": {
238
+ "output-dir": str(data_dir) if data_dir else None,
239
+ "hf-repo-id": dataset_id,
 
 
 
 
 
 
 
240
  },
241
  "deploy-dashboard": {
242
  "pipeline-data-dir": str(data_dir) if data_dir else None,
243
  "web-dir": str(web_dir) if web_dir else None,
244
+ "hf-repo-id": dataset_id,
245
  "dashboard-window-days": dashboard_window_days,
246
  "contributor-window-days": contributor_window_days,
247
  "contributor-max-authors": contributor_max_authors,
 
254
  "dataset-id": dataset_id,
255
  "space-tags": tags_value,
256
  },
257
+ "dataset-status": {
258
+ "repo": repo,
259
+ "output-dir": str(data_dir) if data_dir else None,
260
+ "hf-repo-id": dataset_id,
261
+ },
262
  }
263
  for command, values in defaults.items():
264
  defaults[command] = {key: value for key, value in values.items() if value is not None}
 
270
  defaults[command].update(_resolve_command_paths(config_path, values))
271
 
272
  defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
273
+ defaults["refresh-dataset"].update(_resolve_command_paths(config_path, scrape))
274
  defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
 
275
  return defaults
276
 
277
 
src/slop_farmer/config.py CHANGED
@@ -81,9 +81,6 @@ class PipelineOptions:
81
  max_reviews_per_pr: int | None
82
  max_review_comments_per_pr: int | None
83
  fetch_timeline: bool
84
- publish: bool
85
- hf_repo_id: str | None
86
- private_hf_repo: bool
87
  new_contributor_report: bool
88
  new_contributor_window_days: int
89
  new_contributor_max_authors: int
@@ -102,6 +99,7 @@ class AnalysisOptions:
102
  ranking_backend: str
103
  model: str
104
  max_clusters: int
 
105
  open_prs_only: bool = False
106
  cached_analysis: bool = False
107
  pr_template_cleanup_mode: str = "merge_defaults"
@@ -111,6 +109,10 @@ class AnalysisOptions:
111
  pr_template_line_patterns: tuple[str, ...] = ()
112
  cluster_suppression_rules: tuple[dict[str, Any], ...] = ()
113
 
 
 
 
 
114
 
115
  @dataclass(slots=True)
116
  class MarkdownReportOptions:
@@ -127,6 +129,9 @@ class NewContributorReportOptions:
127
  json_output: Path | None
128
  window_days: int
129
  max_authors: int
 
 
 
130
 
131
 
132
  @dataclass(slots=True)
@@ -137,17 +142,12 @@ class DashboardDataOptions:
137
  contributors_input: Path | None
138
  pr_scope_input: Path | None
139
  window_days: int
 
 
 
140
  snapshot_root: Path | None = None
141
 
142
 
143
- @dataclass(slots=True)
144
- class PublishSnapshotOptions:
145
- output_dir: Path
146
- snapshot_dir: Path | None
147
- hf_repo_id: str
148
- private_hf_repo: bool
149
-
150
-
151
  @dataclass(slots=True)
152
  class DeployDashboardOptions:
153
  pipeline_data_dir: Path
@@ -155,6 +155,10 @@ class DeployDashboardOptions:
155
  snapshot_dir: Path | None
156
  analysis_input: Path | None
157
  contributors_input: Path | None
 
 
 
 
158
  refresh_contributors: bool
159
  dashboard_window_days: int
160
  contributor_window_days: int
@@ -216,20 +220,50 @@ class SnapshotAdoptOptions:
216
 
217
 
218
  @dataclass(slots=True)
219
- class FullPipelineOptions:
220
  repo: RepoRef
221
- dataset: str
222
- model: str
223
- workspace_root: Path
224
  private_hf_repo: bool
225
- ranking_backend: str
226
- max_clusters: int
 
 
 
227
  fetch_timeline: bool
228
- dashboard_window_days: int
229
  new_contributor_window_days: int
230
  new_contributor_max_authors: int
231
- issue_max_age_days: int | None
232
- pr_max_age_days: int | None
233
- max_issues: int | None
234
- max_prs: int | None
235
- open_prs_only: bool = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  max_reviews_per_pr: int | None
82
  max_review_comments_per_pr: int | None
83
  fetch_timeline: bool
 
 
 
84
  new_contributor_report: bool
85
  new_contributor_window_days: int
86
  new_contributor_max_authors: int
 
99
  ranking_backend: str
100
  model: str
101
  max_clusters: int
102
+ hybrid_llm_concurrency: int = 1
103
  open_prs_only: bool = False
104
  cached_analysis: bool = False
105
  pr_template_cleanup_mode: str = "merge_defaults"
 
109
  pr_template_line_patterns: tuple[str, ...] = ()
110
  cluster_suppression_rules: tuple[dict[str, Any], ...] = ()
111
 
112
+ def __post_init__(self) -> None:
113
+ if self.hybrid_llm_concurrency < 1:
114
+ raise ValueError("hybrid_llm_concurrency must be >= 1")
115
+
116
 
117
  @dataclass(slots=True)
118
  class MarkdownReportOptions:
 
129
  json_output: Path | None
130
  window_days: int
131
  max_authors: int
132
+ hf_repo_id: str | None = None
133
+ hf_revision: str | None = None
134
+ hf_materialize_dir: Path | None = None
135
 
136
 
137
  @dataclass(slots=True)
 
142
  contributors_input: Path | None
143
  pr_scope_input: Path | None
144
  window_days: int
145
+ hf_repo_id: str | None = None
146
+ hf_revision: str | None = None
147
+ hf_materialize_dir: Path | None = None
148
  snapshot_root: Path | None = None
149
 
150
 
 
 
 
 
 
 
 
 
151
  @dataclass(slots=True)
152
  class DeployDashboardOptions:
153
  pipeline_data_dir: Path
 
155
  snapshot_dir: Path | None
156
  analysis_input: Path | None
157
  contributors_input: Path | None
158
+ pr_scope_input: Path | None
159
+ hf_repo_id: str | None
160
+ hf_revision: str | None
161
+ hf_materialize_dir: Path | None
162
  refresh_contributors: bool
163
  dashboard_window_days: int
164
  contributor_window_days: int
 
220
 
221
 
222
  @dataclass(slots=True)
223
+ class DatasetRefreshOptions:
224
  repo: RepoRef
225
+ hf_repo_id: str
 
 
226
  private_hf_repo: bool
227
+ max_issues: int | None
228
+ max_prs: int | None
229
+ max_issue_comments: int | None
230
+ max_reviews_per_pr: int | None
231
+ max_review_comments_per_pr: int | None
232
  fetch_timeline: bool
233
+ new_contributor_report: bool
234
  new_contributor_window_days: int
235
  new_contributor_max_authors: int
236
+ http_timeout: int
237
+ http_max_retries: int
238
+ checkpoint_every_comments: int
239
+ checkpoint_every_prs: int
240
+ cluster_suppression_rules: tuple[dict[str, Any], ...] = ()
241
+
242
+
243
+ @dataclass(slots=True)
244
+ class PublishAnalysisArtifactsOptions:
245
+ output_dir: Path
246
+ snapshot_dir: Path | None
247
+ analysis_input: Path | None
248
+ hf_repo_id: str
249
+ analysis_id: str
250
+ canonical: bool = False
251
+ save_cache: bool = False
252
+ private_hf_repo: bool = False
253
+
254
+
255
+ @dataclass(slots=True)
256
+ class SaveCacheOptions:
257
+ output_dir: Path
258
+ snapshot_dir: Path | None
259
+ hf_repo_id: str
260
+ private_hf_repo: bool = False
261
+
262
+
263
+ @dataclass(slots=True)
264
+ class DatasetStatusOptions:
265
+ output_dir: Path
266
+ hf_repo_id: str | None
267
+ hf_revision: str | None
268
+ repo: str | None = None
269
+ json_output: bool = False
src/slop_farmer/data/search_duckdb.py CHANGED
@@ -31,6 +31,7 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
31
  "repo",
32
  "pr_number",
33
  "github_id",
 
34
  "state",
35
  "draft",
36
  "merged",
@@ -46,6 +47,48 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
46
  "review_comments_count",
47
  "html_url",
48
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  "pr_scope_features": (
50
  "run_id",
51
  "repo",
@@ -144,6 +187,7 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
144
  repo VARCHAR,
145
  pr_number BIGINT,
146
  github_id BIGINT,
 
147
  state VARCHAR,
148
  draft BOOLEAN,
149
  merged BOOLEAN,
@@ -159,6 +203,48 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
159
  review_comments_count BIGINT,
160
  html_url VARCHAR
161
  );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  CREATE TABLE IF NOT EXISTS pr_scope_features (
163
  run_id VARCHAR,
164
  repo VARCHAR,
@@ -232,6 +318,8 @@ CREATE TABLE IF NOT EXISTS pr_scope_cluster_candidates (
232
  CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
233
  CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
234
  CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
 
 
235
  CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
236
  CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
237
  CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
@@ -256,6 +344,9 @@ def connect_pr_search_db(path: Path, *, read_only: bool = False) -> duckdb.DuckD
256
 
257
  def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
258
  connection.execute(SCHEMA_SQL)
 
 
 
259
 
260
 
261
  def insert_rows(
@@ -353,6 +444,7 @@ def resolve_active_run(
353
  def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
354
  return {
355
  "documents": _count(connection, "pr_search_documents", run_id),
 
356
  "features": _count(connection, "pr_scope_features", run_id),
357
  "run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
358
  "neighbors": _count(connection, "pr_scope_neighbors", run_id),
@@ -375,6 +467,60 @@ def get_document(
375
  )
376
 
377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  def get_feature(
379
  connection: duckdb.DuckDBPyConnection,
380
  *,
 
31
  "repo",
32
  "pr_number",
33
  "github_id",
34
+ "author_login",
35
  "state",
36
  "draft",
37
  "merged",
 
47
  "review_comments_count",
48
  "html_url",
49
  ),
50
+ "pr_search_contributors": (
51
+ "run_id",
52
+ "repo",
53
+ "snapshot_id",
54
+ "report_generated_at",
55
+ "window_days",
56
+ "author_login",
57
+ "name",
58
+ "profile_url",
59
+ "repo_pull_requests_url",
60
+ "repo_issues_url",
61
+ "repo_first_seen_at",
62
+ "repo_last_seen_at",
63
+ "repo_primary_artifact_count",
64
+ "repo_artifact_count",
65
+ "snapshot_issue_count",
66
+ "snapshot_pr_count",
67
+ "snapshot_comment_count",
68
+ "snapshot_review_count",
69
+ "snapshot_review_comment_count",
70
+ "repo_association",
71
+ "new_to_repo",
72
+ "first_seen_in_snapshot",
73
+ "report_reason",
74
+ "account_age_days",
75
+ "young_account",
76
+ "follow_through_score",
77
+ "breadth_score",
78
+ "automation_risk_signal",
79
+ "heuristic_note",
80
+ "public_orgs_json",
81
+ "visible_authored_pr_count",
82
+ "merged_pr_count",
83
+ "closed_unmerged_pr_count",
84
+ "open_pr_count",
85
+ "merged_pr_rate",
86
+ "closed_unmerged_pr_rate",
87
+ "still_open_pr_rate",
88
+ "distinct_repos_with_authored_prs",
89
+ "distinct_repos_with_open_prs",
90
+ "fetch_error",
91
+ ),
92
  "pr_scope_features": (
93
  "run_id",
94
  "repo",
 
187
  repo VARCHAR,
188
  pr_number BIGINT,
189
  github_id BIGINT,
190
+ author_login VARCHAR,
191
  state VARCHAR,
192
  draft BOOLEAN,
193
  merged BOOLEAN,
 
203
  review_comments_count BIGINT,
204
  html_url VARCHAR
205
  );
206
+ CREATE TABLE IF NOT EXISTS pr_search_contributors (
207
+ run_id VARCHAR,
208
+ repo VARCHAR,
209
+ snapshot_id VARCHAR,
210
+ report_generated_at VARCHAR,
211
+ window_days BIGINT,
212
+ author_login VARCHAR,
213
+ name VARCHAR,
214
+ profile_url VARCHAR,
215
+ repo_pull_requests_url VARCHAR,
216
+ repo_issues_url VARCHAR,
217
+ repo_first_seen_at VARCHAR,
218
+ repo_last_seen_at VARCHAR,
219
+ repo_primary_artifact_count BIGINT,
220
+ repo_artifact_count BIGINT,
221
+ snapshot_issue_count BIGINT,
222
+ snapshot_pr_count BIGINT,
223
+ snapshot_comment_count BIGINT,
224
+ snapshot_review_count BIGINT,
225
+ snapshot_review_comment_count BIGINT,
226
+ repo_association VARCHAR,
227
+ new_to_repo BOOLEAN,
228
+ first_seen_in_snapshot BOOLEAN,
229
+ report_reason VARCHAR,
230
+ account_age_days BIGINT,
231
+ young_account BOOLEAN,
232
+ follow_through_score VARCHAR,
233
+ breadth_score VARCHAR,
234
+ automation_risk_signal VARCHAR,
235
+ heuristic_note VARCHAR,
236
+ public_orgs_json VARCHAR,
237
+ visible_authored_pr_count BIGINT,
238
+ merged_pr_count BIGINT,
239
+ closed_unmerged_pr_count BIGINT,
240
+ open_pr_count BIGINT,
241
+ merged_pr_rate DOUBLE,
242
+ closed_unmerged_pr_rate DOUBLE,
243
+ still_open_pr_rate DOUBLE,
244
+ distinct_repos_with_authored_prs BIGINT,
245
+ distinct_repos_with_open_prs BIGINT,
246
+ fetch_error VARCHAR
247
+ );
248
  CREATE TABLE IF NOT EXISTS pr_scope_features (
249
  run_id VARCHAR,
250
  repo VARCHAR,
 
318
  CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
319
  CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
320
  CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
321
+ CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_author ON pr_search_documents (run_id, author_login);
322
+ CREATE INDEX IF NOT EXISTS idx_pr_search_contributors_run_author ON pr_search_contributors (run_id, author_login);
323
  CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
324
  CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
325
  CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
 
344
 
345
  def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
346
  connection.execute(SCHEMA_SQL)
347
+ connection.execute(
348
+ "ALTER TABLE pr_search_documents ADD COLUMN IF NOT EXISTS author_login VARCHAR"
349
+ )
350
 
351
 
352
  def insert_rows(
 
444
  def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
445
  return {
446
  "documents": _count(connection, "pr_search_documents", run_id),
447
+ "contributors": _count(connection, "pr_search_contributors", run_id),
448
  "features": _count(connection, "pr_scope_features", run_id),
449
  "run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
450
  "neighbors": _count(connection, "pr_scope_neighbors", run_id),
 
467
  )
468
 
469
 
470
+ def get_contributor(
471
+ connection: duckdb.DuckDBPyConnection,
472
+ *,
473
+ run_id: str,
474
+ author_login: str,
475
+ ) -> dict[str, Any] | None:
476
+ return fetch_one(
477
+ connection,
478
+ """
479
+ SELECT *
480
+ FROM pr_search_contributors
481
+ WHERE run_id = ? AND lower(author_login) = lower(?)
482
+ """,
483
+ [run_id, author_login],
484
+ )
485
+
486
+
487
+ def get_contributor_pulls(
488
+ connection: duckdb.DuckDBPyConnection,
489
+ *,
490
+ run_id: str,
491
+ author_login: str,
492
+ limit: int,
493
+ ) -> list[dict[str, Any]]:
494
+ return fetch_rows(
495
+ connection,
496
+ """
497
+ SELECT
498
+ pr_number,
499
+ github_id,
500
+ author_login,
501
+ state,
502
+ draft,
503
+ merged,
504
+ title,
505
+ base_ref,
506
+ created_at,
507
+ updated_at,
508
+ merged_at,
509
+ additions,
510
+ deletions,
511
+ changed_files,
512
+ comments_count,
513
+ review_comments_count,
514
+ html_url
515
+ FROM pr_search_documents
516
+ WHERE run_id = ? AND lower(author_login) = lower(?)
517
+ ORDER BY updated_at DESC NULLS LAST, pr_number DESC
518
+ LIMIT ?
519
+ """,
520
+ [run_id, author_login, limit],
521
+ )
522
+
523
+
524
  def get_feature(
525
  connection: duckdb.DuckDBPyConnection,
526
  *,
src/slop_farmer/data/snapshot_materialize.py CHANGED
@@ -5,13 +5,27 @@ import shutil
5
  import urllib.parse
6
  import urllib.request
7
  from datetime import UTC, datetime
8
- from pathlib import Path
9
  from typing import Any
10
 
11
  from huggingface_hub import HfApi, hf_hub_download
12
 
13
  from slop_farmer.data.http import urlopen_with_retry
14
  from slop_farmer.data.parquet_io import read_json, write_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  def materialize_hf_dataset_snapshot(
@@ -22,11 +36,13 @@ def materialize_hf_dataset_snapshot(
22
  ) -> Path:
23
  info = _hf_dataset_info(repo_id=repo_id, revision=revision, files_metadata=True)
24
  remote_paths = {sibling.rfilename for sibling in info.siblings}
25
- if "snapshots/latest.json" in remote_paths:
 
26
  return _materialize_hf_snapshot_repo_snapshot(
27
  repo_id=repo_id,
28
  local_dir=local_dir,
29
- revision=revision,
 
30
  hf_sha=info.sha,
31
  remote_paths=remote_paths,
32
  )
@@ -34,14 +50,16 @@ def materialize_hf_dataset_snapshot(
34
  return _materialize_hf_root_snapshot(
35
  repo_id=repo_id,
36
  local_dir=local_dir,
37
- revision=revision,
 
38
  hf_sha=info.sha,
39
  remote_paths=remote_paths,
40
  )
41
  return _materialize_hf_dataset_viewer_snapshot(
42
  repo_id=repo_id,
43
  local_dir=local_dir,
44
- revision=revision,
 
45
  hf_sha=info.sha,
46
  )
47
 
@@ -50,84 +68,101 @@ def _materialize_hf_snapshot_repo_snapshot(
50
  *,
51
  repo_id: str,
52
  local_dir: Path,
53
- revision: str | None,
 
54
  hf_sha: str | None,
55
  remote_paths: set[str],
56
  ) -> Path:
57
  local_dir.mkdir(parents=True, exist_ok=True)
58
- latest_path = hf_hub_download(
59
- repo_id=repo_id,
60
- repo_type="dataset",
61
- filename="snapshots/latest.json",
62
- revision=revision,
 
 
63
  )
64
- latest_payload = json.loads(Path(latest_path).read_text(encoding="utf-8"))
65
  downloaded_files: set[str] = set()
 
 
 
 
 
66
  for filename in (
67
- "issues.parquet",
68
- "pull_requests.parquet",
69
- "comments.parquet",
70
- "reviews.parquet",
71
- "review_comments.parquet",
72
- "pr_files.parquet",
73
- "pr_diffs.parquet",
74
- "links.parquet",
75
- "events.parquet",
76
- "manifest.json",
77
- "analysis-report.json",
78
- "analysis-report-hybrid.json",
79
- "analysis-report-deterministic.json",
80
- "new_contributors.parquet",
81
- "new-contributors-report.json",
82
- "new-contributors-report.md",
83
  ):
84
- downloaded_path = _download_first_available_hf_file(
85
  repo_id=repo_id,
86
  revision=revision,
87
  filenames=_hf_latest_snapshot_candidates(latest_payload, filename),
88
  )
89
- if downloaded_path is None:
90
  continue
91
- shutil.copy2(downloaded_path, local_dir / filename)
92
  downloaded_files.add(filename)
93
- downloaded_files.update(
94
- _download_hf_analysis_state_files(
 
95
  repo_id=repo_id,
96
  revision=revision,
97
  local_dir=local_dir,
98
- path_pairs=_hf_analysis_state_path_pairs(
99
- remote_paths,
100
- prefixes=_hf_latest_snapshot_prefixes(latest_payload),
101
- ),
102
  )
 
 
 
 
 
 
 
103
  )
104
 
105
- readme_path = hf_hub_download(
106
  repo_id=repo_id,
107
- repo_type="dataset",
108
- filename="README.md",
109
- revision=revision or "main",
 
110
  )
111
- shutil.copy2(readme_path, local_dir / "README.md")
 
 
 
 
 
 
 
 
 
112
  manifest = (
113
- read_json(local_dir / "manifest.json") if (local_dir / "manifest.json").exists() else {}
 
 
114
  )
115
  manifest.setdefault("repo", _infer_repo_from_materialized_snapshot(local_dir))
116
  manifest.setdefault(
117
- "snapshot_id", str(latest_payload.get("latest_snapshot_id") or hf_sha or local_dir.name)
 
118
  )
119
  manifest.update(
120
  {
121
  "source_type": "hf_snapshot_repo",
122
  "hf_repo_id": repo_id,
123
- "hf_revision": revision,
 
124
  "hf_sha": hf_sha,
125
  "materialized_at": _iso_now(),
126
  "downloaded_files": sorted(downloaded_files),
127
  "hf_latest_pointer": latest_payload,
128
  }
129
  )
130
- write_text(json.dumps(manifest, indent=2) + "\n", local_dir / "manifest.json")
131
  return local_dir
132
 
133
 
@@ -135,60 +170,53 @@ def _materialize_hf_root_snapshot(
135
  *,
136
  repo_id: str,
137
  local_dir: Path,
138
- revision: str | None,
 
139
  hf_sha: str | None,
140
  remote_paths: set[str],
141
  ) -> Path:
142
  local_dir.mkdir(parents=True, exist_ok=True)
143
  downloaded_files: set[str] = set()
144
- for filename in (
145
- "issues.parquet",
146
- "pull_requests.parquet",
147
- "comments.parquet",
148
- "reviews.parquet",
149
- "review_comments.parquet",
150
- "pr_files.parquet",
151
- "pr_diffs.parquet",
152
- "links.parquet",
153
- "events.parquet",
154
- "manifest.json",
155
- "analysis-report.json",
156
- "analysis-report-hybrid.json",
157
- "analysis-report-deterministic.json",
158
- "new_contributors.parquet",
159
- "new-contributors-report.json",
160
- "new-contributors-report.md",
161
  ):
162
- if filename not in remote_paths:
163
  continue
164
- downloaded_path = hf_hub_download(
165
- repo_id=repo_id,
166
- repo_type="dataset",
167
- filename=filename,
168
- revision=revision,
169
- )
170
- shutil.copy2(downloaded_path, local_dir / filename)
171
- downloaded_files.add(filename)
172
- downloaded_files.update(
173
- _download_hf_analysis_state_files(
174
  repo_id=repo_id,
175
  revision=revision,
176
  local_dir=local_dir,
177
- path_pairs=_hf_analysis_state_path_pairs(remote_paths, prefixes=[""]),
 
178
  )
 
 
 
 
 
 
 
179
  )
180
 
181
- if "README.md" in remote_paths:
182
- readme_path = hf_hub_download(
183
- repo_id=repo_id,
184
- repo_type="dataset",
185
- filename="README.md",
186
- revision=revision or "main",
187
- )
188
- shutil.copy2(readme_path, local_dir / "README.md")
189
 
190
  manifest = (
191
- read_json(local_dir / "manifest.json") if (local_dir / "manifest.json").exists() else {}
 
 
192
  )
193
  manifest.setdefault("repo", _infer_repo_from_materialized_snapshot(local_dir))
194
  manifest.setdefault("snapshot_id", hf_sha or local_dir.name)
@@ -196,13 +224,14 @@ def _materialize_hf_root_snapshot(
196
  {
197
  "source_type": "hf_root_snapshot",
198
  "hf_repo_id": repo_id,
199
- "hf_revision": revision,
 
200
  "hf_sha": hf_sha,
201
  "materialized_at": _iso_now(),
202
  "downloaded_files": sorted(downloaded_files),
203
  }
204
  )
205
- write_text(json.dumps(manifest, indent=2) + "\n", local_dir / "manifest.json")
206
  return local_dir
207
 
208
 
@@ -210,7 +239,8 @@ def _materialize_hf_dataset_viewer_snapshot(
210
  *,
211
  repo_id: str,
212
  local_dir: Path,
213
- revision: str | None,
 
214
  hf_sha: str | None,
215
  ) -> Path:
216
  local_dir.mkdir(parents=True, exist_ok=True)
@@ -225,24 +255,165 @@ def _materialize_hf_dataset_viewer_snapshot(
225
  readme_path = hf_hub_download(
226
  repo_id=repo_id,
227
  repo_type="dataset",
228
- filename="README.md",
229
- revision=revision or "main",
230
  )
231
- shutil.copy2(readme_path, local_dir / "README.md")
 
232
  manifest = {
233
  "repo": _infer_repo_from_materialized_snapshot(local_dir),
234
  "snapshot_id": hf_sha or local_dir.name,
235
  "source_type": "hf_dataset_viewer",
236
  "hf_repo_id": repo_id,
237
- "hf_revision": revision,
 
238
  "hf_sha": hf_sha,
239
  "materialized_at": _iso_now(),
240
  "downloaded_files": sorted(downloaded_files),
241
  }
242
- write_text(json.dumps(manifest, indent=2) + "\n", local_dir / "manifest.json")
243
  return local_dir
244
 
245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  def _hf_dataset_info(repo_id: str, revision: str | None, *, files_metadata: bool) -> Any:
247
  api = HfApi()
248
  try:
@@ -270,7 +441,7 @@ def _hf_dataset_parquet_urls(repo_id: str, revision: str | None = None) -> list[
270
  def _download_first_available_hf_file(
271
  *,
272
  repo_id: str,
273
- revision: str | None,
274
  filenames: list[str],
275
  ) -> Path | None:
276
  for filename in filenames:
@@ -290,65 +461,24 @@ def _download_first_available_hf_file(
290
  return None
291
 
292
 
293
- def _download_hf_analysis_state_files(
294
- *,
295
- repo_id: str,
296
- revision: str | None,
297
- local_dir: Path,
298
- path_pairs: list[tuple[str, str]],
299
- ) -> set[str]:
300
- downloaded_files: set[str] = set()
301
- for remote_path, relative_path in path_pairs:
302
- downloaded_path = Path(
303
- hf_hub_download(
304
- repo_id=repo_id,
305
- repo_type="dataset",
306
- filename=remote_path,
307
- revision=revision,
308
- )
309
- )
310
- destination = local_dir / "analysis-state" / relative_path
311
- destination.parent.mkdir(parents=True, exist_ok=True)
312
- shutil.copy2(downloaded_path, destination)
313
- downloaded_files.add(str(Path("analysis-state") / relative_path))
314
- return downloaded_files
315
-
316
-
317
- def _hf_analysis_state_path_pairs(
318
- remote_paths: set[str],
319
- *,
320
- prefixes: list[str],
321
- ) -> list[tuple[str, str]]:
322
- pairs: list[tuple[str, str]] = []
323
- seen_relative_paths: set[str] = set()
324
- for prefix in prefixes:
325
- base = f"{prefix.strip('/')}/analysis-state/" if prefix else "analysis-state/"
326
- for remote_path in sorted(remote_paths):
327
- if not remote_path.startswith(base):
328
- continue
329
- relative_path = remote_path.removeprefix(base)
330
- if not relative_path or relative_path in seen_relative_paths:
331
- continue
332
- seen_relative_paths.add(relative_path)
333
- pairs.append((remote_path, relative_path))
334
- return pairs
335
-
336
-
337
  def _hf_latest_snapshot_candidates(latest_payload: dict[str, Any], filename: str) -> list[str]:
338
  candidates: list[str] = []
339
  manifest_path = str(latest_payload.get("manifest_path") or "").strip("/")
340
  snapshot_dir = str(latest_payload.get("snapshot_dir") or "").strip("/")
341
  latest_snapshot_id = str(latest_payload.get("latest_snapshot_id") or "").strip()
 
342
 
343
- if filename == "manifest.json" and manifest_path:
344
  candidates.append(manifest_path)
345
  if snapshot_dir and snapshot_dir not in {".", "/"}:
346
  candidates.append(f"{snapshot_dir}/{filename}")
 
 
347
  if manifest_path and "/" in manifest_path:
348
  manifest_dir = manifest_path.rsplit("/", 1)[0]
349
  candidates.append(f"{manifest_dir}/{filename}")
350
  if latest_snapshot_id:
351
- candidates.append(f"snapshots/{latest_snapshot_id}/{filename}")
352
  candidates.append(filename)
353
 
354
  seen: set[str] = set()
@@ -362,31 +492,6 @@ def _hf_latest_snapshot_candidates(latest_payload: dict[str, Any], filename: str
362
  return deduped
363
 
364
 
365
- def _hf_latest_snapshot_prefixes(latest_payload: dict[str, Any]) -> list[str]:
366
- prefixes: list[str] = []
367
- manifest_path = str(latest_payload.get("manifest_path") or "").strip("/")
368
- snapshot_dir = str(latest_payload.get("snapshot_dir") or "").strip("/")
369
- latest_snapshot_id = str(latest_payload.get("latest_snapshot_id") or "").strip()
370
-
371
- if snapshot_dir and snapshot_dir not in {".", "/"}:
372
- prefixes.append(snapshot_dir)
373
- if manifest_path and "/" in manifest_path:
374
- prefixes.append(manifest_path.rsplit("/", 1)[0])
375
- if latest_snapshot_id:
376
- prefixes.append(f"snapshots/{latest_snapshot_id}")
377
- prefixes.append("")
378
-
379
- seen: set[str] = set()
380
- deduped: list[str] = []
381
- for prefix in prefixes:
382
- normalized = prefix.lstrip("./")
383
- if normalized in seen:
384
- continue
385
- seen.add(normalized)
386
- deduped.append(normalized)
387
- return deduped
388
-
389
-
390
  def _download_url_to_path(url: str, destination: Path) -> None:
391
  destination.parent.mkdir(parents=True, exist_ok=True)
392
  urllib.request.urlretrieve(url, destination)
@@ -420,18 +525,8 @@ def _parquet_table_name(path: Path) -> str:
420
  def _infer_repo_from_materialized_snapshot(local_dir: Path) -> str:
421
  import pyarrow.parquet as pq
422
 
423
- for table_name in (
424
- "issues",
425
- "pull_requests",
426
- "comments",
427
- "reviews",
428
- "review_comments",
429
- "pr_files",
430
- "pr_diffs",
431
- "links",
432
- "events",
433
- ):
434
- path = local_dir / f"{table_name}.parquet"
435
  if not path.exists():
436
  continue
437
  rows = pq.read_table(path).slice(0, 1).to_pylist()
 
5
  import urllib.parse
6
  import urllib.request
7
  from datetime import UTC, datetime
8
+ from pathlib import Path, PurePosixPath
9
  from typing import Any
10
 
11
  from huggingface_hub import HfApi, hf_hub_download
12
 
13
  from slop_farmer.data.http import urlopen_with_retry
14
  from slop_farmer.data.parquet_io import read_json, write_text
15
+ from slop_farmer.data.snapshot_paths import (
16
+ CONTRIBUTOR_ARTIFACT_FILENAMES,
17
+ CURRENT_ANALYSIS_MANIFEST_PATH,
18
+ LEGACY_ANALYSIS_FILENAMES,
19
+ PR_SCOPE_CLUSTERS_FILENAME,
20
+ RAW_TABLE_FILENAMES,
21
+ README_FILENAME,
22
+ ROOT_MANIFEST_FILENAME,
23
+ SNAPSHOTS_LATEST_PATH,
24
+ STATE_WATERMARK_PATH,
25
+ load_archived_analysis_run_manifest,
26
+ load_current_analysis_manifest,
27
+ repo_relative_path_to_local,
28
+ )
29
 
30
 
31
  def materialize_hf_dataset_snapshot(
 
36
  ) -> Path:
37
  info = _hf_dataset_info(repo_id=repo_id, revision=revision, files_metadata=True)
38
  remote_paths = {sibling.rfilename for sibling in info.siblings}
39
+ resolved_revision = str(info.sha or revision or "main")
40
+ if SNAPSHOTS_LATEST_PATH in remote_paths:
41
  return _materialize_hf_snapshot_repo_snapshot(
42
  repo_id=repo_id,
43
  local_dir=local_dir,
44
+ revision=resolved_revision,
45
+ requested_revision=revision,
46
  hf_sha=info.sha,
47
  remote_paths=remote_paths,
48
  )
 
50
  return _materialize_hf_root_snapshot(
51
  repo_id=repo_id,
52
  local_dir=local_dir,
53
+ revision=resolved_revision,
54
+ requested_revision=revision,
55
  hf_sha=info.sha,
56
  remote_paths=remote_paths,
57
  )
58
  return _materialize_hf_dataset_viewer_snapshot(
59
  repo_id=repo_id,
60
  local_dir=local_dir,
61
+ revision=resolved_revision,
62
+ requested_revision=revision,
63
  hf_sha=info.sha,
64
  )
65
 
 
68
  *,
69
  repo_id: str,
70
  local_dir: Path,
71
+ revision: str,
72
+ requested_revision: str | None,
73
  hf_sha: str | None,
74
  remote_paths: set[str],
75
  ) -> Path:
76
  local_dir.mkdir(parents=True, exist_ok=True)
77
+ latest_download = Path(
78
+ hf_hub_download(
79
+ repo_id=repo_id,
80
+ repo_type="dataset",
81
+ filename=SNAPSHOTS_LATEST_PATH,
82
+ revision=revision,
83
+ )
84
  )
85
+ latest_payload = json.loads(latest_download.read_text(encoding="utf-8"))
86
  downloaded_files: set[str] = set()
87
+ _copy_downloaded_file(
88
+ latest_download, repo_relative_path_to_local(local_dir, SNAPSHOTS_LATEST_PATH)
89
+ )
90
+ downloaded_files.add(SNAPSHOTS_LATEST_PATH)
91
+
92
  for filename in (
93
+ *RAW_TABLE_FILENAMES,
94
+ ROOT_MANIFEST_FILENAME,
95
+ PR_SCOPE_CLUSTERS_FILENAME,
96
+ *CONTRIBUTOR_ARTIFACT_FILENAMES,
97
+ *LEGACY_ANALYSIS_FILENAMES,
 
 
 
 
 
 
 
 
 
 
 
98
  ):
99
+ downloaded = _download_first_available_hf_file(
100
  repo_id=repo_id,
101
  revision=revision,
102
  filenames=_hf_latest_snapshot_candidates(latest_payload, filename),
103
  )
104
+ if downloaded is None:
105
  continue
106
+ _copy_downloaded_file(downloaded, local_dir / filename)
107
  downloaded_files.add(filename)
108
+
109
+ if STATE_WATERMARK_PATH in remote_paths:
110
+ _download_repo_file(
111
  repo_id=repo_id,
112
  revision=revision,
113
  local_dir=local_dir,
114
+ repo_path=STATE_WATERMARK_PATH,
115
+ downloaded_files=downloaded_files,
 
 
116
  )
117
+
118
+ _download_analysis_state_files(
119
+ repo_id=repo_id,
120
+ revision=revision,
121
+ local_dir=local_dir,
122
+ remote_paths=remote_paths,
123
+ downloaded_files=downloaded_files,
124
  )
125
 
126
+ _download_published_analysis_files(
127
  repo_id=repo_id,
128
+ revision=revision,
129
+ local_dir=local_dir,
130
+ remote_paths=remote_paths,
131
+ downloaded_files=downloaded_files,
132
  )
133
+
134
+ _download_repo_file(
135
+ repo_id=repo_id,
136
+ revision=revision,
137
+ local_dir=local_dir,
138
+ repo_path=README_FILENAME,
139
+ downloaded_files=downloaded_files,
140
+ required=False,
141
+ )
142
+
143
  manifest = (
144
+ read_json(local_dir / ROOT_MANIFEST_FILENAME)
145
+ if (local_dir / ROOT_MANIFEST_FILENAME).exists()
146
+ else {}
147
  )
148
  manifest.setdefault("repo", _infer_repo_from_materialized_snapshot(local_dir))
149
  manifest.setdefault(
150
+ "snapshot_id",
151
+ str(latest_payload.get("latest_snapshot_id") or hf_sha or local_dir.name),
152
  )
153
  manifest.update(
154
  {
155
  "source_type": "hf_snapshot_repo",
156
  "hf_repo_id": repo_id,
157
+ "hf_revision": requested_revision,
158
+ "hf_resolved_revision": revision,
159
  "hf_sha": hf_sha,
160
  "materialized_at": _iso_now(),
161
  "downloaded_files": sorted(downloaded_files),
162
  "hf_latest_pointer": latest_payload,
163
  }
164
  )
165
+ write_text(json.dumps(manifest, indent=2) + "\n", local_dir / ROOT_MANIFEST_FILENAME)
166
  return local_dir
167
 
168
 
 
170
  *,
171
  repo_id: str,
172
  local_dir: Path,
173
+ revision: str,
174
+ requested_revision: str | None,
175
  hf_sha: str | None,
176
  remote_paths: set[str],
177
  ) -> Path:
178
  local_dir.mkdir(parents=True, exist_ok=True)
179
  downloaded_files: set[str] = set()
180
+ for repo_path in (
181
+ *RAW_TABLE_FILENAMES,
182
+ ROOT_MANIFEST_FILENAME,
183
+ PR_SCOPE_CLUSTERS_FILENAME,
184
+ *CONTRIBUTOR_ARTIFACT_FILENAMES,
185
+ *LEGACY_ANALYSIS_FILENAMES,
186
+ SNAPSHOTS_LATEST_PATH,
187
+ STATE_WATERMARK_PATH,
188
+ README_FILENAME,
 
 
 
 
 
 
 
 
189
  ):
190
+ if repo_path not in remote_paths:
191
  continue
192
+ _download_repo_file(
 
 
 
 
 
 
 
 
 
193
  repo_id=repo_id,
194
  revision=revision,
195
  local_dir=local_dir,
196
+ repo_path=repo_path,
197
+ downloaded_files=downloaded_files,
198
  )
199
+
200
+ _download_analysis_state_files(
201
+ repo_id=repo_id,
202
+ revision=revision,
203
+ local_dir=local_dir,
204
+ remote_paths=remote_paths,
205
+ downloaded_files=downloaded_files,
206
  )
207
 
208
+ _download_published_analysis_files(
209
+ repo_id=repo_id,
210
+ revision=revision,
211
+ local_dir=local_dir,
212
+ remote_paths=remote_paths,
213
+ downloaded_files=downloaded_files,
214
+ )
 
215
 
216
  manifest = (
217
+ read_json(local_dir / ROOT_MANIFEST_FILENAME)
218
+ if (local_dir / ROOT_MANIFEST_FILENAME).exists()
219
+ else {}
220
  )
221
  manifest.setdefault("repo", _infer_repo_from_materialized_snapshot(local_dir))
222
  manifest.setdefault("snapshot_id", hf_sha or local_dir.name)
 
224
  {
225
  "source_type": "hf_root_snapshot",
226
  "hf_repo_id": repo_id,
227
+ "hf_revision": requested_revision,
228
+ "hf_resolved_revision": revision,
229
  "hf_sha": hf_sha,
230
  "materialized_at": _iso_now(),
231
  "downloaded_files": sorted(downloaded_files),
232
  }
233
  )
234
+ write_text(json.dumps(manifest, indent=2) + "\n", local_dir / ROOT_MANIFEST_FILENAME)
235
  return local_dir
236
 
237
 
 
239
  *,
240
  repo_id: str,
241
  local_dir: Path,
242
+ revision: str,
243
+ requested_revision: str | None,
244
  hf_sha: str | None,
245
  ) -> Path:
246
  local_dir.mkdir(parents=True, exist_ok=True)
 
255
  readme_path = hf_hub_download(
256
  repo_id=repo_id,
257
  repo_type="dataset",
258
+ filename=README_FILENAME,
259
+ revision=revision,
260
  )
261
+ shutil.copy2(readme_path, local_dir / README_FILENAME)
262
+ downloaded_files.add(README_FILENAME)
263
  manifest = {
264
  "repo": _infer_repo_from_materialized_snapshot(local_dir),
265
  "snapshot_id": hf_sha or local_dir.name,
266
  "source_type": "hf_dataset_viewer",
267
  "hf_repo_id": repo_id,
268
+ "hf_revision": requested_revision,
269
+ "hf_resolved_revision": revision,
270
  "hf_sha": hf_sha,
271
  "materialized_at": _iso_now(),
272
  "downloaded_files": sorted(downloaded_files),
273
  }
274
+ write_text(json.dumps(manifest, indent=2) + "\n", local_dir / ROOT_MANIFEST_FILENAME)
275
  return local_dir
276
 
277
 
278
+ def _download_published_analysis_files(
279
+ *,
280
+ repo_id: str,
281
+ revision: str,
282
+ local_dir: Path,
283
+ remote_paths: set[str],
284
+ downloaded_files: set[str],
285
+ ) -> None:
286
+ if CURRENT_ANALYSIS_MANIFEST_PATH in remote_paths:
287
+ manifest_path = _download_repo_file(
288
+ repo_id=repo_id,
289
+ revision=revision,
290
+ local_dir=local_dir,
291
+ repo_path=CURRENT_ANALYSIS_MANIFEST_PATH,
292
+ downloaded_files=downloaded_files,
293
+ )
294
+ current_manifest = load_current_analysis_manifest(manifest_path)
295
+ for repo_path in _manifest_artifact_paths(current_manifest, include_archived=True):
296
+ if repo_path not in remote_paths:
297
+ continue
298
+ _download_repo_file(
299
+ repo_id=repo_id,
300
+ revision=revision,
301
+ local_dir=local_dir,
302
+ repo_path=repo_path,
303
+ downloaded_files=downloaded_files,
304
+ )
305
+
306
+ for repo_path in sorted(
307
+ path for path in remote_paths if _is_archived_analysis_manifest_path(path)
308
+ ):
309
+ manifest_path = _download_repo_file(
310
+ repo_id=repo_id,
311
+ revision=revision,
312
+ local_dir=local_dir,
313
+ repo_path=repo_path,
314
+ downloaded_files=downloaded_files,
315
+ )
316
+ archived_manifest = load_archived_analysis_run_manifest(manifest_path)
317
+ for artifact_path in _manifest_artifact_paths(archived_manifest, include_archived=False):
318
+ if artifact_path not in remote_paths:
319
+ continue
320
+ _download_repo_file(
321
+ repo_id=repo_id,
322
+ revision=revision,
323
+ local_dir=local_dir,
324
+ repo_path=artifact_path,
325
+ downloaded_files=downloaded_files,
326
+ )
327
+
328
+
329
+ def _download_analysis_state_files(
330
+ *,
331
+ repo_id: str,
332
+ revision: str,
333
+ local_dir: Path,
334
+ remote_paths: set[str],
335
+ downloaded_files: set[str],
336
+ ) -> None:
337
+ for repo_path in sorted(
338
+ path for path in remote_paths if PurePosixPath(path).parts[:1] == ("analysis-state",)
339
+ ):
340
+ _download_repo_file(
341
+ repo_id=repo_id,
342
+ revision=revision,
343
+ local_dir=local_dir,
344
+ repo_path=repo_path,
345
+ downloaded_files=downloaded_files,
346
+ )
347
+
348
+
349
+ def _manifest_artifact_paths(
350
+ payload: dict[str, Any],
351
+ *,
352
+ include_archived: bool,
353
+ ) -> list[str]:
354
+ paths = [
355
+ str(value) for value in (payload.get("artifacts") or {}).values() if isinstance(value, str)
356
+ ]
357
+ if include_archived:
358
+ paths.extend(
359
+ str(value)
360
+ for value in (payload.get("archived_artifacts") or {}).values()
361
+ if isinstance(value, str)
362
+ )
363
+ deduped: list[str] = []
364
+ seen: set[str] = set()
365
+ for repo_path in paths:
366
+ normalized = repo_path.lstrip("./")
367
+ if not normalized or normalized in seen:
368
+ continue
369
+ seen.add(normalized)
370
+ deduped.append(normalized)
371
+ return deduped
372
+
373
+
374
+ def _is_archived_analysis_manifest_path(repo_path: str) -> bool:
375
+ parts = PurePosixPath(repo_path).parts
376
+ return (
377
+ len(parts) == 5
378
+ and parts[0] == "snapshots"
379
+ and parts[2] == "analysis-runs"
380
+ and parts[4] == ROOT_MANIFEST_FILENAME
381
+ )
382
+
383
+
384
+ def _download_repo_file(
385
+ *,
386
+ repo_id: str,
387
+ revision: str,
388
+ local_dir: Path,
389
+ repo_path: str,
390
+ downloaded_files: set[str],
391
+ required: bool = True,
392
+ ) -> Path:
393
+ try:
394
+ downloaded = Path(
395
+ hf_hub_download(
396
+ repo_id=repo_id,
397
+ repo_type="dataset",
398
+ filename=repo_path,
399
+ revision=revision,
400
+ )
401
+ )
402
+ except Exception:
403
+ if required:
404
+ raise
405
+ return local_dir / repo_path
406
+ destination = repo_relative_path_to_local(local_dir, repo_path)
407
+ _copy_downloaded_file(downloaded, destination)
408
+ downloaded_files.add(repo_path)
409
+ return destination
410
+
411
+
412
+ def _copy_downloaded_file(downloaded_path: Path, destination: Path) -> None:
413
+ destination.parent.mkdir(parents=True, exist_ok=True)
414
+ shutil.copy2(downloaded_path, destination)
415
+
416
+
417
  def _hf_dataset_info(repo_id: str, revision: str | None, *, files_metadata: bool) -> Any:
418
  api = HfApi()
419
  try:
 
441
  def _download_first_available_hf_file(
442
  *,
443
  repo_id: str,
444
+ revision: str,
445
  filenames: list[str],
446
  ) -> Path | None:
447
  for filename in filenames:
 
461
  return None
462
 
463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  def _hf_latest_snapshot_candidates(latest_payload: dict[str, Any], filename: str) -> list[str]:
465
  candidates: list[str] = []
466
  manifest_path = str(latest_payload.get("manifest_path") or "").strip("/")
467
  snapshot_dir = str(latest_payload.get("snapshot_dir") or "").strip("/")
468
  latest_snapshot_id = str(latest_payload.get("latest_snapshot_id") or "").strip()
469
+ archived_manifest_path = str(latest_payload.get("archived_manifest_path") or "").strip("/")
470
 
471
+ if filename == ROOT_MANIFEST_FILENAME and manifest_path:
472
  candidates.append(manifest_path)
473
  if snapshot_dir and snapshot_dir not in {".", "/"}:
474
  candidates.append(f"{snapshot_dir}/{filename}")
475
+ if filename == ROOT_MANIFEST_FILENAME and archived_manifest_path:
476
+ candidates.append(archived_manifest_path)
477
  if manifest_path and "/" in manifest_path:
478
  manifest_dir = manifest_path.rsplit("/", 1)[0]
479
  candidates.append(f"{manifest_dir}/{filename}")
480
  if latest_snapshot_id:
481
+ candidates.append(str(PurePosixPath("snapshots") / latest_snapshot_id / filename))
482
  candidates.append(filename)
483
 
484
  seen: set[str] = set()
 
492
  return deduped
493
 
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  def _download_url_to_path(url: str, destination: Path) -> None:
496
  destination.parent.mkdir(parents=True, exist_ok=True)
497
  urllib.request.urlretrieve(url, destination)
 
525
  def _infer_repo_from_materialized_snapshot(local_dir: Path) -> str:
526
  import pyarrow.parquet as pq
527
 
528
+ for table_filename in RAW_TABLE_FILENAMES:
529
+ path = local_dir / table_filename
 
 
 
 
 
 
 
 
 
 
530
  if not path.exists():
531
  continue
532
  rows = pq.read_table(path).slice(0, 1).to_pylist()
src/slop_farmer/data/snapshot_paths.py CHANGED
@@ -1,9 +1,63 @@
1
  from __future__ import annotations
2
 
3
- from pathlib import Path
 
 
 
4
 
5
  from slop_farmer.data.parquet_io import read_json
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def default_hf_materialize_dir(output_dir: Path, repo_id: str, revision: str | None) -> Path:
9
  suffix = repo_id.replace("/", "--")
@@ -12,14 +66,241 @@ def default_hf_materialize_dir(output_dir: Path, repo_id: str, revision: str | N
12
  return output_dir.resolve() / "snapshots" / f"hf-{suffix}"
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def load_latest_snapshot_pointer(snapshots_root: Path) -> Path | None:
16
- latest_path = snapshots_root.resolve() / "latest.json"
 
17
  if not latest_path.exists():
18
  return None
19
  payload = read_json(latest_path)
20
  snapshot_dir = payload.get("snapshot_dir")
21
  if isinstance(snapshot_dir, str) and snapshot_dir:
22
- return Path(snapshot_dir).resolve()
 
 
 
23
  return None
24
 
25
 
@@ -46,3 +327,104 @@ def resolve_snapshot_dir_from_snapshots_root(
46
  if snapshot_dirs:
47
  return snapshot_dirs[-1].resolve()
48
  raise FileNotFoundError(f"Could not resolve a snapshot directory from {latest_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
+ import re
4
+ from dataclasses import dataclass
5
+ from pathlib import Path, PurePosixPath
6
+ from typing import Any
7
 
8
  from slop_farmer.data.parquet_io import read_json
9
 
10
+ RAW_TABLE_FILENAMES: tuple[str, ...] = (
11
+ "issues.parquet",
12
+ "pull_requests.parquet",
13
+ "comments.parquet",
14
+ "reviews.parquet",
15
+ "review_comments.parquet",
16
+ "pr_files.parquet",
17
+ "pr_diffs.parquet",
18
+ "links.parquet",
19
+ "events.parquet",
20
+ )
21
+ VIEWER_SPLIT_FILENAMES: tuple[str, ...] = (
22
+ "issue_comments.parquet",
23
+ "pr_comments.parquet",
24
+ )
25
+ ROOT_MANIFEST_FILENAME = "manifest.json"
26
+ README_FILENAME = "README.md"
27
+ STATE_WATERMARK_PATH = "state/watermark.json"
28
+ SNAPSHOTS_LATEST_PATH = "snapshots/latest.json"
29
+ PR_SCOPE_CLUSTERS_FILENAME = "pr-scope-clusters.json"
30
+ NEW_CONTRIBUTORS_PARQUET_FILENAME = "new_contributors.parquet"
31
+ NEW_CONTRIBUTORS_REPORT_JSON_FILENAME = "new-contributors-report.json"
32
+ NEW_CONTRIBUTORS_REPORT_MARKDOWN_FILENAME = "new-contributors-report.md"
33
+ CONTRIBUTOR_ARTIFACT_FILENAMES: tuple[str, ...] = (
34
+ NEW_CONTRIBUTORS_PARQUET_FILENAME,
35
+ NEW_CONTRIBUTORS_REPORT_JSON_FILENAME,
36
+ NEW_CONTRIBUTORS_REPORT_MARKDOWN_FILENAME,
37
+ )
38
+ ANALYSIS_REPORT_FILENAME_BY_VARIANT: dict[str, str] = {
39
+ "deterministic": "analysis-report.json",
40
+ "hybrid": "analysis-report-hybrid.json",
41
+ }
42
+ HYBRID_ANALYSIS_REVIEWS_FILENAME = "analysis-report-hybrid.llm-reviews.json"
43
+ LEGACY_ANALYSIS_FILENAMES: tuple[str, ...] = (
44
+ ANALYSIS_REPORT_FILENAME_BY_VARIANT["deterministic"],
45
+ ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"],
46
+ HYBRID_ANALYSIS_REVIEWS_FILENAME,
47
+ )
48
+ CURRENT_ANALYSIS_DIR = PurePosixPath("analysis/current")
49
+ CURRENT_ANALYSIS_MANIFEST_PATH = str(CURRENT_ANALYSIS_DIR / ROOT_MANIFEST_FILENAME)
50
+ ANALYSIS_MANIFEST_SCHEMA_VERSION = 1
51
+
52
+
53
+ @dataclass(frozen=True, slots=True)
54
+ class ResolvedAnalysisReportPath:
55
+ path: Path
56
+ variant: str
57
+ source: str
58
+ snapshot_id: str | None = None
59
+ analysis_id: str | None = None
60
+
61
 
62
  def default_hf_materialize_dir(output_dir: Path, repo_id: str, revision: str | None) -> Path:
63
  suffix = repo_id.replace("/", "--")
 
66
  return output_dir.resolve() / "snapshots" / f"hf-{suffix}"
67
 
68
 
69
+ def repo_relative_path_to_local(base_dir: Path, repo_relative_path: str) -> Path:
70
+ return base_dir.joinpath(*PurePosixPath(repo_relative_path).parts)
71
+
72
+
73
+ def snapshot_artifact_path(snapshot_id: str, filename: str) -> str:
74
+ return str(PurePosixPath("snapshots") / snapshot_id / filename)
75
+
76
+
77
+ def archived_snapshot_manifest_path(snapshot_id: str) -> str:
78
+ return snapshot_artifact_path(snapshot_id, ROOT_MANIFEST_FILENAME)
79
+
80
+
81
+ def analysis_run_artifact_path(snapshot_id: str, analysis_id: str, filename: str) -> str:
82
+ return str(PurePosixPath("snapshots") / snapshot_id / "analysis-runs" / analysis_id / filename)
83
+
84
+
85
+ def analysis_run_manifest_path(snapshot_id: str, analysis_id: str) -> str:
86
+ return analysis_run_artifact_path(snapshot_id, analysis_id, ROOT_MANIFEST_FILENAME)
87
+
88
+
89
+ def current_analysis_artifact_path(filename: str) -> str:
90
+ return str(CURRENT_ANALYSIS_DIR / filename)
91
+
92
+
93
+ def repo_key(repo_slug: str) -> str:
94
+ return _path_key(repo_slug)
95
+
96
+
97
+ def model_key(model: str) -> str:
98
+ return _path_key(model)
99
+
100
+
101
+ def build_current_analysis_manifest(
102
+ *,
103
+ repo: str,
104
+ snapshot_id: str,
105
+ analysis_id: str,
106
+ variant: str,
107
+ channel: str,
108
+ model: str | None,
109
+ published_at: str,
110
+ include_hybrid_reviews: bool,
111
+ ) -> dict[str, Any]:
112
+ artifacts = {
113
+ "hybrid": current_analysis_artifact_path(ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"]),
114
+ }
115
+ archived_artifacts = {
116
+ "hybrid": analysis_run_artifact_path(
117
+ snapshot_id,
118
+ analysis_id,
119
+ ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"],
120
+ )
121
+ }
122
+ if include_hybrid_reviews:
123
+ artifacts["hybrid_reviews"] = current_analysis_artifact_path(
124
+ HYBRID_ANALYSIS_REVIEWS_FILENAME
125
+ )
126
+ archived_artifacts["hybrid_reviews"] = analysis_run_artifact_path(
127
+ snapshot_id,
128
+ analysis_id,
129
+ HYBRID_ANALYSIS_REVIEWS_FILENAME,
130
+ )
131
+ payload = {
132
+ "schema_version": ANALYSIS_MANIFEST_SCHEMA_VERSION,
133
+ "repo": repo,
134
+ "snapshot_id": snapshot_id,
135
+ "analysis_id": analysis_id,
136
+ "variant": variant,
137
+ "channel": channel,
138
+ "model": model,
139
+ "published_at": published_at,
140
+ "artifacts": artifacts,
141
+ "archived_artifacts": archived_artifacts,
142
+ }
143
+ return validate_current_analysis_manifest(payload)
144
+
145
+
146
+ def build_archived_analysis_run_manifest(
147
+ *,
148
+ repo: str,
149
+ snapshot_id: str,
150
+ analysis_id: str,
151
+ variant: str,
152
+ channel: str,
153
+ model: str | None,
154
+ published_at: str,
155
+ include_hybrid_reviews: bool,
156
+ ) -> dict[str, Any]:
157
+ artifacts = {
158
+ "hybrid": analysis_run_artifact_path(
159
+ snapshot_id,
160
+ analysis_id,
161
+ ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"],
162
+ )
163
+ }
164
+ if include_hybrid_reviews:
165
+ artifacts["hybrid_reviews"] = analysis_run_artifact_path(
166
+ snapshot_id,
167
+ analysis_id,
168
+ HYBRID_ANALYSIS_REVIEWS_FILENAME,
169
+ )
170
+ payload = {
171
+ "schema_version": ANALYSIS_MANIFEST_SCHEMA_VERSION,
172
+ "repo": repo,
173
+ "snapshot_id": snapshot_id,
174
+ "analysis_id": analysis_id,
175
+ "variant": variant,
176
+ "channel": channel,
177
+ "model": model,
178
+ "published_at": published_at,
179
+ "artifacts": artifacts,
180
+ }
181
+ return validate_archived_analysis_run_manifest(payload)
182
+
183
+
184
+ def load_current_analysis_manifest(path: Path) -> dict[str, Any]:
185
+ payload = read_json(path)
186
+ if not isinstance(payload, dict):
187
+ raise ValueError(f"Current analysis manifest at {path} must contain a JSON object.")
188
+ return validate_current_analysis_manifest(payload)
189
+
190
+
191
+ def load_archived_analysis_run_manifest(path: Path) -> dict[str, Any]:
192
+ payload = read_json(path)
193
+ if not isinstance(payload, dict):
194
+ raise ValueError(f"Archived analysis manifest at {path} must contain a JSON object.")
195
+ return validate_archived_analysis_run_manifest(payload)
196
+
197
+
198
+ def resolve_default_dashboard_analysis_report(
199
+ snapshot_dir: Path,
200
+ ) -> ResolvedAnalysisReportPath | None:
201
+ current = resolve_current_analysis_report(snapshot_dir)
202
+ if current is not None and _analysis_matches_snapshot(snapshot_dir, current):
203
+ return current
204
+ return resolve_snapshot_local_analysis_report(snapshot_dir, variant="auto")
205
+
206
+
207
+ def resolve_current_analysis_report(
208
+ snapshot_dir: Path,
209
+ *,
210
+ variant: str = "auto",
211
+ ) -> ResolvedAnalysisReportPath | None:
212
+ normalized = _normalize_analysis_variant(variant)
213
+ manifest_path = repo_relative_path_to_local(snapshot_dir, CURRENT_ANALYSIS_MANIFEST_PATH)
214
+ if not manifest_path.exists():
215
+ return None
216
+ manifest = load_current_analysis_manifest(manifest_path)
217
+ artifact_key = _analysis_artifact_key_for_variant(normalized, manifest_kind="current")
218
+ artifact_path = manifest.get("artifacts", {}).get(artifact_key)
219
+ if not isinstance(artifact_path, str) or not artifact_path:
220
+ message = (
221
+ f"Published current analysis manifest does not provide the {normalized} artifact."
222
+ if normalized != "auto"
223
+ else "Published current analysis manifest does not provide the canonical hybrid artifact."
224
+ )
225
+ raise ValueError(message)
226
+ report_path = repo_relative_path_to_local(snapshot_dir, artifact_path)
227
+ if not report_path.exists():
228
+ raise ValueError(
229
+ f"Published current analysis artifact {artifact_path!r} is missing from the materialized snapshot."
230
+ )
231
+ return ResolvedAnalysisReportPath(
232
+ path=report_path,
233
+ variant="hybrid" if artifact_key == "hybrid" else normalized,
234
+ source="current",
235
+ snapshot_id=str(manifest["snapshot_id"]),
236
+ analysis_id=str(manifest["analysis_id"]),
237
+ )
238
+
239
+
240
+ def resolve_snapshot_local_analysis_report(
241
+ snapshot_dir: Path,
242
+ *,
243
+ variant: str = "auto",
244
+ ) -> ResolvedAnalysisReportPath | None:
245
+ normalized = _normalize_analysis_variant(variant)
246
+ if normalized == "auto":
247
+ hybrid_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"]
248
+ if hybrid_path.exists():
249
+ return ResolvedAnalysisReportPath(
250
+ path=hybrid_path,
251
+ variant="hybrid",
252
+ source="snapshot",
253
+ )
254
+ deterministic_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT["deterministic"]
255
+ if deterministic_path.exists():
256
+ return ResolvedAnalysisReportPath(
257
+ path=deterministic_path,
258
+ variant="deterministic",
259
+ source="snapshot",
260
+ )
261
+ return None
262
+ report_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT[normalized]
263
+ if not report_path.exists():
264
+ return None
265
+ return ResolvedAnalysisReportPath(
266
+ path=report_path,
267
+ variant=normalized,
268
+ source="snapshot",
269
+ )
270
+
271
+
272
+ def validate_current_analysis_manifest(payload: dict[str, Any]) -> dict[str, Any]:
273
+ validated = _validate_analysis_manifest(payload, require_archived_artifacts=True)
274
+ archived_artifacts = _validate_artifacts(
275
+ dict(validated["archived_artifacts"]),
276
+ expected_prefix=analysis_run_artifact_path(
277
+ str(validated["snapshot_id"]),
278
+ str(validated["analysis_id"]),
279
+ "",
280
+ ),
281
+ )
282
+ if set(archived_artifacts) != set(validated["artifacts"]):
283
+ raise ValueError("Current analysis manifest artifacts and archived_artifacts must match.")
284
+ validated["archived_artifacts"] = archived_artifacts
285
+ return validated
286
+
287
+
288
+ def validate_archived_analysis_run_manifest(payload: dict[str, Any]) -> dict[str, Any]:
289
+ return _validate_analysis_manifest(payload, require_archived_artifacts=False)
290
+
291
+
292
  def load_latest_snapshot_pointer(snapshots_root: Path) -> Path | None:
293
+ resolved_snapshots_root = snapshots_root.resolve()
294
+ latest_path = resolved_snapshots_root / "latest.json"
295
  if not latest_path.exists():
296
  return None
297
  payload = read_json(latest_path)
298
  snapshot_dir = payload.get("snapshot_dir")
299
  if isinstance(snapshot_dir, str) and snapshot_dir:
300
+ path = Path(snapshot_dir)
301
+ if path.is_absolute():
302
+ return path.resolve()
303
+ return (resolved_snapshots_root.parent / path).resolve()
304
  return None
305
 
306
 
 
327
  if snapshot_dirs:
328
  return snapshot_dirs[-1].resolve()
329
  raise FileNotFoundError(f"Could not resolve a snapshot directory from {latest_path}")
330
+
331
+
332
+ def _validate_analysis_manifest(
333
+ payload: dict[str, Any],
334
+ *,
335
+ require_archived_artifacts: bool,
336
+ ) -> dict[str, Any]:
337
+ validated = {str(key): value for key, value in payload.items()}
338
+ if validated.get("schema_version") != ANALYSIS_MANIFEST_SCHEMA_VERSION:
339
+ raise ValueError(
340
+ f"Unsupported analysis manifest schema version: {validated.get('schema_version')!r}"
341
+ )
342
+ for field in ("repo", "snapshot_id", "analysis_id", "variant", "channel", "published_at"):
343
+ if not isinstance(validated.get(field), str) or not str(validated[field]).strip():
344
+ raise ValueError(f"Analysis manifest field {field!r} must be a non-empty string.")
345
+ validated[field] = str(validated[field]).strip()
346
+ model = validated.get("model")
347
+ if model is not None and not isinstance(model, str):
348
+ raise ValueError("Analysis manifest field 'model' must be a string when present.")
349
+ artifacts = validated.get("artifacts")
350
+ if not isinstance(artifacts, dict):
351
+ raise ValueError("Analysis manifest field 'artifacts' must be an object.")
352
+ expected_prefix = (
353
+ current_analysis_artifact_path("")
354
+ if require_archived_artifacts
355
+ else analysis_run_artifact_path(
356
+ str(validated["snapshot_id"]),
357
+ str(validated["analysis_id"]),
358
+ "",
359
+ )
360
+ )
361
+ validated["artifacts"] = _validate_artifacts(dict(artifacts), expected_prefix=expected_prefix)
362
+ if require_archived_artifacts:
363
+ archived_artifacts = validated.get("archived_artifacts")
364
+ if not isinstance(archived_artifacts, dict):
365
+ raise ValueError(
366
+ "Current analysis manifest field 'archived_artifacts' must be an object."
367
+ )
368
+ validated["archived_artifacts"] = {
369
+ str(key): value for key, value in archived_artifacts.items()
370
+ }
371
+ return validated
372
+
373
+
374
+ def _validate_artifacts(artifacts: dict[str, Any], *, expected_prefix: str) -> dict[str, str]:
375
+ normalized = {str(key): value for key, value in artifacts.items()}
376
+ hybrid_path = normalized.get("hybrid")
377
+ if not isinstance(hybrid_path, str) or not hybrid_path:
378
+ raise ValueError("Analysis manifest must include artifacts.hybrid.")
379
+ validated = {"hybrid": hybrid_path}
380
+ hybrid_reviews_path = normalized.get("hybrid_reviews")
381
+ if hybrid_reviews_path is not None:
382
+ if not isinstance(hybrid_reviews_path, str) or not hybrid_reviews_path:
383
+ raise ValueError(
384
+ "Analysis manifest artifacts.hybrid_reviews must be a non-empty string."
385
+ )
386
+ validated["hybrid_reviews"] = hybrid_reviews_path
387
+ for key, value in validated.items():
388
+ if not value.startswith(expected_prefix):
389
+ raise ValueError(
390
+ f"Analysis manifest artifact {key!r} must live under {expected_prefix!r}, got {value!r}."
391
+ )
392
+ return validated
393
+
394
+
395
+ def _path_key(value: str) -> str:
396
+ normalized = re.sub(r"[^a-z0-9]+", "-", value.strip().lower())
397
+ normalized = re.sub(r"-+", "-", normalized).strip("-")
398
+ if not normalized:
399
+ raise ValueError("Expected a non-empty path key value.")
400
+ return normalized
401
+
402
+
403
+ def _analysis_matches_snapshot(
404
+ snapshot_dir: Path,
405
+ analysis_path: ResolvedAnalysisReportPath,
406
+ ) -> bool:
407
+ snapshot_manifest_path = snapshot_dir / ROOT_MANIFEST_FILENAME
408
+ if snapshot_manifest_path.exists():
409
+ snapshot_manifest = read_json(snapshot_manifest_path)
410
+ snapshot_id = snapshot_manifest.get("snapshot_id")
411
+ if snapshot_id is not None:
412
+ return str(snapshot_id) == str(analysis_path.snapshot_id)
413
+ return snapshot_dir.name == str(analysis_path.snapshot_id)
414
+
415
+
416
+ def _normalize_analysis_variant(variant: str) -> str:
417
+ normalized = variant.strip().lower()
418
+ if normalized not in {"auto", "deterministic", "hybrid"}:
419
+ raise ValueError(
420
+ f"Unsupported analysis variant {variant!r}; expected auto, hybrid, or deterministic."
421
+ )
422
+ return normalized
423
+
424
+
425
+ def _analysis_artifact_key_for_variant(variant: str, *, manifest_kind: str) -> str:
426
+ if variant in {"auto", "hybrid"}:
427
+ return "hybrid"
428
+ raise ValueError(
429
+ f"Published {manifest_kind} analysis only serves canonical hybrid artifacts; requested {variant!r}."
430
+ )
src/slop_farmer/reports/analysis.py CHANGED
@@ -19,15 +19,12 @@ from rank_bm25 import BM25Okapi
19
  from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
20
  from slop_farmer.data.links import build_text_link_rows
21
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
22
- from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
23
- from slop_farmer.data.snapshot_paths import (
24
- default_hf_materialize_dir,
25
- resolve_snapshot_dir_from_output,
26
- )
27
  from slop_farmer.reports.analysis_cache import (
28
  HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
29
  PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
30
  HybridReviewCacheEntry,
 
31
  HybridReviewCacheManifest,
32
  HybridReviewCacheStore,
33
  HybridReviewSettingsFingerprint,
@@ -89,12 +86,12 @@ LLM_PROVIDER_ENV_VARS = (
89
  "DEEPSEEK_API_KEY",
90
  )
91
  LLM_PACKET_CHARS_PER_TOKEN = 4
92
- LLM_MAX_INPUT_TOKENS = 12_000
93
- LLM_MAX_NODES_PER_PACKET = 18
94
- LLM_MAX_SOFT_PAIRS_PER_PACKET = 24
95
- LLM_MAX_DIFF_CHARS_PER_ITEM = 400
96
- LLM_MAX_FILENAMES_PER_ITEM = 8
97
- LLM_SKIP_EVALUATOR_ABOVE_TOKENS = 6_000
98
  LLM_OVERFLOW_POLICY = "truncate_then_skip"
99
  LLM_SHARED_TARGET_MAX_NEIGHBORS_PER_PR = 3
100
  LLM_SHARED_TARGET_MAX_EXTRA_PAIRS_PER_TARGET = 18
@@ -311,6 +308,42 @@ class AnalysisBuildResult:
311
  llm_reviews: list[dict[str, Any]]
312
 
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  def _hybrid_review_cache_manifest() -> HybridReviewCacheManifest:
315
  return HybridReviewCacheManifest(
316
  cache_schema_version=HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
@@ -766,18 +799,14 @@ def _artifact_suffix(row: dict[str, Any] | None, kind: str) -> str:
766
 
767
 
768
  def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
769
- if options.hf_repo_id:
770
- materialize_dir = options.hf_materialize_dir or default_hf_materialize_dir(
771
- options.output_dir,
772
- options.hf_repo_id,
773
- options.hf_revision,
774
- )
775
- return materialize_hf_dataset_snapshot(
776
- repo_id=options.hf_repo_id,
777
- local_dir=materialize_dir,
778
- revision=options.hf_revision,
779
- ).resolve()
780
- return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
781
 
782
 
783
  def _load_snapshot(snapshot_dir: Path) -> SnapshotData:
@@ -927,39 +956,46 @@ async def _build_report(snapshot: SnapshotData, options: AnalysisOptions) -> Ana
927
  review_comment_map=review_comment_map,
928
  )
929
  issue_soft_candidates = _issue_soft_candidates(issue_map, features, issue_hard_pairs)
930
- accepted_issue_pairs, issue_llm_enabled, issue_llm_reviews = await _accepted_soft_pairs(
931
  options=options,
932
  snapshot=snapshot,
 
 
933
  features=features,
934
- hard_pairs=issue_hard_pairs,
935
- soft_candidates=issue_soft_candidates,
936
- label="issue",
937
- hybrid_review_cache=hybrid_review_cache,
938
- llm_available=llm_available,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
939
  )
940
  issue_pairs = dict(issue_hard_pairs)
941
  for pair, detail in accepted_issue_pairs.items():
942
  issue_pairs.setdefault(pair, set()).update(
943
  detail.get("evidence_types") or {"soft_similarity"}
944
  )
945
-
946
- pr_soft_candidates, pr_pair_target_issues = _pr_duplicate_candidates(
947
- options=options,
948
- snapshot=snapshot,
949
- issue_map=issue_map,
950
- pr_map=pr_map,
951
- features=features,
952
- )
953
- accepted_pr_pairs, pr_llm_enabled, pr_llm_reviews = await _accepted_soft_pairs(
954
- options=options,
955
- snapshot=snapshot,
956
- features=features,
957
- hard_pairs={},
958
- soft_candidates=pr_soft_candidates,
959
- label="pull_request",
960
- hybrid_review_cache=hybrid_review_cache,
961
- llm_available=llm_available,
962
- )
963
  pr_pairs: dict[tuple[str, str], set[str]] = {}
964
  for pair, detail in accepted_pr_pairs.items():
965
  pr_pairs.setdefault(pair, set()).update(detail.get("evidence_types") or {"soft_similarity"})
@@ -1873,28 +1909,21 @@ def _review_subpacket(packet: dict[str, Any], soft_pairs: list[dict[str, Any]])
1873
  }
1874
 
1875
 
1876
- def _split_packet_for_review(packet: dict[str, Any]) -> list[dict[str, Any]]:
1877
- if (
1878
- len(packet["nodes"]) <= LLM_MAX_NODES_PER_PACKET
1879
- and len(packet["soft_pairs"]) <= LLM_MAX_SOFT_PAIRS_PER_PACKET
1880
- ):
1881
  return [packet]
1882
  batches: list[list[dict[str, Any]]] = []
1883
  current_batch: list[dict[str, Any]] = []
1884
- current_nodes: set[str] = set()
1885
  for soft_pair in sorted(packet["soft_pairs"], key=_soft_pair_review_sort_key):
1886
- pair_nodes = {str(soft_pair["left"]), str(soft_pair["right"])}
1887
- next_nodes = current_nodes | pair_nodes
1888
- if current_batch and (
1889
- len(current_batch) >= LLM_MAX_SOFT_PAIRS_PER_PACKET
1890
- or len(next_nodes) > LLM_MAX_NODES_PER_PACKET
1891
- ):
1892
  batches.append(current_batch)
1893
  current_batch = [soft_pair]
1894
- current_nodes = set(pair_nodes)
1895
  continue
1896
- current_batch.append(soft_pair)
1897
- current_nodes = next_nodes
1898
  if current_batch:
1899
  batches.append(current_batch)
1900
  return [_review_subpacket(packet, batch) for batch in batches]
@@ -1985,7 +2014,8 @@ def _should_run_evaluator(
1985
  aggressively_trimmed: bool,
1986
  analyst_result: ClusterAnalystResponse,
1987
  ) -> bool:
1988
- if split or aggressively_trimmed:
 
1989
  return False
1990
  if budget.estimated_eval_tokens > LLM_SKIP_EVALUATOR_ABOVE_TOKENS:
1991
  return False
@@ -2020,6 +2050,166 @@ def _packet_soft_pair_ids(packet: dict[str, Any]) -> list[str]:
2020
  ]
2021
 
2022
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2023
  async def _accepted_soft_pairs(
2024
  *,
2025
  options: AnalysisOptions,
@@ -2030,6 +2220,7 @@ async def _accepted_soft_pairs(
2030
  label: str,
2031
  hybrid_review_cache: HybridReviewCacheStore,
2032
  llm_available: bool,
 
2033
  ) -> tuple[dict[tuple[str, str], dict[str, Any]], bool, list[dict[str, Any]]]:
2034
  del snapshot
2035
  if not soft_candidates:
@@ -2048,6 +2239,8 @@ async def _accepted_soft_pairs(
2048
  for pair in soft_candidates:
2049
  candidate_graph.setdefault(pair, set()).add("soft_similarity")
2050
  component_payloads = _component_packets(features, candidate_graph, soft_candidates)
 
 
2051
  accepted: dict[tuple[str, str], dict[str, Any]] = dict(deterministic_accepts)
2052
  llm_used = False
2053
  review_records: list[dict[str, Any]] = []
@@ -2055,7 +2248,7 @@ async def _accepted_soft_pairs(
2055
  for index, payload in enumerate(component_payloads, start=1):
2056
  component_budget = _estimate_packet_size(payload, options.model)
2057
  cluster_id = _cluster_id_from_nodes(payload["nodes"])
2058
- review_units = _split_packet_for_review(payload)
2059
  if len(review_units) > 1:
2060
  _analysis_log(
2061
  f"LLM {label} soft-edge review {index}/{total_components}: "
@@ -2064,10 +2257,6 @@ async def _accepted_soft_pairs(
2064
  f"est_tokens={component_budget.estimated_input_tokens})"
2065
  )
2066
  for unit_index, review_unit in enumerate(review_units, start=1):
2067
- prefix = (
2068
- f"LLM {label} soft-edge review {index}/{total_components}"
2069
- f" unit {unit_index}/{len(review_units)}"
2070
- )
2071
  prepared = _prepare_packet_for_llm(
2072
  review_unit,
2073
  options.model,
@@ -2075,41 +2264,29 @@ async def _accepted_soft_pairs(
2075
  )
2076
  if prepared is None:
2077
  unit_budget = _estimate_packet_size(review_unit, options.model)
2078
- _analysis_log(
2079
- f"{prefix}: skipped over-budget packet "
2080
- f"(nodes={unit_budget.node_count}, soft_pairs={unit_budget.soft_pair_count}, "
2081
- f"est_tokens={unit_budget.estimated_input_tokens}, overflow_policy={LLM_OVERFLOW_POLICY})"
2082
- )
2083
- review_records.append(
2084
- {
2085
- "label": label,
2086
- "component_index": index,
2087
- "component_count": total_components,
2088
- "review_unit_index": unit_index,
2089
- "review_unit_count": len(review_units),
2090
- "status": "skipped",
2091
- "reason": "over_budget_after_truncate",
2092
- "source": None,
2093
- "cache_hit": False,
2094
- "model": options.model,
2095
- "cluster_id": cluster_id,
2096
- "nodes": list(review_unit["nodes"]),
2097
- "soft_pairs": _packet_soft_pair_ids(review_unit),
2098
- "prepared_review_unit_hash": None,
2099
- "component_budget": _packet_budget_json(component_budget),
2100
- "budget": _packet_budget_json(unit_budget),
2101
- "overflow_policy": LLM_OVERFLOW_POLICY,
2102
- "trimmed": True,
2103
- "aggressively_trimmed": True,
2104
- "split": len(review_units) > 1,
2105
- "analyst_result": None,
2106
- "evaluator_result": None,
2107
- "evaluator_used": False,
2108
- "retried": False,
2109
- "accepted_nontrivial_soft_edge": False,
2110
- "error_kind": None,
2111
- "error_message": None,
2112
- }
2113
  )
2114
  continue
2115
  prepared_review_unit = _prepared_review_unit_payload(prepared)
@@ -2118,85 +2295,113 @@ async def _accepted_soft_pairs(
2118
  model=options.model,
2119
  prepared_review_unit=prepared_review_unit,
2120
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2121
  cached_entry = hybrid_review_cache.get(cache_key)
2122
- cache_hit = cached_entry is not None
2123
  if cached_entry is not None:
2124
- result = _cluster_analysis_call_result_from_payload(cached_entry.result)
2125
- _analysis_log(
2126
- f"{prefix}: cache hit "
2127
- f"(nodes={prepared.budget.node_count}, soft_pairs={prepared.budget.soft_pair_count}, "
2128
- f"est_tokens={prepared.budget.estimated_input_tokens}, model={options.model})"
2129
- )
2130
- else:
2131
- if not llm_available:
2132
- _analysis_log(
2133
- f"{prefix}: cache miss with fast-agent unavailable; "
2134
- "keeping deterministic-only soft edges"
 
2135
  )
2136
- review_records.append(
2137
- {
2138
- "label": label,
2139
- "component_index": index,
2140
- "component_count": total_components,
2141
- "review_unit_index": unit_index,
2142
- "review_unit_count": len(review_units),
2143
- "status": "skipped",
2144
- "reason": "llm_unavailable_cache_miss",
2145
- "source": None,
2146
- "cache_hit": False,
2147
- "model": options.model,
2148
- "cluster_id": cluster_id,
2149
- "nodes": list(prepared.packet["nodes"]),
2150
- "soft_pairs": _packet_soft_pair_ids(prepared.packet),
2151
- "prepared_review_unit_hash": cache_key.prepared_review_unit_hash,
2152
- "component_budget": _packet_budget_json(component_budget),
2153
- "budget": _packet_budget_json(prepared.budget),
2154
- "overflow_policy": LLM_OVERFLOW_POLICY,
2155
- "trimmed": prepared.trimmed,
2156
- "aggressively_trimmed": prepared.aggressively_trimmed,
2157
- "split": prepared.split,
2158
- "analyst_result": None,
2159
- "evaluator_result": None,
2160
- "evaluator_used": False,
2161
- "retried": False,
2162
- "accepted_nontrivial_soft_edge": False,
2163
- "error_kind": None,
2164
- "error_message": None,
2165
- }
2166
  )
2167
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2168
  _analysis_log(
2169
- f"{prefix}: {prepared.budget.node_count} nodes, {prepared.budget.soft_pair_count} soft pairs, "
2170
- f"est_tokens={prepared.budget.estimated_input_tokens}, model={options.model}"
2171
- f"{' trimmed' if prepared.trimmed else ''}"
2172
- f"{' split' if prepared.split else ''}"
2173
  )
2174
- result = await _fast_agent_cluster_analysis(prepared, options.model)
2175
- if _cacheable_cluster_analysis_result(result):
2176
- hybrid_review_cache.put(
2177
- HybridReviewCacheEntry(
2178
- key=cache_key,
2179
- result=_cluster_analysis_call_result_payload(result),
2180
- cached_at=_iso_now(),
2181
- nodes=tuple(prepared.packet["nodes"]),
2182
- soft_pairs=tuple(_packet_soft_pair_ids(prepared.packet)),
2183
- budget=_packet_budget_json(prepared.budget),
2184
- split=prepared.split,
2185
- trimmed=prepared.trimmed,
2186
- aggressively_trimmed=prepared.aggressively_trimmed,
2187
- )
2188
- )
2189
- accepted_nontrivial = False
2190
- if result.analyst_result is None:
2191
- if result.error_kind is not None:
2192
  _analysis_log(
2193
- f"{prefix}: {result.error_kind}"
2194
- f" (nodes={prepared.budget.node_count}, soft_pairs={prepared.budget.soft_pair_count}, "
2195
- f"est_tokens={prepared.budget.estimated_input_tokens}, "
2196
  f"overflow_policy={LLM_OVERFLOW_POLICY})"
2197
  )
2198
  else:
2199
- _analysis_log(f"{prefix}: no result")
2200
  else:
2201
  llm_used = True
2202
  verdicts = {
@@ -2205,18 +2410,28 @@ async def _accepted_soft_pairs(
2205
  }
2206
  accepted_count = sum(1 for verdict in verdicts.values() if verdict.accept)
2207
  rejected_count = sum(1 for verdict in verdicts.values() if not verdict.accept)
2208
- accepted_nontrivial = _accepted_nontrivial_soft_edge(
2209
- prepared.packet, result.analyst_result
 
 
 
 
 
 
 
 
2210
  )
2211
  evaluator_status = "used" if result.evaluator_used else "skipped"
2212
  _analysis_log(
2213
- f"{prefix}: {accepted_count} accepted, {rejected_count} rejected, "
2214
- f"evaluator={evaluator_status}, source={'cache' if cache_hit else 'llm'}"
2215
  )
2216
  if result.error_kind is not None:
2217
- _analysis_log(f"{prefix}: {result.error_kind}; keeping analyst result")
2218
- for pair in prepared.packet["soft_pairs"]:
2219
- normalized_pair = _pair_key(str(pair["left"]), str(pair["right"]))
 
 
2220
  verdict = verdicts.get(normalized_pair)
2221
  if verdict is None:
2222
  continue
@@ -2224,45 +2439,31 @@ async def _accepted_soft_pairs(
2224
  accepted[normalized_pair] = soft_candidates[normalized_pair]
2225
  else:
2226
  accepted.pop(normalized_pair, None)
2227
- review_records.append(
2228
- {
2229
- "label": label,
2230
- "component_index": index,
2231
- "component_count": total_components,
2232
- "review_unit_index": unit_index,
2233
- "review_unit_count": len(review_units),
2234
- "status": "reviewed" if result.analyst_result is not None else "error",
2235
- "reason": None,
2236
- "source": "cache" if cache_hit else "llm",
2237
- "cache_hit": cache_hit,
2238
- "model": options.model,
2239
- "cluster_id": cluster_id,
2240
- "nodes": list(prepared.packet["nodes"]),
2241
- "soft_pairs": _packet_soft_pair_ids(prepared.packet),
2242
- "prepared_review_unit_hash": cache_key.prepared_review_unit_hash,
2243
- "component_budget": _packet_budget_json(component_budget),
2244
- "budget": _packet_budget_json(prepared.budget),
2245
- "overflow_policy": LLM_OVERFLOW_POLICY,
2246
- "trimmed": prepared.trimmed,
2247
- "aggressively_trimmed": prepared.aggressively_trimmed,
2248
- "split": prepared.split,
2249
- "analyst_result": (
2250
- None
2251
- if result.analyst_result is None
2252
- else result.analyst_result.model_dump(mode="json")
2253
- ),
2254
- "evaluator_result": (
2255
- None
2256
- if result.evaluator_result is None
2257
- else result.evaluator_result.model_dump(mode="json")
2258
- ),
2259
- "evaluator_used": result.evaluator_used,
2260
- "retried": result.retried,
2261
- "accepted_nontrivial_soft_edge": accepted_nontrivial,
2262
- "error_kind": result.error_kind,
2263
- "error_message": result.error_message,
2264
- }
2265
  )
 
2266
  return accepted, llm_used, review_records
2267
 
2268
 
 
19
  from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
20
  from slop_farmer.data.links import build_text_link_rows
21
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
22
+ from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 
 
 
 
23
  from slop_farmer.reports.analysis_cache import (
24
  HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
25
  PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
26
  HybridReviewCacheEntry,
27
+ HybridReviewCacheKey,
28
  HybridReviewCacheManifest,
29
  HybridReviewCacheStore,
30
  HybridReviewSettingsFingerprint,
 
86
  "DEEPSEEK_API_KEY",
87
  )
88
  LLM_PACKET_CHARS_PER_TOKEN = 4
89
+ LLM_MAX_INPUT_TOKENS = 60_000
90
+ LLM_MAX_NODES_PER_PACKET = 48
91
+ LLM_MAX_SOFT_PAIRS_PER_PACKET = 72
92
+ LLM_MAX_DIFF_CHARS_PER_ITEM = 1_200
93
+ LLM_MAX_FILENAMES_PER_ITEM = 16
94
+ LLM_SKIP_EVALUATOR_ABOVE_TOKENS = 60_000
95
  LLM_OVERFLOW_POLICY = "truncate_then_skip"
96
  LLM_SHARED_TARGET_MAX_NEIGHBORS_PER_PR = 3
97
  LLM_SHARED_TARGET_MAX_EXTRA_PAIRS_PER_TARGET = 18
 
308
  llm_reviews: list[dict[str, Any]]
309
 
310
 
311
+ @dataclass(frozen=True, slots=True)
312
+ class SoftPairReviewUnitMeta:
313
+ label: str
314
+ component_index: int
315
+ component_count: int
316
+ review_unit_index: int
317
+ review_unit_count: int
318
+ cluster_id: str
319
+ prefix: str
320
+ nodes: tuple[str, ...]
321
+ soft_pairs: tuple[str, ...]
322
+ component_budget: PacketBudget
323
+ budget: PacketBudget
324
+ prepared_review_unit_hash: str | None
325
+ trimmed: bool
326
+ aggressively_trimmed: bool
327
+ split: bool
328
+
329
+
330
+ @dataclass(frozen=True, slots=True)
331
+ class PendingSoftPairReview:
332
+ meta: SoftPairReviewUnitMeta
333
+ prepared: PreparedLlmPacket
334
+ cache_key: HybridReviewCacheKey
335
+
336
+
337
+ @dataclass(frozen=True, slots=True)
338
+ class CompletedSoftPairReview:
339
+ meta: SoftPairReviewUnitMeta
340
+ result: ClusterAnalysisCallResult | None
341
+ status: str
342
+ reason: str | None
343
+ source: str | None
344
+ cache_hit: bool
345
+
346
+
347
  def _hybrid_review_cache_manifest() -> HybridReviewCacheManifest:
348
  return HybridReviewCacheManifest(
349
  cache_schema_version=HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
 
799
 
800
 
801
  def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
802
+ return resolve_snapshot_source_dir(
803
+ snapshot_dir=options.snapshot_dir,
804
+ local_snapshots_root=options.output_dir.resolve() / "snapshots",
805
+ hf_repo_id=options.hf_repo_id,
806
+ hf_revision=options.hf_revision,
807
+ hf_materialize_dir=options.hf_materialize_dir,
808
+ hf_output_dir=options.output_dir,
809
+ )
 
 
 
 
810
 
811
 
812
  def _load_snapshot(snapshot_dir: Path) -> SnapshotData:
 
956
  review_comment_map=review_comment_map,
957
  )
958
  issue_soft_candidates = _issue_soft_candidates(issue_map, features, issue_hard_pairs)
959
+ pr_soft_candidates, pr_pair_target_issues = _pr_duplicate_candidates(
960
  options=options,
961
  snapshot=snapshot,
962
+ issue_map=issue_map,
963
+ pr_map=pr_map,
964
  features=features,
965
+ )
966
+ review_semaphore = asyncio.Semaphore(options.hybrid_llm_concurrency)
967
+ (
968
+ (accepted_issue_pairs, issue_llm_enabled, issue_llm_reviews),
969
+ (accepted_pr_pairs, pr_llm_enabled, pr_llm_reviews),
970
+ ) = await asyncio.gather(
971
+ _accepted_soft_pairs(
972
+ options=options,
973
+ snapshot=snapshot,
974
+ features=features,
975
+ hard_pairs=issue_hard_pairs,
976
+ soft_candidates=issue_soft_candidates,
977
+ label="issue",
978
+ hybrid_review_cache=hybrid_review_cache,
979
+ llm_available=llm_available,
980
+ review_semaphore=review_semaphore,
981
+ ),
982
+ _accepted_soft_pairs(
983
+ options=options,
984
+ snapshot=snapshot,
985
+ features=features,
986
+ hard_pairs={},
987
+ soft_candidates=pr_soft_candidates,
988
+ label="pull_request",
989
+ hybrid_review_cache=hybrid_review_cache,
990
+ llm_available=llm_available,
991
+ review_semaphore=review_semaphore,
992
+ ),
993
  )
994
  issue_pairs = dict(issue_hard_pairs)
995
  for pair, detail in accepted_issue_pairs.items():
996
  issue_pairs.setdefault(pair, set()).update(
997
  detail.get("evidence_types") or {"soft_similarity"}
998
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
  pr_pairs: dict[tuple[str, str], set[str]] = {}
1000
  for pair, detail in accepted_pr_pairs.items():
1001
  pr_pairs.setdefault(pair, set()).update(detail.get("evidence_types") or {"soft_similarity"})
 
1909
  }
1910
 
1911
 
1912
+ def _split_packet_for_review(packet: dict[str, Any], model: str) -> list[dict[str, Any]]:
1913
+ if not packet["soft_pairs"]:
1914
+ return [packet]
1915
+ if not _packet_over_budget(_estimate_packet_size(packet, model)):
 
1916
  return [packet]
1917
  batches: list[list[dict[str, Any]]] = []
1918
  current_batch: list[dict[str, Any]] = []
 
1919
  for soft_pair in sorted(packet["soft_pairs"], key=_soft_pair_review_sort_key):
1920
+ candidate_batch = [*current_batch, soft_pair]
1921
+ candidate_packet = _review_subpacket(packet, candidate_batch)
1922
+ if current_batch and _packet_over_budget(_estimate_packet_size(candidate_packet, model)):
 
 
 
1923
  batches.append(current_batch)
1924
  current_batch = [soft_pair]
 
1925
  continue
1926
+ current_batch = candidate_batch
 
1927
  if current_batch:
1928
  batches.append(current_batch)
1929
  return [_review_subpacket(packet, batch) for batch in batches]
 
2014
  aggressively_trimmed: bool,
2015
  analyst_result: ClusterAnalystResponse,
2016
  ) -> bool:
2017
+ del split
2018
+ if aggressively_trimmed:
2019
  return False
2020
  if budget.estimated_eval_tokens > LLM_SKIP_EVALUATOR_ABOVE_TOKENS:
2021
  return False
 
2050
  ]
2051
 
2052
 
2053
+ def _soft_pair_review_meta(
2054
+ *,
2055
+ label: str,
2056
+ component_index: int,
2057
+ component_count: int,
2058
+ review_unit_index: int,
2059
+ review_unit_count: int,
2060
+ cluster_id: str,
2061
+ component_budget: PacketBudget,
2062
+ budget: PacketBudget,
2063
+ prepared_review_unit_hash: str | None,
2064
+ trimmed: bool,
2065
+ aggressively_trimmed: bool,
2066
+ split: bool,
2067
+ packet: dict[str, Any],
2068
+ ) -> SoftPairReviewUnitMeta:
2069
+ prefix = (
2070
+ f"LLM {label} soft-edge review {component_index}/{component_count}"
2071
+ f" unit {review_unit_index}/{review_unit_count}"
2072
+ )
2073
+ return SoftPairReviewUnitMeta(
2074
+ label=label,
2075
+ component_index=component_index,
2076
+ component_count=component_count,
2077
+ review_unit_index=review_unit_index,
2078
+ review_unit_count=review_unit_count,
2079
+ cluster_id=cluster_id,
2080
+ prefix=prefix,
2081
+ nodes=tuple(str(node) for node in packet["nodes"]),
2082
+ soft_pairs=tuple(_packet_soft_pair_ids(packet)),
2083
+ component_budget=component_budget,
2084
+ budget=budget,
2085
+ prepared_review_unit_hash=prepared_review_unit_hash,
2086
+ trimmed=trimmed,
2087
+ aggressively_trimmed=aggressively_trimmed,
2088
+ split=split,
2089
+ )
2090
+
2091
+
2092
+ def _completed_soft_pair_review_sort_key(review: CompletedSoftPairReview) -> tuple[int, int]:
2093
+ return (
2094
+ review.meta.component_index,
2095
+ review.meta.review_unit_index,
2096
+ )
2097
+
2098
+
2099
+ def _soft_pair_review_record(
2100
+ *,
2101
+ review: CompletedSoftPairReview,
2102
+ model: str,
2103
+ accepted_nontrivial_soft_edge: bool,
2104
+ ) -> dict[str, Any]:
2105
+ result = review.result
2106
+ return {
2107
+ "label": review.meta.label,
2108
+ "component_index": review.meta.component_index,
2109
+ "component_count": review.meta.component_count,
2110
+ "review_unit_index": review.meta.review_unit_index,
2111
+ "review_unit_count": review.meta.review_unit_count,
2112
+ "status": review.status,
2113
+ "reason": review.reason,
2114
+ "source": review.source,
2115
+ "cache_hit": review.cache_hit,
2116
+ "model": model,
2117
+ "cluster_id": review.meta.cluster_id,
2118
+ "nodes": list(review.meta.nodes),
2119
+ "soft_pairs": list(review.meta.soft_pairs),
2120
+ "prepared_review_unit_hash": review.meta.prepared_review_unit_hash,
2121
+ "component_budget": _packet_budget_json(review.meta.component_budget),
2122
+ "budget": _packet_budget_json(review.meta.budget),
2123
+ "overflow_policy": LLM_OVERFLOW_POLICY,
2124
+ "trimmed": review.meta.trimmed,
2125
+ "aggressively_trimmed": review.meta.aggressively_trimmed,
2126
+ "split": review.meta.split,
2127
+ "analyst_result": (
2128
+ None
2129
+ if result is None or result.analyst_result is None
2130
+ else result.analyst_result.model_dump(mode="json")
2131
+ ),
2132
+ "evaluator_result": (
2133
+ None
2134
+ if result is None or result.evaluator_result is None
2135
+ else result.evaluator_result.model_dump(mode="json")
2136
+ ),
2137
+ "evaluator_used": False if result is None else result.evaluator_used,
2138
+ "retried": False if result is None else result.retried,
2139
+ "accepted_nontrivial_soft_edge": accepted_nontrivial_soft_edge,
2140
+ "error_kind": None if result is None else result.error_kind,
2141
+ "error_message": None if result is None else result.error_message,
2142
+ }
2143
+
2144
+
2145
+ def _completed_soft_pair_review_from_result(
2146
+ pending: PendingSoftPairReview,
2147
+ result: ClusterAnalysisCallResult,
2148
+ ) -> CompletedSoftPairReview:
2149
+ return CompletedSoftPairReview(
2150
+ meta=pending.meta,
2151
+ result=result,
2152
+ status="reviewed" if result.analyst_result is not None else "error",
2153
+ reason=None,
2154
+ source="llm",
2155
+ cache_hit=False,
2156
+ )
2157
+
2158
+
2159
+ async def _run_pending_soft_pair_review(
2160
+ pending: PendingSoftPairReview,
2161
+ *,
2162
+ model: str,
2163
+ review_semaphore: asyncio.Semaphore,
2164
+ ) -> CompletedSoftPairReview:
2165
+ async with review_semaphore:
2166
+ try:
2167
+ result = await _fast_agent_cluster_analysis(pending.prepared, model)
2168
+ except Exception as exc:
2169
+ result = ClusterAnalysisCallResult(
2170
+ analyst_result=None,
2171
+ evaluator_result=None,
2172
+ error_kind=_classify_llm_error(exc),
2173
+ error_message=_summarize_llm_error(exc),
2174
+ evaluator_used=False,
2175
+ retried=False,
2176
+ )
2177
+ return _completed_soft_pair_review_from_result(pending, result)
2178
+
2179
+
2180
+ async def _run_pending_soft_pair_reviews(
2181
+ pending_reviews: list[PendingSoftPairReview],
2182
+ *,
2183
+ concurrency: int,
2184
+ model: str,
2185
+ review_semaphore: asyncio.Semaphore,
2186
+ ) -> list[CompletedSoftPairReview]:
2187
+ if not pending_reviews:
2188
+ return []
2189
+ if concurrency <= 1:
2190
+ completed: list[CompletedSoftPairReview] = []
2191
+ for pending in pending_reviews:
2192
+ completed.append(
2193
+ await _run_pending_soft_pair_review(
2194
+ pending,
2195
+ model=model,
2196
+ review_semaphore=review_semaphore,
2197
+ )
2198
+ )
2199
+ return completed
2200
+ tasks = [
2201
+ asyncio.create_task(
2202
+ _run_pending_soft_pair_review(
2203
+ pending,
2204
+ model=model,
2205
+ review_semaphore=review_semaphore,
2206
+ )
2207
+ )
2208
+ for pending in pending_reviews
2209
+ ]
2210
+ return await asyncio.gather(*tasks)
2211
+
2212
+
2213
  async def _accepted_soft_pairs(
2214
  *,
2215
  options: AnalysisOptions,
 
2220
  label: str,
2221
  hybrid_review_cache: HybridReviewCacheStore,
2222
  llm_available: bool,
2223
+ review_semaphore: asyncio.Semaphore,
2224
  ) -> tuple[dict[tuple[str, str], dict[str, Any]], bool, list[dict[str, Any]]]:
2225
  del snapshot
2226
  if not soft_candidates:
 
2239
  for pair in soft_candidates:
2240
  candidate_graph.setdefault(pair, set()).add("soft_similarity")
2241
  component_payloads = _component_packets(features, candidate_graph, soft_candidates)
2242
+ pending_reviews: list[PendingSoftPairReview] = []
2243
+ completed_reviews: list[CompletedSoftPairReview] = []
2244
  accepted: dict[tuple[str, str], dict[str, Any]] = dict(deterministic_accepts)
2245
  llm_used = False
2246
  review_records: list[dict[str, Any]] = []
 
2248
  for index, payload in enumerate(component_payloads, start=1):
2249
  component_budget = _estimate_packet_size(payload, options.model)
2250
  cluster_id = _cluster_id_from_nodes(payload["nodes"])
2251
+ review_units = _split_packet_for_review(payload, options.model)
2252
  if len(review_units) > 1:
2253
  _analysis_log(
2254
  f"LLM {label} soft-edge review {index}/{total_components}: "
 
2257
  f"est_tokens={component_budget.estimated_input_tokens})"
2258
  )
2259
  for unit_index, review_unit in enumerate(review_units, start=1):
 
 
 
 
2260
  prepared = _prepare_packet_for_llm(
2261
  review_unit,
2262
  options.model,
 
2264
  )
2265
  if prepared is None:
2266
  unit_budget = _estimate_packet_size(review_unit, options.model)
2267
+ completed_reviews.append(
2268
+ CompletedSoftPairReview(
2269
+ meta=_soft_pair_review_meta(
2270
+ label=label,
2271
+ component_index=index,
2272
+ component_count=total_components,
2273
+ review_unit_index=unit_index,
2274
+ review_unit_count=len(review_units),
2275
+ cluster_id=cluster_id,
2276
+ component_budget=component_budget,
2277
+ budget=unit_budget,
2278
+ prepared_review_unit_hash=None,
2279
+ trimmed=True,
2280
+ aggressively_trimmed=True,
2281
+ split=len(review_units) > 1,
2282
+ packet=review_unit,
2283
+ ),
2284
+ result=None,
2285
+ status="skipped",
2286
+ reason="over_budget_after_truncate",
2287
+ source=None,
2288
+ cache_hit=False,
2289
+ )
 
 
 
 
 
 
 
 
 
 
 
 
2290
  )
2291
  continue
2292
  prepared_review_unit = _prepared_review_unit_payload(prepared)
 
2295
  model=options.model,
2296
  prepared_review_unit=prepared_review_unit,
2297
  )
2298
+ meta = _soft_pair_review_meta(
2299
+ label=label,
2300
+ component_index=index,
2301
+ component_count=total_components,
2302
+ review_unit_index=unit_index,
2303
+ review_unit_count=len(review_units),
2304
+ cluster_id=cluster_id,
2305
+ component_budget=component_budget,
2306
+ budget=prepared.budget,
2307
+ prepared_review_unit_hash=cache_key.prepared_review_unit_hash,
2308
+ trimmed=prepared.trimmed,
2309
+ aggressively_trimmed=prepared.aggressively_trimmed,
2310
+ split=prepared.split,
2311
+ packet=prepared.packet,
2312
+ )
2313
  cached_entry = hybrid_review_cache.get(cache_key)
 
2314
  if cached_entry is not None:
2315
+ completed_reviews.append(
2316
+ CompletedSoftPairReview(
2317
+ meta=meta,
2318
+ result=_cluster_analysis_call_result_from_payload(cached_entry.result),
2319
+ status=(
2320
+ "reviewed"
2321
+ if cached_entry.result.get("analyst_result") is not None
2322
+ else "error"
2323
+ ),
2324
+ reason=None,
2325
+ source="cache",
2326
+ cache_hit=True,
2327
  )
2328
+ )
2329
+ continue
2330
+ if not llm_available:
2331
+ completed_reviews.append(
2332
+ CompletedSoftPairReview(
2333
+ meta=meta,
2334
+ result=None,
2335
+ status="skipped",
2336
+ reason="llm_unavailable_cache_miss",
2337
+ source=None,
2338
+ cache_hit=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2339
  )
2340
+ )
2341
+ continue
2342
+ pending_reviews.append(
2343
+ PendingSoftPairReview(
2344
+ meta=meta,
2345
+ prepared=prepared,
2346
+ cache_key=cache_key,
2347
+ )
2348
+ )
2349
+ reviewed_from_cache = sum(1 for review in completed_reviews if review.cache_hit)
2350
+ skipped_reviews = sum(1 for review in completed_reviews if review.status == "skipped")
2351
+ _analysis_log(
2352
+ f"LLM {label} soft-edge review scheduling: "
2353
+ f"units={len(pending_reviews) + len(completed_reviews)}, "
2354
+ f"cache_hits={reviewed_from_cache}, "
2355
+ f"cache_misses={len(pending_reviews)}, "
2356
+ f"skipped={skipped_reviews}, "
2357
+ f"concurrency={options.hybrid_llm_concurrency}"
2358
+ )
2359
+ completed_reviews.extend(
2360
+ await _run_pending_soft_pair_reviews(
2361
+ pending_reviews,
2362
+ concurrency=options.hybrid_llm_concurrency,
2363
+ model=options.model,
2364
+ review_semaphore=review_semaphore,
2365
+ )
2366
+ )
2367
+ pending_by_position = {
2368
+ (pending.meta.component_index, pending.meta.review_unit_index): pending
2369
+ for pending in pending_reviews
2370
+ }
2371
+ for review in sorted(completed_reviews, key=_completed_soft_pair_review_sort_key):
2372
+ accepted_nontrivial = False
2373
+ pending = pending_by_position.get(
2374
+ (review.meta.component_index, review.meta.review_unit_index)
2375
+ )
2376
+ result = review.result
2377
+ if review.reason == "over_budget_after_truncate":
2378
+ _analysis_log(
2379
+ f"{review.meta.prefix}: skipped over-budget packet "
2380
+ f"(nodes={review.meta.budget.node_count}, soft_pairs={review.meta.budget.soft_pair_count}, "
2381
+ f"est_tokens={review.meta.budget.estimated_input_tokens}, overflow_policy={LLM_OVERFLOW_POLICY})"
2382
+ )
2383
+ elif review.reason == "llm_unavailable_cache_miss":
2384
+ _analysis_log(
2385
+ f"{review.meta.prefix}: cache miss with fast-agent unavailable; "
2386
+ "keeping deterministic-only soft edges"
2387
+ )
2388
+ else:
2389
+ if review.cache_hit:
2390
  _analysis_log(
2391
+ f"{review.meta.prefix}: cache hit "
2392
+ f"(nodes={review.meta.budget.node_count}, soft_pairs={review.meta.budget.soft_pair_count}, "
2393
+ f"est_tokens={review.meta.budget.estimated_input_tokens}, model={options.model})"
 
2394
  )
2395
+ if result is None or result.analyst_result is None:
2396
+ if result is not None and result.error_kind is not None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2397
  _analysis_log(
2398
+ f"{review.meta.prefix}: {result.error_kind}"
2399
+ f" (nodes={review.meta.budget.node_count}, soft_pairs={review.meta.budget.soft_pair_count}, "
2400
+ f"est_tokens={review.meta.budget.estimated_input_tokens}, "
2401
  f"overflow_policy={LLM_OVERFLOW_POLICY})"
2402
  )
2403
  else:
2404
+ _analysis_log(f"{review.meta.prefix}: no result")
2405
  else:
2406
  llm_used = True
2407
  verdicts = {
 
2410
  }
2411
  accepted_count = sum(1 for verdict in verdicts.values() if verdict.accept)
2412
  rejected_count = sum(1 for verdict in verdicts.values() if not verdict.accept)
2413
+ accepted_nontrivial = any(
2414
+ verdicts.get(_pair_key(*pair_id.split("|", 1))) is not None
2415
+ and verdicts[_pair_key(*pair_id.split("|", 1))].accept
2416
+ and not bool(
2417
+ soft_candidates[_pair_key(*pair_id.split("|", 1))].get(
2418
+ "deterministic_accept",
2419
+ True,
2420
+ )
2421
+ )
2422
+ for pair_id in review.meta.soft_pairs
2423
  )
2424
  evaluator_status = "used" if result.evaluator_used else "skipped"
2425
  _analysis_log(
2426
+ f"{review.meta.prefix}: {accepted_count} accepted, {rejected_count} rejected, "
2427
+ f"evaluator={evaluator_status}, source={review.source}"
2428
  )
2429
  if result.error_kind is not None:
2430
+ _analysis_log(
2431
+ f"{review.meta.prefix}: {result.error_kind}; keeping analyst result"
2432
+ )
2433
+ for pair_id in review.meta.soft_pairs:
2434
+ normalized_pair = _pair_key(*pair_id.split("|", 1))
2435
  verdict = verdicts.get(normalized_pair)
2436
  if verdict is None:
2437
  continue
 
2439
  accepted[normalized_pair] = soft_candidates[normalized_pair]
2440
  else:
2441
  accepted.pop(normalized_pair, None)
2442
+ if (
2443
+ pending is not None
2444
+ and review.source == "llm"
2445
+ and _cacheable_cluster_analysis_result(result)
2446
+ ):
2447
+ hybrid_review_cache.put(
2448
+ HybridReviewCacheEntry(
2449
+ key=pending.cache_key,
2450
+ result=_cluster_analysis_call_result_payload(result),
2451
+ cached_at=_iso_now(),
2452
+ nodes=tuple(pending.prepared.packet["nodes"]),
2453
+ soft_pairs=tuple(_packet_soft_pair_ids(pending.prepared.packet)),
2454
+ budget=_packet_budget_json(pending.prepared.budget),
2455
+ split=pending.prepared.split,
2456
+ trimmed=pending.prepared.trimmed,
2457
+ aggressively_trimmed=pending.prepared.aggressively_trimmed,
2458
+ )
2459
+ )
2460
+ review_records.append(
2461
+ _soft_pair_review_record(
2462
+ review=review,
2463
+ model=options.model,
2464
+ accepted_nontrivial_soft_edge=accepted_nontrivial,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2465
  )
2466
+ )
2467
  return accepted, llm_used, review_records
2468
 
2469
 
src/slop_farmer/reports/analysis_service.py CHANGED
@@ -6,12 +6,16 @@ from typing import Any
6
 
7
  from slop_farmer.data.parquet_io import read_json
8
  from slop_farmer.data.search_duckdb import connect_pr_search_db, resolve_active_run
 
 
 
 
 
 
 
 
9
 
10
  ANALYSIS_VARIANTS = {"auto", "deterministic", "hybrid"}
11
- ANALYSIS_REPORT_FILENAMES = {
12
- "deterministic": "analysis-report.json",
13
- "hybrid": "analysis-report-hybrid.json",
14
- }
15
 
16
 
17
  @dataclass(frozen=True, slots=True)
@@ -28,6 +32,7 @@ class AnalysisContext:
28
  report_source: str
29
  variant_requested: str
30
  variant_used: str
 
31
 
32
 
33
  def get_analysis_status(
@@ -35,14 +40,15 @@ def get_analysis_status(
35
  *,
36
  repo: str | None = None,
37
  variant: str = "auto",
38
- analysis_root: Path | None = None,
 
39
  ) -> dict[str, Any]:
40
  active = _resolve_active_snapshot_context(db_path, repo=repo)
41
- report_path, variant_used, report_source = _resolve_analysis_report_path(
42
  active.snapshot_dir,
43
- str(active.active_run["repo"]),
44
  variant,
45
- analysis_root=analysis_root,
 
46
  required=False,
47
  )
48
  payload = {
@@ -55,7 +61,7 @@ def get_analysis_status(
55
  if report_path is None or variant_used is None or report_source is None:
56
  return payload
57
  report = _load_report(report_path)
58
- return {
59
  **payload,
60
  "snapshot_id": str(report.get("snapshot_id") or active.active_run["snapshot_id"]),
61
  "variant_used": variant_used,
@@ -64,6 +70,9 @@ def get_analysis_status(
64
  "generated_at": report.get("generated_at"),
65
  "counts": _analysis_counts(report),
66
  }
 
 
 
67
 
68
 
69
  def get_pr_analysis(
@@ -72,13 +81,15 @@ def get_pr_analysis(
72
  pr_number: int,
73
  repo: str | None = None,
74
  variant: str = "auto",
75
- analysis_root: Path | None = None,
 
76
  ) -> dict[str, Any]:
77
  context = _load_analysis_context(
78
  db_path,
79
  repo=repo,
80
  variant=variant,
81
- analysis_root=analysis_root,
 
82
  )
83
  meta_bug, rank = _find_meta_bug_for_pr(context.report, pr_number)
84
  duplicate_pr = _find_duplicate_pr_for_pr(context.report, pr_number)
@@ -97,13 +108,15 @@ def list_analysis_meta_bugs(
97
  repo: str | None = None,
98
  variant: str = "auto",
99
  limit: int = 50,
100
- analysis_root: Path | None = None,
 
101
  ) -> dict[str, Any]:
102
  context = _load_analysis_context(
103
  db_path,
104
  repo=repo,
105
  variant=variant,
106
- analysis_root=analysis_root,
 
107
  )
108
  meta_bugs = [
109
  _meta_bug_payload(cluster, rank=index)
@@ -122,13 +135,15 @@ def get_analysis_meta_bug(
122
  cluster_id: str,
123
  repo: str | None = None,
124
  variant: str = "auto",
125
- analysis_root: Path | None = None,
 
126
  ) -> dict[str, Any]:
127
  context = _load_analysis_context(
128
  db_path,
129
  repo=repo,
130
  variant=variant,
131
- analysis_root=analysis_root,
 
132
  )
133
  for index, cluster in enumerate(context.report.get("meta_bugs", []), start=1):
134
  if str(cluster.get("cluster_id")) != cluster_id:
@@ -147,13 +162,15 @@ def list_analysis_duplicate_prs(
147
  repo: str | None = None,
148
  variant: str = "auto",
149
  limit: int = 50,
150
- analysis_root: Path | None = None,
 
151
  ) -> dict[str, Any]:
152
  context = _load_analysis_context(
153
  db_path,
154
  repo=repo,
155
  variant=variant,
156
- analysis_root=analysis_root,
 
157
  )
158
  duplicate_prs = [
159
  {"rank": index, **dict(entry)}
@@ -171,13 +188,15 @@ def get_analysis_best(
171
  *,
172
  repo: str | None = None,
173
  variant: str = "auto",
174
- analysis_root: Path | None = None,
 
175
  ) -> dict[str, Any]:
176
  context = _load_analysis_context(
177
  db_path,
178
  repo=repo,
179
  variant=variant,
180
- analysis_root=analysis_root,
 
181
  )
182
  return {
183
  **_analysis_base_payload(context),
@@ -217,14 +236,15 @@ def _load_analysis_context(
217
  *,
218
  repo: str | None,
219
  variant: str,
220
- analysis_root: Path | None,
 
221
  ) -> AnalysisContext:
222
  active = _resolve_active_snapshot_context(db_path, repo=repo)
223
- report_path, variant_used, report_source = _resolve_analysis_report_path(
224
  active.snapshot_dir,
225
- str(active.active_run["repo"]),
226
  variant,
227
- analysis_root=analysis_root,
 
228
  required=True,
229
  )
230
  assert report_path is not None
@@ -237,59 +257,143 @@ def _load_analysis_context(
237
  report_source=report_source,
238
  variant_requested=_normalize_analysis_variant(variant),
239
  variant_used=variant_used,
 
240
  )
241
 
242
 
243
  def _resolve_analysis_report_path(
244
  snapshot_dir: Path,
245
- repo: str,
246
  variant: str,
247
  *,
248
- analysis_root: Path | None,
 
249
  required: bool,
250
- ) -> tuple[Path | None, str | None, str | None]:
251
  normalized = _normalize_analysis_variant(variant)
252
- candidate_dirs = _candidate_analysis_dirs(
253
- snapshot_dir=snapshot_dir,
254
- repo=repo,
255
- analysis_root=analysis_root,
256
- )
257
- if normalized == "auto":
258
- for source, directory in candidate_dirs:
259
- hybrid_path = directory / ANALYSIS_REPORT_FILENAMES["hybrid"]
260
- if hybrid_path.exists():
261
- return hybrid_path, "hybrid", source
262
- deterministic_path = directory / ANALYSIS_REPORT_FILENAMES["deterministic"]
263
- if deterministic_path.exists():
264
- return deterministic_path, "deterministic", source
265
  if not required:
266
- return None, None, None
267
  raise ValueError(
268
- "No analysis report was found for the current analysis path or active snapshot."
269
  )
270
- for source, directory in candidate_dirs:
271
- report_path = directory / ANALYSIS_REPORT_FILENAMES[normalized]
272
- if report_path.exists():
273
- return report_path, normalized, source
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  if not required:
275
- return None, None, None
276
  raise ValueError(
277
- f"{normalized.capitalize()} analysis report was not found for the current analysis path or active snapshot."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  )
 
279
 
280
 
281
- def _candidate_analysis_dirs(
 
 
282
  *,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  snapshot_dir: Path,
284
- repo: str,
285
- analysis_root: Path | None,
286
- ) -> list[tuple[str, Path]]:
287
- owner, name = repo.split("/", 1)
288
- candidates: list[tuple[str, Path]] = []
289
- if analysis_root is not None:
290
- candidates.append(("current", analysis_root / owner / name / "current"))
291
- candidates.append(("snapshot", snapshot_dir))
292
- return candidates
 
 
 
 
 
 
293
 
294
 
295
  def _normalize_analysis_variant(variant: str) -> str:
@@ -304,7 +408,7 @@ def _normalize_analysis_variant(variant: str) -> str:
304
  def _analysis_base_payload(context: AnalysisContext) -> dict[str, Any]:
305
  active_snapshot_id = str(context.active_run["snapshot_id"])
306
  snapshot_id = str(context.report.get("snapshot_id") or active_snapshot_id)
307
- return {
308
  "repo": str(context.active_run["repo"]),
309
  "snapshot_id": snapshot_id,
310
  "active_snapshot_id": active_snapshot_id,
@@ -315,6 +419,9 @@ def _analysis_base_payload(context: AnalysisContext) -> dict[str, Any]:
315
  "llm_enrichment": bool(context.report.get("llm_enrichment")),
316
  "generated_at": context.report.get("generated_at"),
317
  }
 
 
 
318
 
319
 
320
  def _analysis_counts(report: dict[str, Any]) -> dict[str, int]:
 
6
 
7
  from slop_farmer.data.parquet_io import read_json
8
  from slop_farmer.data.search_duckdb import connect_pr_search_db, resolve_active_run
9
+ from slop_farmer.data.snapshot_paths import (
10
+ ANALYSIS_REPORT_FILENAME_BY_VARIANT,
11
+ CURRENT_ANALYSIS_MANIFEST_PATH,
12
+ analysis_run_manifest_path,
13
+ load_archived_analysis_run_manifest,
14
+ load_current_analysis_manifest,
15
+ repo_relative_path_to_local,
16
+ )
17
 
18
  ANALYSIS_VARIANTS = {"auto", "deterministic", "hybrid"}
 
 
 
 
19
 
20
 
21
  @dataclass(frozen=True, slots=True)
 
32
  report_source: str
33
  variant_requested: str
34
  variant_used: str
35
+ analysis_id: str | None
36
 
37
 
38
  def get_analysis_status(
 
40
  *,
41
  repo: str | None = None,
42
  variant: str = "auto",
43
+ snapshot_id: str | None = None,
44
+ analysis_id: str | None = None,
45
  ) -> dict[str, Any]:
46
  active = _resolve_active_snapshot_context(db_path, repo=repo)
47
+ report_path, variant_used, report_source, resolved_analysis_id = _resolve_analysis_report_path(
48
  active.snapshot_dir,
 
49
  variant,
50
+ snapshot_id=snapshot_id,
51
+ analysis_id=analysis_id,
52
  required=False,
53
  )
54
  payload = {
 
61
  if report_path is None or variant_used is None or report_source is None:
62
  return payload
63
  report = _load_report(report_path)
64
+ status = {
65
  **payload,
66
  "snapshot_id": str(report.get("snapshot_id") or active.active_run["snapshot_id"]),
67
  "variant_used": variant_used,
 
70
  "generated_at": report.get("generated_at"),
71
  "counts": _analysis_counts(report),
72
  }
73
+ if resolved_analysis_id is not None:
74
+ status["analysis_id"] = resolved_analysis_id
75
+ return status
76
 
77
 
78
  def get_pr_analysis(
 
81
  pr_number: int,
82
  repo: str | None = None,
83
  variant: str = "auto",
84
+ snapshot_id: str | None = None,
85
+ analysis_id: str | None = None,
86
  ) -> dict[str, Any]:
87
  context = _load_analysis_context(
88
  db_path,
89
  repo=repo,
90
  variant=variant,
91
+ snapshot_id=snapshot_id,
92
+ analysis_id=analysis_id,
93
  )
94
  meta_bug, rank = _find_meta_bug_for_pr(context.report, pr_number)
95
  duplicate_pr = _find_duplicate_pr_for_pr(context.report, pr_number)
 
108
  repo: str | None = None,
109
  variant: str = "auto",
110
  limit: int = 50,
111
+ snapshot_id: str | None = None,
112
+ analysis_id: str | None = None,
113
  ) -> dict[str, Any]:
114
  context = _load_analysis_context(
115
  db_path,
116
  repo=repo,
117
  variant=variant,
118
+ snapshot_id=snapshot_id,
119
+ analysis_id=analysis_id,
120
  )
121
  meta_bugs = [
122
  _meta_bug_payload(cluster, rank=index)
 
135
  cluster_id: str,
136
  repo: str | None = None,
137
  variant: str = "auto",
138
+ snapshot_id: str | None = None,
139
+ analysis_id: str | None = None,
140
  ) -> dict[str, Any]:
141
  context = _load_analysis_context(
142
  db_path,
143
  repo=repo,
144
  variant=variant,
145
+ snapshot_id=snapshot_id,
146
+ analysis_id=analysis_id,
147
  )
148
  for index, cluster in enumerate(context.report.get("meta_bugs", []), start=1):
149
  if str(cluster.get("cluster_id")) != cluster_id:
 
162
  repo: str | None = None,
163
  variant: str = "auto",
164
  limit: int = 50,
165
+ snapshot_id: str | None = None,
166
+ analysis_id: str | None = None,
167
  ) -> dict[str, Any]:
168
  context = _load_analysis_context(
169
  db_path,
170
  repo=repo,
171
  variant=variant,
172
+ snapshot_id=snapshot_id,
173
+ analysis_id=analysis_id,
174
  )
175
  duplicate_prs = [
176
  {"rank": index, **dict(entry)}
 
188
  *,
189
  repo: str | None = None,
190
  variant: str = "auto",
191
+ snapshot_id: str | None = None,
192
+ analysis_id: str | None = None,
193
  ) -> dict[str, Any]:
194
  context = _load_analysis_context(
195
  db_path,
196
  repo=repo,
197
  variant=variant,
198
+ snapshot_id=snapshot_id,
199
+ analysis_id=analysis_id,
200
  )
201
  return {
202
  **_analysis_base_payload(context),
 
236
  *,
237
  repo: str | None,
238
  variant: str,
239
+ snapshot_id: str | None,
240
+ analysis_id: str | None,
241
  ) -> AnalysisContext:
242
  active = _resolve_active_snapshot_context(db_path, repo=repo)
243
+ report_path, variant_used, report_source, resolved_analysis_id = _resolve_analysis_report_path(
244
  active.snapshot_dir,
 
245
  variant,
246
+ snapshot_id=snapshot_id,
247
+ analysis_id=analysis_id,
248
  required=True,
249
  )
250
  assert report_path is not None
 
257
  report_source=report_source,
258
  variant_requested=_normalize_analysis_variant(variant),
259
  variant_used=variant_used,
260
+ analysis_id=resolved_analysis_id,
261
  )
262
 
263
 
264
  def _resolve_analysis_report_path(
265
  snapshot_dir: Path,
 
266
  variant: str,
267
  *,
268
+ snapshot_id: str | None,
269
+ analysis_id: str | None,
270
  required: bool,
271
+ ) -> tuple[Path | None, str | None, str | None, str | None]:
272
  normalized = _normalize_analysis_variant(variant)
273
+ if (snapshot_id is None) != (analysis_id is None):
274
+ raise ValueError("snapshot_id and analysis_id must be provided together.")
275
+ if snapshot_id is not None and analysis_id is not None:
276
+ selection = _resolve_archived_analysis_report_path(
277
+ snapshot_dir,
278
+ snapshot_id=snapshot_id,
279
+ analysis_id=analysis_id,
280
+ variant=normalized,
281
+ )
282
+ if selection is not None:
283
+ return (*selection, analysis_id)
 
 
284
  if not required:
285
+ return None, None, None, None
286
  raise ValueError(
287
+ f"Published analysis run {analysis_id!r} for snapshot {snapshot_id!r} was not found."
288
  )
289
+
290
+ current_manifest_path = repo_relative_path_to_local(
291
+ snapshot_dir, CURRENT_ANALYSIS_MANIFEST_PATH
292
+ )
293
+ if normalized == "deterministic":
294
+ selection = _resolve_snapshot_local_report_path(snapshot_dir, variant=normalized)
295
+ if selection is not None:
296
+ return (*selection, None)
297
+
298
+ if current_manifest_path.exists():
299
+ report_path, variant_used = _resolve_manifest_report_path(
300
+ snapshot_dir,
301
+ load_current_analysis_manifest(current_manifest_path),
302
+ variant=normalized,
303
+ manifest_kind="current",
304
+ )
305
+ return (
306
+ report_path,
307
+ variant_used,
308
+ "current",
309
+ str(load_current_analysis_manifest(current_manifest_path)["analysis_id"]),
310
+ )
311
+
312
+ selection = _resolve_snapshot_local_report_path(snapshot_dir, variant=normalized)
313
+ if selection is not None:
314
+ return (*selection, None)
315
  if not required:
316
+ return None, None, None, None
317
  raise ValueError(
318
+ "No analysis report was found for the current analysis view or active snapshot."
319
+ )
320
+
321
+
322
+ def _resolve_archived_analysis_report_path(
323
+ snapshot_dir: Path,
324
+ *,
325
+ snapshot_id: str,
326
+ analysis_id: str,
327
+ variant: str,
328
+ ) -> tuple[Path, str, str] | None:
329
+ manifest_path = repo_relative_path_to_local(
330
+ snapshot_dir,
331
+ analysis_run_manifest_path(snapshot_id, analysis_id),
332
+ )
333
+ if not manifest_path.exists():
334
+ return None
335
+ report_path, variant_used = _resolve_manifest_report_path(
336
+ snapshot_dir,
337
+ load_archived_analysis_run_manifest(manifest_path),
338
+ variant=variant,
339
+ manifest_kind="archived",
340
  )
341
+ return report_path, variant_used, "archived"
342
 
343
 
344
+ def _resolve_manifest_report_path(
345
+ snapshot_dir: Path,
346
+ manifest: dict[str, Any],
347
  *,
348
+ variant: str,
349
+ manifest_kind: str,
350
+ ) -> tuple[Path, str]:
351
+ artifact_key = _artifact_key_for_variant(variant, manifest_kind=manifest_kind)
352
+ artifacts = manifest.get("artifacts") or {}
353
+ artifact_path = artifacts.get(artifact_key)
354
+ if not isinstance(artifact_path, str) or not artifact_path:
355
+ message = (
356
+ f"Published {manifest_kind} analysis manifest does not provide the {variant} artifact."
357
+ if variant != "auto"
358
+ else f"Published {manifest_kind} analysis manifest does not provide the canonical hybrid artifact."
359
+ )
360
+ raise ValueError(message)
361
+ report_path = repo_relative_path_to_local(snapshot_dir, artifact_path)
362
+ if not report_path.exists():
363
+ raise ValueError(
364
+ f"Published {manifest_kind} analysis artifact {artifact_path!r} is missing from the materialized snapshot."
365
+ )
366
+ variant_used = "hybrid" if artifact_key == "hybrid" else variant
367
+ return report_path, variant_used
368
+
369
+
370
+ def _artifact_key_for_variant(variant: str, *, manifest_kind: str) -> str:
371
+ if variant == "auto":
372
+ return "hybrid"
373
+ if variant == "hybrid":
374
+ return "hybrid"
375
+ raise ValueError(
376
+ f"Published {manifest_kind} analysis only serves canonical hybrid artifacts; requested {variant!r}."
377
+ )
378
+
379
+
380
+ def _resolve_snapshot_local_report_path(
381
  snapshot_dir: Path,
382
+ *,
383
+ variant: str,
384
+ ) -> tuple[Path, str, str] | None:
385
+ if variant == "auto":
386
+ hybrid_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT["hybrid"]
387
+ if hybrid_path.exists():
388
+ return hybrid_path, "hybrid", "snapshot"
389
+ deterministic_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT["deterministic"]
390
+ if deterministic_path.exists():
391
+ return deterministic_path, "deterministic", "snapshot"
392
+ return None
393
+ report_path = snapshot_dir / ANALYSIS_REPORT_FILENAME_BY_VARIANT[variant]
394
+ if not report_path.exists():
395
+ return None
396
+ return report_path, variant, "snapshot"
397
 
398
 
399
  def _normalize_analysis_variant(variant: str) -> str:
 
408
  def _analysis_base_payload(context: AnalysisContext) -> dict[str, Any]:
409
  active_snapshot_id = str(context.active_run["snapshot_id"])
410
  snapshot_id = str(context.report.get("snapshot_id") or active_snapshot_id)
411
+ payload = {
412
  "repo": str(context.active_run["repo"]),
413
  "snapshot_id": snapshot_id,
414
  "active_snapshot_id": active_snapshot_id,
 
419
  "llm_enrichment": bool(context.report.get("llm_enrichment")),
420
  "generated_at": context.report.get("generated_at"),
421
  }
422
+ if context.analysis_id is not None:
423
+ payload["analysis_id"] = context.analysis_id
424
+ return payload
425
 
426
 
427
  def _analysis_counts(report: dict[str, Any]) -> dict[str, int]:
src/slop_farmer/reports/dashboard.py CHANGED
@@ -8,7 +8,11 @@ from typing import Any
8
 
9
  from slop_farmer.config import DashboardDataOptions
10
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
11
- from slop_farmer.data.snapshot_paths import resolve_snapshot_dir_from_snapshots_root
 
 
 
 
12
 
13
 
14
  def run_dashboard_data(options: DashboardDataOptions) -> Path:
@@ -16,7 +20,8 @@ def run_dashboard_data(options: DashboardDataOptions) -> Path:
16
  manifest = _read_optional_json(snapshot_dir / "manifest.json")
17
  issues = read_parquet_rows(snapshot_dir / "issues.parquet")
18
  pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
19
- analysis = _read_optional_json(options.analysis_input or snapshot_dir / "analysis-report.json")
 
20
  contributor_report = _read_optional_json(
21
  options.contributors_input or snapshot_dir / "new-contributors-report.json"
22
  )
@@ -67,6 +72,21 @@ def run_dashboard_data(options: DashboardDataOptions) -> Path:
67
  "clustered_pr_count": sum(1 for row in prs if row["cluster_id"]),
68
  "contributor_count": len(contributors),
69
  "analysis_available": bool(analysis),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  "contributors_available": bool(contributor_report),
71
  "pr_scope_available": bool(pr_scope_report),
72
  "pr_scope_cluster_count": len(pr_scope_clusters),
@@ -88,7 +108,29 @@ def _resolve_snapshot_dir(options: DashboardDataOptions) -> Path:
88
  if options.snapshot_root is not None
89
  else (Path("data") / "snapshots").resolve()
90
  )
91
- return resolve_snapshot_dir_from_snapshots_root(snapshots_root, options.snapshot_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
 
94
  def _read_optional_json(path: Path) -> dict[str, Any]:
@@ -153,6 +195,14 @@ def _excerpt(value: Any, limit: int = 240) -> str | None:
153
  return compact[: limit - 1].rstrip() + "…"
154
 
155
 
 
 
 
 
 
 
 
 
156
  def _cluster_rows(
157
  analysis: dict[str, Any],
158
  issue_map: dict[int, dict[str, Any]],
 
8
 
9
  from slop_farmer.config import DashboardDataOptions
10
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
11
+ from slop_farmer.data.snapshot_paths import (
12
+ ResolvedAnalysisReportPath,
13
+ resolve_default_dashboard_analysis_report,
14
+ )
15
+ from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
16
 
17
 
18
  def run_dashboard_data(options: DashboardDataOptions) -> Path:
 
20
  manifest = _read_optional_json(snapshot_dir / "manifest.json")
21
  issues = read_parquet_rows(snapshot_dir / "issues.parquet")
22
  pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
23
+ analysis_path = _resolve_analysis_input(snapshot_dir, options.analysis_input)
24
+ analysis = _read_optional_json(analysis_path.path) if analysis_path is not None else {}
25
  contributor_report = _read_optional_json(
26
  options.contributors_input or snapshot_dir / "new-contributors-report.json"
27
  )
 
72
  "clustered_pr_count": sum(1 for row in prs if row["cluster_id"]),
73
  "contributor_count": len(contributors),
74
  "analysis_available": bool(analysis),
75
+ "analysis_source": None if analysis_path is None else analysis_path.source,
76
+ "analysis_variant": None if analysis_path is None else analysis_path.variant,
77
+ "analysis_snapshot_id": (
78
+ None
79
+ if analysis_path is None
80
+ else (
81
+ analysis_path.snapshot_id
82
+ or (
83
+ str(analysis.get("snapshot_id")).strip()
84
+ if analysis.get("snapshot_id") is not None
85
+ else None
86
+ )
87
+ )
88
+ ),
89
+ "analysis_id": None if analysis_path is None else analysis_path.analysis_id,
90
  "contributors_available": bool(contributor_report),
91
  "pr_scope_available": bool(pr_scope_report),
92
  "pr_scope_cluster_count": len(pr_scope_clusters),
 
108
  if options.snapshot_root is not None
109
  else (Path("data") / "snapshots").resolve()
110
  )
111
+ return resolve_snapshot_source_dir(
112
+ snapshot_dir=options.snapshot_dir,
113
+ local_snapshots_root=snapshots_root,
114
+ hf_repo_id=options.hf_repo_id,
115
+ hf_revision=options.hf_revision,
116
+ hf_materialize_dir=options.hf_materialize_dir,
117
+ hf_output_dir=snapshots_root.parent,
118
+ )
119
+
120
+
121
+ def _resolve_analysis_input(
122
+ snapshot_dir: Path, override_path: Path | None
123
+ ) -> ResolvedAnalysisReportPath | None:
124
+ if override_path is not None:
125
+ resolved = override_path.resolve()
126
+ if not resolved.exists():
127
+ raise FileNotFoundError(f"Dashboard analysis input not found: {resolved}")
128
+ return ResolvedAnalysisReportPath(
129
+ path=resolved,
130
+ variant=_analysis_variant_for_path(resolved),
131
+ source="override",
132
+ )
133
+ return resolve_default_dashboard_analysis_report(snapshot_dir)
134
 
135
 
136
  def _read_optional_json(path: Path) -> dict[str, Any]:
 
195
  return compact[: limit - 1].rstrip() + "…"
196
 
197
 
198
+ def _analysis_variant_for_path(path: Path) -> str:
199
+ if path.name == "analysis-report-hybrid.json":
200
+ return "hybrid"
201
+ if path.name == "analysis-report.json":
202
+ return "deterministic"
203
+ return "override"
204
+
205
+
206
  def _cluster_rows(
207
  analysis: dict[str, Any],
208
  issue_map: dict[int, dict[str, Any]],
src/slop_farmer/reports/new_contributor_report.py CHANGED
@@ -12,7 +12,7 @@ from typing import Any
12
  from slop_farmer.config import NewContributorReportOptions, resolve_github_token
13
  from slop_farmer.data.http import urlopen_with_retry
14
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
15
- from slop_farmer.data.snapshot_paths import resolve_snapshot_dir_from_output
16
  from slop_farmer.reports.user_activity import summarize_user
17
 
18
  GRAPHQL_URL = "https://api.github.com/graphql"
@@ -131,7 +131,14 @@ def run_new_contributor_report(options: NewContributorReportOptions) -> Path:
131
 
132
 
133
  def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
134
- return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
 
 
 
 
 
 
 
135
 
136
 
137
  def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
@@ -244,7 +251,6 @@ def _report_contributors(
244
  previous_report_reusable
245
  and previous_entry is not None
246
  and not previous_entry.get("fetch_error")
247
- and not known_via_prior_merged_pr
248
  ):
249
  contributors.append(
250
  _reused_previous_report_entry(
@@ -256,6 +262,8 @@ def _report_contributors(
256
  )
257
  )
258
  reused_previous_report += 1
 
 
259
  continue
260
  try:
261
  summary = summarize_user(row["author_login"], options.window_days, None)
 
12
  from slop_farmer.config import NewContributorReportOptions, resolve_github_token
13
  from slop_farmer.data.http import urlopen_with_retry
14
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
15
+ from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
16
  from slop_farmer.reports.user_activity import summarize_user
17
 
18
  GRAPHQL_URL = "https://api.github.com/graphql"
 
131
 
132
 
133
  def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
134
+ return resolve_snapshot_source_dir(
135
+ snapshot_dir=options.snapshot_dir,
136
+ local_snapshots_root=options.output_dir.resolve() / "snapshots",
137
+ hf_repo_id=options.hf_repo_id,
138
+ hf_revision=options.hf_revision,
139
+ hf_materialize_dir=options.hf_materialize_dir,
140
+ hf_output_dir=options.output_dir,
141
+ )
142
 
143
 
144
  def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
 
251
  previous_report_reusable
252
  and previous_entry is not None
253
  and not previous_entry.get("fetch_error")
 
254
  ):
255
  contributors.append(
256
  _reused_previous_report_entry(
 
262
  )
263
  )
264
  reused_previous_report += 1
265
+ if known_via_prior_merged_pr:
266
+ reused_known_merged += 1
267
  continue
268
  try:
269
  summary = summarize_user(row["author_login"], options.window_days, None)
src/slop_farmer/reports/pr_scope.py CHANGED
@@ -42,11 +42,7 @@ from typing import Any
42
  from pydantic import BaseModel, Field
43
 
44
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
45
- from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
46
- from slop_farmer.data.snapshot_paths import (
47
- default_hf_materialize_dir,
48
- resolve_snapshot_dir_from_output,
49
- )
50
  from slop_farmer.reports.pr_heuristics import (
51
  compile_cluster_suppression_rules,
52
  suppressed_pull_request_reasons,
@@ -260,17 +256,14 @@ def run_pr_scope_report(options: Any) -> Path:
260
 
261
 
262
  def _resolve_snapshot_dir(options: Any) -> Path:
263
- if options.hf_repo_id:
264
- snapshot_dir = materialize_hf_dataset_snapshot(
265
- repo_id=options.hf_repo_id,
266
- local_dir=options.hf_materialize_dir
267
- or default_hf_materialize_dir(
268
- options.output_dir, options.hf_repo_id, options.hf_revision
269
- ),
270
- revision=options.hf_revision,
271
- )
272
- return snapshot_dir.resolve()
273
- return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
274
 
275
 
276
  def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:
 
42
  from pydantic import BaseModel, Field
43
 
44
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
45
+ from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 
 
 
 
46
  from slop_farmer.reports.pr_heuristics import (
47
  compile_cluster_suppression_rules,
48
  suppressed_pull_request_reasons,
 
256
 
257
 
258
  def _resolve_snapshot_dir(options: Any) -> Path:
259
+ return resolve_snapshot_source_dir(
260
+ snapshot_dir=options.snapshot_dir,
261
+ local_snapshots_root=options.output_dir.resolve() / "snapshots",
262
+ hf_repo_id=options.hf_repo_id,
263
+ hf_revision=options.hf_revision,
264
+ hf_materialize_dir=options.hf_materialize_dir,
265
+ hf_output_dir=options.output_dir,
266
+ )
 
 
 
267
 
268
 
269
  def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:
src/slop_farmer/reports/pr_search_scope.py CHANGED
@@ -10,11 +10,7 @@ from typing import Any
10
 
11
  from slop_farmer.config import PrSearchRefreshOptions
12
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
13
- from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
14
- from slop_farmer.data.snapshot_paths import (
15
- default_hf_materialize_dir,
16
- resolve_snapshot_dir_from_output,
17
- )
18
  from slop_farmer.reports.pr_heuristics import (
19
  compile_cluster_suppression_rules,
20
  suppressed_pull_request_reasons,
@@ -36,17 +32,14 @@ DEFAULT_CANDIDATE_LIMIT = 5
36
 
37
 
38
  def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
39
- if options.hf_repo_id:
40
- snapshot_dir = materialize_hf_dataset_snapshot(
41
- repo_id=options.hf_repo_id,
42
- local_dir=options.hf_materialize_dir
43
- or default_hf_materialize_dir(
44
- options.output_dir, options.hf_repo_id, options.hf_revision
45
- ),
46
- revision=options.hf_revision,
47
- )
48
- return snapshot_dir.resolve()
49
- return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
50
 
51
 
52
  def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
@@ -54,6 +47,7 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
54
  manifest = read_json(manifest_path) if manifest_path.exists() else {}
55
  pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
56
  pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
 
57
  repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
58
  snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
59
  return {
@@ -62,6 +56,7 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
62
  "manifest": manifest,
63
  "pull_requests": pull_requests,
64
  "pr_files": pr_files,
 
65
  }
66
 
67
 
@@ -412,6 +407,7 @@ def _document_row(row: Mapping[str, Any]) -> dict[str, Any]:
412
  return {
413
  "pr_number": int(row["number"]),
414
  "github_id": row.get("github_id"),
 
415
  "state": row.get("state"),
416
  "draft": bool(row.get("draft")),
417
  "merged": bool(row.get("merged")),
 
10
 
11
  from slop_farmer.config import PrSearchRefreshOptions
12
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
13
+ from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 
 
 
 
14
  from slop_farmer.reports.pr_heuristics import (
15
  compile_cluster_suppression_rules,
16
  suppressed_pull_request_reasons,
 
32
 
33
 
34
  def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
35
+ return resolve_snapshot_source_dir(
36
+ snapshot_dir=options.snapshot_dir,
37
+ local_snapshots_root=options.output_dir.resolve() / "snapshots",
38
+ hf_repo_id=options.hf_repo_id,
39
+ hf_revision=options.hf_revision,
40
+ hf_materialize_dir=options.hf_materialize_dir,
41
+ hf_output_dir=options.output_dir,
42
+ )
 
 
 
43
 
44
 
45
  def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
 
47
  manifest = read_json(manifest_path) if manifest_path.exists() else {}
48
  pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
49
  pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
50
+ contributors = read_parquet_rows(snapshot_dir / "new_contributors.parquet")
51
  repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
52
  snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
53
  return {
 
56
  "manifest": manifest,
57
  "pull_requests": pull_requests,
58
  "pr_files": pr_files,
59
+ "contributors": contributors,
60
  }
61
 
62
 
 
407
  return {
408
  "pr_number": int(row["number"]),
409
  "github_id": row.get("github_id"),
410
+ "author_login": row.get("author_login"),
411
  "state": row.get("state"),
412
  "draft": bool(row.get("draft")),
413
  "merged": bool(row.get("merged")),
src/slop_farmer/reports/pr_search_service.py CHANGED
@@ -1,7 +1,7 @@
1
  from __future__ import annotations
2
 
3
  import json
4
- from collections.abc import Iterable, Mapping
5
  from contextlib import suppress
6
  from pathlib import Path
7
  from typing import Any, Protocol
@@ -17,6 +17,8 @@ from slop_farmer.data.search_duckdb import (
17
  get_cluster,
18
  get_cluster_ids_for_prs,
19
  get_cluster_members,
 
 
20
  get_document,
21
  get_feature,
22
  get_pair_neighbor_row,
@@ -99,6 +101,16 @@ def run_pr_search_refresh(options: PrSearchRefreshOptions) -> dict[str, Any]:
99
  "pr_search_documents",
100
  _scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
101
  )
 
 
 
 
 
 
 
 
 
 
102
  insert_rows(
103
  connection,
104
  "pr_scope_features",
@@ -290,6 +302,85 @@ def get_pr_search_candidate_clusters(
290
  connection.close()
291
 
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  def get_pr_search_similar_lookup(
294
  db_path: Path,
295
  *,
@@ -801,6 +892,15 @@ def _require_feature(connection: Any, *, run_id: str, pr_number: int) -> dict[st
801
  return feature
802
 
803
 
 
 
 
 
 
 
 
 
 
804
  def _json_list(raw: Any) -> list[str]:
805
  if isinstance(raw, list):
806
  return [str(item) for item in raw]
@@ -838,6 +938,71 @@ def _without_json_fields(row: Mapping[str, Any]) -> dict[str, Any]:
838
  return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
839
 
840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  def _normalize_lookup_mode(mode: str) -> str:
842
  normalized = mode.strip().lower()
843
  if normalized not in {"auto", "indexed", "live"}:
 
1
  from __future__ import annotations
2
 
3
  import json
4
+ from collections.abc import Iterable, Mapping, Sequence
5
  from contextlib import suppress
6
  from pathlib import Path
7
  from typing import Any, Protocol
 
17
  get_cluster,
18
  get_cluster_ids_for_prs,
19
  get_cluster_members,
20
+ get_contributor,
21
+ get_contributor_pulls,
22
  get_document,
23
  get_feature,
24
  get_pair_neighbor_row,
 
101
  "pr_search_documents",
102
  _scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
103
  )
104
+ insert_rows(
105
+ connection,
106
+ "pr_search_contributors",
107
+ _contributor_rows(
108
+ snapshot["contributors"],
109
+ run_id=run_id,
110
+ repo=repo,
111
+ snapshot_id=str(snapshot["snapshot_id"]),
112
+ ),
113
+ )
114
  insert_rows(
115
  connection,
116
  "pr_scope_features",
 
302
  connection.close()
303
 
304
 
305
+ def get_pr_search_contributor(
306
+ db_path: Path,
307
+ *,
308
+ author_login: str,
309
+ repo: str | None = None,
310
+ ) -> dict[str, Any]:
311
+ connection = connect_pr_search_db(db_path, read_only=True)
312
+ try:
313
+ active_run = resolve_active_run(connection, repo=repo)
314
+ run_id = str(active_run["id"])
315
+ contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
316
+ pulls = _document_rows(
317
+ get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=20)
318
+ )
319
+ return {
320
+ "repo": active_run["repo"],
321
+ "snapshot_id": active_run["snapshot_id"],
322
+ "run_id": run_id,
323
+ "contributor": contributor,
324
+ "pulls": pulls,
325
+ "pull_count": len(pulls),
326
+ }
327
+ finally:
328
+ connection.close()
329
+
330
+
331
+ def get_pr_search_contributor_pulls(
332
+ db_path: Path,
333
+ *,
334
+ author_login: str,
335
+ repo: str | None = None,
336
+ limit: int = 20,
337
+ ) -> dict[str, Any]:
338
+ connection = connect_pr_search_db(db_path, read_only=True)
339
+ try:
340
+ active_run = resolve_active_run(connection, repo=repo)
341
+ run_id = str(active_run["id"])
342
+ contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
343
+ pulls = _document_rows(
344
+ get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=limit)
345
+ )
346
+ return {
347
+ "repo": active_run["repo"],
348
+ "snapshot_id": active_run["snapshot_id"],
349
+ "run_id": run_id,
350
+ "contributor": contributor,
351
+ "pulls": pulls,
352
+ "pull_count": len(pulls),
353
+ }
354
+ finally:
355
+ connection.close()
356
+
357
+
358
+ def get_pr_search_pull_contributor(
359
+ db_path: Path,
360
+ *,
361
+ pr_number: int,
362
+ repo: str | None = None,
363
+ ) -> dict[str, Any]:
364
+ connection = connect_pr_search_db(db_path, read_only=True)
365
+ try:
366
+ active_run = resolve_active_run(connection, repo=repo)
367
+ run_id = str(active_run["id"])
368
+ document = _require_document(connection, run_id=run_id, pr_number=pr_number)
369
+ author_login = str(document.get("author_login") or "").strip()
370
+ if not author_login:
371
+ raise ValueError(f"PR #{pr_number} does not have an indexed author_login.")
372
+ contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
373
+ return {
374
+ "repo": active_run["repo"],
375
+ "snapshot_id": active_run["snapshot_id"],
376
+ "run_id": run_id,
377
+ "pr": _without_json_fields(document),
378
+ "contributor": contributor,
379
+ }
380
+ finally:
381
+ connection.close()
382
+
383
+
384
  def get_pr_search_similar_lookup(
385
  db_path: Path,
386
  *,
 
892
  return feature
893
 
894
 
895
+ def _require_contributor(connection: Any, *, run_id: str, author_login: str) -> dict[str, Any]:
896
+ contributor = get_contributor(connection, run_id=run_id, author_login=author_login)
897
+ if contributor is None:
898
+ raise ValueError(
899
+ f"Contributor {author_login!r} was not found in the active indexed universe."
900
+ )
901
+ return _contributor_row(contributor)
902
+
903
+
904
  def _json_list(raw: Any) -> list[str]:
905
  if isinstance(raw, list):
906
  return [str(item) for item in raw]
 
938
  return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
939
 
940
 
941
+ def _document_rows(rows: Sequence[Mapping[str, Any]]) -> list[dict[str, Any]]:
942
+ return [_without_json_fields(row) for row in rows]
943
+
944
+
945
+ def _contributor_rows(
946
+ rows: list[Mapping[str, Any]],
947
+ *,
948
+ run_id: str,
949
+ repo: str,
950
+ snapshot_id: str,
951
+ ) -> list[dict[str, Any]]:
952
+ return [
953
+ {
954
+ "run_id": run_id,
955
+ "repo": repo,
956
+ "snapshot_id": snapshot_id,
957
+ "report_generated_at": row.get("report_generated_at"),
958
+ "window_days": row.get("window_days"),
959
+ "author_login": row.get("author_login"),
960
+ "name": row.get("name"),
961
+ "profile_url": row.get("profile_url"),
962
+ "repo_pull_requests_url": row.get("repo_pull_requests_url"),
963
+ "repo_issues_url": row.get("repo_issues_url"),
964
+ "repo_first_seen_at": row.get("repo_first_seen_at"),
965
+ "repo_last_seen_at": row.get("repo_last_seen_at"),
966
+ "repo_primary_artifact_count": row.get("repo_primary_artifact_count"),
967
+ "repo_artifact_count": row.get("repo_artifact_count"),
968
+ "snapshot_issue_count": row.get("snapshot_issue_count"),
969
+ "snapshot_pr_count": row.get("snapshot_pr_count"),
970
+ "snapshot_comment_count": row.get("snapshot_comment_count"),
971
+ "snapshot_review_count": row.get("snapshot_review_count"),
972
+ "snapshot_review_comment_count": row.get("snapshot_review_comment_count"),
973
+ "repo_association": row.get("repo_association"),
974
+ "new_to_repo": row.get("new_to_repo"),
975
+ "first_seen_in_snapshot": row.get("first_seen_in_snapshot"),
976
+ "report_reason": row.get("report_reason"),
977
+ "account_age_days": row.get("account_age_days"),
978
+ "young_account": row.get("young_account"),
979
+ "follow_through_score": row.get("follow_through_score"),
980
+ "breadth_score": row.get("breadth_score"),
981
+ "automation_risk_signal": row.get("automation_risk_signal"),
982
+ "heuristic_note": row.get("heuristic_note"),
983
+ "public_orgs_json": row.get("public_orgs"),
984
+ "visible_authored_pr_count": row.get("visible_authored_pr_count"),
985
+ "merged_pr_count": row.get("merged_pr_count"),
986
+ "closed_unmerged_pr_count": row.get("closed_unmerged_pr_count"),
987
+ "open_pr_count": row.get("open_pr_count"),
988
+ "merged_pr_rate": row.get("merged_pr_rate"),
989
+ "closed_unmerged_pr_rate": row.get("closed_unmerged_pr_rate"),
990
+ "still_open_pr_rate": row.get("still_open_pr_rate"),
991
+ "distinct_repos_with_authored_prs": row.get("distinct_repos_with_authored_prs"),
992
+ "distinct_repos_with_open_prs": row.get("distinct_repos_with_open_prs"),
993
+ "fetch_error": row.get("fetch_error"),
994
+ }
995
+ for row in rows
996
+ ]
997
+
998
+
999
+ def _contributor_row(row: Mapping[str, Any]) -> dict[str, Any]:
1000
+ return {
1001
+ **_without_json_fields(row),
1002
+ "public_orgs": _json_list(row.get("public_orgs_json")),
1003
+ }
1004
+
1005
+
1006
  def _normalize_lookup_mode(mode: str) -> str:
1007
  normalized = mode.strip().lower()
1008
  if normalized not in {"auto", "indexed", "live"}:
src/slop_farmer/reports/read_views.py CHANGED
@@ -5,6 +5,11 @@ from pathlib import Path
5
  from typing import Any, Literal
6
 
7
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
 
 
 
 
 
8
 
9
  AnalysisVariant = Literal["auto", "hybrid", "deterministic"]
10
 
@@ -252,7 +257,8 @@ def get_issue_best(snapshot_dir: Path, *, variant: AnalysisVariant) -> dict[str,
252
  def get_contributor_status(snapshot_dir: Path) -> dict[str, Any]:
253
  metadata = _snapshot_metadata(snapshot_dir)
254
  report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
255
- contributors = report.get("contributors") if isinstance(report.get("contributors"), list) else []
 
256
  return {
257
  "repo": str(report.get("repo") or metadata.repo),
258
  "snapshot_id": str(report.get("snapshot_id") or metadata.snapshot_id),
@@ -321,7 +327,12 @@ def _analysis_context(
321
  snapshot_dir: Path,
322
  *,
323
  variant: AnalysisVariant,
324
- ) -> tuple[_SnapshotMetadata, _AnalysisSelection | None, dict[int, dict[str, Any]], dict[int, dict[str, Any]]]:
 
 
 
 
 
325
  metadata = _snapshot_metadata(snapshot_dir)
326
  selection = _select_analysis_report(_analysis_candidates(snapshot_dir), variant=variant)
327
  issue_map, pr_map = _artifact_maps(snapshot_dir)
@@ -395,16 +406,38 @@ def _select_analysis_report(
395
 
396
 
397
  def _analysis_report_paths(snapshot_dir: Path) -> list[Path]:
398
- ordered = [
399
- snapshot_dir / "analysis-report-hybrid.json",
400
- snapshot_dir / "analysis-report-deterministic.json",
401
- snapshot_dir / "analysis-report.json",
402
- ]
403
- seen = {path.name for path in ordered}
 
 
 
 
 
 
 
 
404
  ordered.extend(
405
- path for path in sorted(snapshot_dir.glob("analysis-report*.json")) if path.name not in seen
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  )
407
- return [path for path in ordered if path.exists()]
408
 
409
 
410
  def _analysis_auto_priority(candidate: dict[str, Any]) -> tuple[int, str]:
@@ -448,15 +481,25 @@ def _analysis_counts(payload: dict[str, Any]) -> dict[str, int]:
448
  }
449
 
450
 
451
- def _artifact_maps(snapshot_dir: Path) -> tuple[dict[int, dict[str, Any]], dict[int, dict[str, Any]]]:
452
- issue_rows = read_parquet_rows(snapshot_dir / "issues.parquet") if (snapshot_dir / "issues.parquet").exists() else []
 
 
 
 
 
 
453
  pr_rows = (
454
  read_parquet_rows(snapshot_dir / "pull_requests.parquet")
455
  if (snapshot_dir / "pull_requests.parquet").exists()
456
  else []
457
  )
458
- issue_map = {int(row["number"]): row for row in issue_rows if _coerce_int(row.get("number")) is not None}
459
- pr_map = {int(row["number"]): row for row in pr_rows if _coerce_int(row.get("number")) is not None}
 
 
 
 
460
  return issue_map, pr_map
461
 
462
 
@@ -474,7 +517,9 @@ def _issue_cluster_summary(
474
  return {
475
  "rank": rank,
476
  "cluster_id": str(cluster.get("cluster_id") or f"cluster-{rank or 0}"),
477
- "title": _cluster_title(cluster, issue_map, pr_map, canonical_issue_number, canonical_pr_number),
 
 
478
  "summary": cluster.get("summary"),
479
  "status": cluster.get("status"),
480
  "confidence": _coerce_float(cluster.get("confidence")),
@@ -518,7 +563,9 @@ def _cluster_url(
518
  issue_map: dict[int, dict[str, Any]],
519
  pr_map: dict[int, dict[str, Any]],
520
  ) -> str | None:
521
- return _url_for_issue(canonical_issue_number, issue_map) or _url_for_pr(canonical_pr_number, pr_map)
 
 
522
 
523
 
524
  def _duplicate_pr_summary(
@@ -605,7 +652,8 @@ def _pr_member_row(number: int, row: dict[str, Any] | None, *, role: str) -> dic
605
 
606
 
607
  def _contributor_summary(contributor: dict[str, Any], *, rank: int | None = None) -> dict[str, Any]:
608
- activity = contributor.get("activity") if isinstance(contributor.get("activity"), dict) else {}
 
609
  return {
610
  "rank": rank,
611
  "author_login": contributor.get("author_login"),
@@ -629,7 +677,8 @@ def _contributor_summary(contributor: dict[str, Any], *, rank: int | None = None
629
 
630
 
631
  def _contributor_risk(contributor: dict[str, Any]) -> dict[str, Any]:
632
- activity = contributor.get("activity") if isinstance(contributor.get("activity"), dict) else {}
 
633
  return {
634
  "automation_risk_signal": contributor.get("automation_risk_signal"),
635
  "heuristic_note": contributor.get("heuristic_note"),
 
5
  from typing import Any, Literal
6
 
7
  from slop_farmer.data.parquet_io import read_json, read_parquet_rows
8
+ from slop_farmer.data.snapshot_paths import (
9
+ CURRENT_ANALYSIS_MANIFEST_PATH,
10
+ load_current_analysis_manifest,
11
+ repo_relative_path_to_local,
12
+ )
13
 
14
  AnalysisVariant = Literal["auto", "hybrid", "deterministic"]
15
 
 
257
  def get_contributor_status(snapshot_dir: Path) -> dict[str, Any]:
258
  metadata = _snapshot_metadata(snapshot_dir)
259
  report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
260
+ raw_contributors = report.get("contributors")
261
+ contributors: list[Any] = raw_contributors if isinstance(raw_contributors, list) else []
262
  return {
263
  "repo": str(report.get("repo") or metadata.repo),
264
  "snapshot_id": str(report.get("snapshot_id") or metadata.snapshot_id),
 
327
  snapshot_dir: Path,
328
  *,
329
  variant: AnalysisVariant,
330
+ ) -> tuple[
331
+ _SnapshotMetadata,
332
+ _AnalysisSelection | None,
333
+ dict[int, dict[str, Any]],
334
+ dict[int, dict[str, Any]],
335
+ ]:
336
  metadata = _snapshot_metadata(snapshot_dir)
337
  selection = _select_analysis_report(_analysis_candidates(snapshot_dir), variant=variant)
338
  issue_map, pr_map = _artifact_maps(snapshot_dir)
 
406
 
407
 
408
  def _analysis_report_paths(snapshot_dir: Path) -> list[Path]:
409
+ ordered: list[Path] = []
410
+ current_manifest_path = repo_relative_path_to_local(
411
+ snapshot_dir, CURRENT_ANALYSIS_MANIFEST_PATH
412
+ )
413
+ if current_manifest_path.exists():
414
+ try:
415
+ current_manifest = load_current_analysis_manifest(current_manifest_path)
416
+ except ValueError:
417
+ current_manifest = None
418
+ if current_manifest is not None:
419
+ for artifact_path in (current_manifest.get("artifacts") or {}).values():
420
+ if not isinstance(artifact_path, str):
421
+ continue
422
+ ordered.append(repo_relative_path_to_local(snapshot_dir, artifact_path))
423
  ordered.extend(
424
+ [
425
+ snapshot_dir / "analysis-report-hybrid.json",
426
+ snapshot_dir / "analysis-report-deterministic.json",
427
+ snapshot_dir / "analysis-report.json",
428
+ ]
429
+ )
430
+ seen: set[Path] = set()
431
+ deduped: list[Path] = []
432
+ for path in ordered:
433
+ if path in seen:
434
+ continue
435
+ seen.add(path)
436
+ deduped.append(path)
437
+ deduped.extend(
438
+ path for path in sorted(snapshot_dir.glob("analysis-report*.json")) if path not in seen
439
  )
440
+ return [path for path in deduped if path.exists()]
441
 
442
 
443
  def _analysis_auto_priority(candidate: dict[str, Any]) -> tuple[int, str]:
 
481
  }
482
 
483
 
484
+ def _artifact_maps(
485
+ snapshot_dir: Path,
486
+ ) -> tuple[dict[int, dict[str, Any]], dict[int, dict[str, Any]]]:
487
+ issue_rows = (
488
+ read_parquet_rows(snapshot_dir / "issues.parquet")
489
+ if (snapshot_dir / "issues.parquet").exists()
490
+ else []
491
+ )
492
  pr_rows = (
493
  read_parquet_rows(snapshot_dir / "pull_requests.parquet")
494
  if (snapshot_dir / "pull_requests.parquet").exists()
495
  else []
496
  )
497
+ issue_map = {
498
+ int(row["number"]): row for row in issue_rows if _coerce_int(row.get("number")) is not None
499
+ }
500
+ pr_map = {
501
+ int(row["number"]): row for row in pr_rows if _coerce_int(row.get("number")) is not None
502
+ }
503
  return issue_map, pr_map
504
 
505
 
 
517
  return {
518
  "rank": rank,
519
  "cluster_id": str(cluster.get("cluster_id") or f"cluster-{rank or 0}"),
520
+ "title": _cluster_title(
521
+ cluster, issue_map, pr_map, canonical_issue_number, canonical_pr_number
522
+ ),
523
  "summary": cluster.get("summary"),
524
  "status": cluster.get("status"),
525
  "confidence": _coerce_float(cluster.get("confidence")),
 
563
  issue_map: dict[int, dict[str, Any]],
564
  pr_map: dict[int, dict[str, Any]],
565
  ) -> str | None:
566
+ return _url_for_issue(canonical_issue_number, issue_map) or _url_for_pr(
567
+ canonical_pr_number, pr_map
568
+ )
569
 
570
 
571
  def _duplicate_pr_summary(
 
652
 
653
 
654
  def _contributor_summary(contributor: dict[str, Any], *, rank: int | None = None) -> dict[str, Any]:
655
+ raw_activity = contributor.get("activity")
656
+ activity: dict[str, Any] = raw_activity if isinstance(raw_activity, dict) else {}
657
  return {
658
  "rank": rank,
659
  "author_login": contributor.get("author_login"),
 
677
 
678
 
679
  def _contributor_risk(contributor: dict[str, Any]) -> dict[str, Any]:
680
+ raw_activity = contributor.get("activity")
681
+ activity: dict[str, Any] = raw_activity if isinstance(raw_activity, dict) else {}
682
  return {
683
  "automation_risk_signal": contributor.get("automation_risk_signal"),
684
  "heuristic_note": contributor.get("heuristic_note"),
uv.lock CHANGED
@@ -561,7 +561,7 @@ wheels = [
561
 
562
  [[package]]
563
  name = "fast-agent-mcp"
564
- version = "0.6.18"
565
  source = { registry = "https://pypi.org/simple" }
566
  dependencies = [
567
  { name = "a2a-sdk" },
@@ -598,9 +598,9 @@ dependencies = [
598
  { name = "uvloop", marker = "sys_platform != 'win32'" },
599
  { name = "watchfiles" },
600
  ]
601
- sdist = { url = "https://files.pythonhosted.org/packages/68/9f/a66344581177eb70cd817a58a3305c4b2c2b5f98661129c2cecc4aa36e77/fast_agent_mcp-0.6.18.tar.gz", hash = "sha256:5ee5624890a9670b6f1a912998807e0fd451aa1c7205d189a964764a988c7bc0", size = 2091443, upload-time = "2026-04-17T20:52:25.84Z" }
602
  wheels = [
603
- { url = "https://files.pythonhosted.org/packages/49/63/d8942bde2e706c869f93835ea85a2015be0edf5772c4e9ec8939a1001172/fast_agent_mcp-0.6.18-py3-none-any.whl", hash = "sha256:67c0c011763a28b8d5779b5d4d5cdc61e6f3dbc8cd1a7227388229957429835f", size = 1573842, upload-time = "2026-04-17T20:52:28.807Z" },
604
  ]
605
 
606
  [[package]]
@@ -820,34 +820,34 @@ wheels = [
820
 
821
  [[package]]
822
  name = "hf-xet"
823
- version = "1.4.2"
824
- source = { registry = "https://pypi.org/simple" }
825
- sdist = { url = "https://files.pythonhosted.org/packages/09/08/23c84a26716382c89151b5b447b4beb19e3345f3a93d3b73009a71a57ad3/hf_xet-1.4.2.tar.gz", hash = "sha256:b7457b6b482d9e0743bd116363239b1fa904a5e65deede350fbc0c4ea67c71ea", size = 672357, upload-time = "2026-03-13T06:58:51.077Z" }
826
- wheels = [
827
- { url = "https://files.pythonhosted.org/packages/18/06/e8cf74c3c48e5485c7acc5a990d0d8516cdfb5fdf80f799174f1287cc1b5/hf_xet-1.4.2-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ac8202ae1e664b2c15cdfc7298cbb25e80301ae596d602ef7870099a126fcad4", size = 3796125, upload-time = "2026-03-13T06:58:33.177Z" },
828
- { url = "https://files.pythonhosted.org/packages/66/d4/b73ebab01cbf60777323b7de9ef05550790451eb5172a220d6b9845385ec/hf_xet-1.4.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6d2f8ee39fa9fba9af929f8c0d0482f8ee6e209179ad14a909b6ad78ffcb7c81", size = 3555985, upload-time = "2026-03-13T06:58:31.797Z" },
829
- { url = "https://files.pythonhosted.org/packages/ff/e7/ded6d1bd041c3f2bca9e913a0091adfe32371988e047dd3a68a2463c15a2/hf_xet-1.4.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4642a6cf249c09da8c1f87fe50b24b2a3450b235bf8adb55700b52f0ea6e2eb6", size = 4212085, upload-time = "2026-03-13T06:58:24.323Z" },
830
- { url = "https://files.pythonhosted.org/packages/97/c1/a0a44d1f98934f7bdf17f7a915b934f9fca44bb826628c553589900f6df8/hf_xet-1.4.2-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:769431385e746c92dc05492dde6f687d304584b89c33d79def8367ace06cb555", size = 3988266, upload-time = "2026-03-13T06:58:22.887Z" },
831
- { url = "https://files.pythonhosted.org/packages/7a/82/be713b439060e7d1f1d93543c8053d4ef2fe7e6922c5b31642eaa26f3c4b/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c9dd1c1bc4cc56168f81939b0e05b4c36dd2d28c13dc1364b17af89aa0082496", size = 4188513, upload-time = "2026-03-13T06:58:40.858Z" },
832
- { url = "https://files.pythonhosted.org/packages/21/a6/cbd4188b22abd80ebd0edbb2b3e87f2633e958983519980815fb8314eae5/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:fca58a2ae4e6f6755cc971ac6fcdf777ea9284d7e540e350bb000813b9a3008d", size = 4428287, upload-time = "2026-03-13T06:58:42.601Z" },
833
- { url = "https://files.pythonhosted.org/packages/b2/4e/84e45b25e2e3e903ed3db68d7eafa96dae9a1d1f6d0e7fc85120347a852f/hf_xet-1.4.2-cp313-cp313t-win_amd64.whl", hash = "sha256:163aab46854ccae0ab6a786f8edecbbfbaa38fcaa0184db6feceebf7000c93c0", size = 3665574, upload-time = "2026-03-13T06:58:53.881Z" },
834
- { url = "https://files.pythonhosted.org/packages/ee/71/c5ac2b9a7ae39c14e91973035286e73911c31980fe44e7b1d03730c00adc/hf_xet-1.4.2-cp313-cp313t-win_arm64.whl", hash = "sha256:09b138422ecbe50fd0c84d4da5ff537d27d487d3607183cd10e3e53f05188e82", size = 3528760, upload-time = "2026-03-13T06:58:52.187Z" },
835
- { url = "https://files.pythonhosted.org/packages/1e/0f/fcd2504015eab26358d8f0f232a1aed6b8d363a011adef83fe130bff88f7/hf_xet-1.4.2-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:949dcf88b484bb9d9276ca83f6599e4aa03d493c08fc168c124ad10b2e6f75d7", size = 3796493, upload-time = "2026-03-13T06:58:39.267Z" },
836
- { url = "https://files.pythonhosted.org/packages/82/56/19c25105ff81731ca6d55a188b5de2aa99d7a2644c7aa9de1810d5d3b726/hf_xet-1.4.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:41659966020d59eb9559c57de2cde8128b706a26a64c60f0531fa2318f409418", size = 3555797, upload-time = "2026-03-13T06:58:37.546Z" },
837
- { url = "https://files.pythonhosted.org/packages/bf/e3/8933c073186849b5e06762aa89847991d913d10a95d1603eb7f2c3834086/hf_xet-1.4.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c588e21d80010119458dd5d02a69093f0d115d84e3467efe71ffb2c67c19146", size = 4212127, upload-time = "2026-03-13T06:58:30.539Z" },
838
- { url = "https://files.pythonhosted.org/packages/eb/01/f89ebba4e369b4ed699dcb60d3152753870996f41c6d22d3d7cac01310e1/hf_xet-1.4.2-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a296744d771a8621ad1d50c098d7ab975d599800dae6d48528ba3944e5001ba0", size = 3987788, upload-time = "2026-03-13T06:58:29.139Z" },
839
- { url = "https://files.pythonhosted.org/packages/84/4d/8a53e5ffbc2cc33bbf755382ac1552c6d9af13f623ed125fe67cc3e6772f/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f563f7efe49588b7d0629d18d36f46d1658fe7e08dce3fa3d6526e1c98315e2d", size = 4188315, upload-time = "2026-03-13T06:58:48.017Z" },
840
- { url = "https://files.pythonhosted.org/packages/d1/b8/b7a1c1b5592254bd67050632ebbc1b42cc48588bf4757cb03c2ef87e704a/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5b2e0132c56d7ee1bf55bdb638c4b62e7106f6ac74f0b786fed499d5548c5570", size = 4428306, upload-time = "2026-03-13T06:58:49.502Z" },
841
- { url = "https://files.pythonhosted.org/packages/a0/0c/40779e45b20e11c7c5821a94135e0207080d6b3d76e7b78ccb413c6f839b/hf_xet-1.4.2-cp314-cp314t-win_amd64.whl", hash = "sha256:2f45c712c2fa1215713db10df6ac84b49d0e1c393465440e9cb1de73ecf7bbf6", size = 3665826, upload-time = "2026-03-13T06:58:59.88Z" },
842
- { url = "https://files.pythonhosted.org/packages/51/4c/e2688c8ad1760d7c30f7c429c79f35f825932581bc7c9ec811436d2f21a0/hf_xet-1.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:6d53df40616f7168abfccff100d232e9d460583b9d86fa4912c24845f192f2b8", size = 3529113, upload-time = "2026-03-13T06:58:58.491Z" },
843
- { url = "https://files.pythonhosted.org/packages/b4/86/b40b83a2ff03ef05c4478d2672b1fc2b9683ff870e2b25f4f3af240f2e7b/hf_xet-1.4.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:71f02d6e4cdd07f344f6844845d78518cc7186bd2bc52d37c3b73dc26a3b0bc5", size = 3800339, upload-time = "2026-03-13T06:58:36.245Z" },
844
- { url = "https://files.pythonhosted.org/packages/64/2e/af4475c32b4378b0e92a587adb1aa3ec53e3450fd3e5fe0372a874531c00/hf_xet-1.4.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e9b38d876e94d4bdcf650778d6ebbaa791dd28de08db9736c43faff06ede1b5a", size = 3559664, upload-time = "2026-03-13T06:58:34.787Z" },
845
- { url = "https://files.pythonhosted.org/packages/3c/4c/781267da3188db679e601de18112021a5cb16506fe86b246e22c5401a9c4/hf_xet-1.4.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:77e8c180b7ef12d8a96739a4e1e558847002afe9ea63b6f6358b2271a8bdda1c", size = 4217422, upload-time = "2026-03-13T06:58:27.472Z" },
846
- { url = "https://files.pythonhosted.org/packages/68/47/d6cf4a39ecf6c7705f887a46f6ef5c8455b44ad9eb0d391aa7e8a2ff7fea/hf_xet-1.4.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c3b3c6a882016b94b6c210957502ff7877802d0dbda8ad142c8595db8b944271", size = 3992847, upload-time = "2026-03-13T06:58:25.989Z" },
847
- { url = "https://files.pythonhosted.org/packages/2d/ef/e80815061abff54697239803948abc665c6b1d237102c174f4f7a9a5ffc5/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9d9a634cc929cfbaf2e1a50c0e532ae8c78fa98618426769480c58501e8c8ac2", size = 4193843, upload-time = "2026-03-13T06:58:44.59Z" },
848
- { url = "https://files.pythonhosted.org/packages/54/75/07f6aa680575d9646c4167db6407c41340cbe2357f5654c4e72a1b01ca14/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6b0932eb8b10317ea78b7da6bab172b17be03bbcd7809383d8d5abd6a2233e04", size = 4432751, upload-time = "2026-03-13T06:58:46.533Z" },
849
- { url = "https://files.pythonhosted.org/packages/cd/71/193eabd7e7d4b903c4aa983a215509c6114915a5a237525ec562baddb868/hf_xet-1.4.2-cp37-abi3-win_amd64.whl", hash = "sha256:ad185719fb2e8ac26f88c8100562dbf9dbdcc3d9d2add00faa94b5f106aea53f", size = 3671149, upload-time = "2026-03-13T06:58:57.07Z" },
850
- { url = "https://files.pythonhosted.org/packages/b4/7e/ccf239da366b37ba7f0b36095450efae4a64980bdc7ec2f51354205fdf39/hf_xet-1.4.2-cp37-abi3-win_arm64.whl", hash = "sha256:32c012286b581f783653e718c1862aea5b9eb140631685bb0c5e7012c8719a87", size = 3533426, upload-time = "2026-03-13T06:58:55.46Z" },
851
  ]
852
 
853
  [[package]]
@@ -902,7 +902,7 @@ wheels = [
902
 
903
  [[package]]
904
  name = "huggingface-hub"
905
- version = "1.7.2"
906
  source = { registry = "https://pypi.org/simple" }
907
  dependencies = [
908
  { name = "filelock" },
@@ -915,9 +915,9 @@ dependencies = [
915
  { name = "typer" },
916
  { name = "typing-extensions" },
917
  ]
918
- sdist = { url = "https://files.pythonhosted.org/packages/19/15/eafc1c57bf0f8afffb243dcd4c0cceb785e956acc17bba4d9bf2ae21fc9c/huggingface_hub-1.7.2.tar.gz", hash = "sha256:7f7e294e9bbb822e025bdb2ada025fa4344d978175a7f78e824d86e35f7ab43b", size = 724684, upload-time = "2026-03-20T10:36:08.767Z" }
919
  wheels = [
920
- { url = "https://files.pythonhosted.org/packages/08/de/3ad061a05f74728927ded48c90b73521b9a9328c85d841bdefb30e01fb85/huggingface_hub-1.7.2-py3-none-any.whl", hash = "sha256:288f33a0a17b2a73a1359e2a5fd28d1becb2c121748c6173ab8643fb342c850e", size = 618036, upload-time = "2026-03-20T10:36:06.824Z" },
921
  ]
922
 
923
  [[package]]
@@ -2366,7 +2366,7 @@ wheels = [
2366
 
2367
  [[package]]
2368
  name = "slop-farmer"
2369
- version = "0.1.0"
2370
  source = { editable = "." }
2371
  dependencies = [
2372
  { name = "duckdb" },
@@ -2398,7 +2398,7 @@ requires-dist = [
2398
  { name = "fast-agent-mcp", marker = "python_full_version >= '3.13.5' and extra == 'llm'", specifier = ">=0.6.16" },
2399
  { name = "fastapi", specifier = ">=0.115.0" },
2400
  { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
2401
- { name = "huggingface-hub", specifier = ">=0.30.0" },
2402
  { name = "pyarrow", specifier = ">=18.0.0" },
2403
  { name = "pydantic", specifier = ">=2.11" },
2404
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.0" },
 
561
 
562
  [[package]]
563
  name = "fast-agent-mcp"
564
+ version = "0.6.17"
565
  source = { registry = "https://pypi.org/simple" }
566
  dependencies = [
567
  { name = "a2a-sdk" },
 
598
  { name = "uvloop", marker = "sys_platform != 'win32'" },
599
  { name = "watchfiles" },
600
  ]
601
+ sdist = { url = "https://files.pythonhosted.org/packages/8c/a1/b6b1045345d38b342da3def7723a2dc6a44faff9c01fee6d81afbd272d62/fast_agent_mcp-0.6.17.tar.gz", hash = "sha256:a920113d47ef2ab82be1bd63b77d3bf78f8f862a5a6e91f1fd0aa931850fb25f", size = 2091401, upload-time = "2026-04-16T21:48:43.334Z" }
602
  wheels = [
603
+ { url = "https://files.pythonhosted.org/packages/b4/ef/47e05d6fa95e04ed8ad60afac3ae29d8205894fb220ffde193bd33578f3a/fast_agent_mcp-0.6.17-py3-none-any.whl", hash = "sha256:a23c5a5ed8924e38809dabd31f994e5cc81b8c084e84632bb1eb246b257c4752", size = 1573794, upload-time = "2026-04-16T21:48:38.999Z" },
604
  ]
605
 
606
  [[package]]
 
820
 
821
  [[package]]
822
  name = "hf-xet"
823
+ version = "1.4.3"
824
+ source = { registry = "https://pypi.org/simple" }
825
+ sdist = { url = "https://files.pythonhosted.org/packages/53/92/ec9ad04d0b5728dca387a45af7bc98fbb0d73b2118759f5f6038b61a57e8/hf_xet-1.4.3.tar.gz", hash = "sha256:8ddedb73c8c08928c793df2f3401ec26f95be7f7e516a7bee2fbb546f6676113", size = 670477, upload-time = "2026-03-31T22:40:07.874Z" }
826
+ wheels = [
827
+ { url = "https://files.pythonhosted.org/packages/72/43/724d307b34e353da0abd476e02f72f735cdd2bc86082dee1b32ea0bfee1d/hf_xet-1.4.3-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:7551659ba4f1e1074e9623996f28c3873682530aee0a846b7f2f066239228144", size = 3800935, upload-time = "2026-03-31T22:39:49.618Z" },
828
+ { url = "https://files.pythonhosted.org/packages/2b/d2/8bee5996b699262edb87dbb54118d287c0e1b2fc78af7cdc41857ba5e3c4/hf_xet-1.4.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:bee693ada985e7045997f05f081d0e12c4c08bd7626dc397f8a7c487e6c04f7f", size = 3558942, upload-time = "2026-03-31T22:39:47.938Z" },
829
+ { url = "https://files.pythonhosted.org/packages/c3/a1/e993d09cbe251196fb60812b09a58901c468127b7259d2bf0f68bf6088eb/hf_xet-1.4.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:21644b404bb0100fe3857892f752c4d09642586fd988e61501c95bbf44b393a3", size = 4207657, upload-time = "2026-03-31T22:39:39.69Z" },
830
+ { url = "https://files.pythonhosted.org/packages/64/44/9eb6d21e5c34c63e5e399803a6932fa983cabdf47c0ecbcfe7ea97684b8c/hf_xet-1.4.3-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:987f09cfe418237812896a6736b81b1af02a3a6dcb4b4944425c4c4fca7a7cf8", size = 3986765, upload-time = "2026-03-31T22:39:37.936Z" },
831
+ { url = "https://files.pythonhosted.org/packages/ea/7b/8ad6f16fdb82f5f7284a34b5ec48645bd575bdcd2f6f0d1644775909c486/hf_xet-1.4.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:60cf7fc43a99da0a853345cf86d23738c03983ee5249613a6305d3e57a5dca74", size = 4188162, upload-time = "2026-03-31T22:39:58.382Z" },
832
+ { url = "https://files.pythonhosted.org/packages/1b/c4/39d6e136cbeea9ca5a23aad4b33024319222adbdc059ebcda5fc7d9d5ff4/hf_xet-1.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2815a49a7a59f3e2edf0cf113ae88e8cb2ca2a221bf353fb60c609584f4884d4", size = 4424525, upload-time = "2026-03-31T22:40:00.225Z" },
833
+ { url = "https://files.pythonhosted.org/packages/46/f2/adc32dae6bdbc367853118b9878139ac869419a4ae7ba07185dc31251b76/hf_xet-1.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:42ee323265f1e6a81b0e11094564fb7f7e0ec75b5105ffd91ae63f403a11931b", size = 3671610, upload-time = "2026-03-31T22:40:10.42Z" },
834
+ { url = "https://files.pythonhosted.org/packages/e2/19/25d897dcc3f81953e0c2cde9ec186c7a0fee413eb0c9a7a9130d87d94d3a/hf_xet-1.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:27c976ba60079fb8217f485b9c5c7fcd21c90b0367753805f87cb9f3cdc4418a", size = 3528529, upload-time = "2026-03-31T22:40:09.106Z" },
835
+ { url = "https://files.pythonhosted.org/packages/ec/36/3e8f85ca9fe09b8de2b2e10c63b3b3353d7dda88a0b3d426dffbe7b8313b/hf_xet-1.4.3-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:5251d5ece3a81815bae9abab41cf7ddb7bcb8f56411bce0827f4a3071c92fdc6", size = 3801019, upload-time = "2026-03-31T22:39:56.651Z" },
836
+ { url = "https://files.pythonhosted.org/packages/b5/9c/defb6cb1de28bccb7bd8d95f6e60f72a3d3fa4cb3d0329c26fb9a488bfe7/hf_xet-1.4.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1feb0f3abeacee143367c326a128a2e2b60868ec12a36c225afb1d6c5a05e6d2", size = 3558746, upload-time = "2026-03-31T22:39:54.766Z" },
837
+ { url = "https://files.pythonhosted.org/packages/c1/bd/8d001191893178ff8e826e46ad5299446e62b93cd164e17b0ffea08832ec/hf_xet-1.4.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8b301fc150290ca90b4fccd079829b84bb4786747584ae08b94b4577d82fb791", size = 4207692, upload-time = "2026-03-31T22:39:46.246Z" },
838
+ { url = "https://files.pythonhosted.org/packages/ce/48/6790b402803250e9936435613d3a78b9aaeee7973439f0918848dde58309/hf_xet-1.4.3-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:d972fbe95ddc0d3c0fc49b31a8a69f47db35c1e3699bf316421705741aab6653", size = 3986281, upload-time = "2026-03-31T22:39:44.648Z" },
839
+ { url = "https://files.pythonhosted.org/packages/51/56/ea62552fe53db652a9099eda600b032d75554d0e86c12a73824bfedef88b/hf_xet-1.4.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c5b48db1ee344a805a1b9bd2cda9b6b65fe77ed3787bd6e87ad5521141d317cd", size = 4187414, upload-time = "2026-03-31T22:40:04.951Z" },
840
+ { url = "https://files.pythonhosted.org/packages/7d/f5/bc1456d4638061bea997e6d2db60a1a613d7b200e0755965ec312dc1ef79/hf_xet-1.4.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:22bdc1f5fb8b15bf2831440b91d1c9bbceeb7e10c81a12e8d75889996a5c9da8", size = 4424368, upload-time = "2026-03-31T22:40:06.347Z" },
841
+ { url = "https://files.pythonhosted.org/packages/e4/76/ab597bae87e1f06d18d3ecb8ed7f0d3c9a37037fc32ce76233d369273c64/hf_xet-1.4.3-cp314-cp314t-win_amd64.whl", hash = "sha256:0392c79b7cf48418cd61478c1a925246cf10639f4cd9d94368d8ca1e8df9ea07", size = 3672280, upload-time = "2026-03-31T22:40:16.401Z" },
842
+ { url = "https://files.pythonhosted.org/packages/62/05/2e462d34e23a09a74d73785dbed71cc5dbad82a72eee2ad60a72a554155d/hf_xet-1.4.3-cp314-cp314t-win_arm64.whl", hash = "sha256:681c92a07796325778a79d76c67011764ecc9042a8c3579332b61b63ae512075", size = 3528945, upload-time = "2026-03-31T22:40:14.995Z" },
843
+ { url = "https://files.pythonhosted.org/packages/ac/9f/9c23e4a447b8f83120798f9279d0297a4d1360bdbf59ef49ebec78fe2545/hf_xet-1.4.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:d0da85329eaf196e03e90b84c2d0aca53bd4573d097a75f99609e80775f98025", size = 3805048, upload-time = "2026-03-31T22:39:53.105Z" },
844
+ { url = "https://files.pythonhosted.org/packages/0b/f8/7aacb8e5f4a7899d39c787b5984e912e6c18b11be136ef13947d7a66d265/hf_xet-1.4.3-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e23717ce4186b265f69afa66e6f0069fe7efbf331546f5c313d00e123dc84583", size = 3562178, upload-time = "2026-03-31T22:39:51.295Z" },
845
+ { url = "https://files.pythonhosted.org/packages/df/9a/a24b26dc8a65f0ecc0fe5be981a19e61e7ca963b85e062c083f3a9100529/hf_xet-1.4.3-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc360b70c815bf340ed56c7b8c63aacf11762a4b099b2fe2c9bd6d6068668c08", size = 4212320, upload-time = "2026-03-31T22:39:42.922Z" },
846
+ { url = "https://files.pythonhosted.org/packages/53/60/46d493db155d2ee2801b71fb1b0fd67696359047fdd8caee2c914cc50c79/hf_xet-1.4.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:39f2d2e9654cd9b4319885733993807aab6de9dfbd34c42f0b78338d6617421f", size = 3991546, upload-time = "2026-03-31T22:39:41.335Z" },
847
+ { url = "https://files.pythonhosted.org/packages/bc/f5/067363e1c96c6b17256910830d1b54099d06287e10f4ec6ec4e7e08371fc/hf_xet-1.4.3-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:49ad8a8cead2b56051aa84d7fce3e1335efe68df3cf6c058f22a65513885baac", size = 4193200, upload-time = "2026-03-31T22:40:01.936Z" },
848
+ { url = "https://files.pythonhosted.org/packages/42/4b/53951592882d9c23080c7644542fda34a3813104e9e11fa1a7d82d419cb8/hf_xet-1.4.3-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7716d62015477a70ea272d2d68cd7cad140f61c52ee452e133e139abfe2c17ba", size = 4429392, upload-time = "2026-03-31T22:40:03.492Z" },
849
+ { url = "https://files.pythonhosted.org/packages/8a/21/75a6c175b4e79662ad8e62f46a40ce341d8d6b206b06b4320d07d55b188c/hf_xet-1.4.3-cp37-abi3-win_amd64.whl", hash = "sha256:6b591fcad34e272a5b02607485e4f2a1334aebf1bc6d16ce8eb1eb8978ac2021", size = 3677359, upload-time = "2026-03-31T22:40:13.619Z" },
850
+ { url = "https://files.pythonhosted.org/packages/8a/7c/44314ecd0e89f8b2b51c9d9e5e7a60a9c1c82024ac471d415860557d3cd8/hf_xet-1.4.3-cp37-abi3-win_arm64.whl", hash = "sha256:7c2c7e20bcfcc946dc67187c203463f5e932e395845d098cc2a93f5b67ca0b47", size = 3533664, upload-time = "2026-03-31T22:40:12.152Z" },
851
  ]
852
 
853
  [[package]]
 
902
 
903
  [[package]]
904
  name = "huggingface-hub"
905
+ version = "1.11.0"
906
  source = { registry = "https://pypi.org/simple" }
907
  dependencies = [
908
  { name = "filelock" },
 
915
  { name = "typer" },
916
  { name = "typing-extensions" },
917
  ]
918
+ sdist = { url = "https://files.pythonhosted.org/packages/dc/89/e7aa12d8a6b9259bed10671abb25ae6fa437c0f88a86ecbf59617bae7759/huggingface_hub-1.11.0.tar.gz", hash = "sha256:15fb3713c7f9cdff7b808a94fd91664f661ab142796bb48c9cd9493e8d166278", size = 761749, upload-time = "2026-04-16T13:07:39.73Z" }
919
  wheels = [
920
+ { url = "https://files.pythonhosted.org/packages/37/02/4f3f8997d1ea7fe0146b343e5e14bd065fa87af790d07e5576d31b31cc18/huggingface_hub-1.11.0-py3-none-any.whl", hash = "sha256:42a6de0afbfeb5e022222d36398f029679db4eb4778801aafda32257ae9131ab", size = 645499, upload-time = "2026-04-16T13:07:37.716Z" },
921
  ]
922
 
923
  [[package]]
 
2366
 
2367
  [[package]]
2368
  name = "slop-farmer"
2369
+ version = "0.1.1"
2370
  source = { editable = "." }
2371
  dependencies = [
2372
  { name = "duckdb" },
 
2398
  { name = "fast-agent-mcp", marker = "python_full_version >= '3.13.5' and extra == 'llm'", specifier = ">=0.6.16" },
2399
  { name = "fastapi", specifier = ">=0.115.0" },
2400
  { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.28.0" },
2401
+ { name = "huggingface-hub", specifier = ">=1.11.0" },
2402
  { name = "pyarrow", specifier = ">=18.0.0" },
2403
  { name = "pydantic", specifier = ">=2.11" },
2404
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.0" },