Spaces:

evalstate
/

openclaw-pr-api

Sleeping

App Files Files Community

evalstate HF Staff commited on Apr 17

Commit

114bead

verified ·

1 Parent(s): d09c394

Deploy PR search API with issues/contributors routes

Browse files

Files changed (26) hide show

README.md +5 -4
pyproject.toml +8 -1
src/slop_farmer.egg-info/PKG-INFO +42 -1
src/slop_farmer.egg-info/SOURCES.txt +1 -2
src/slop_farmer.egg-info/entry_points.txt +0 -1
src/slop_farmer.egg-info/requires.txt +1 -1
src/slop_farmer/__init__.py +1 -1
src/slop_farmer/app/cli.py +0 -307
src/slop_farmer/app/deploy.py +2 -11
src/slop_farmer/app/hf_checkpoint_import.py +70 -10
src/slop_farmer/app/pipeline.py +90 -12
src/slop_farmer/app/pr_search.py +0 -74
src/slop_farmer/app/pr_search_api.py +133 -83
src/slop_farmer/app/workflow.py +0 -3
src/slop_farmer/app_config.py +0 -22
src/slop_farmer/config.py +0 -38
src/slop_farmer/data/search_duckdb.py +0 -146
src/slop_farmer/data/snapshot_materialize.py +6 -0
src/slop_farmer/reports/analysis.py +17 -9
src/slop_farmer/reports/dashboard.py +2 -9
src/slop_farmer/reports/new_contributor_report.py +3 -11
src/slop_farmer/reports/pr_scope.py +16 -9
src/slop_farmer/reports/pr_search_scope.py +16 -12
src/slop_farmer/reports/pr_search_service.py +1 -166
src/slop_farmer/reports/read_views.py +742 -0
uv.lock +4 -4

README.md CHANGED Viewed

@@ -29,8 +29,9 @@ Defaults for this deployment:
 CLI examples:
 ```bash
-pr-search repo status
-pr-search pr similar 67096
-pr-search pr clusters 67096
-pr-search --json pr similar 67096
 ```

 CLI examples:
 ```bash
+pr-search status
+pr-search code similar 67096
+pr-search code clusters for-pr 67096
+pr-search issues list --limit 5
+pr-search contributors list --limit 10
 ```

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "slop-farmer"
-version = "0.1.1"
 description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
 readme = "README.md"
 requires-python = ">=3.13.5"
@@ -60,6 +60,13 @@ select = [
 ]
 ignore = ["E501"]
 [tool.slop-farmer.dashboard-data]
 output-dir = "web/public/data"
 window-days = 14

 [project]
 name = "slop-farmer"
+version = "0.1.0"
 description = "GitHub-to-Hub data pipeline for transformers issue and PR triage research."
 readme = "README.md"
 requires-python = ">=3.13.5"
 ]
 ignore = ["E501"]
+[tool.slop-farmer.analyze]
+output-dir = "eval_data"
+hf-repo-id = "evalstate/transformers-pr"
+ranking-backend = "hybrid"
+model = "gpt-5.4-mini"
+max-clusters = 10
 [tool.slop-farmer.dashboard-data]
 output-dir = "web/public/data"
 window-days = 14

src/slop_farmer.egg-info/PKG-INFO CHANGED Viewed

@@ -11,7 +11,7 @@ Requires-Dist: huggingface_hub>=0.30.0
 Requires-Dist: pydantic>=2.11
 Requires-Dist: PyYAML>=6.0.2
 Requires-Dist: rank-bm25>=0.2.2
-Requires-Dist: fast-agent-mcp>=0.6.16
 Requires-Dist: uvicorn>=0.34.0
 Provides-Extra: dev
 Requires-Dist: httpx>=0.28.0; extra == "dev"
@@ -409,3 +409,44 @@ Or use the CLI wrapper with a YAML config:
 ```bash
 uv run slop-farmer --config configs/diffusers.yaml deploy-dashboard --refresh-contributors
 ```

 Requires-Dist: pydantic>=2.11
 Requires-Dist: PyYAML>=6.0.2
 Requires-Dist: rank-bm25>=0.2.2
+Requires-Dist: fast-agent-mcp>=0.6.17
 Requires-Dist: uvicorn>=0.34.0
 Provides-Extra: dev
 Requires-Dist: httpx>=0.28.0; extra == "dev"
 ```bash
 uv run slop-farmer --config configs/diffusers.yaml deploy-dashboard --refresh-contributors
 ```
+## Deploy the PR similarity API to a Hugging Face Docker Space
+The repo includes the FastAPI service for the read-oriented PR similarity surface.
+The standalone `pr-search` client now lives in the downstream `pr-search-cli`
+package.
+Deploy the OpenClaw API Space with:
+```bash
+scripts/update_openclaw_pr_search_api.sh
+```
+Or use the generic deploy script directly:
+```bash
+SPACE_ID=evalstate/openclaw-pr-api \
+SPACE_TITLE="OpenClaw PR API" \
+DEFAULT_REPO=openclaw/openclaw \
+GHR_BASE_URL=https://ghreplica.dutiful.dev \
+HF_REPO_ID=evalstate/openclaw-pr \
+BUCKET_ID=evalstate/openclaw-pr-api-data \
+scripts/deploy_pr_search_space.sh
+```
+This deploy flow:
+- creates or updates a Docker Space
+- uploads a minimal app bundle with a generated Space `README.md`
+- sets runtime variables for the API
+- mounts the configured HF bucket at `/data`
+After the Space is live, you can query it either through the in-repo admin CLI:
+```bash
+uv run slop-farmer pr-search status --repo openclaw/openclaw
+uv run slop-farmer pr-search similar 67096 --repo openclaw/openclaw
+```
+Or through the downstream `pr-search-cli` package, which owns the standalone
+`pr-search` executable.

src/slop_farmer.egg-info/SOURCES.txt CHANGED Viewed

@@ -17,7 +17,6 @@ src/slop_farmer/app/hf_checkpoint_import.py
 src/slop_farmer/app/pipeline.py
 src/slop_farmer/app/pr_search.py
 src/slop_farmer/app/pr_search_api.py
-src/slop_farmer/app/pr_search_client.py
 src/slop_farmer/app/publish.py
 src/slop_farmer/app/snapshot_state.py
 src/slop_farmer/app/workflow.py
@@ -42,6 +41,7 @@ src/slop_farmer/reports/pr_heuristics.py
 src/slop_farmer/reports/pr_scope.py
 src/slop_farmer/reports/pr_search_scope.py
 src/slop_farmer/reports/pr_search_service.py
 src/slop_farmer/reports/user_activity.py
 tests/test_analysis.py
 tests/test_analysis_cache.py
@@ -61,7 +61,6 @@ tests/test_pipeline_checkpoint_resume.py
 tests/test_pr_scope.py
 tests/test_pr_search.py
 tests/test_pr_search_api.py
-tests/test_pr_search_client.py
 tests/test_publish.py
 tests/test_snapshot_state.py
 tests/test_update_transformers_dataset.py

 src/slop_farmer/app/pipeline.py
 src/slop_farmer/app/pr_search.py
 src/slop_farmer/app/pr_search_api.py
 src/slop_farmer/app/publish.py
 src/slop_farmer/app/snapshot_state.py
 src/slop_farmer/app/workflow.py
 src/slop_farmer/reports/pr_scope.py
 src/slop_farmer/reports/pr_search_scope.py
 src/slop_farmer/reports/pr_search_service.py
+src/slop_farmer/reports/read_views.py
 src/slop_farmer/reports/user_activity.py
 tests/test_analysis.py
 tests/test_analysis_cache.py
 tests/test_pr_scope.py
 tests/test_pr_search.py
 tests/test_pr_search_api.py
 tests/test_publish.py
 tests/test_snapshot_state.py
 tests/test_update_transformers_dataset.py

src/slop_farmer.egg-info/entry_points.txt CHANGED Viewed

@@ -1,3 +1,2 @@
 [console_scripts]
-pr-search = slop_farmer.app.pr_search_client:main
 slop-farmer = slop_farmer.app.cli:main


1	[console_scripts]

2	slop-farmer = slop_farmer.app.cli:main

src/slop_farmer.egg-info/requires.txt CHANGED Viewed

@@ -5,7 +5,7 @@ huggingface_hub>=0.30.0
 pydantic>=2.11
 PyYAML>=6.0.2
 rank-bm25>=0.2.2
-fast-agent-mcp>=0.6.16
 uvicorn>=0.34.0
 [dev]

 pydantic>=2.11
 PyYAML>=6.0.2
 rank-bm25>=0.2.2
+fast-agent-mcp>=0.6.17
 uvicorn>=0.34.0
 [dev]

src/slop_farmer/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 __all__ = ["__version__"]
-__version__ = "0.1.1"


1	__all__ = ["__version__"]
2
3	+ __version__ = "0.1.0"

src/slop_farmer/app/cli.py CHANGED Viewed

@@ -13,8 +13,6 @@ from slop_farmer.config import (
     AnalysisOptions,
     CheckpointImportOptions,
     DashboardDataOptions,
-    DatasetRefreshOptions,
-    DatasetStatusOptions,
     DeployDashboardOptions,
     FullPipelineOptions,
     MarkdownReportOptions,
@@ -43,7 +41,6 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
     subparsers = parser.add_subparsers(dest="command", required=True)
     _add_scrape_parser(subparsers, defaults["scrape"])
-    _add_refresh_dataset_parser(subparsers, defaults["refresh-dataset"])
     _add_analyze_parser(subparsers, defaults["analyze"])
     _add_pr_scope_parser(subparsers, defaults["pr-scope"])
     _add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
@@ -55,7 +52,6 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
     _add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
     _add_publish_snapshot_parser(subparsers, defaults["publish-snapshot"])
     _add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
-    _add_dataset_status_parser(subparsers, defaults["dataset-status"])
     _add_full_pipeline_parser(subparsers, defaults["full-pipeline"])
     return parser
@@ -63,7 +59,6 @@ def build_parser(*, config_path: Path | None = None) -> argparse.ArgumentParser:
 def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
     commands = (
         "scrape",
-        "refresh-dataset",
         "analyze",
         "import-hf-checkpoint",
         "pr-scope",
@@ -73,7 +68,6 @@ def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]
         "dashboard-data",
         "publish-snapshot",
         "deploy-dashboard",
-        "dataset-status",
         "full-pipeline",
     )
     return {command: command_defaults(command, config_path=config_path) for command in commands}
@@ -190,80 +184,6 @@ def _add_scrape_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
     )
-def _add_refresh_dataset_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
-    refresh = subparsers.add_parser(
-        "refresh-dataset",
-        help="Refresh the canonical Hugging Face dataset repo from remote watermark state.",
-    )
-    refresh.add_argument(
-        "--repo",
-        default=defaults.get("repo", "huggingface/transformers"),
-        help="GitHub repository in owner/name form.",
-    )
-    refresh.add_argument(
-        "--hf-repo-id",
-        default=defaults.get("hf-repo-id"),
-        required=defaults.get("hf-repo-id") is None,
-        help="Canonical Hugging Face dataset repo id to refresh.",
-    )
-    refresh.add_argument("--max-issues", type=int, default=defaults.get("max-issues"))
-    refresh.add_argument("--max-prs", type=int, default=defaults.get("max-prs"))
-    refresh.add_argument(
-        "--max-issue-comments", type=int, default=defaults.get("max-issue-comments")
-    )
-    refresh.add_argument(
-        "--max-reviews-per-pr", type=int, default=defaults.get("max-reviews-per-pr")
-    )
-    refresh.add_argument(
-        "--max-review-comments-per-pr",
-        type=int,
-        default=defaults.get("max-review-comments-per-pr"),
-    )
-    refresh.add_argument(
-        "--fetch-timeline",
-        action="store_true",
-        default=bool(defaults.get("fetch-timeline", False)),
-    )
-    refresh.add_argument(
-        "--new-contributor-report",
-        dest="new_contributor_report",
-        action="store_true",
-        default=bool(defaults.get("new-contributor-report", True)),
-    )
-    refresh.add_argument(
-        "--no-new-contributor-report",
-        dest="new_contributor_report",
-        action="store_false",
-    )
-    refresh.add_argument(
-        "--new-contributor-window-days",
-        type=int,
-        default=int(defaults.get("new-contributor-window-days", 42)),
-    )
-    refresh.add_argument(
-        "--new-contributor-max-authors",
-        type=int,
-        default=int(defaults.get("new-contributor-max-authors", 25)),
-    )
-    refresh.add_argument("--http-timeout", type=int, default=300)
-    refresh.add_argument("--http-max-retries", type=int, default=8)
-    refresh.add_argument("--checkpoint-every-comments", type=int, default=1000)
-    refresh.add_argument("--checkpoint-every-prs", type=int, default=25)
-    refresh.add_argument(
-        "--private-hf-repo",
-        dest="private_hf_repo",
-        action="store_true",
-        default=bool(defaults.get("private-hf-repo", False)),
-        help="Create the target dataset repo as private if needed.",
-    )
-    refresh.add_argument(
-        "--private",
-        dest="private_hf_repo",
-        action="store_true",
-        help=argparse.SUPPRESS,
-    )
 def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
     analyze = subparsers.add_parser(
         "analyze", help="Analyze a local snapshot and write a shortlist JSON report."
@@ -717,61 +637,6 @@ def _add_pr_search_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
     status.add_argument("--repo", help="Optional repo override.")
     status.add_argument("--json", action="store_true", help="Emit JSON.")
-    contributor = pr_search_subparsers.add_parser(
-        "contributor", help="Show indexed contributor summary for one author login."
-    )
-    contributor.add_argument("login", help="GitHub author login to query.")
-    contributor.add_argument(
-        "--db",
-        type=Path,
-        default=Path(defaults["db"]) if defaults.get("db") else None,
-        help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
-    )
-    contributor.add_argument(
-        "--output-dir",
-        type=Path,
-        default=Path(defaults.get("output-dir", "data")),
-    )
-    contributor.add_argument("--repo", help="Optional repo override.")
-    contributor.add_argument("--json", action="store_true", help="Emit JSON.")
-    contributor_prs = pr_search_subparsers.add_parser(
-        "contributor-prs", help="List indexed PRs for one contributor login."
-    )
-    contributor_prs.add_argument("login", help="GitHub author login to query.")
-    contributor_prs.add_argument(
-        "--db",
-        type=Path,
-        default=Path(defaults["db"]) if defaults.get("db") else None,
-        help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
-    )
-    contributor_prs.add_argument(
-        "--output-dir",
-        type=Path,
-        default=Path(defaults.get("output-dir", "data")),
-    )
-    contributor_prs.add_argument("--repo", help="Optional repo override.")
-    contributor_prs.add_argument("--limit", type=int, default=20, help="Maximum rows to show.")
-    contributor_prs.add_argument("--json", action="store_true", help="Emit JSON.")
-    pr_contributor = pr_search_subparsers.add_parser(
-        "pr-contributor", help="Show contributor summary for the author of one indexed PR."
-    )
-    pr_contributor.add_argument("pr_number", type=int, help="Pull request number to query.")
-    pr_contributor.add_argument(
-        "--db",
-        type=Path,
-        default=Path(defaults["db"]) if defaults.get("db") else None,
-        help="DuckDB file path. Defaults to <output-dir>/state/pr-search.duckdb.",
-    )
-    pr_contributor.add_argument(
-        "--output-dir",
-        type=Path,
-        default=Path(defaults.get("output-dir", "data")),
-    )
-    pr_contributor.add_argument("--repo", help="Optional repo override.")
-    pr_contributor.add_argument("--json", action="store_true", help="Emit JSON.")
 def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
     new_contributor = subparsers.add_parser(
@@ -794,24 +659,6 @@ def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]
     new_contributor.add_argument(
         "--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
     )
-    new_contributor.add_argument(
-        "--hf-repo-id",
-        default=defaults.get("hf-repo-id"),
-        help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
-    )
-    new_contributor.add_argument(
-        "--hf-revision",
-        default=defaults.get("hf-revision"),
-        help="Optional Hub revision for metadata and README download.",
-    )
-    new_contributor.add_argument(
-        "--hf-materialize-dir",
-        type=Path,
-        default=Path(defaults["hf-materialize-dir"])
-        if defaults.get("hf-materialize-dir")
-        else None,
-        help="Optional local directory used when materializing an HF dataset snapshot.",
-    )
     new_contributor.add_argument(
         "--window-days",
         type=int,
@@ -855,24 +702,6 @@ def _add_dashboard_data_parser(subparsers: Any, defaults: dict[str, Any]) -> Non
         type=Path,
         help="Optional PR scope cluster JSON. Defaults to pr-scope-clusters.json in the snapshot.",
     )
-    dashboard.add_argument(
-        "--hf-repo-id",
-        default=defaults.get("hf-repo-id"),
-        help="Analyze a Hugging Face dataset repo by materializing its parquet export locally.",
-    )
-    dashboard.add_argument(
-        "--hf-revision",
-        default=defaults.get("hf-revision"),
-        help="Optional Hub revision for metadata and README download.",
-    )
-    dashboard.add_argument(
-        "--hf-materialize-dir",
-        type=Path,
-        default=Path(defaults["hf-materialize-dir"])
-        if defaults.get("hf-materialize-dir")
-        else None,
-        help="Optional local directory used when materializing an HF dataset snapshot.",
-    )
     dashboard.add_argument(
         "--window-days",
         type=int,
@@ -932,24 +761,6 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
     deploy_dashboard.add_argument(
         "--contributors-input", type=Path, help="Optional contributor report JSON override."
     )
-    deploy_dashboard.add_argument(
-        "--hf-repo-id",
-        default=defaults.get("hf-repo-id"),
-        help="Materialize a Hugging Face dataset repo instead of using the latest local snapshot.",
-    )
-    deploy_dashboard.add_argument(
-        "--hf-revision",
-        default=defaults.get("hf-revision"),
-        help="Optional Hub revision for metadata and README download.",
-    )
-    deploy_dashboard.add_argument(
-        "--hf-materialize-dir",
-        type=Path,
-        default=Path(defaults["hf-materialize-dir"])
-        if defaults.get("hf-materialize-dir")
-        else None,
-        help="Optional local directory used when materializing an HF dataset snapshot.",
-    )
     deploy_dashboard.add_argument(
         "--refresh-contributors",
         action="store_true",
@@ -1006,31 +817,6 @@ def _add_deploy_dashboard_parser(subparsers: Any, defaults: dict[str, Any]) -> N
     )
-def _add_dataset_status_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
-    dataset_status = subparsers.add_parser(
-        "dataset-status",
-        help="Inspect canonical dataset freshness and the local latest pointer.",
-    )
-    dataset_status.add_argument("--repo", default=defaults.get("repo"))
-    dataset_status.add_argument(
-        "--output-dir",
-        type=Path,
-        default=Path(defaults.get("output-dir", "data")),
-        help="Local workspace root containing snapshots/latest.json.",
-    )
-    dataset_status.add_argument(
-        "--hf-repo-id",
-        default=defaults.get("hf-repo-id"),
-        help="Canonical Hugging Face dataset repo id to inspect.",
-    )
-    dataset_status.add_argument(
-        "--hf-revision",
-        default=defaults.get("hf-revision"),
-        help="Optional Hub revision for metadata and README download.",
-    )
-    dataset_status.add_argument("--json", action="store_true", help="Emit machine-readable JSON.")
 def _add_full_pipeline_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
     full_pipeline = subparsers.add_parser(
         "full-pipeline",
@@ -1147,33 +933,6 @@ def _run_scrape(args: argparse.Namespace, config_path: Path | None) -> None:
     print(run_pipeline(options))
-def _run_refresh_dataset(args: argparse.Namespace, config_path: Path | None) -> None:
-    del config_path
-    from slop_farmer.app.dataset_refresh import run_dataset_refresh
-    result = run_dataset_refresh(
-        DatasetRefreshOptions(
-            repo=RepoRef.parse(args.repo),
-            hf_repo_id=args.hf_repo_id,
-            private_hf_repo=args.private_hf_repo,
-            max_issues=args.max_issues,
-            max_prs=args.max_prs,
-            max_issue_comments=args.max_issue_comments,
-            max_reviews_per_pr=args.max_reviews_per_pr,
-            max_review_comments_per_pr=args.max_review_comments_per_pr,
-            fetch_timeline=args.fetch_timeline,
-            new_contributor_report=args.new_contributor_report,
-            new_contributor_window_days=args.new_contributor_window_days,
-            new_contributor_max_authors=args.new_contributor_max_authors,
-            http_timeout=args.http_timeout,
-            http_max_retries=args.http_max_retries,
-            checkpoint_every_comments=args.checkpoint_every_comments,
-            checkpoint_every_prs=args.checkpoint_every_prs,
-        )
-    )
-    print(json.dumps(result, indent=2))
 def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
     from slop_farmer.reports.analysis import run_analysis
@@ -1282,18 +1041,12 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
         explain_pr_search_pair,
         format_pr_search_candidate_clusters,
         format_pr_search_cluster,
-        format_pr_search_contributor,
-        format_pr_search_contributor_pulls,
         format_pr_search_pair,
         format_pr_search_probe,
-        format_pr_search_pull_contributor,
         format_pr_search_similar,
         format_pr_search_status,
         get_pr_search_candidate_clusters,
         get_pr_search_cluster,
-        get_pr_search_contributor,
-        get_pr_search_contributor_pulls,
-        get_pr_search_pull_contributor,
         get_pr_search_similar,
         get_pr_search_status,
         probe_pr_search_github,
@@ -1387,36 +1140,6 @@ def _run_pr_search(args: argparse.Namespace, config_path: Path | None) -> None:
         print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
         return
-    if args.pr_search_command == "contributor":
-        result = get_pr_search_contributor(db_path, author_login=args.login, repo=args.repo)
-        print(json.dumps(result, indent=2) if args.json else format_pr_search_contributor(result))
-        return
-    if args.pr_search_command == "contributor-prs":
-        result = get_pr_search_contributor_pulls(
-            db_path,
-            author_login=args.login,
-            repo=args.repo,
-            limit=args.limit,
-        )
-        print(
-            json.dumps(result, indent=2)
-            if args.json
-            else format_pr_search_contributor_pulls(result)
-        )
-        return
-    if args.pr_search_command == "pr-contributor":
-        result = get_pr_search_pull_contributor(
-            db_path,
-            pr_number=args.pr_number,
-            repo=args.repo,
-        )
-        print(
-            json.dumps(result, indent=2) if args.json else format_pr_search_pull_contributor(result)
-        )
-        return
     raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
@@ -1458,7 +1181,6 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
     del config_path
     from slop_farmer.reports.new_contributor_report import run_new_contributor_report
-    hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
     print(
         run_new_contributor_report(
             NewContributorReportOptions(
@@ -1466,9 +1188,6 @@ def _run_new_contributor_report(args: argparse.Namespace, config_path: Path | No
                 output_dir=args.output_dir,
                 output=args.output,
                 json_output=args.json_output,
-                hf_repo_id=hf_repo_id,
-                hf_revision=hf_revision,
-                hf_materialize_dir=hf_materialize_dir,
                 window_days=args.window_days,
                 max_authors=args.max_authors,
             )
@@ -1480,7 +1199,6 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
     from slop_farmer.reports.dashboard import run_dashboard_data
     dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
-    hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
     print(
         run_dashboard_data(
             DashboardDataOptions(
@@ -1489,9 +1207,6 @@ def _run_dashboard_data(args: argparse.Namespace, config_path: Path | None) -> N
                 analysis_input=args.analysis_input,
                 contributors_input=args.contributors_input,
                 pr_scope_input=args.pr_scope_input,
-                hf_repo_id=hf_repo_id,
-                hf_revision=hf_revision,
-                hf_materialize_dir=hf_materialize_dir,
                 window_days=args.window_days,
                 snapshot_root=(
                     Path(dashboard_defaults["snapshot-root"])
@@ -1507,7 +1222,6 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
     del config_path
     from slop_farmer.app.deploy import run_deploy_dashboard
-    hf_repo_id, hf_revision, hf_materialize_dir = _resolve_hf_inputs(args)
     run_deploy_dashboard(
         DeployDashboardOptions(
             pipeline_data_dir=args.pipeline_data_dir,
@@ -1515,9 +1229,6 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
             snapshot_dir=args.snapshot_dir,
             analysis_input=args.analysis_input,
             contributors_input=args.contributors_input,
-            hf_repo_id=hf_repo_id,
-            hf_revision=hf_revision,
-            hf_materialize_dir=hf_materialize_dir,
             refresh_contributors=args.refresh_contributors,
             dashboard_window_days=args.dashboard_window_days,
             contributor_window_days=args.contributor_window_days,
@@ -1536,22 +1247,6 @@ def _run_deploy_dashboard(args: argparse.Namespace, config_path: Path | None) ->
     )
-def _run_dataset_status(args: argparse.Namespace, config_path: Path | None) -> None:
-    del config_path
-    from slop_farmer.app.dataset_status import format_dataset_status, get_dataset_status
-    result = get_dataset_status(
-        DatasetStatusOptions(
-            repo=args.repo,
-            output_dir=args.output_dir,
-            hf_repo_id=args.hf_repo_id,
-            hf_revision=args.hf_revision,
-            json_output=args.json,
-        )
-    )
-    print(json.dumps(result, indent=2) if args.json else format_dataset_status(result))
 def _run_publish_snapshot(args: argparse.Namespace, config_path: Path | None) -> None:
     del config_path
     from slop_farmer.app.publish import run_publish_snapshot
@@ -1601,7 +1296,6 @@ def main() -> None:
     handlers: dict[str, CommandHandler] = {
         "scrape": _run_scrape,
-        "refresh-dataset": _run_refresh_dataset,
         "analyze": _run_analyze,
         "markdown-report": _run_markdown_report,
         "duplicate-prs": _run_duplicate_prs,
@@ -1612,7 +1306,6 @@ def main() -> None:
         "new-contributor-report": _run_new_contributor_report,
         "dashboard-data": _run_dashboard_data,
         "deploy-dashboard": _run_deploy_dashboard,
-        "dataset-status": _run_dataset_status,
         "publish-snapshot": _run_publish_snapshot,
         "full-pipeline": _run_full_pipeline,
     }

     AnalysisOptions,
     CheckpointImportOptions,
     DashboardDataOptions,
     DeployDashboardOptions,
     FullPipelineOptions,
     MarkdownReportOptions,
     subparsers = parser.add_subparsers(dest="command", required=True)
     _add_scrape_parser(subparsers, defaults["scrape"])
     _add_analyze_parser(subparsers, defaults["analyze"])
     _add_pr_scope_parser(subparsers, defaults["pr-scope"])
     _add_checkpoint_import_parser(subparsers, defaults["import-hf-checkpoint"])
     _add_dashboard_data_parser(subparsers, defaults["dashboard-data"])
     _add_publish_snapshot_parser(subparsers, defaults["publish-snapshot"])
     _add_deploy_dashboard_parser(subparsers, defaults["deploy-dashboard"])
     _add_full_pipeline_parser(subparsers, defaults["full-pipeline"])
     return parser
 def _load_parser_defaults(config_path: Path | None) -> dict[str, dict[str, Any]]:
     commands = (
         "scrape",
         "analyze",
         "import-hf-checkpoint",
         "pr-scope",
         "dashboard-data",
         "publish-snapshot",
         "deploy-dashboard",
         "full-pipeline",
     )
     return {command: command_defaults(command, config_path=config_path) for command in commands}
     )
 def _add_analyze_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
     analyze = subparsers.add_parser(
         "analyze", help="Analyze a local snapshot and write a shortlist JSON report."
     status.add_argument("--repo", help="Optional repo override.")
     status.add_argument("--json", action="store_true", help="Emit JSON.")
 def _add_new_contributor_report_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
     new_contributor = subparsers.add_parser(
     new_contributor.add_argument(
         "--json-output", type=Path, help="Optional JSON output path. Defaults next to the snapshot."
     )
     new_contributor.add_argument(
         "--window-days",
         type=int,
         type=Path,
         help="Optional PR scope cluster JSON. Defaults to pr-scope-clusters.json in the snapshot.",
     )
     dashboard.add_argument(
         "--window-days",
         type=int,
     deploy_dashboard.add_argument(
         "--contributors-input", type=Path, help="Optional contributor report JSON override."
     )
     deploy_dashboard.add_argument(
         "--refresh-contributors",
         action="store_true",
     )
 def _add_full_pipeline_parser(subparsers: Any, defaults: dict[str, Any]) -> None:
     full_pipeline = subparsers.add_parser(
         "full-pipeline",
     print(run_pipeline(options))
 def _run_analyze(args: argparse.Namespace, config_path: Path | None) -> None:
     from slop_farmer.reports.analysis import run_analysis
         explain_pr_search_pair,
         format_pr_search_candidate_clusters,
         format_pr_search_cluster,
         format_pr_search_pair,
         format_pr_search_probe,
         format_pr_search_similar,
         format_pr_search_status,
         get_pr_search_candidate_clusters,
         get_pr_search_cluster,
         get_pr_search_similar,
         get_pr_search_status,
         probe_pr_search_github,
         print(json.dumps(result, indent=2) if args.json else format_pr_search_status(result))
         return
     raise ValueError(f"Unsupported pr-search command: {args.pr_search_command}")
     del config_path
     from slop_farmer.reports.new_contributor_report import run_new_contributor_report
     print(
         run_new_contributor_report(
             NewContributorReportOptions(
                 output_dir=args.output_dir,
                 output=args.output,
                 json_output=args.json_output,
                 window_days=args.window_days,
                 max_authors=args.max_authors,
             )
     from slop_farmer.reports.dashboard import run_dashboard_data
     dashboard_defaults = command_defaults("dashboard-data", config_path=config_path)
     print(
         run_dashboard_data(
             DashboardDataOptions(
                 analysis_input=args.analysis_input,
                 contributors_input=args.contributors_input,
                 pr_scope_input=args.pr_scope_input,
                 window_days=args.window_days,
                 snapshot_root=(
                     Path(dashboard_defaults["snapshot-root"])
     del config_path
     from slop_farmer.app.deploy import run_deploy_dashboard
     run_deploy_dashboard(
         DeployDashboardOptions(
             pipeline_data_dir=args.pipeline_data_dir,
             snapshot_dir=args.snapshot_dir,
             analysis_input=args.analysis_input,
             contributors_input=args.contributors_input,
             refresh_contributors=args.refresh_contributors,
             dashboard_window_days=args.dashboard_window_days,
             contributor_window_days=args.contributor_window_days,
     )
 def _run_publish_snapshot(args: argparse.Namespace, config_path: Path | None) -> None:
     del config_path
     from slop_farmer.app.publish import run_publish_snapshot
     handlers: dict[str, CommandHandler] = {
         "scrape": _run_scrape,
         "analyze": _run_analyze,
         "markdown-report": _run_markdown_report,
         "duplicate-prs": _run_duplicate_prs,
         "new-contributor-report": _run_new_contributor_report,
         "dashboard-data": _run_dashboard_data,
         "deploy-dashboard": _run_deploy_dashboard,
         "publish-snapshot": _run_publish_snapshot,
         "full-pipeline": _run_full_pipeline,
     }

src/slop_farmer/app/deploy.py CHANGED Viewed

@@ -5,7 +5,6 @@ import subprocess
 from pathlib import Path
 from slop_farmer.config import DeployDashboardOptions
-from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
@@ -18,16 +17,6 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
         {
             "PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
             "WEB_DIR": str(options.web_dir),
-            "SNAPSHOT_DIR": str(
-                resolve_snapshot_source_dir(
-                    snapshot_dir=options.snapshot_dir,
-                    local_snapshots_root=options.pipeline_data_dir.resolve() / "snapshots",
-                    hf_repo_id=options.hf_repo_id,
-                    hf_revision=options.hf_revision,
-                    hf_materialize_dir=options.hf_materialize_dir,
-                    hf_output_dir=options.pipeline_data_dir,
-                )
-            ),
             "DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
             "CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
             "CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
@@ -39,6 +28,8 @@ def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
             "SPACE_SHORT_DESCRIPTION": options.space_short_description,
         }
     )
     if options.analysis_input is not None:
         env["ANALYSIS_INPUT"] = str(options.analysis_input)
     if options.contributors_input is not None:

 from pathlib import Path
 from slop_farmer.config import DeployDashboardOptions
 def run_deploy_dashboard(options: DeployDashboardOptions) -> None:
         {
             "PIPELINE_DATA_DIR": str(options.pipeline_data_dir),
             "WEB_DIR": str(options.web_dir),
             "DASHBOARD_WINDOW_DAYS": str(options.dashboard_window_days),
             "CONTRIBUTOR_WINDOW_DAYS": str(options.contributor_window_days),
             "CONTRIBUTOR_MAX_AUTHORS": str(options.contributor_max_authors),
             "SPACE_SHORT_DESCRIPTION": options.space_short_description,
         }
     )
+    if options.snapshot_dir is not None:
+        env["SNAPSHOT_DIR"] = str(options.snapshot_dir)
     if options.analysis_input is not None:
         env["ANALYSIS_INPUT"] = str(options.analysis_input)
     if options.contributors_input is not None:

src/slop_farmer/app/hf_checkpoint_import.py CHANGED Viewed

@@ -28,7 +28,6 @@ from huggingface_hub import HfApi, hf_hub_download
 from slop_farmer.app.publish import publish_snapshot
 from slop_farmer.config import CheckpointImportOptions
-from slop_farmer.data.dataset_card import build_hf_dataset_card
 from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
 from slop_farmer.data.parquet_io import (
     SCHEMAS,
@@ -456,15 +455,76 @@ def _viewer_comment_rows(
 def _dataset_card(
     repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
 ) -> str:
-    return build_hf_dataset_card(
-        repo_slug,
-        snapshot_id,
-        notes=[
-            f"source HF dataset: `{source_repo_id}`",
-            f"source checkpoint root: `{checkpoint_root}`",
-            "links were regenerated locally from text references and timeline events",
-        ],
-    )
 def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:

 from slop_farmer.app.publish import publish_snapshot
 from slop_farmer.config import CheckpointImportOptions
 from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
 from slop_farmer.data.parquet_io import (
     SCHEMAS,
 def _dataset_card(
     repo_slug: str, snapshot_id: str, source_repo_id: str, checkpoint_root: str
 ) -> str:
+    return f"""---
+pretty_name: Transformers PR Slop Dataset
+configs:
+- config_name: issues
+  data_files:
+  - split: train
+    path: issues.parquet
+  default: true
+- config_name: prs
+  data_files:
+  - split: train
+    path: pull_requests.parquet
+- config_name: issue_comments
+  data_files:
+  - split: train
+    path: issue_comments.parquet
+- config_name: pr_comments
+  data_files:
+  - split: train
+    path: pr_comments.parquet
+- config_name: pr_reviews
+  data_files:
+  - split: train
+    path: reviews.parquet
+- config_name: pr_files
+  data_files:
+  - split: train
+    path: pr_files.parquet
+- config_name: pr_diffs
+  data_files:
+  - split: train
+    path: pr_diffs.parquet
+- config_name: review_comments
+  data_files:
+  - split: train
+    path: review_comments.parquet
+- config_name: links
+  data_files:
+  - split: train
+    path: links.parquet
+- config_name: events
+  data_files:
+  - split: train
+    path: events.parquet
+---
+---
+# Transformers PR Slop Dataset
+Imported checkpoint snapshot for `{repo_slug}`.
+Files:
+- `issues.parquet`
+- `pull_requests.parquet`
+- `comments.parquet`
+- `issue_comments.parquet`
+- `pr_comments.parquet`
+- `reviews.parquet`
+- `pr_files.parquet`
+- `pr_diffs.parquet`
+- `review_comments.parquet`
+- `links.parquet`
+- `events.parquet`
+Notes:
+- source HF dataset: `{source_repo_id}`
+- source checkpoint root: `{checkpoint_root}`
+- latest imported checkpoint: `{snapshot_id}`
+- links were regenerated locally from text references and timeline events
+"""
 def _snapshot_dir_name(source_repo_id: str, checkpoint_id: str) -> str:

src/slop_farmer/app/pipeline.py CHANGED Viewed

@@ -9,7 +9,6 @@ from typing import Any, Protocol
 from slop_farmer.app.publish import publish_snapshot
 from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
-from slop_farmer.data.dataset_card import build_hf_dataset_card
 from slop_farmer.data.github_api import GitHubClient
 from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
 from slop_farmer.data.normalize import (
@@ -113,14 +112,96 @@ def _reference_time_for_age_caps(crawl_started_at: str) -> datetime:
 def _dataset_card(
     repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
 ) -> str:
-    notes = ["new contributor reviewer artifacts are included"] if include_new_contributors else []
-    del manifest
-    return build_hf_dataset_card(
-        repo,
-        snapshot_id,
-        include_new_contributors=include_new_contributors,
-        notes=notes,
-    )
 def _viewer_comment_rows(
@@ -964,9 +1045,6 @@ def run_pipeline(options: PipelineOptions, client: GitHubClientLike | None = Non
                 output_dir=options.output_dir,
                 output=None,
                 json_output=None,
-                hf_repo_id=None,
-                hf_revision=None,
-                hf_materialize_dir=None,
                 window_days=options.new_contributor_window_days,
                 max_authors=options.new_contributor_max_authors,
             )

 from slop_farmer.app.publish import publish_snapshot
 from slop_farmer.config import NewContributorReportOptions, PipelineOptions, resolve_github_token
 from slop_farmer.data.github_api import GitHubClient
 from slop_farmer.data.links import build_pr_duplicate_candidate_rows, build_text_link_rows
 from slop_farmer.data.normalize import (
 def _dataset_card(
     repo: str, snapshot_id: str, manifest: dict[str, Any], *, include_new_contributors: bool = False
 ) -> str:
+    new_contributor_config = ""
+    new_contributor_file = ""
+    if include_new_contributors:
+        new_contributor_config = """- config_name: new_contributors
+  data_files:
+  - split: train
+    path: new_contributors.parquet
+"""
+        new_contributor_file = """- `new_contributors.parquet`
+- `new-contributors-report.json`
+- `new-contributors-report.md`
+"""
+    return f"""---
+pretty_name: Transformers PR Slop Dataset
+configs:
+- config_name: issues
+  data_files:
+  - split: train
+    path: issues.parquet
+  default: true
+- config_name: prs
+  data_files:
+  - split: train
+    path: pull_requests.parquet
+- config_name: issue_comments
+  data_files:
+  - split: train
+    path: issue_comments.parquet
+- config_name: pr_comments
+  data_files:
+  - split: train
+    path: pr_comments.parquet
+- config_name: pr_reviews
+  data_files:
+  - split: train
+    path: reviews.parquet
+- config_name: pr_files
+  data_files:
+  - split: train
+    path: pr_files.parquet
+- config_name: pr_diffs
+  data_files:
+  - split: train
+    path: pr_diffs.parquet
+- config_name: review_comments
+  data_files:
+  - split: train
+    path: review_comments.parquet
+- config_name: links
+  data_files:
+  - split: train
+    path: links.parquet
+- config_name: events
+  data_files:
+  - split: train
+    path: events.parquet
+{new_contributor_config}---
+---
+# Transformers PR Slop Dataset
+Normalized snapshots of issues, pull requests, comments, reviews, and linkage data from `{repo}`.
+Files:
+- `issues.parquet`
+- `pull_requests.parquet`
+- `comments.parquet`
+- `issue_comments.parquet` (derived view of issue discussion comments)
+- `pr_comments.parquet` (derived view of pull request discussion comments)
+- `reviews.parquet`
+- `pr_files.parquet`
+- `pr_diffs.parquet`
+- `review_comments.parquet`
+- `links.parquet`
+- `events.parquet`
+{new_contributor_file}
+Use:
+- duplicate PR and issue analysis
+- triage and ranking experiments
+- eval set creation
+Notes:
+- updated daily
+- latest snapshot: `{snapshot_id}`
+- raw data only; no labels or moderation decisions
+- PR metadata, file-level patch hunks, and full unified diffs are included
+- new contributor reviewer artifacts are included when generated for the snapshot
+- full file contents for changed files are not included
+"""
 def _viewer_comment_rows(
                 output_dir=options.output_dir,
                 output=None,
                 json_output=None,
                 window_days=options.new_contributor_window_days,
                 max_authors=options.new_contributor_max_authors,
             )

src/slop_farmer/app/pr_search.py CHANGED Viewed

@@ -10,12 +10,9 @@ get_pr_search_status = pr_search_service.get_pr_search_status
 get_pr_search_similar = pr_search_service.get_pr_search_similar
 get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
 get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
-get_pr_search_contributor = pr_search_service.get_pr_search_contributor
-get_pr_search_contributor_pulls = pr_search_service.get_pr_search_contributor_pulls
 get_pr_search_clusters = pr_search_service.get_pr_search_clusters
 list_pr_search_clusters = pr_search_service.list_pr_search_clusters
 get_pr_search_cluster = pr_search_service.get_pr_search_cluster
-get_pr_search_pull_contributor = pr_search_service.get_pr_search_pull_contributor
 explain_pr_search_pair = pr_search_service.explain_pr_search_pair
 probe_pr_search_live = pr_search_service.probe_pr_search_live
 probe_pr_search_github = pr_search_service.probe_pr_search_github
@@ -34,7 +31,6 @@ def format_pr_search_status(result: Mapping[str, Any]) -> str:
             (
                 "Rows: "
                 f"documents={counts['documents']} "
-                f"contributors={counts.get('contributors', 0)} "
                 f"features={counts['features']} "
                 f"neighbors={counts['neighbors']} "
                 f"clusters={counts['clusters']} "
@@ -249,73 +245,3 @@ def format_pr_search_probe(result: Mapping[str, Any]) -> str:
             if row.get("reason"):
                 lines.append(f"   reason: {row['reason']}")
     return "\n".join(lines)
-def format_pr_search_contributor(result: Mapping[str, Any]) -> str:
-    contributor = result["contributor"]
-    lines = [
-        f"Contributor {contributor['author_login']}",
-        f"Repo: {result['repo']}",
-        f"Snapshot: {result['snapshot_id']}",
-        f"Name: {contributor.get('name') or '-'}",
-        f"Profile: {contributor.get('profile_url') or '-'}",
-        f"Association: {contributor.get('repo_association') or '-'}",
-        f"First seen in snapshot: {'yes' if contributor.get('first_seen_in_snapshot') else 'no'}",
-        (
-            "Scores: "
-            f"follow-through={contributor.get('follow_through_score') or '-'} "
-            f"breadth={contributor.get('breadth_score') or '-'} "
-            f"risk={contributor.get('automation_risk_signal') or '-'}"
-        ),
-        f"Heuristic: {contributor.get('heuristic_note') or '-'}",
-        f"Public orgs: {', '.join(contributor.get('public_orgs') or []) or '-'}",
-        "",
-        "Recent indexed PRs:",
-    ]
-    pulls = result.get("pulls") or []
-    if not pulls:
-        lines.append("- none")
-        return "\n".join(lines)
-    for row in pulls:
-        lines.append(
-            f"- PR #{row['pr_number']}: {row.get('title') or ''} "
-            f"[state={row.get('state') or '-'} merged={'yes' if row.get('merged') else 'no'}]"
-        )
-    return "\n".join(lines)
-def format_pr_search_contributor_pulls(result: Mapping[str, Any]) -> str:
-    contributor = result["contributor"]
-    lines = [
-        f"Contributor PRs: {contributor['author_login']}",
-        f"Repo: {result['repo']}",
-        f"Snapshot: {result['snapshot_id']}",
-        f"Pull requests: {result.get('pull_count', len(result.get('pulls') or []))}",
-        "",
-    ]
-    pulls = result.get("pulls") or []
-    if not pulls:
-        lines.append("No indexed PRs found for that contributor.")
-        return "\n".join(lines)
-    for row in pulls:
-        lines.append(
-            f"- PR #{row['pr_number']}: {row.get('title') or ''} "
-            f"(updated={row.get('updated_at') or '-'}, state={row.get('state') or '-'})"
-        )
-    return "\n".join(lines)
-def format_pr_search_pull_contributor(result: Mapping[str, Any]) -> str:
-    pr = result["pr"]
-    contributor = result["contributor"]
-    return "\n".join(
-        [
-            f"PR #{pr['pr_number']}: {pr.get('title') or ''}",
-            f"Author: {contributor['author_login']}",
-            f"Risk: {contributor.get('automation_risk_signal') or '-'}",
-            f"Follow-through: {contributor.get('follow_through_score') or '-'}",
-            f"Breadth: {contributor.get('breadth_score') or '-'}",
-            f"Heuristic: {contributor.get('heuristic_note') or '-'}",
-            f"Profile: {contributor.get('profile_url') or '-'}",
-        ]
-    )

 get_pr_search_similar = pr_search_service.get_pr_search_similar
 get_pr_search_similar_lookup = pr_search_service.get_pr_search_similar_lookup
 get_pr_search_candidate_clusters = pr_search_service.get_pr_search_candidate_clusters
 get_pr_search_clusters = pr_search_service.get_pr_search_clusters
 list_pr_search_clusters = pr_search_service.list_pr_search_clusters
 get_pr_search_cluster = pr_search_service.get_pr_search_cluster
 explain_pr_search_pair = pr_search_service.explain_pr_search_pair
 probe_pr_search_live = pr_search_service.probe_pr_search_live
 probe_pr_search_github = pr_search_service.probe_pr_search_github
             (
                 "Rows: "
                 f"documents={counts['documents']} "
                 f"features={counts['features']} "
                 f"neighbors={counts['neighbors']} "
                 f"clusters={counts['clusters']} "
             if row.get("reason"):
                 lines.append(f"   reason: {row['reason']}")
     return "\n".join(lines)

src/slop_farmer/app/pr_search_api.py CHANGED Viewed

@@ -11,25 +11,30 @@ from fastapi.responses import JSONResponse
 from slop_farmer.config import PrSearchRefreshOptions
 from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
-from slop_farmer.reports.analysis_service import (
-    get_analysis_best,
-    get_analysis_meta_bug,
-    get_analysis_status,
-    get_pr_analysis,
-    list_analysis_duplicate_prs,
-    list_analysis_meta_bugs,
-)
 from slop_farmer.reports.pr_search_service import (
     get_pr_search_cluster,
     get_pr_search_clusters,
-    get_pr_search_contributor,
-    get_pr_search_contributor_pulls,
-    get_pr_search_pull_contributor,
     get_pr_search_similar_lookup,
     get_pr_search_status,
     list_pr_search_clusters,
     run_pr_search_refresh,
 )
 @dataclass(slots=True)
@@ -37,7 +42,6 @@ class PrSearchApiSettings:
     default_repo: str | None
     index_path: Path
     output_dir: Path
-    analysis_dir: Path | None = None
     snapshot_dir: Path | None = None
     hf_repo_id: str | None = None
     hf_revision: str | None = None
@@ -55,6 +59,10 @@ class PrSearchApiSettings:
     candidate_limit_max: int = 20
     cluster_list_limit_default: int = 50
     cluster_list_limit_max: int = 200
     probe_limit_default: int = 10
     probe_limit_max: int = 25
@@ -70,7 +78,6 @@ class PrSearchApiSettings:
             default_repo=os.environ.get("DEFAULT_REPO"),
             index_path=index_path,
             output_dir=output_dir,
-            analysis_dir=_env_path("ANALYSIS_DIR") or (output_dir / "analysis"),
             snapshot_dir=snapshot_dir,
             hf_repo_id=os.environ.get("HF_REPO_ID"),
             hf_revision=os.environ.get("HF_REVISION"),
@@ -88,6 +95,10 @@ class PrSearchApiSettings:
             candidate_limit_max=_env_int("CANDIDATE_LIMIT_MAX", 20),
             cluster_list_limit_default=_env_int("CLUSTER_LIST_LIMIT_DEFAULT", 50),
             cluster_list_limit_max=_env_int("CLUSTER_LIST_LIMIT_MAX", 200),
             probe_limit_default=_env_int("PROBE_LIMIT_DEFAULT", 10),
             probe_limit_max=_env_int("PROBE_LIMIT_MAX", 25),
         )
@@ -102,13 +113,14 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
         app.state.ready = False
         app.state.startup_error = None
         try:
             _bootstrap_index(api_settings)
             app.state.ready = _is_ready(api_settings)
         except Exception as exc:
             app.state.startup_error = str(exc)
         yield
-    app = FastAPI(title="slop PR search API", version="0.1.1", lifespan=lifespan)
     @app.exception_handler(ValueError)
     async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
@@ -139,7 +151,9 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
     async def repo_status(owner: str, repo: str, request: Request) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
-        return get_pr_search_status(settings.index_path, repo=repo_slug)
     @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/similar")
     async def pr_similar(
@@ -217,80 +231,89 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
             ),
         )
-    @app.get("/v1/repos/{owner}/{repo}/contributors/{login}")
-    async def contributor_view(
-        owner: str, repo: str, login: str, request: Request
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
-        return get_pr_search_contributor(settings.index_path, repo=repo_slug, author_login=login)
-    @app.get("/v1/repos/{owner}/{repo}/contributors/{login}/pulls")
-    async def contributor_pulls(
         owner: str,
         repo: str,
-        login: str,
         request: Request,
         limit: int | None = None,
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
-        return get_pr_search_contributor_pulls(
-            settings.index_path,
-            repo=repo_slug,
-            author_login=login,
             limit=_limit(
-                limit, default=settings.similar_limit_default, maximum=settings.similar_limit_max
             ),
         )
-    @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/contributor")
-    async def pull_contributor(
         owner: str,
         repo: str,
-        number: int,
         request: Request,
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
-        return get_pr_search_pull_contributor(settings.index_path, repo=repo_slug, pr_number=number)
-    @app.get("/v1/repos/{owner}/{repo}/analysis/status")
-    async def analysis_status(
         owner: str,
         repo: str,
         request: Request,
         variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
-        return get_analysis_status(
-            settings.index_path,
-            repo=repo_slug,
             variant=variant,
-            analysis_root=settings.analysis_dir,
         )
-    @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/analysis")
-    async def pr_analysis(
         owner: str,
         repo: str,
         number: int,
         request: Request,
         variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
-        return get_pr_analysis(
-            settings.index_path,
-            repo=repo_slug,
             pr_number=number,
             variant=variant,
-            analysis_root=settings.analysis_dir,
         )
-    @app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs")
-    async def analysis_meta_bugs(
         owner: str,
         repo: str,
         request: Request,
@@ -299,73 +322,76 @@ def create_app(settings: PrSearchApiSettings | None = None) -> FastAPI:
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
-        return list_analysis_meta_bugs(
-            settings.index_path,
-            repo=repo_slug,
-            variant=variant,
-            analysis_root=settings.analysis_dir,
             limit=_limit(
                 limit,
-                default=settings.cluster_list_limit_default,
-                maximum=settings.cluster_list_limit_max,
             ),
         )
-    @app.get("/v1/repos/{owner}/{repo}/analysis/meta-bugs/{cluster_id}")
-    async def analysis_meta_bug(
         owner: str,
         repo: str,
-        cluster_id: str,
         request: Request,
         variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
-        return get_analysis_meta_bug(
-            settings.index_path,
-            repo=repo_slug,
-            cluster_id=cluster_id,
-            variant=variant,
-            analysis_root=settings.analysis_dir,
-        )
-    @app.get("/v1/repos/{owner}/{repo}/analysis/duplicate-prs")
-    async def analysis_duplicate_prs(
         owner: str,
         repo: str,
         request: Request,
         limit: int | None = None,
-        variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
-        return list_analysis_duplicate_prs(
-            settings.index_path,
-            repo=repo_slug,
-            variant=variant,
-            analysis_root=settings.analysis_dir,
             limit=_limit(
                 limit,
-                default=settings.cluster_list_limit_default,
-                maximum=settings.cluster_list_limit_max,
             ),
         )
-    @app.get("/v1/repos/{owner}/{repo}/analysis/best")
-    async def analysis_best(
         owner: str,
         repo: str,
         request: Request,
-        variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
-        return get_analysis_best(
-            settings.index_path,
-            repo=repo_slug,
-            variant=variant,
-            analysis_root=settings.analysis_dir,
-        )
     return app
@@ -391,6 +417,21 @@ def _bootstrap_index(settings: PrSearchApiSettings) -> None:
     )
 def _needs_refresh(settings: PrSearchApiSettings) -> bool:
     if settings.rebuild_on_start:
         return True
@@ -429,6 +470,17 @@ def _repo_slug(settings: PrSearchApiSettings, owner: str, repo: str) -> str:
     return repo_slug
 def _limit(value: int | None, *, default: int, maximum: int) -> int:
     limit = default if value is None else value
     if limit < 1:
@@ -452,8 +504,6 @@ def _looks_not_found(exc: ValueError) -> bool:
     message = str(exc).lower()
     return (
         "not found" in message
-        or "analysis report was not found" in message
-        or "no analysis report was found" in message
         or "no active pr search run" in message
         or "was not found in the active indexed universe" in message
     )

 from slop_farmer.config import PrSearchRefreshOptions
 from slop_farmer.data.ghreplica_api import GhReplicaProbeUnavailableError, GhrProbeClient
+from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
+from slop_farmer.data.snapshot_paths import default_hf_materialize_dir
 from slop_farmer.reports.pr_search_service import (
     get_pr_search_cluster,
     get_pr_search_clusters,
     get_pr_search_similar_lookup,
     get_pr_search_status,
     list_pr_search_clusters,
     run_pr_search_refresh,
 )
+from slop_farmer.reports.read_views import (
+    check_issue_cluster_membership,
+    get_contributor,
+    get_contributor_risk,
+    get_contributor_status,
+    get_issue_best,
+    get_issue_cluster,
+    get_issue_cluster_status,
+    get_issue_clusters_for_pr,
+    get_snapshot_surfaces,
+    list_contributors,
+    list_issue_clusters,
+    list_issue_duplicate_prs,
+)
 @dataclass(slots=True)
     default_repo: str | None
     index_path: Path
     output_dir: Path
     snapshot_dir: Path | None = None
     hf_repo_id: str | None = None
     hf_revision: str | None = None
     candidate_limit_max: int = 20
     cluster_list_limit_default: int = 50
     cluster_list_limit_max: int = 200
+    issue_list_limit_default: int = 50
+    issue_list_limit_max: int = 200
+    contributor_list_limit_default: int = 50
+    contributor_list_limit_max: int = 200
     probe_limit_default: int = 10
     probe_limit_max: int = 25
             default_repo=os.environ.get("DEFAULT_REPO"),
             index_path=index_path,
             output_dir=output_dir,
             snapshot_dir=snapshot_dir,
             hf_repo_id=os.environ.get("HF_REPO_ID"),
             hf_revision=os.environ.get("HF_REVISION"),
             candidate_limit_max=_env_int("CANDIDATE_LIMIT_MAX", 20),
             cluster_list_limit_default=_env_int("CLUSTER_LIST_LIMIT_DEFAULT", 50),
             cluster_list_limit_max=_env_int("CLUSTER_LIST_LIMIT_MAX", 200),
+            issue_list_limit_default=_env_int("ISSUE_LIST_LIMIT_DEFAULT", 50),
+            issue_list_limit_max=_env_int("ISSUE_LIST_LIMIT_MAX", 200),
+            contributor_list_limit_default=_env_int("CONTRIBUTOR_LIST_LIMIT_DEFAULT", 50),
+            contributor_list_limit_max=_env_int("CONTRIBUTOR_LIST_LIMIT_MAX", 200),
             probe_limit_default=_env_int("PROBE_LIMIT_DEFAULT", 10),
             probe_limit_max=_env_int("PROBE_LIMIT_MAX", 25),
         )
         app.state.ready = False
         app.state.startup_error = None
         try:
+            _bootstrap_snapshot_assets(api_settings)
             _bootstrap_index(api_settings)
             app.state.ready = _is_ready(api_settings)
         except Exception as exc:
             app.state.startup_error = str(exc)
         yield
+    app = FastAPI(title="slop PR search API", version="0.1.0", lifespan=lifespan)
     @app.exception_handler(ValueError)
     async def handle_value_error(_request: Request, exc: ValueError) -> JSONResponse:
     async def repo_status(owner: str, repo: str, request: Request) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
+        status = get_pr_search_status(settings.index_path, repo=repo_slug)
+        snapshot_dir = _status_snapshot_dir(status)
+        return {**status, "surfaces": get_snapshot_surfaces(snapshot_dir)}
     @app.get("/v1/repos/{owner}/{repo}/pulls/{number}/similar")
     async def pr_similar(
             ),
         )
+    @app.get("/v1/repos/{owner}/{repo}/issues/status")
+    async def issue_status(
+        owner: str,
+        repo: str,
+        request: Request,
+        variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
+        return get_issue_cluster_status(_active_snapshot_dir(settings, repo_slug), variant=variant)
+    @app.get("/v1/repos/{owner}/{repo}/issues/clusters")
+    async def issue_clusters(
         owner: str,
         repo: str,
         request: Request,
         limit: int | None = None,
+        variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
+        return list_issue_clusters(
+            _active_snapshot_dir(settings, repo_slug),
             limit=_limit(
+                limit,
+                default=settings.issue_list_limit_default,
+                maximum=settings.issue_list_limit_max,
             ),
+            variant=variant,
         )
+    @app.get("/v1/repos/{owner}/{repo}/issues/clusters/{cluster_id}")
+    async def issue_cluster(
         owner: str,
         repo: str,
+        cluster_id: str,
         request: Request,
+        variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
+        return get_issue_cluster(
+            _active_snapshot_dir(settings, repo_slug),
+            cluster_id=cluster_id,
+            variant=variant,
+        )
+    @app.get("/v1/repos/{owner}/{repo}/issues/pulls/{number}")
+    async def issue_clusters_for_pr(
         owner: str,
         repo: str,
+        number: int,
         request: Request,
         variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
+        return get_issue_clusters_for_pr(
+            _active_snapshot_dir(settings, repo_slug),
+            pr_number=number,
             variant=variant,
         )
+    @app.get("/v1/repos/{owner}/{repo}/issues/pulls/{number}/membership")
+    async def issue_membership_for_pr(
         owner: str,
         repo: str,
         number: int,
         request: Request,
+        cluster_id: str | None = None,
         variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
+        return check_issue_cluster_membership(
+            _active_snapshot_dir(settings, repo_slug),
             pr_number=number,
+            cluster_id=cluster_id,
             variant=variant,
         )
+    @app.get("/v1/repos/{owner}/{repo}/issues/duplicate-prs")
+    async def issue_duplicate_prs(
         owner: str,
         repo: str,
         request: Request,
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
+        return list_issue_duplicate_prs(
+            _active_snapshot_dir(settings, repo_slug),
             limit=_limit(
                 limit,
+                default=settings.issue_list_limit_default,
+                maximum=settings.issue_list_limit_max,
             ),
+            variant=variant,
         )
+    @app.get("/v1/repos/{owner}/{repo}/issues/best")
+    async def issue_best(
         owner: str,
         repo: str,
         request: Request,
         variant: Literal["auto", "hybrid", "deterministic"] = "auto",
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
+        return get_issue_best(_active_snapshot_dir(settings, repo_slug), variant=variant)
+    @app.get("/v1/repos/{owner}/{repo}/contributors/status")
+    async def contributor_status(
+        owner: str,
+        repo: str,
+        request: Request,
+    ) -> dict[str, Any]:
+        settings = request.app.state.settings
+        repo_slug = _repo_slug(settings, owner, repo)
+        return get_contributor_status(_active_snapshot_dir(settings, repo_slug))
+    @app.get("/v1/repos/{owner}/{repo}/contributors")
+    async def contributors(
         owner: str,
         repo: str,
         request: Request,
         limit: int | None = None,
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
+        return list_contributors(
+            _active_snapshot_dir(settings, repo_slug),
             limit=_limit(
                 limit,
+                default=settings.contributor_list_limit_default,
+                maximum=settings.contributor_list_limit_max,
             ),
         )
+    @app.get("/v1/repos/{owner}/{repo}/contributors/{login}")
+    async def contributor(
         owner: str,
         repo: str,
+        login: str,
         request: Request,
     ) -> dict[str, Any]:
         settings = request.app.state.settings
         repo_slug = _repo_slug(settings, owner, repo)
+        return get_contributor(_active_snapshot_dir(settings, repo_slug), author_login=login)
+    @app.get("/v1/repos/{owner}/{repo}/contributors/{login}/risk")
+    async def contributor_risk(
+        owner: str,
+        repo: str,
+        login: str,
+        request: Request,
+    ) -> dict[str, Any]:
+        settings = request.app.state.settings
+        repo_slug = _repo_slug(settings, owner, repo)
+        return get_contributor_risk(_active_snapshot_dir(settings, repo_slug), author_login=login)
     return app
     )
+def _bootstrap_snapshot_assets(settings: PrSearchApiSettings) -> None:
+    if settings.snapshot_dir is not None or settings.hf_repo_id is None:
+        return
+    materialize_dir = settings.hf_materialize_dir or default_hf_materialize_dir(
+        settings.output_dir,
+        settings.hf_repo_id,
+        settings.hf_revision,
+    )
+    materialize_hf_dataset_snapshot(
+        repo_id=settings.hf_repo_id,
+        local_dir=materialize_dir,
+        revision=settings.hf_revision,
+    )
 def _needs_refresh(settings: PrSearchApiSettings) -> bool:
     if settings.rebuild_on_start:
         return True
     return repo_slug
+def _active_snapshot_dir(settings: PrSearchApiSettings, repo_slug: str) -> Path:
+    return _status_snapshot_dir(get_pr_search_status(settings.index_path, repo=repo_slug))
+def _status_snapshot_dir(status: dict[str, Any]) -> Path:
+    snapshot_dir = status.get("snapshot_dir")
+    if not snapshot_dir:
+        raise HTTPException(status_code=503, detail="active snapshot directory is unavailable")
+    return Path(str(snapshot_dir))
 def _limit(value: int | None, *, default: int, maximum: int) -> int:
     limit = default if value is None else value
     if limit < 1:
     message = str(exc).lower()
     return (
         "not found" in message
         or "no active pr search run" in message
         or "was not found in the active indexed universe" in message
     )

src/slop_farmer/app/workflow.py CHANGED Viewed

@@ -74,9 +74,6 @@ def run_full_pipeline(options: FullPipelineOptions) -> str:
             analysis_input=analysis_path,
             contributors_input=snapshot_dir / "new-contributors-report.json",
             pr_scope_input=snapshot_dir / "pr-scope-clusters.json",
-            hf_repo_id=None,
-            hf_revision=None,
-            hf_materialize_dir=None,
             window_days=options.dashboard_window_days,
         )
     )

             analysis_input=analysis_path,
             contributors_input=snapshot_dir / "new-contributors-report.json",
             pr_scope_input=snapshot_dir / "pr-scope-clusters.json",
             window_days=options.dashboard_window_days,
         )
     )

src/slop_farmer/app_config.py CHANGED Viewed

@@ -184,18 +184,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
             "new-contributor-window-days": contributor_window_days,
             "new-contributor-max-authors": contributor_max_authors,
         },
-        "refresh-dataset": {
-            "repo": repo,
-            "hf-repo-id": dataset_id,
-            "fetch-timeline": scrape.get("fetch-timeline"),
-            "max-issues": scrape.get("max-issues"),
-            "max-prs": scrape.get("max-prs"),
-            "max-issue-comments": scrape.get("max-issue-comments"),
-            "max-reviews-per-pr": scrape.get("max-reviews-per-pr"),
-            "max-review-comments-per-pr": scrape.get("max-review-comments-per-pr"),
-            "new-contributor-window-days": contributor_window_days,
-            "new-contributor-max-authors": contributor_max_authors,
-        },
         "analyze": {
             "output-dir": str(data_dir) if data_dir else None,
             "hf-repo-id": analysis.get("hf-repo-id", dataset_id),
@@ -213,7 +201,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
         },
         "pr-scope": {
             "output-dir": str(data_dir) if data_dir else None,
-            "hf-repo-id": dataset_id,
             "cluster-suppression-rules": cluster_suppression_rules,
         },
         "pr-search": {
@@ -223,14 +210,12 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
         },
         "new-contributor-report": {
             "output-dir": str(data_dir) if data_dir else None,
-            "hf-repo-id": dataset_id,
             "window-days": contributor_window_days,
             "max-authors": contributor_max_authors,
         },
         "dashboard-data": {
             "output-dir": str(dashboard_dir) if dashboard_dir else None,
             "snapshot-root": str(data_dir / "snapshots") if data_dir else None,
-            "hf-repo-id": dataset_id,
             "window-days": dashboard_window_days,
         },
         "publish-snapshot": {
@@ -251,7 +236,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
         "deploy-dashboard": {
             "pipeline-data-dir": str(data_dir) if data_dir else None,
             "web-dir": str(web_dir) if web_dir else None,
-            "hf-repo-id": dataset_id,
             "dashboard-window-days": dashboard_window_days,
             "contributor-window-days": contributor_window_days,
             "contributor-max-authors": contributor_max_authors,
@@ -264,11 +248,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
             "dataset-id": dataset_id,
             "space-tags": tags_value,
         },
-        "dataset-status": {
-            "repo": repo,
-            "output-dir": str(data_dir) if data_dir else None,
-            "hf-repo-id": dataset_id,
-        },
     }
     for command, values in defaults.items():
         defaults[command] = {key: value for key, value in values.items() if value is not None}
@@ -280,7 +259,6 @@ def _dashboard_config_defaults(config_path: Path) -> dict[str, dict[str, Any]]:
         defaults[command].update(_resolve_command_paths(config_path, values))
     defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
-    defaults["refresh-dataset"].update(_resolve_command_paths(config_path, scrape))
     defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
     defaults["full-pipeline"].update(_resolve_command_paths(config_path, full_pipeline))
     return defaults

             "new-contributor-window-days": contributor_window_days,
             "new-contributor-max-authors": contributor_max_authors,
         },
         "analyze": {
             "output-dir": str(data_dir) if data_dir else None,
             "hf-repo-id": analysis.get("hf-repo-id", dataset_id),
         },
         "pr-scope": {
             "output-dir": str(data_dir) if data_dir else None,
             "cluster-suppression-rules": cluster_suppression_rules,
         },
         "pr-search": {
         },
         "new-contributor-report": {
             "output-dir": str(data_dir) if data_dir else None,
             "window-days": contributor_window_days,
             "max-authors": contributor_max_authors,
         },
         "dashboard-data": {
             "output-dir": str(dashboard_dir) if dashboard_dir else None,
             "snapshot-root": str(data_dir / "snapshots") if data_dir else None,
             "window-days": dashboard_window_days,
         },
         "publish-snapshot": {
         "deploy-dashboard": {
             "pipeline-data-dir": str(data_dir) if data_dir else None,
             "web-dir": str(web_dir) if web_dir else None,
             "dashboard-window-days": dashboard_window_days,
             "contributor-window-days": contributor_window_days,
             "contributor-max-authors": contributor_max_authors,
             "dataset-id": dataset_id,
             "space-tags": tags_value,
         },
     }
     for command, values in defaults.items():
         defaults[command] = {key: value for key, value in values.items() if value is not None}
         defaults[command].update(_resolve_command_paths(config_path, values))
     defaults["scrape"].update(_resolve_command_paths(config_path, scrape))
     defaults["analyze"].update(_resolve_command_paths(config_path, analysis))
     defaults["full-pipeline"].update(_resolve_command_paths(config_path, full_pipeline))
     return defaults

src/slop_farmer/config.py CHANGED Viewed

@@ -127,9 +127,6 @@ class NewContributorReportOptions:
     json_output: Path | None
     window_days: int
     max_authors: int
-    hf_repo_id: str | None = None
-    hf_revision: str | None = None
-    hf_materialize_dir: Path | None = None
 @dataclass(slots=True)
@@ -140,9 +137,6 @@ class DashboardDataOptions:
     contributors_input: Path | None
     pr_scope_input: Path | None
     window_days: int
-    hf_repo_id: str | None = None
-    hf_revision: str | None = None
-    hf_materialize_dir: Path | None = None
     snapshot_root: Path | None = None
@@ -161,9 +155,6 @@ class DeployDashboardOptions:
     snapshot_dir: Path | None
     analysis_input: Path | None
     contributors_input: Path | None
-    hf_repo_id: str | None
-    hf_revision: str | None
-    hf_materialize_dir: Path | None
     refresh_contributors: bool
     dashboard_window_days: int
     contributor_window_days: int
@@ -242,32 +233,3 @@ class FullPipelineOptions:
     max_issues: int | None
     max_prs: int | None
     open_prs_only: bool = False
-@dataclass(slots=True)
-class DatasetRefreshOptions:
-    repo: RepoRef
-    hf_repo_id: str
-    private_hf_repo: bool
-    max_issues: int | None
-    max_prs: int | None
-    max_issue_comments: int | None
-    max_reviews_per_pr: int | None
-    max_review_comments_per_pr: int | None
-    fetch_timeline: bool
-    new_contributor_report: bool
-    new_contributor_window_days: int
-    new_contributor_max_authors: int
-    http_timeout: int
-    http_max_retries: int
-    checkpoint_every_comments: int
-    checkpoint_every_prs: int
-@dataclass(slots=True)
-class DatasetStatusOptions:
-    output_dir: Path
-    hf_repo_id: str | None
-    hf_revision: str | None
-    repo: str | None = None
-    json_output: bool = False

     json_output: Path | None
     window_days: int
     max_authors: int
 @dataclass(slots=True)
     contributors_input: Path | None
     pr_scope_input: Path | None
     window_days: int
     snapshot_root: Path | None = None
     snapshot_dir: Path | None
     analysis_input: Path | None
     contributors_input: Path | None
     refresh_contributors: bool
     dashboard_window_days: int
     contributor_window_days: int
     max_issues: int | None
     max_prs: int | None
     open_prs_only: bool = False

src/slop_farmer/data/search_duckdb.py CHANGED Viewed

@@ -31,7 +31,6 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
         "repo",
         "pr_number",
         "github_id",
-        "author_login",
         "state",
         "draft",
         "merged",
@@ -47,48 +46,6 @@ TABLE_COLUMNS: dict[str, tuple[str, ...]] = {
         "review_comments_count",
         "html_url",
     ),
-    "pr_search_contributors": (
-        "run_id",
-        "repo",
-        "snapshot_id",
-        "report_generated_at",
-        "window_days",
-        "author_login",
-        "name",
-        "profile_url",
-        "repo_pull_requests_url",
-        "repo_issues_url",
-        "repo_first_seen_at",
-        "repo_last_seen_at",
-        "repo_primary_artifact_count",
-        "repo_artifact_count",
-        "snapshot_issue_count",
-        "snapshot_pr_count",
-        "snapshot_comment_count",
-        "snapshot_review_count",
-        "snapshot_review_comment_count",
-        "repo_association",
-        "new_to_repo",
-        "first_seen_in_snapshot",
-        "report_reason",
-        "account_age_days",
-        "young_account",
-        "follow_through_score",
-        "breadth_score",
-        "automation_risk_signal",
-        "heuristic_note",
-        "public_orgs_json",
-        "visible_authored_pr_count",
-        "merged_pr_count",
-        "closed_unmerged_pr_count",
-        "open_pr_count",
-        "merged_pr_rate",
-        "closed_unmerged_pr_rate",
-        "still_open_pr_rate",
-        "distinct_repos_with_authored_prs",
-        "distinct_repos_with_open_prs",
-        "fetch_error",
-    ),
     "pr_scope_features": (
         "run_id",
         "repo",
@@ -187,7 +144,6 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
     repo VARCHAR,
     pr_number BIGINT,
     github_id BIGINT,
-    author_login VARCHAR,
     state VARCHAR,
     draft BOOLEAN,
     merged BOOLEAN,
@@ -203,48 +159,6 @@ CREATE TABLE IF NOT EXISTS pr_search_documents (
     review_comments_count BIGINT,
     html_url VARCHAR
 );
-CREATE TABLE IF NOT EXISTS pr_search_contributors (
-    run_id VARCHAR,
-    repo VARCHAR,
-    snapshot_id VARCHAR,
-    report_generated_at VARCHAR,
-    window_days BIGINT,
-    author_login VARCHAR,
-    name VARCHAR,
-    profile_url VARCHAR,
-    repo_pull_requests_url VARCHAR,
-    repo_issues_url VARCHAR,
-    repo_first_seen_at VARCHAR,
-    repo_last_seen_at VARCHAR,
-    repo_primary_artifact_count BIGINT,
-    repo_artifact_count BIGINT,
-    snapshot_issue_count BIGINT,
-    snapshot_pr_count BIGINT,
-    snapshot_comment_count BIGINT,
-    snapshot_review_count BIGINT,
-    snapshot_review_comment_count BIGINT,
-    repo_association VARCHAR,
-    new_to_repo BOOLEAN,
-    first_seen_in_snapshot BOOLEAN,
-    report_reason VARCHAR,
-    account_age_days BIGINT,
-    young_account BOOLEAN,
-    follow_through_score VARCHAR,
-    breadth_score VARCHAR,
-    automation_risk_signal VARCHAR,
-    heuristic_note VARCHAR,
-    public_orgs_json VARCHAR,
-    visible_authored_pr_count BIGINT,
-    merged_pr_count BIGINT,
-    closed_unmerged_pr_count BIGINT,
-    open_pr_count BIGINT,
-    merged_pr_rate DOUBLE,
-    closed_unmerged_pr_rate DOUBLE,
-    still_open_pr_rate DOUBLE,
-    distinct_repos_with_authored_prs BIGINT,
-    distinct_repos_with_open_prs BIGINT,
-    fetch_error VARCHAR
-);
 CREATE TABLE IF NOT EXISTS pr_scope_features (
     run_id VARCHAR,
     repo VARCHAR,
@@ -318,8 +232,6 @@ CREATE TABLE IF NOT EXISTS pr_scope_cluster_candidates (
 CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
 CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
 CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
-CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_author ON pr_search_documents (run_id, author_login);
-CREATE INDEX IF NOT EXISTS idx_pr_search_contributors_run_author ON pr_search_contributors (run_id, author_login);
 CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
 CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
 CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
@@ -344,9 +256,6 @@ def connect_pr_search_db(path: Path, *, read_only: bool = False) -> duckdb.DuckD
 def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
     connection.execute(SCHEMA_SQL)
-    connection.execute(
-        "ALTER TABLE pr_search_documents ADD COLUMN IF NOT EXISTS author_login VARCHAR"
-    )
 def insert_rows(
@@ -444,7 +353,6 @@ def resolve_active_run(
 def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
     return {
         "documents": _count(connection, "pr_search_documents", run_id),
-        "contributors": _count(connection, "pr_search_contributors", run_id),
         "features": _count(connection, "pr_scope_features", run_id),
         "run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
         "neighbors": _count(connection, "pr_scope_neighbors", run_id),
@@ -467,60 +375,6 @@ def get_document(
     )
-def get_contributor(
-    connection: duckdb.DuckDBPyConnection,
-    *,
-    run_id: str,
-    author_login: str,
-) -> dict[str, Any] | None:
-    return fetch_one(
-        connection,
-        """
-        SELECT *
-        FROM pr_search_contributors
-        WHERE run_id = ? AND lower(author_login) = lower(?)
-        """,
-        [run_id, author_login],
-    )
-def get_contributor_pulls(
-    connection: duckdb.DuckDBPyConnection,
-    *,
-    run_id: str,
-    author_login: str,
-    limit: int,
-) -> list[dict[str, Any]]:
-    return fetch_rows(
-        connection,
-        """
-        SELECT
-            pr_number,
-            github_id,
-            author_login,
-            state,
-            draft,
-            merged,
-            title,
-            base_ref,
-            created_at,
-            updated_at,
-            merged_at,
-            additions,
-            deletions,
-            changed_files,
-            comments_count,
-            review_comments_count,
-            html_url
-        FROM pr_search_documents
-        WHERE run_id = ? AND lower(author_login) = lower(?)
-        ORDER BY updated_at DESC NULLS LAST, pr_number DESC
-        LIMIT ?
-        """,
-        [run_id, author_login, limit],
-    )
 def get_feature(
     connection: duckdb.DuckDBPyConnection,
     *,

         "repo",
         "pr_number",
         "github_id",
         "state",
         "draft",
         "merged",
         "review_comments_count",
         "html_url",
     ),
     "pr_scope_features": (
         "run_id",
         "repo",
     repo VARCHAR,
     pr_number BIGINT,
     github_id BIGINT,
     state VARCHAR,
     draft BOOLEAN,
     merged BOOLEAN,
     review_comments_count BIGINT,
     html_url VARCHAR
 );
 CREATE TABLE IF NOT EXISTS pr_scope_features (
     run_id VARCHAR,
     repo VARCHAR,
 CREATE INDEX IF NOT EXISTS idx_pr_search_active_run_repo ON pr_search_active_run (repo);
 CREATE INDEX IF NOT EXISTS idx_pr_search_runs_repo_status ON pr_search_runs (repo, status);
 CREATE INDEX IF NOT EXISTS idx_pr_search_documents_run_pr ON pr_search_documents (run_id, pr_number);
 CREATE INDEX IF NOT EXISTS idx_pr_scope_features_run_pr ON pr_scope_features (run_id, pr_number);
 CREATE INDEX IF NOT EXISTS idx_pr_scope_run_artifacts_run ON pr_scope_run_artifacts (run_id);
 CREATE INDEX IF NOT EXISTS idx_pr_scope_neighbors_run_left ON pr_scope_neighbors (run_id, left_pr_number);
 def ensure_pr_search_schema(connection: duckdb.DuckDBPyConnection) -> None:
     connection.execute(SCHEMA_SQL)
 def insert_rows(
 def get_run_counts(connection: duckdb.DuckDBPyConnection, *, run_id: str) -> dict[str, int]:
     return {
         "documents": _count(connection, "pr_search_documents", run_id),
         "features": _count(connection, "pr_scope_features", run_id),
         "run_artifacts": _count(connection, "pr_scope_run_artifacts", run_id),
         "neighbors": _count(connection, "pr_scope_neighbors", run_id),
     )
 def get_feature(
     connection: duckdb.DuckDBPyConnection,
     *,

src/slop_farmer/data/snapshot_materialize.py CHANGED Viewed

@@ -74,6 +74,9 @@ def _materialize_hf_snapshot_repo_snapshot(
         "links.parquet",
         "events.parquet",
         "manifest.json",
         "new_contributors.parquet",
         "new-contributors-report.json",
         "new-contributors-report.md",
@@ -149,6 +152,9 @@ def _materialize_hf_root_snapshot(
         "links.parquet",
         "events.parquet",
         "manifest.json",
         "new_contributors.parquet",
         "new-contributors-report.json",
         "new-contributors-report.md",

         "links.parquet",
         "events.parquet",
         "manifest.json",
+        "analysis-report.json",
+        "analysis-report-hybrid.json",
+        "analysis-report-deterministic.json",
         "new_contributors.parquet",
         "new-contributors-report.json",
         "new-contributors-report.md",
         "links.parquet",
         "events.parquet",
         "manifest.json",
+        "analysis-report.json",
+        "analysis-report-hybrid.json",
+        "analysis-report-deterministic.json",
         "new_contributors.parquet",
         "new-contributors-report.json",
         "new-contributors-report.md",

src/slop_farmer/reports/analysis.py CHANGED Viewed

@@ -19,7 +19,11 @@ from rank_bm25 import BM25Okapi
 from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
 from slop_farmer.data.links import build_text_link_rows
 from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
-from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 from slop_farmer.reports.analysis_cache import (
     HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
     PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
@@ -762,14 +766,18 @@ def _artifact_suffix(row: dict[str, Any] | None, kind: str) -> str:
 def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
-    return resolve_snapshot_source_dir(
-        snapshot_dir=options.snapshot_dir,
-        local_snapshots_root=options.output_dir.resolve() / "snapshots",
-        hf_repo_id=options.hf_repo_id,
-        hf_revision=options.hf_revision,
-        hf_materialize_dir=options.hf_materialize_dir,
-        hf_output_dir=options.output_dir,
-    )
 def _load_snapshot(snapshot_dir: Path) -> SnapshotData:

 from slop_farmer.config import AnalysisOptions, MarkdownReportOptions
 from slop_farmer.data.links import build_text_link_rows
 from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_text
+from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
+from slop_farmer.data.snapshot_paths import (
+    default_hf_materialize_dir,
+    resolve_snapshot_dir_from_output,
+)
 from slop_farmer.reports.analysis_cache import (
     HYBRID_REVIEW_CACHE_SCHEMA_VERSION,
     PREPARED_REVIEW_UNIT_SCHEMA_VERSION,
 def _resolve_snapshot_dir(options: AnalysisOptions) -> Path:
+    if options.hf_repo_id:
+        materialize_dir = options.hf_materialize_dir or default_hf_materialize_dir(
+            options.output_dir,
+            options.hf_repo_id,
+            options.hf_revision,
+        )
+        return materialize_hf_dataset_snapshot(
+            repo_id=options.hf_repo_id,
+            local_dir=materialize_dir,
+            revision=options.hf_revision,
+        ).resolve()
+    return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
 def _load_snapshot(snapshot_dir: Path) -> SnapshotData:

src/slop_farmer/reports/dashboard.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Any
 from slop_farmer.config import DashboardDataOptions
 from slop_farmer.data.parquet_io import read_json, read_parquet_rows
-from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 def run_dashboard_data(options: DashboardDataOptions) -> Path:
@@ -88,14 +88,7 @@ def _resolve_snapshot_dir(options: DashboardDataOptions) -> Path:
         if options.snapshot_root is not None
         else (Path("data") / "snapshots").resolve()
     )
-    return resolve_snapshot_source_dir(
-        snapshot_dir=options.snapshot_dir,
-        local_snapshots_root=snapshots_root,
-        hf_repo_id=options.hf_repo_id,
-        hf_revision=options.hf_revision,
-        hf_materialize_dir=options.hf_materialize_dir,
-        hf_output_dir=snapshots_root.parent,
-    )
 def _read_optional_json(path: Path) -> dict[str, Any]:

 from slop_farmer.config import DashboardDataOptions
 from slop_farmer.data.parquet_io import read_json, read_parquet_rows
+from slop_farmer.data.snapshot_paths import resolve_snapshot_dir_from_snapshots_root
 def run_dashboard_data(options: DashboardDataOptions) -> Path:
         if options.snapshot_root is not None
         else (Path("data") / "snapshots").resolve()
     )
+    return resolve_snapshot_dir_from_snapshots_root(snapshots_root, options.snapshot_dir)
 def _read_optional_json(path: Path) -> dict[str, Any]:

src/slop_farmer/reports/new_contributor_report.py CHANGED Viewed

@@ -12,7 +12,7 @@ from typing import Any
 from slop_farmer.config import NewContributorReportOptions, resolve_github_token
 from slop_farmer.data.http import urlopen_with_retry
 from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
-from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 from slop_farmer.reports.user_activity import summarize_user
 GRAPHQL_URL = "https://api.github.com/graphql"
@@ -131,14 +131,7 @@ def run_new_contributor_report(options: NewContributorReportOptions) -> Path:
 def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
-    return resolve_snapshot_source_dir(
-        snapshot_dir=options.snapshot_dir,
-        local_snapshots_root=options.output_dir.resolve() / "snapshots",
-        hf_repo_id=options.hf_repo_id,
-        hf_revision=options.hf_revision,
-        hf_materialize_dir=options.hf_materialize_dir,
-        hf_output_dir=options.output_dir,
-    )
 def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
@@ -251,6 +244,7 @@ def _report_contributors(
             previous_report_reusable
             and previous_entry is not None
             and not previous_entry.get("fetch_error")
         ):
             contributors.append(
                 _reused_previous_report_entry(
@@ -262,8 +256,6 @@ def _report_contributors(
                 )
             )
             reused_previous_report += 1
-            if known_via_prior_merged_pr:
-                reused_known_merged += 1
             continue
         try:
             summary = summarize_user(row["author_login"], options.window_days, None)

 from slop_farmer.config import NewContributorReportOptions, resolve_github_token
 from slop_farmer.data.http import urlopen_with_retry
 from slop_farmer.data.parquet_io import read_json, read_parquet_rows, write_parquet, write_text
+from slop_farmer.data.snapshot_paths import resolve_snapshot_dir_from_output
 from slop_farmer.reports.user_activity import summarize_user
 GRAPHQL_URL = "https://api.github.com/graphql"
 def _resolve_snapshot_dir(options: NewContributorReportOptions) -> Path:
+    return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
 def _load_snapshot(snapshot_dir: Path) -> dict[str, Any]:
             previous_report_reusable
             and previous_entry is not None
             and not previous_entry.get("fetch_error")
+            and not known_via_prior_merged_pr
         ):
             contributors.append(
                 _reused_previous_report_entry(
                 )
             )
             reused_previous_report += 1
             continue
         try:
             summary = summarize_user(row["author_login"], options.window_days, None)

src/slop_farmer/reports/pr_scope.py CHANGED Viewed

@@ -42,7 +42,11 @@ from typing import Any
 from pydantic import BaseModel, Field
 from slop_farmer.data.parquet_io import read_json, read_parquet_rows
-from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 from slop_farmer.reports.pr_heuristics import (
     compile_cluster_suppression_rules,
     suppressed_pull_request_reasons,
@@ -256,14 +260,17 @@ def run_pr_scope_report(options: Any) -> Path:
 def _resolve_snapshot_dir(options: Any) -> Path:
-    return resolve_snapshot_source_dir(
-        snapshot_dir=options.snapshot_dir,
-        local_snapshots_root=options.output_dir.resolve() / "snapshots",
-        hf_repo_id=options.hf_repo_id,
-        hf_revision=options.hf_revision,
-        hf_materialize_dir=options.hf_materialize_dir,
-        hf_output_dir=options.output_dir,
-    )
 def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:

 from pydantic import BaseModel, Field
 from slop_farmer.data.parquet_io import read_json, read_parquet_rows
+from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
+from slop_farmer.data.snapshot_paths import (
+    default_hf_materialize_dir,
+    resolve_snapshot_dir_from_output,
+)
 from slop_farmer.reports.pr_heuristics import (
     compile_cluster_suppression_rules,
     suppressed_pull_request_reasons,
 def _resolve_snapshot_dir(options: Any) -> Path:
+    if options.hf_repo_id:
+        snapshot_dir = materialize_hf_dataset_snapshot(
+            repo_id=options.hf_repo_id,
+            local_dir=options.hf_materialize_dir
+            or default_hf_materialize_dir(
+                options.output_dir, options.hf_repo_id, options.hf_revision
+            ),
+            revision=options.hf_revision,
+        )
+        return snapshot_dir.resolve()
+    return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
 def _load_snapshot_context(snapshot_dir: Path) -> dict[str, Any]:

src/slop_farmer/reports/pr_search_scope.py CHANGED Viewed

@@ -10,7 +10,11 @@ from typing import Any
 from slop_farmer.config import PrSearchRefreshOptions
 from slop_farmer.data.parquet_io import read_json, read_parquet_rows
-from slop_farmer.data.snapshot_source import resolve_snapshot_source_dir
 from slop_farmer.reports.pr_heuristics import (
     compile_cluster_suppression_rules,
     suppressed_pull_request_reasons,
@@ -32,14 +36,17 @@ DEFAULT_CANDIDATE_LIMIT = 5
 def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
-    return resolve_snapshot_source_dir(
-        snapshot_dir=options.snapshot_dir,
-        local_snapshots_root=options.output_dir.resolve() / "snapshots",
-        hf_repo_id=options.hf_repo_id,
-        hf_revision=options.hf_revision,
-        hf_materialize_dir=options.hf_materialize_dir,
-        hf_output_dir=options.output_dir,
-    )
 def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
@@ -47,7 +54,6 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
     manifest = read_json(manifest_path) if manifest_path.exists() else {}
     pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
     pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
-    contributors = read_parquet_rows(snapshot_dir / "new_contributors.parquet")
     repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
     snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
     return {
@@ -56,7 +62,6 @@ def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
         "manifest": manifest,
         "pull_requests": pull_requests,
         "pr_files": pr_files,
-        "contributors": contributors,
     }
@@ -407,7 +412,6 @@ def _document_row(row: Mapping[str, Any]) -> dict[str, Any]:
     return {
         "pr_number": int(row["number"]),
         "github_id": row.get("github_id"),
-        "author_login": row.get("author_login"),
         "state": row.get("state"),
         "draft": bool(row.get("draft")),
         "merged": bool(row.get("merged")),

 from slop_farmer.config import PrSearchRefreshOptions
 from slop_farmer.data.parquet_io import read_json, read_parquet_rows
+from slop_farmer.data.snapshot_materialize import materialize_hf_dataset_snapshot
+from slop_farmer.data.snapshot_paths import (
+    default_hf_materialize_dir,
+    resolve_snapshot_dir_from_output,
+)
 from slop_farmer.reports.pr_heuristics import (
     compile_cluster_suppression_rules,
     suppressed_pull_request_reasons,
 def resolve_pr_search_snapshot_dir(options: PrSearchRefreshOptions) -> Path:
+    if options.hf_repo_id:
+        snapshot_dir = materialize_hf_dataset_snapshot(
+            repo_id=options.hf_repo_id,
+            local_dir=options.hf_materialize_dir
+            or default_hf_materialize_dir(
+                options.output_dir, options.hf_repo_id, options.hf_revision
+            ),
+            revision=options.hf_revision,
+        )
+        return snapshot_dir.resolve()
+    return resolve_snapshot_dir_from_output(options.output_dir, options.snapshot_dir)
 def load_pr_search_snapshot(snapshot_dir: Path) -> dict[str, Any]:
     manifest = read_json(manifest_path) if manifest_path.exists() else {}
     pull_requests = read_parquet_rows(snapshot_dir / "pull_requests.parquet")
     pr_files = read_parquet_rows(snapshot_dir / "pr_files.parquet")
     repo = manifest.get("repo") or (pull_requests[0].get("repo") if pull_requests else None) or ""
     snapshot_id = manifest.get("snapshot_id") or snapshot_dir.name
     return {
         "manifest": manifest,
         "pull_requests": pull_requests,
         "pr_files": pr_files,
     }
     return {
         "pr_number": int(row["number"]),
         "github_id": row.get("github_id"),
         "state": row.get("state"),
         "draft": bool(row.get("draft")),
         "merged": bool(row.get("merged")),

src/slop_farmer/reports/pr_search_service.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import json
-from collections.abc import Iterable, Mapping, Sequence
 from contextlib import suppress
 from pathlib import Path
 from typing import Any, Protocol
@@ -17,8 +17,6 @@ from slop_farmer.data.search_duckdb import (
     get_cluster,
     get_cluster_ids_for_prs,
     get_cluster_members,
-    get_contributor,
-    get_contributor_pulls,
     get_document,
     get_feature,
     get_pair_neighbor_row,
@@ -101,16 +99,6 @@ def run_pr_search_refresh(options: PrSearchRefreshOptions) -> dict[str, Any]:
             "pr_search_documents",
             _scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
         )
-        insert_rows(
-            connection,
-            "pr_search_contributors",
-            _contributor_rows(
-                snapshot["contributors"],
-                run_id=run_id,
-                repo=repo,
-                snapshot_id=str(snapshot["snapshot_id"]),
-            ),
-        )
         insert_rows(
             connection,
             "pr_scope_features",
@@ -302,85 +290,6 @@ def get_pr_search_candidate_clusters(
         connection.close()
-def get_pr_search_contributor(
-    db_path: Path,
-    *,
-    author_login: str,
-    repo: str | None = None,
-) -> dict[str, Any]:
-    connection = connect_pr_search_db(db_path, read_only=True)
-    try:
-        active_run = resolve_active_run(connection, repo=repo)
-        run_id = str(active_run["id"])
-        contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
-        pulls = _document_rows(
-            get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=20)
-        )
-        return {
-            "repo": active_run["repo"],
-            "snapshot_id": active_run["snapshot_id"],
-            "run_id": run_id,
-            "contributor": contributor,
-            "pulls": pulls,
-            "pull_count": len(pulls),
-        }
-    finally:
-        connection.close()
-def get_pr_search_contributor_pulls(
-    db_path: Path,
-    *,
-    author_login: str,
-    repo: str | None = None,
-    limit: int = 20,
-) -> dict[str, Any]:
-    connection = connect_pr_search_db(db_path, read_only=True)
-    try:
-        active_run = resolve_active_run(connection, repo=repo)
-        run_id = str(active_run["id"])
-        contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
-        pulls = _document_rows(
-            get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=limit)
-        )
-        return {
-            "repo": active_run["repo"],
-            "snapshot_id": active_run["snapshot_id"],
-            "run_id": run_id,
-            "contributor": contributor,
-            "pulls": pulls,
-            "pull_count": len(pulls),
-        }
-    finally:
-        connection.close()
-def get_pr_search_pull_contributor(
-    db_path: Path,
-    *,
-    pr_number: int,
-    repo: str | None = None,
-) -> dict[str, Any]:
-    connection = connect_pr_search_db(db_path, read_only=True)
-    try:
-        active_run = resolve_active_run(connection, repo=repo)
-        run_id = str(active_run["id"])
-        document = _require_document(connection, run_id=run_id, pr_number=pr_number)
-        author_login = str(document.get("author_login") or "").strip()
-        if not author_login:
-            raise ValueError(f"PR #{pr_number} does not have an indexed author_login.")
-        contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
-        return {
-            "repo": active_run["repo"],
-            "snapshot_id": active_run["snapshot_id"],
-            "run_id": run_id,
-            "pr": _without_json_fields(document),
-            "contributor": contributor,
-        }
-    finally:
-        connection.close()
 def get_pr_search_similar_lookup(
     db_path: Path,
     *,
@@ -892,15 +801,6 @@ def _require_feature(connection: Any, *, run_id: str, pr_number: int) -> dict[st
     return feature
-def _require_contributor(connection: Any, *, run_id: str, author_login: str) -> dict[str, Any]:
-    contributor = get_contributor(connection, run_id=run_id, author_login=author_login)
-    if contributor is None:
-        raise ValueError(
-            f"Contributor {author_login!r} was not found in the active indexed universe."
-        )
-    return _contributor_row(contributor)
 def _json_list(raw: Any) -> list[str]:
     if isinstance(raw, list):
         return [str(item) for item in raw]
@@ -938,71 +838,6 @@ def _without_json_fields(row: Mapping[str, Any]) -> dict[str, Any]:
     return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
-def _document_rows(rows: Sequence[Mapping[str, Any]]) -> list[dict[str, Any]]:
-    return [_without_json_fields(row) for row in rows]
-def _contributor_rows(
-    rows: list[Mapping[str, Any]],
-    *,
-    run_id: str,
-    repo: str,
-    snapshot_id: str,
-) -> list[dict[str, Any]]:
-    return [
-        {
-            "run_id": run_id,
-            "repo": repo,
-            "snapshot_id": snapshot_id,
-            "report_generated_at": row.get("report_generated_at"),
-            "window_days": row.get("window_days"),
-            "author_login": row.get("author_login"),
-            "name": row.get("name"),
-            "profile_url": row.get("profile_url"),
-            "repo_pull_requests_url": row.get("repo_pull_requests_url"),
-            "repo_issues_url": row.get("repo_issues_url"),
-            "repo_first_seen_at": row.get("repo_first_seen_at"),
-            "repo_last_seen_at": row.get("repo_last_seen_at"),
-            "repo_primary_artifact_count": row.get("repo_primary_artifact_count"),
-            "repo_artifact_count": row.get("repo_artifact_count"),
-            "snapshot_issue_count": row.get("snapshot_issue_count"),
-            "snapshot_pr_count": row.get("snapshot_pr_count"),
-            "snapshot_comment_count": row.get("snapshot_comment_count"),
-            "snapshot_review_count": row.get("snapshot_review_count"),
-            "snapshot_review_comment_count": row.get("snapshot_review_comment_count"),
-            "repo_association": row.get("repo_association"),
-            "new_to_repo": row.get("new_to_repo"),
-            "first_seen_in_snapshot": row.get("first_seen_in_snapshot"),
-            "report_reason": row.get("report_reason"),
-            "account_age_days": row.get("account_age_days"),
-            "young_account": row.get("young_account"),
-            "follow_through_score": row.get("follow_through_score"),
-            "breadth_score": row.get("breadth_score"),
-            "automation_risk_signal": row.get("automation_risk_signal"),
-            "heuristic_note": row.get("heuristic_note"),
-            "public_orgs_json": row.get("public_orgs"),
-            "visible_authored_pr_count": row.get("visible_authored_pr_count"),
-            "merged_pr_count": row.get("merged_pr_count"),
-            "closed_unmerged_pr_count": row.get("closed_unmerged_pr_count"),
-            "open_pr_count": row.get("open_pr_count"),
-            "merged_pr_rate": row.get("merged_pr_rate"),
-            "closed_unmerged_pr_rate": row.get("closed_unmerged_pr_rate"),
-            "still_open_pr_rate": row.get("still_open_pr_rate"),
-            "distinct_repos_with_authored_prs": row.get("distinct_repos_with_authored_prs"),
-            "distinct_repos_with_open_prs": row.get("distinct_repos_with_open_prs"),
-            "fetch_error": row.get("fetch_error"),
-        }
-        for row in rows
-    ]
-def _contributor_row(row: Mapping[str, Any]) -> dict[str, Any]:
-    return {
-        **_without_json_fields(row),
-        "public_orgs": _json_list(row.get("public_orgs_json")),
-    }
 def _normalize_lookup_mode(mode: str) -> str:
     normalized = mode.strip().lower()
     if normalized not in {"auto", "indexed", "live"}:

 from __future__ import annotations
 import json
+from collections.abc import Iterable, Mapping
 from contextlib import suppress
 from pathlib import Path
 from typing import Any, Protocol
     get_cluster,
     get_cluster_ids_for_prs,
     get_cluster_members,
     get_document,
     get_feature,
     get_pair_neighbor_row,
             "pr_search_documents",
             _scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
         )
         insert_rows(
             connection,
             "pr_scope_features",
         connection.close()
 def get_pr_search_similar_lookup(
     db_path: Path,
     *,
     return feature
 def _json_list(raw: Any) -> list[str]:
     if isinstance(raw, list):
         return [str(item) for item in raw]
     return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}
 def _normalize_lookup_mode(mode: str) -> str:
     normalized = mode.strip().lower()
     if normalized not in {"auto", "indexed", "live"}:

src/slop_farmer/reports/read_views.py ADDED Viewed

	@@ -0,0 +1,742 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Literal
+from slop_farmer.data.parquet_io import read_json, read_parquet_rows
+AnalysisVariant = Literal["auto", "hybrid", "deterministic"]
+@dataclass(slots=True, frozen=True)
+class _SnapshotMetadata:
+    repo: str
+    snapshot_id: str
+@dataclass(slots=True, frozen=True)
+class _AnalysisSelection:
+    path: Path
+    payload: dict[str, Any]
+    variant_used: str
+    llm_enrichment: bool
+def get_snapshot_surfaces(snapshot_dir: Path) -> dict[str, Any]:
+    issue_status = get_issue_cluster_status(snapshot_dir, variant="auto")
+    contributor_status = get_contributor_status(snapshot_dir)
+    return {
+        "issues": {
+            "available": issue_status["available"],
+            "variant_used": issue_status.get("variant_used"),
+            "llm_enrichment": issue_status.get("llm_enrichment"),
+            "generated_at": issue_status.get("generated_at"),
+            "cluster_count": (issue_status.get("counts") or {}).get("meta_bugs", 0),
+            "duplicate_pr_count": (issue_status.get("counts") or {}).get("duplicate_prs", 0),
+            "available_variants": issue_status.get("available_variants") or [],
+        },
+        "contributors": {
+            "available": contributor_status["available"],
+            "generated_at": contributor_status.get("generated_at"),
+            "contributor_count": contributor_status.get("contributor_count", 0),
+        },
+    }
+def get_issue_cluster_status(snapshot_dir: Path, *, variant: AnalysisVariant) -> dict[str, Any]:
+    metadata = _snapshot_metadata(snapshot_dir)
+    candidates = _analysis_candidates(snapshot_dir)
+    selection = _select_analysis_report(candidates, variant=variant)
+    status = {
+        "repo": metadata.repo,
+        "snapshot_id": metadata.snapshot_id,
+        "variant_requested": variant,
+        "available": selection is not None,
+        "available_variants": sorted({candidate["variant"] for candidate in candidates}),
+    }
+    if selection is None:
+        return {
+            **status,
+            "variant_used": None,
+            "llm_enrichment": False,
+            "generated_at": None,
+            "report_path": None,
+            "counts": {"meta_bugs": 0, "duplicate_issues": 0, "duplicate_prs": 0},
+        }
+    payload = selection.payload
+    return {
+        **status,
+        "variant_used": selection.variant_used,
+        "llm_enrichment": selection.llm_enrichment,
+        "generated_at": payload.get("generated_at"),
+        "report_path": selection.path.name,
+        "counts": _analysis_counts(payload),
+    }
+def list_issue_clusters(
+    snapshot_dir: Path,
+    *,
+    limit: int | None,
+    variant: AnalysisVariant,
+) -> dict[str, Any]:
+    metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
+    base = _analysis_base_payload(metadata, selection, variant=variant)
+    if selection is None:
+        return {**base, "clusters": [], "cluster_count": 0}
+    clusters = [
+        _issue_cluster_summary(cluster, issue_map, pr_map, rank=index)
+        for index, cluster in enumerate(selection.payload.get("meta_bugs") or [], start=1)
+    ]
+    total = len(clusters)
+    return {
+        **base,
+        "clusters": clusters[:limit] if limit is not None else clusters,
+        "cluster_count": total,
+    }
+def get_issue_cluster(
+    snapshot_dir: Path,
+    *,
+    cluster_id: str,
+    variant: AnalysisVariant,
+) -> dict[str, Any]:
+    metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
+    base = _analysis_base_payload(metadata, selection, variant=variant)
+    if selection is None:
+        return {
+            **base,
+            "cluster_id": cluster_id,
+            "found": False,
+            "cluster": None,
+            "issues": [],
+            "pull_requests": [],
+        }
+    cluster = next(
+        (
+            row
+            for row in selection.payload.get("meta_bugs") or []
+            if str(row.get("cluster_id") or "") == cluster_id
+        ),
+        None,
+    )
+    if cluster is None:
+        return {
+            **base,
+            "cluster_id": cluster_id,
+            "found": False,
+            "cluster": None,
+            "issues": [],
+            "pull_requests": [],
+        }
+    issue_numbers = _ordered_ints(cluster.get("issue_numbers"))
+    pr_numbers = _ordered_ints(cluster.get("pr_numbers"))
+    canonical_pr_number = _coerce_int(cluster.get("canonical_pr_number"))
+    return {
+        **base,
+        "cluster_id": cluster_id,
+        "found": True,
+        "cluster": {
+            **_issue_cluster_summary(cluster, issue_map, pr_map),
+            "canonical_issue_reason": cluster.get("canonical_issue_reason"),
+            "canonical_pr_reason": cluster.get("canonical_pr_reason"),
+            "best_issue_reason": cluster.get("best_issue_reason"),
+            "best_pr_reason": cluster.get("best_pr_reason"),
+        },
+        "issues": [_issue_member_row(number, issue_map.get(number)) for number in issue_numbers],
+        "pull_requests": [
+            _pr_member_row(
+                number,
+                pr_map.get(number),
+                role="canonical" if canonical_pr_number == number else "member",
+            )
+            for number in pr_numbers
+        ],
+    }
+def get_issue_clusters_for_pr(
+    snapshot_dir: Path,
+    *,
+    pr_number: int,
+    variant: AnalysisVariant,
+) -> dict[str, Any]:
+    metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
+    base = _analysis_base_payload(metadata, selection, variant=variant)
+    if selection is None:
+        return {**base, "pr_number": pr_number, "found": False, "clusters": [], "cluster_count": 0}
+    matches = []
+    for index, cluster in enumerate(selection.payload.get("meta_bugs") or [], start=1):
+        pr_numbers = _ordered_ints(cluster.get("pr_numbers"))
+        if pr_number not in pr_numbers:
+            continue
+        canonical_pr_number = _coerce_int(cluster.get("canonical_pr_number"))
+        matches.append(
+            {
+                **_issue_cluster_summary(cluster, issue_map, pr_map, rank=index),
+                "membership_role": "canonical" if canonical_pr_number == pr_number else "member",
+            }
+        )
+    return {
+        **base,
+        "pr_number": pr_number,
+        "found": bool(matches),
+        "clusters": matches,
+        "cluster_count": len(matches),
+    }
+def check_issue_cluster_membership(
+    snapshot_dir: Path,
+    *,
+    pr_number: int,
+    cluster_id: str | None,
+    variant: AnalysisVariant,
+) -> dict[str, Any]:
+    lookup = get_issue_clusters_for_pr(snapshot_dir, pr_number=pr_number, variant=variant)
+    matches = list(lookup.get("clusters") or [])
+    matching_cluster_ids = [str(row.get("cluster_id")) for row in matches if row.get("cluster_id")]
+    if cluster_id is None:
+        return {
+            **lookup,
+            "cluster_id": None,
+            "matched": bool(matching_cluster_ids),
+            "matching_cluster_ids": matching_cluster_ids,
+        }
+    match = next((row for row in matches if row.get("cluster_id") == cluster_id), None)
+    return {
+        **lookup,
+        "cluster_id": cluster_id,
+        "matched": match is not None,
+        "matching_cluster_ids": matching_cluster_ids,
+        "membership": match,
+    }
+def list_issue_duplicate_prs(
+    snapshot_dir: Path,
+    *,
+    limit: int | None,
+    variant: AnalysisVariant,
+) -> dict[str, Any]:
+    metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
+    base = _analysis_base_payload(metadata, selection, variant=variant)
+    if selection is None:
+        return {**base, "duplicate_prs": [], "duplicate_pr_count": 0}
+    rows = [
+        _duplicate_pr_summary(entry, issue_map, pr_map, rank=index)
+        for index, entry in enumerate(selection.payload.get("duplicate_prs") or [], start=1)
+    ]
+    total = len(rows)
+    return {
+        **base,
+        "duplicate_prs": rows[:limit] if limit is not None else rows,
+        "duplicate_pr_count": total,
+    }
+def get_issue_best(snapshot_dir: Path, *, variant: AnalysisVariant) -> dict[str, Any]:
+    metadata, selection, issue_map, pr_map = _analysis_context(snapshot_dir, variant=variant)
+    base = _analysis_base_payload(metadata, selection, variant=variant)
+    if selection is None:
+        return {**base, "best_issue": None, "best_pr": None}
+    return {
+        **base,
+        "best_issue": _best_issue_summary(selection.payload.get("best_issue"), issue_map),
+        "best_pr": _best_pr_summary(selection.payload.get("best_pr"), pr_map),
+    }
+def get_contributor_status(snapshot_dir: Path) -> dict[str, Any]:
+    metadata = _snapshot_metadata(snapshot_dir)
+    report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
+    contributors = report.get("contributors") if isinstance(report.get("contributors"), list) else []
+    return {
+        "repo": str(report.get("repo") or metadata.repo),
+        "snapshot_id": str(report.get("snapshot_id") or metadata.snapshot_id),
+        "available": bool(report),
+        "generated_at": report.get("generated_at"),
+        "window_days": _coerce_int(report.get("window_days")),
+        "contributor_count": len(contributors),
+    }
+def list_contributors(snapshot_dir: Path, *, limit: int | None) -> dict[str, Any]:
+    status = get_contributor_status(snapshot_dir)
+    report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
+    rows = [
+        _contributor_summary(entry, rank=index)
+        for index, entry in enumerate(report.get("contributors") or [], start=1)
+        if isinstance(entry, dict)
+    ]
+    total = len(rows)
+    return {
+        **status,
+        "contributors": rows[:limit] if limit is not None else rows,
+        "contributor_count": total,
+    }
+def get_contributor(snapshot_dir: Path, *, author_login: str) -> dict[str, Any]:
+    status = get_contributor_status(snapshot_dir)
+    report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
+    contributor = _find_contributor(report.get("contributors") or [], author_login)
+    if contributor is None:
+        return {
+            **status,
+            "author_login": author_login,
+            "found": False,
+            "summary": None,
+            "risk": None,
+            "contributor": None,
+        }
+    return {
+        **status,
+        "author_login": str(contributor.get("author_login") or author_login),
+        "found": True,
+        "summary": _contributor_summary(contributor),
+        "risk": _contributor_risk(contributor),
+        "contributor": contributor,
+    }
+def get_contributor_risk(snapshot_dir: Path, *, author_login: str) -> dict[str, Any]:
+    contributor = get_contributor(snapshot_dir, author_login=author_login)
+    risk = contributor.get("risk")
+    return {
+        "repo": contributor.get("repo"),
+        "snapshot_id": contributor.get("snapshot_id"),
+        "available": contributor.get("available"),
+        "generated_at": contributor.get("generated_at"),
+        "author_login": contributor.get("author_login"),
+        "found": contributor.get("found"),
+        "risk_available": risk is not None,
+        "risk": risk,
+    }
+def _analysis_context(
+    snapshot_dir: Path,
+    *,
+    variant: AnalysisVariant,
+) -> tuple[_SnapshotMetadata, _AnalysisSelection | None, dict[int, dict[str, Any]], dict[int, dict[str, Any]]]:
+    metadata = _snapshot_metadata(snapshot_dir)
+    selection = _select_analysis_report(_analysis_candidates(snapshot_dir), variant=variant)
+    issue_map, pr_map = _artifact_maps(snapshot_dir)
+    return metadata, selection, issue_map, pr_map
+def _analysis_base_payload(
+    metadata: _SnapshotMetadata,
+    selection: _AnalysisSelection | None,
+    *,
+    variant: AnalysisVariant,
+) -> dict[str, Any]:
+    base = {
+        "repo": metadata.repo,
+        "snapshot_id": metadata.snapshot_id,
+        "variant_requested": variant,
+        "available": selection is not None,
+        "variant_used": None,
+        "llm_enrichment": False,
+        "generated_at": None,
+    }
+    if selection is None:
+        return base
+    return {
+        **base,
+        "variant_used": selection.variant_used,
+        "llm_enrichment": selection.llm_enrichment,
+        "generated_at": selection.payload.get("generated_at"),
+    }
+def _analysis_candidates(snapshot_dir: Path) -> list[dict[str, Any]]:
+    candidates: list[dict[str, Any]] = []
+    for path in _analysis_report_paths(snapshot_dir):
+        payload = _read_optional_json(path)
+        if not payload:
+            continue
+        llm_enrichment = bool(payload.get("llm_enrichment"))
+        candidates.append(
+            {
+                "path": path,
+                "payload": payload,
+                "variant": _analysis_variant(path.name, payload, llm_enrichment=llm_enrichment),
+                "llm_enrichment": llm_enrichment,
+            }
+        )
+    return candidates
+def _select_analysis_report(
+    candidates: list[dict[str, Any]],
+    *,
+    variant: AnalysisVariant,
+) -> _AnalysisSelection | None:
+    if not candidates:
+        return None
+    if variant == "auto":
+        ordered = sorted(candidates, key=_analysis_auto_priority)
+    else:
+        ordered = [candidate for candidate in candidates if candidate["variant"] == variant]
+        ordered.sort(key=_analysis_specific_priority)
+    if not ordered:
+        return None
+    winner = ordered[0]
+    return _AnalysisSelection(
+        path=Path(winner["path"]),
+        payload=dict(winner["payload"]),
+        variant_used=str(winner["variant"]),
+        llm_enrichment=bool(winner["llm_enrichment"]),
+    )
+def _analysis_report_paths(snapshot_dir: Path) -> list[Path]:
+    ordered = [
+        snapshot_dir / "analysis-report-hybrid.json",
+        snapshot_dir / "analysis-report-deterministic.json",
+        snapshot_dir / "analysis-report.json",
+    ]
+    seen = {path.name for path in ordered}
+    ordered.extend(
+        path for path in sorted(snapshot_dir.glob("analysis-report*.json")) if path.name not in seen
+    )
+    return [path for path in ordered if path.exists()]
+def _analysis_auto_priority(candidate: dict[str, Any]) -> tuple[int, str]:
+    path = Path(candidate["path"])
+    if path.name == "analysis-report-hybrid.json":
+        return (0, path.name)
+    if bool(candidate.get("llm_enrichment")):
+        return (1, path.name)
+    if path.name == "analysis-report.json":
+        return (2, path.name)
+    return (3, path.name)
+def _analysis_specific_priority(candidate: dict[str, Any]) -> tuple[int, str]:
+    path = Path(candidate["path"])
+    if path.name.endswith(f"-{candidate['variant']}.json"):
+        return (0, path.name)
+    if path.name == "analysis-report.json":
+        return (1, path.name)
+    return (2, path.name)
+def _analysis_variant(path_name: str, payload: dict[str, Any], *, llm_enrichment: bool) -> str:
+    lowered = path_name.lower()
+    if "hybrid" in lowered:
+        return "hybrid"
+    if "deterministic" in lowered:
+        return "deterministic"
+    if isinstance(payload.get("variant_used"), str):
+        variant_used = str(payload["variant_used"]).strip().lower()
+        if variant_used in {"hybrid", "deterministic"}:
+            return variant_used
+    return "hybrid" if llm_enrichment else "deterministic"
+def _analysis_counts(payload: dict[str, Any]) -> dict[str, int]:
+    return {
+        "meta_bugs": len(payload.get("meta_bugs") or []),
+        "duplicate_issues": len(payload.get("duplicate_issues") or []),
+        "duplicate_prs": len(payload.get("duplicate_prs") or []),
+    }
+def _artifact_maps(snapshot_dir: Path) -> tuple[dict[int, dict[str, Any]], dict[int, dict[str, Any]]]:
+    issue_rows = read_parquet_rows(snapshot_dir / "issues.parquet") if (snapshot_dir / "issues.parquet").exists() else []
+    pr_rows = (
+        read_parquet_rows(snapshot_dir / "pull_requests.parquet")
+        if (snapshot_dir / "pull_requests.parquet").exists()
+        else []
+    )
+    issue_map = {int(row["number"]): row for row in issue_rows if _coerce_int(row.get("number")) is not None}
+    pr_map = {int(row["number"]): row for row in pr_rows if _coerce_int(row.get("number")) is not None}
+    return issue_map, pr_map
+def _issue_cluster_summary(
+    cluster: dict[str, Any],
+    issue_map: dict[int, dict[str, Any]],
+    pr_map: dict[int, dict[str, Any]],
+    *,
+    rank: int | None = None,
+) -> dict[str, Any]:
+    canonical_issue_number = _coerce_int(cluster.get("canonical_issue_number"))
+    canonical_pr_number = _coerce_int(cluster.get("canonical_pr_number"))
+    issue_numbers = _ordered_ints(cluster.get("issue_numbers"))
+    pr_numbers = _ordered_ints(cluster.get("pr_numbers"))
+    return {
+        "rank": rank,
+        "cluster_id": str(cluster.get("cluster_id") or f"cluster-{rank or 0}"),
+        "title": _cluster_title(cluster, issue_map, pr_map, canonical_issue_number, canonical_pr_number),
+        "summary": cluster.get("summary"),
+        "status": cluster.get("status"),
+        "confidence": _coerce_float(cluster.get("confidence")),
+        "canonical_issue_number": canonical_issue_number,
+        "canonical_issue_title": _title_for_issue(canonical_issue_number, issue_map),
+        "canonical_issue_url": _url_for_issue(canonical_issue_number, issue_map),
+        "canonical_pr_number": canonical_pr_number,
+        "canonical_pr_title": _title_for_pr(canonical_pr_number, pr_map),
+        "canonical_pr_url": _url_for_pr(canonical_pr_number, pr_map),
+        "issue_numbers": issue_numbers,
+        "issue_count": len(issue_numbers),
+        "pr_numbers": pr_numbers,
+        "pr_count": len(pr_numbers),
+        "evidence_types": [str(value) for value in (cluster.get("evidence_types") or []) if value],
+        "github_url": _cluster_url(canonical_issue_number, canonical_pr_number, issue_map, pr_map),
+    }
+def _cluster_title(
+    cluster: dict[str, Any],
+    issue_map: dict[int, dict[str, Any]],
+    pr_map: dict[int, dict[str, Any]],
+    canonical_issue_number: int | None,
+    canonical_pr_number: int | None,
+) -> str:
+    issue_title = _title_for_issue(canonical_issue_number, issue_map)
+    if issue_title:
+        return issue_title
+    pr_title = _title_for_pr(canonical_pr_number, pr_map)
+    if pr_title:
+        return pr_title
+    summary = str(cluster.get("summary") or "").strip()
+    if summary:
+        return summary
+    return str(cluster.get("cluster_id") or "cluster")
+def _cluster_url(
+    canonical_issue_number: int | None,
+    canonical_pr_number: int | None,
+    issue_map: dict[int, dict[str, Any]],
+    pr_map: dict[int, dict[str, Any]],
+) -> str | None:
+    return _url_for_issue(canonical_issue_number, issue_map) or _url_for_pr(canonical_pr_number, pr_map)
+def _duplicate_pr_summary(
+    entry: dict[str, Any],
+    issue_map: dict[int, dict[str, Any]],
+    pr_map: dict[int, dict[str, Any]],
+    *,
+    rank: int,
+) -> dict[str, Any]:
+    canonical_pr_number = _coerce_int(entry.get("canonical_pr_number"))
+    target_issue_number = _coerce_int(entry.get("target_issue_number"))
+    duplicates = _ordered_ints(entry.get("duplicate_pr_numbers"))
+    return {
+        "rank": rank,
+        "cluster_id": str(entry.get("cluster_id") or f"duplicate-pr-{rank}"),
+        "canonical_pr_number": canonical_pr_number,
+        "canonical_pr_title": _title_for_pr(canonical_pr_number, pr_map),
+        "canonical_pr_url": _url_for_pr(canonical_pr_number, pr_map),
+        "target_issue_number": target_issue_number,
+        "target_issue_title": _title_for_issue(target_issue_number, issue_map),
+        "target_issue_url": _url_for_issue(target_issue_number, issue_map),
+        "duplicate_pr_numbers": duplicates,
+        "duplicate_pr_count": len(duplicates),
+        "reason": entry.get("reason"),
+    }
+def _best_issue_summary(entry: Any, issue_map: dict[int, dict[str, Any]]) -> dict[str, Any] | None:
+    if not isinstance(entry, dict):
+        return None
+    issue_number = _coerce_int(entry.get("issue_number"))
+    return {
+        "cluster_id": entry.get("cluster_id"),
+        "issue_number": issue_number,
+        "title": _title_for_issue(issue_number, issue_map),
+        "url": _url_for_issue(issue_number, issue_map),
+        "reason": entry.get("reason"),
+        "score": _coerce_float(entry.get("score")),
+    }
+def _best_pr_summary(entry: Any, pr_map: dict[int, dict[str, Any]]) -> dict[str, Any] | None:
+    if not isinstance(entry, dict):
+        return None
+    pr_number = _coerce_int(entry.get("pr_number"))
+    return {
+        "cluster_id": entry.get("cluster_id"),
+        "pr_number": pr_number,
+        "title": _title_for_pr(pr_number, pr_map),
+        "url": _url_for_pr(pr_number, pr_map),
+        "reason": entry.get("reason"),
+        "score": _coerce_float(entry.get("score")),
+    }
+def _issue_member_row(number: int, row: dict[str, Any] | None) -> dict[str, Any]:
+    row = row or {}
+    return {
+        "number": number,
+        "title": row.get("title"),
+        "state": row.get("state"),
+        "author_login": row.get("author_login"),
+        "created_at": row.get("created_at"),
+        "updated_at": row.get("updated_at"),
+        "html_url": row.get("html_url"),
+    }
+def _pr_member_row(number: int, row: dict[str, Any] | None, *, role: str) -> dict[str, Any]:
+    row = row or {}
+    return {
+        "number": number,
+        "role": role,
+        "title": row.get("title"),
+        "author_login": row.get("author_login"),
+        "state": row.get("state"),
+        "draft": bool(row.get("draft")),
+        "merged": bool(row.get("merged")),
+        "author_association": row.get("author_association"),
+        "created_at": row.get("created_at"),
+        "updated_at": row.get("updated_at"),
+        "html_url": row.get("html_url"),
+    }
+def _contributor_summary(contributor: dict[str, Any], *, rank: int | None = None) -> dict[str, Any]:
+    activity = contributor.get("activity") if isinstance(contributor.get("activity"), dict) else {}
+    return {
+        "rank": rank,
+        "author_login": contributor.get("author_login"),
+        "name": contributor.get("name"),
+        "profile_url": contributor.get("profile_url"),
+        "repo_association": contributor.get("repo_association"),
+        "first_seen_in_snapshot": contributor.get("first_seen_in_snapshot"),
+        "new_to_repo": contributor.get("new_to_repo"),
+        "snapshot_pr_count": _coerce_int(contributor.get("snapshot_pr_count")) or 0,
+        "snapshot_issue_count": _coerce_int(contributor.get("snapshot_issue_count")) or 0,
+        "follow_through_score": contributor.get("follow_through_score"),
+        "breadth_score": contributor.get("breadth_score"),
+        "automation_risk_signal": contributor.get("automation_risk_signal"),
+        "heuristic_note": contributor.get("heuristic_note"),
+        "account_age_days": _coerce_int(contributor.get("account_age_days")),
+        "public_pr_count_42d": _coerce_int(activity.get("visible_authored_pr_count")),
+        "public_repo_count_42d": _coerce_int(activity.get("distinct_repos_with_authored_prs")),
+        "repo_pull_requests_url": contributor.get("repo_pull_requests_url"),
+        "repo_issues_url": contributor.get("repo_issues_url"),
+    }
+def _contributor_risk(contributor: dict[str, Any]) -> dict[str, Any]:
+    activity = contributor.get("activity") if isinstance(contributor.get("activity"), dict) else {}
+    return {
+        "automation_risk_signal": contributor.get("automation_risk_signal"),
+        "heuristic_note": contributor.get("heuristic_note"),
+        "follow_through_score": contributor.get("follow_through_score"),
+        "breadth_score": contributor.get("breadth_score"),
+        "account_age_days": _coerce_int(contributor.get("account_age_days")),
+        "public_pr_count_42d": _coerce_int(activity.get("visible_authored_pr_count")),
+        "public_repo_count_42d": _coerce_int(activity.get("distinct_repos_with_authored_prs")),
+        "report_reason": contributor.get("report_reason"),
+    }
+def _find_contributor(entries: list[Any], author_login: str) -> dict[str, Any] | None:
+    lowered = author_login.casefold()
+    for entry in entries:
+        if not isinstance(entry, dict):
+            continue
+        login = str(entry.get("author_login") or "")
+        if login.casefold() == lowered:
+            return entry
+    return None
+def _snapshot_metadata(snapshot_dir: Path) -> _SnapshotMetadata:
+    manifest = _read_optional_json(snapshot_dir / "manifest.json")
+    repo = str(manifest.get("repo") or _infer_repo(snapshot_dir) or "")
+    snapshot_id = str(manifest.get("snapshot_id") or snapshot_dir.name)
+    return _SnapshotMetadata(repo=repo, snapshot_id=snapshot_id)
+def _infer_repo(snapshot_dir: Path) -> str | None:
+    for filename in ("pull_requests.parquet", "issues.parquet"):
+        path = snapshot_dir / filename
+        if not path.exists():
+            continue
+        rows = read_parquet_rows(path)
+        if rows and rows[0].get("repo"):
+            return str(rows[0]["repo"])
+    for filename in _analysis_report_paths(snapshot_dir):
+        payload = _read_optional_json(filename)
+        if payload.get("repo"):
+            return str(payload["repo"])
+    report = _read_optional_json(snapshot_dir / "new-contributors-report.json")
+    if report.get("repo"):
+        return str(report["repo"])
+    return None
+def _title_for_issue(number: int | None, issue_map: dict[int, dict[str, Any]]) -> str | None:
+    if number is None or number not in issue_map:
+        return None
+    title = issue_map[number].get("title")
+    return str(title) if title else None
+def _url_for_issue(number: int | None, issue_map: dict[int, dict[str, Any]]) -> str | None:
+    if number is None or number not in issue_map:
+        return None
+    value = issue_map[number].get("html_url")
+    return str(value) if value else None
+def _title_for_pr(number: int | None, pr_map: dict[int, dict[str, Any]]) -> str | None:
+    if number is None or number not in pr_map:
+        return None
+    title = pr_map[number].get("title")
+    return str(title) if title else None
+def _url_for_pr(number: int | None, pr_map: dict[int, dict[str, Any]]) -> str | None:
+    if number is None or number not in pr_map:
+        return None
+    value = pr_map[number].get("html_url")
+    return str(value) if value else None
+def _ordered_ints(values: Any) -> list[int]:
+    if not isinstance(values, list):
+        return []
+    ordered: list[int] = []
+    for value in values:
+        number = _coerce_int(value)
+        if number is not None:
+            ordered.append(number)
+    return ordered
+def _coerce_int(value: Any) -> int | None:
+    if value is None:
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+def _coerce_float(value: Any) -> float | None:
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+def _read_optional_json(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        return {}
+    payload = read_json(path)
+    return payload if isinstance(payload, dict) else {}

uv.lock CHANGED Viewed

@@ -561,7 +561,7 @@ wheels = [
 [[package]]
 name = "fast-agent-mcp"
-version = "0.6.17"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "a2a-sdk" },
@@ -598,9 +598,9 @@ dependencies = [
     { name = "uvloop", marker = "sys_platform != 'win32'" },
     { name = "watchfiles" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/8c/a1/b6b1045345d38b342da3def7723a2dc6a44faff9c01fee6d81afbd272d62/fast_agent_mcp-0.6.17.tar.gz", hash = "sha256:a920113d47ef2ab82be1bd63b77d3bf78f8f862a5a6e91f1fd0aa931850fb25f", size = 2091401, upload-time = "2026-04-16T21:48:43.334Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b4/ef/47e05d6fa95e04ed8ad60afac3ae29d8205894fb220ffde193bd33578f3a/fast_agent_mcp-0.6.17-py3-none-any.whl", hash = "sha256:a23c5a5ed8924e38809dabd31f994e5cc81b8c084e84632bb1eb246b257c4752", size = 1573794, upload-time = "2026-04-16T21:48:38.999Z" },
 ]
 [[package]]
@@ -2366,7 +2366,7 @@ wheels = [
 [[package]]
 name = "slop-farmer"
-version = "0.1.1"
 source = { editable = "." }
 dependencies = [
     { name = "duckdb" },

 [[package]]
 name = "fast-agent-mcp"
+version = "0.6.18"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "a2a-sdk" },
     { name = "uvloop", marker = "sys_platform != 'win32'" },
     { name = "watchfiles" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/68/9f/a66344581177eb70cd817a58a3305c4b2c2b5f98661129c2cecc4aa36e77/fast_agent_mcp-0.6.18.tar.gz", hash = "sha256:5ee5624890a9670b6f1a912998807e0fd451aa1c7205d189a964764a988c7bc0", size = 2091443, upload-time = "2026-04-17T20:52:25.84Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/49/63/d8942bde2e706c869f93835ea85a2015be0edf5772c4e9ec8939a1001172/fast_agent_mcp-0.6.18-py3-none-any.whl", hash = "sha256:67c0c011763a28b8d5779b5d4d5cdc61e6f3dbc8cd1a7227388229957429835f", size = 1573842, upload-time = "2026-04-17T20:52:28.807Z" },
 ]
 [[package]]
 [[package]]
 name = "slop-farmer"
+version = "0.1.0"
 source = { editable = "." }
 dependencies = [
     { name = "duckdb" },