Spaces:

evalstate
/

diffusers-pr-api

Sleeping

App Files Files Community

diffusers-pr-api / src /slop_farmer /reports /pr_search_service.py

evalstate HF Staff

Deploy Diffusers PR API

dbf7313 verified 26 days ago

raw

history blame contribute delete

37.2 kB

	from __future__ import annotations

	import json
	from collections.abc import Iterable, Mapping, Sequence
	from contextlib import suppress
	from pathlib import Path
	from typing import Any, Protocol
	from uuid import uuid4

	from slop_farmer.config import PrSearchRefreshOptions, RepoRef, resolve_github_token
	from slop_farmer.data.github_api import GitHubClient
	from slop_farmer.data.normalize import normalize_pr_file, normalize_pull_request
	from slop_farmer.data.search_duckdb import (
	connect_pr_search_db,
	fetch_rows,
	get_candidate_cluster_rows,
	get_cluster,
	get_cluster_ids_for_prs,
	get_cluster_members,
	get_contributor,
	get_contributor_pulls,
	get_document,
	get_feature,
	get_pair_neighbor_row,
	get_run_counts,
	get_scope_run_artifact,
	get_shared_cluster_ids,
	get_similar_pr_rows,
	insert_rows,
	replace_active_run,
	resolve_active_run,
	update_run_status,
	)
	from slop_farmer.reports.pr_scope import PrScopeClusterOptions
	from slop_farmer.reports.pr_search_scope import (
	build_pr_scope_search_artifacts,
	build_scope_feature_for_pull_request,
	build_scope_feature_idf_for_indexed_documents,
	iso_timestamp,
	load_pr_search_snapshot,
	rank_scope_cluster_candidates,
	rank_scope_feature_matches,
	resolve_pr_search_snapshot_dir,
	scope_feature_pair_explanation,
	scope_options_from_settings,
	)


	class ProbeClientLike(Protocol):
	def get_pull_request(self, owner: str, repo: str, number: int) -> dict[str, Any]: ...

	def iter_pull_files(self, owner: str, repo: str, number: int) -> Iterable[dict[str, Any]]: ...


	def run_pr_search_refresh(options: PrSearchRefreshOptions) -> dict[str, Any]:
	snapshot_dir = resolve_pr_search_snapshot_dir(options)
	snapshot = load_pr_search_snapshot(snapshot_dir)
	repo = str(snapshot["repo"])
	db_path = resolve_pr_search_db_path(options.db, output_dir=options.output_dir)
	started_at = iso_timestamp()
	scope_options = PrScopeClusterOptions(
	include_closed=options.include_closed,
	include_drafts=options.include_drafts,
	)
	artifacts = build_pr_scope_search_artifacts(
	snapshot["pull_requests"],
	snapshot["pr_files"],
	options=scope_options,
	suppression_rules=options.cluster_suppression_rules,
	limit_prs=options.limit_prs,
	)
	run_id = uuid4().hex
	source_type = "hf_dataset_repo" if options.hf_repo_id else "local_snapshot"

	connection = connect_pr_search_db(db_path)
	try:
	insert_rows(
	connection,
	"pr_search_runs",
	[
	{
	"id": run_id,
	"repo": repo,
	"snapshot_id": snapshot["snapshot_id"],
	"snapshot_dir": str(snapshot_dir),
	"source_type": source_type,
	"hf_repo_id": options.hf_repo_id,
	"hf_revision": options.hf_revision,
	"started_at": started_at,
	"finished_at": None,
	"status": "running",
	"settings_json": artifacts["settings_json"],
	"notes": None,
	}
	],
	)
	connection.execute("BEGIN")
	created_at = iso_timestamp()
	insert_rows(
	connection,
	"pr_search_documents",
	_scoped_rows(artifacts["documents"], run_id=run_id, repo=repo),
	)
	insert_rows(
	connection,
	"pr_search_contributors",
	_contributor_rows(
	snapshot["contributors"],
	run_id=run_id,
	repo=repo,
	snapshot_id=str(snapshot["snapshot_id"]),
	),
	)
	insert_rows(
	connection,
	"pr_scope_features",
	_scoped_rows(
	artifacts["features"],
	run_id=run_id,
	repo=repo,
	computed_at=created_at,
	),
	)
	insert_rows(
	connection,
	"pr_scope_run_artifacts",
	_scoped_rows(
	[artifacts["run_artifact"]],
	run_id=run_id,
	repo=repo,
	computed_at=created_at,
	),
	)
	insert_rows(
	connection,
	"pr_scope_neighbors",
	_scoped_rows(
	artifacts["neighbors"],
	run_id=run_id,
	repo=repo,
	created_at=created_at,
	),
	)
	insert_rows(
	connection,
	"pr_scope_clusters",
	_scoped_rows(
	artifacts["clusters"],
	run_id=run_id,
	repo=repo,
	created_at=created_at,
	),
	)
	insert_rows(
	connection,
	"pr_scope_cluster_members",
	_scoped_rows(artifacts["cluster_members"], run_id=run_id, repo=repo),
	)
	insert_rows(
	connection,
	"pr_scope_cluster_candidates",
	_scoped_rows(artifacts["cluster_candidates"], run_id=run_id, repo=repo),
	)
	finished_at = iso_timestamp()
	update_run_status(
	connection,
	run_id=run_id,
	status="succeeded",
	finished_at=finished_at,
	)
	if options.replace_active:
	replace_active_run(
	connection,
	repo=repo,
	run_id=run_id,
	activated_at=finished_at,
	)
	connection.execute("COMMIT")
	counts = get_run_counts(connection, run_id=run_id)
	return {
	"db_path": str(db_path),
	"run_id": run_id,
	"repo": repo,
	"snapshot_id": snapshot["snapshot_id"],
	"snapshot_dir": str(snapshot_dir),
	"source_type": source_type,
	"active_updated": bool(options.replace_active),
	"row_counts": counts,
	}
	except Exception as exc:
	with suppress(Exception):
	connection.execute("ROLLBACK")
	update_run_status(
	connection,
	run_id=run_id,
	status="failed",
	finished_at=iso_timestamp(),
	notes=str(exc),
	)
	raise
	finally:
	connection.close()


	def get_pr_search_status(db_path: Path, *, repo: str \| None = None) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	return {
	**_without_json_fields(active_run),
	"settings": _json_dict(active_run.get("settings_json")),
	"row_counts": get_run_counts(connection, run_id=str(active_run["id"])),
	}
	finally:
	connection.close()


	def get_pr_search_similar(
	db_path: Path,
	*,
	pr_number: int,
	repo: str \| None = None,
	limit: int = 10,
	) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	run_id = str(active_run["id"])
	document = _require_document(connection, run_id=run_id, pr_number=pr_number)
	similar_rows = get_similar_pr_rows(
	connection, run_id=run_id, pr_number=pr_number, limit=limit
	)
	cluster_ids_by_pr = get_cluster_ids_for_prs(
	connection,
	run_id=run_id,
	pr_numbers=[int(row["neighbor_pr_number"]) for row in similar_rows],
	)
	results = []
	for row in similar_rows:
	results.append(
	{
	**_without_json_fields(row),
	"neighbor_title": _require_document(
	connection,
	run_id=run_id,
	pr_number=int(row["neighbor_pr_number"]),
	)["title"],
	"cluster_ids": cluster_ids_by_pr.get(int(row["neighbor_pr_number"]), []),
	"shared_filenames": _json_list(row.get("shared_filenames_json")),
	"shared_directories": _json_list(row.get("shared_directories_json")),
	}
	)
	return {
	"repo": active_run["repo"],
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"pr": document,
	"similar_prs": results,
	"similar_count": len(results),
	}
	finally:
	connection.close()


	def get_pr_search_candidate_clusters(
	db_path: Path,
	*,
	pr_number: int,
	repo: str \| None = None,
	limit: int = 5,
	) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	run_id = str(active_run["id"])
	document = _require_document(connection, run_id=run_id, pr_number=pr_number)
	rows = get_candidate_cluster_rows(
	connection, run_id=run_id, pr_number=pr_number, limit=limit
	)
	candidates = []
	for row in rows:
	evidence = _json_dict(row.get("evidence_json"))
	candidates.append(
	{
	**_without_json_fields(row),
	"shared_filenames": _json_list(row.get("shared_filenames_json")),
	"shared_directories": _json_list(row.get("shared_directories_json")),
	"evidence": evidence,
	"matched_member_pr_numbers": evidence.get("matched_member_pr_numbers") or [],
	"reason": evidence.get("reason") or "",
	}
	)
	return {
	"repo": active_run["repo"],
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"pr": document,
	"candidate_clusters": candidates,
	"candidate_cluster_count": len(candidates),
	}
	finally:
	connection.close()


	def get_pr_search_contributor(
	db_path: Path,
	*,
	author_login: str,
	repo: str \| None = None,
	) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	run_id = str(active_run["id"])
	contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
	pulls = _document_rows(
	get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=20)
	)
	return {
	"repo": active_run["repo"],
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"contributor": contributor,
	"pulls": pulls,
	"pull_count": len(pulls),
	}
	finally:
	connection.close()


	def get_pr_search_contributor_pulls(
	db_path: Path,
	*,
	author_login: str,
	repo: str \| None = None,
	limit: int = 20,
	) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	run_id = str(active_run["id"])
	contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
	pulls = _document_rows(
	get_contributor_pulls(connection, run_id=run_id, author_login=author_login, limit=limit)
	)
	return {
	"repo": active_run["repo"],
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"contributor": contributor,
	"pulls": pulls,
	"pull_count": len(pulls),
	}
	finally:
	connection.close()


	def get_pr_search_pull_contributor(
	db_path: Path,
	*,
	pr_number: int,
	repo: str \| None = None,
	) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	run_id = str(active_run["id"])
	document = _require_document(connection, run_id=run_id, pr_number=pr_number)
	author_login = str(document.get("author_login") or "").strip()
	if not author_login:
	raise ValueError(f"PR #{pr_number} does not have an indexed author_login.")
	contributor = _require_contributor(connection, run_id=run_id, author_login=author_login)
	return {
	"repo": active_run["repo"],
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"pr": _without_json_fields(document),
	"contributor": contributor,
	}
	finally:
	connection.close()


	def get_pr_search_similar_lookup(
	db_path: Path,
	*,
	pr_number: int,
	repo: str \| None = None,
	limit: int = 10,
	mode: str = "auto",
	client: ProbeClientLike \| None = None,
	) -> dict[str, Any]:
	resolved_mode = _normalize_lookup_mode(mode)
	if resolved_mode != "live":
	try:
	result = get_pr_search_similar(db_path, pr_number=pr_number, repo=repo, limit=limit)
	except ValueError as exc:
	if resolved_mode == "indexed" or not _is_index_miss(exc):
	raise
	else:
	result["query"] = {
	"pr_number": pr_number,
	"mode_requested": resolved_mode,
	"mode_used": "indexed",
	"source": "active_index",
	}
	return result

	live_result = probe_pr_search_live(
	db_path,
	pr_number=pr_number,
	repo=repo,
	limit=limit,
	client=client,
	)
	return {
	"repo": live_result["repo"],
	"snapshot_id": live_result["snapshot_id"],
	"run_id": live_result["run_id"],
	"query": {
	"pr_number": pr_number,
	"mode_requested": resolved_mode,
	"mode_used": "live",
	"source": live_result["probe_source"]["provider"],
	},
	"pr": live_result["probe_pr"],
	"probe_source": live_result["probe_source"],
	"similar_prs": live_result["similar_prs"],
	"similar_count": len(live_result["similar_prs"]),
	}


	def get_pr_search_clusters(
	db_path: Path,
	*,
	pr_number: int,
	repo: str \| None = None,
	limit: int = 5,
	mode: str = "auto",
	client: ProbeClientLike \| None = None,
	) -> dict[str, Any]:
	resolved_mode = _normalize_lookup_mode(mode)
	if resolved_mode != "live":
	try:
	result = _get_pr_search_clusters_indexed(
	db_path,
	pr_number=pr_number,
	repo=repo,
	limit=limit,
	)
	except ValueError as exc:
	if resolved_mode == "indexed" or not _is_index_miss(exc):
	raise
	else:
	result["query"] = {
	"pr_number": pr_number,
	"mode_requested": resolved_mode,
	"mode_used": "indexed",
	"source": "active_index",
	}
	return result

	live_result = probe_pr_search_live(
	db_path,
	pr_number=pr_number,
	repo=repo,
	limit=limit,
	client=client,
	)
	return {
	"repo": live_result["repo"],
	"snapshot_id": live_result["snapshot_id"],
	"run_id": live_result["run_id"],
	"query": {
	"pr_number": pr_number,
	"mode_requested": resolved_mode,
	"mode_used": "live",
	"source": live_result["probe_source"]["provider"],
	},
	"pr": live_result["probe_pr"],
	"probe_source": live_result["probe_source"],
	"assigned_clusters": [],
	"candidate_clusters": live_result["candidate_clusters"],
	"assigned_cluster_count": 0,
	"candidate_cluster_count": len(live_result["candidate_clusters"]),
	}


	def get_pr_search_cluster(
	db_path: Path,
	*,
	cluster_id: str,
	repo: str \| None = None,
	) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	run_id = str(active_run["id"])
	cluster = get_cluster(connection, run_id=run_id, cluster_id=cluster_id)
	if cluster is None:
	raise ValueError(f"Cluster {cluster_id!r} was not found in the active run.")
	members = get_cluster_members(connection, run_id=run_id, cluster_id=cluster_id)
	return {
	"repo": active_run["repo"],
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"cluster": _cluster_summary(cluster),
	"members": members,
	"member_count": len(members),
	}
	finally:
	connection.close()


	def list_pr_search_clusters(
	db_path: Path,
	*,
	repo: str \| None = None,
	limit: int = 50,
	) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	run_id = str(active_run["id"])
	rows = fetch_rows(
	connection,
	"""
	SELECT
	cl.*,
	d.title AS representative_title,
	d.html_url AS representative_html_url,
	d.state AS representative_state,
	d.draft AS representative_draft
	FROM pr_scope_clusters AS cl
	LEFT JOIN pr_search_documents AS d
	ON d.run_id = cl.run_id AND d.pr_number = cl.representative_pr_number
	WHERE cl.run_id = ?
	ORDER BY cl.cluster_size DESC, cl.average_similarity DESC, cl.cluster_id
	LIMIT ?
	""",
	[run_id, limit],
	)
	clusters = []
	for index, row in enumerate(rows, start=1):
	clusters.append({"rank": index, **_cluster_summary(row)})
	return {
	"repo": active_run["repo"],
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"clusters": clusters,
	"cluster_count": len(clusters),
	}
	finally:
	connection.close()


	def explain_pr_search_pair(
	db_path: Path,
	*,
	left_pr_number: int,
	right_pr_number: int,
	repo: str \| None = None,
	) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	run_id = str(active_run["id"])
	left_document = _require_document(connection, run_id=run_id, pr_number=left_pr_number)
	right_document = _require_document(connection, run_id=run_id, pr_number=right_pr_number)
	neighbor_row = get_pair_neighbor_row(
	connection,
	run_id=run_id,
	left_pr_number=left_pr_number,
	right_pr_number=right_pr_number,
	)
	shared_cluster_ids = get_shared_cluster_ids(
	connection,
	run_id=run_id,
	left_pr_number=left_pr_number,
	right_pr_number=right_pr_number,
	)
	if neighbor_row is not None:
	shared_filenames = _json_list(neighbor_row.get("shared_filenames_json"))
	shared_directories = _json_list(neighbor_row.get("shared_directories_json"))
	return {
	"repo": active_run["repo"],
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"materialized": True,
	"left_pr": left_document,
	"right_pr": right_document,
	"pair": {
	"similarity": neighbor_row["similarity"],
	"content_similarity": neighbor_row["content_similarity"],
	"size_similarity": neighbor_row["size_similarity"],
	"breadth_similarity": neighbor_row["breadth_similarity"],
	"concentration_similarity": neighbor_row["concentration_similarity"],
	"shared_filenames": shared_filenames,
	"shared_directories": shared_directories,
	},
	"shared_cluster_ids": shared_cluster_ids,
	}

	left_feature = _require_feature(connection, run_id=run_id, pr_number=left_pr_number)
	right_feature = _require_feature(connection, run_id=run_id, pr_number=right_pr_number)
	pair = scope_feature_pair_explanation(
	left_feature,
	right_feature,
	options=scope_options_from_settings(_json_dict(active_run.get("settings_json"))),
	)
	return {
	"repo": active_run["repo"],
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"materialized": False,
	"left_pr": left_document,
	"right_pr": right_document,
	"pair": pair,
	"shared_cluster_ids": shared_cluster_ids,
	}
	finally:
	connection.close()


	def probe_pr_search_live(
	db_path: Path,
	*,
	pr_number: int,
	repo: str \| None = None,
	limit: int = 10,
	client: ProbeClientLike \| None = None,
	) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	run_id = str(active_run["id"])
	repo_slug = repo or str(active_run["repo"])
	repo_ref = RepoRef.parse(repo_slug)
	settings = scope_options_from_settings(_json_dict(active_run.get("settings_json")))
	indexed_documents = fetch_rows(
	connection,
	"""
	SELECT *
	FROM pr_search_documents
	WHERE run_id = ?
	ORDER BY pr_number
	""",
	[run_id],
	)
	indexed_features = fetch_rows(
	connection,
	"""
	SELECT *
	FROM pr_scope_features
	WHERE run_id = ?
	ORDER BY pr_number
	""",
	[run_id],
	)
	run_artifact = get_scope_run_artifact(connection, run_id=run_id)
	cluster_rows = fetch_rows(
	connection,
	"""
	SELECT *
	FROM pr_scope_clusters
	WHERE run_id = ?
	ORDER BY cluster_id
	""",
	[run_id],
	)
	cluster_member_rows = fetch_rows(
	connection,
	"""
	SELECT cluster_id, pr_number
	FROM pr_scope_cluster_members
	WHERE run_id = ?
	ORDER BY cluster_id, pr_number
	""",
	[run_id],
	)
	cluster_members: dict[str, list[int]] = {}
	for row in cluster_member_rows:
	cluster_members.setdefault(str(row["cluster_id"]), []).append(int(row["pr_number"]))

	probe_client = client or GitHubClient(token=resolve_github_token())
	extracted_at = iso_timestamp()
	pr_detail = probe_client.get_pull_request(repo_ref.owner, repo_ref.name, pr_number)
	pr_row = normalize_pull_request(
	repo_ref.slug,
	pr_detail,
	pr_detail,
	str(active_run["snapshot_id"]),
	extracted_at,
	)
	pr_files = [
	normalize_pr_file(
	repo_ref.slug,
	pr_number,
	item,
	str(active_run["snapshot_id"]),
	extracted_at,
	)
	for item in probe_client.iter_pull_files(repo_ref.owner, repo_ref.name, pr_number)
	]
	feature_idf = (
	_json_float_dict(run_artifact.get("idf_json")) if run_artifact is not None else {}
	)
	if not feature_idf:
	snapshot = load_pr_search_snapshot(Path(str(active_run["snapshot_dir"])))
	feature_idf = build_scope_feature_idf_for_indexed_documents(
	indexed_documents,
	snapshot["pr_files"],
	options=settings,
	)
	query_feature = build_scope_feature_for_pull_request(
	pr_row,
	pr_files,
	feature_idf=feature_idf,
	options=settings,
	)
	similarity_rows = rank_scope_feature_matches(
	query_feature,
	indexed_features,
	options=settings,
	limit=limit,
	)
	cluster_ids_by_pr = get_cluster_ids_for_prs(
	connection,
	run_id=run_id,
	pr_numbers=[int(row["right_pr_number"]) for row in similarity_rows],
	)
	live_similar_prs = []
	for row in similarity_rows:
	indexed_document = _require_document(
	connection,
	run_id=run_id,
	pr_number=int(row["right_pr_number"]),
	)
	live_similar_prs.append(
	{
	**row,
	"neighbor_pr_number": int(row["right_pr_number"]),
	"neighbor_title": indexed_document["title"],
	"cluster_ids": cluster_ids_by_pr.get(int(row["right_pr_number"]), []),
	}
	)
	assigned_cluster_ids = set(
	get_cluster_ids_for_prs(connection, run_id=run_id, pr_numbers=[pr_number]).get(
	pr_number, []
	)
	)
	candidate_clusters = rank_scope_cluster_candidates(
	similarity_rows=similarity_rows,
	clusters=cluster_rows,
	cluster_members=cluster_members,
	assigned_cluster_ids=assigned_cluster_ids,
	limit=min(5, max(limit, 1)),
	)
	cluster_by_id = {str(row["cluster_id"]): row for row in cluster_rows}
	for row in candidate_clusters:
	cluster = cluster_by_id[row["cluster_id"]]
	row.update(
	{
	"representative_pr_number": cluster["representative_pr_number"],
	"cluster_size": cluster["cluster_size"],
	"average_similarity": cluster["average_similarity"],
	"summary": cluster["summary"],
	"shared_filenames": _json_list(cluster.get("shared_filenames_json")),
	"shared_directories": _json_list(cluster.get("shared_directories_json")),
	"matched_member_pr_numbers": row["evidence"].get("matched_member_pr_numbers")
	or [],
	"reason": row["evidence"].get("reason") or "",
	}
	)

	return {
	"repo": repo_slug,
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"probe_pr": {
	"pr_number": pr_number,
	"title": pr_row.get("title") or "",
	"html_url": pr_row.get("html_url"),
	"base_ref": pr_row.get("base_ref"),
	"changed_files": int(pr_row.get("changed_files") or 0),
	},
	"probe_source": _probe_source_metadata(
	probe_client,
	owner=repo_ref.owner,
	repo=repo_ref.name,
	number=pr_number,
	),
	"similar_prs": live_similar_prs,
	"candidate_clusters": candidate_clusters,
	}
	finally:
	connection.close()


	def probe_pr_search_github(
	db_path: Path,
	*,
	pr_number: int,
	repo: str \| None = None,
	limit: int = 10,
	client: ProbeClientLike \| None = None,
	) -> dict[str, Any]:
	return probe_pr_search_live(
	db_path,
	pr_number=pr_number,
	repo=repo,
	limit=limit,
	client=client,
	)


	def resolve_pr_search_db_path(db_path: Path \| None, *, output_dir: Path) -> Path:
	return (db_path or output_dir / "state" / "pr-search.duckdb").resolve()


	def _scoped_rows(rows: list[dict[str, Any]], **extra: Any) -> list[dict[str, Any]]:
	return [{extra, row} for row in rows]


	def _get_pr_search_clusters_indexed(
	db_path: Path,
	*,
	pr_number: int,
	repo: str \| None = None,
	limit: int = 5,
	) -> dict[str, Any]:
	connection = connect_pr_search_db(db_path, read_only=True)
	try:
	active_run = resolve_active_run(connection, repo=repo)
	run_id = str(active_run["id"])
	document = _require_document(connection, run_id=run_id, pr_number=pr_number)
	candidate_rows = get_candidate_cluster_rows(
	connection,
	run_id=run_id,
	pr_number=pr_number,
	limit=limit,
	)
	assigned_cluster_ids = get_cluster_ids_for_prs(
	connection,
	run_id=run_id,
	pr_numbers=[pr_number],
	).get(pr_number, [])
	assigned_clusters = []
	for cluster_id in assigned_cluster_ids:
	cluster = get_cluster(connection, run_id=run_id, cluster_id=cluster_id)
	if cluster is None:
	continue
	assigned_clusters.append(_cluster_summary(cluster))
	candidates = []
	for row in candidate_rows:
	evidence = _json_dict(row.get("evidence_json"))
	candidates.append(
	{
	**_without_json_fields(row),
	"shared_filenames": _json_list(row.get("shared_filenames_json")),
	"shared_directories": _json_list(row.get("shared_directories_json")),
	"evidence": evidence,
	"matched_member_pr_numbers": evidence.get("matched_member_pr_numbers") or [],
	"reason": evidence.get("reason") or "",
	}
	)
	return {
	"repo": active_run["repo"],
	"snapshot_id": active_run["snapshot_id"],
	"run_id": run_id,
	"pr": document,
	"assigned_clusters": assigned_clusters,
	"candidate_clusters": candidates,
	"assigned_cluster_count": len(assigned_clusters),
	"candidate_cluster_count": len(candidates),
	}
	finally:
	connection.close()


	def _require_document(connection: Any, *, run_id: str, pr_number: int) -> dict[str, Any]:
	document = get_document(connection, run_id=run_id, pr_number=pr_number)
	if document is None:
	raise ValueError(f"PR #{pr_number} was not found in the active indexed universe.")
	return document


	def _require_feature(connection: Any, *, run_id: str, pr_number: int) -> dict[str, Any]:
	feature = get_feature(connection, run_id=run_id, pr_number=pr_number)
	if feature is None:
	raise ValueError(f"No scope feature row was found for PR #{pr_number}.")
	return feature


	def _require_contributor(connection: Any, *, run_id: str, author_login: str) -> dict[str, Any]:
	contributor = get_contributor(connection, run_id=run_id, author_login=author_login)
	if contributor is None:
	raise ValueError(
	f"Contributor {author_login!r} was not found in the active indexed universe."
	)
	return _contributor_row(contributor)


	def _json_list(raw: Any) -> list[str]:
	if isinstance(raw, list):
	return [str(item) for item in raw]
	if isinstance(raw, str) and raw:
	payload = json.loads(raw)
	if isinstance(payload, list):
	return [str(item) for item in payload]
	return []


	def _json_dict(raw: Any) -> dict[str, Any]:
	if isinstance(raw, dict):
	return dict(raw)
	if isinstance(raw, str) and raw:
	payload = json.loads(raw)
	if isinstance(payload, dict):
	return payload
	return {}


	def _json_float_dict(raw: Any) -> dict[str, float]:
	payload = _json_dict(raw)
	return {str(key): float(value) for key, value in payload.items()}


	def _cluster_summary(cluster: dict[str, Any]) -> dict[str, Any]:
	return {
	**_without_json_fields(cluster),
	"shared_filenames": _json_list(cluster.get("shared_filenames_json")),
	"shared_directories": _json_list(cluster.get("shared_directories_json")),
	}


	def _without_json_fields(row: Mapping[str, Any]) -> dict[str, Any]:
	return {str(key): value for key, value in row.items() if not str(key).endswith("_json")}


	def _document_rows(rows: Sequence[Mapping[str, Any]]) -> list[dict[str, Any]]:
	return [_without_json_fields(row) for row in rows]


	def _contributor_rows(
	rows: list[Mapping[str, Any]],
	*,
	run_id: str,
	repo: str,
	snapshot_id: str,
	) -> list[dict[str, Any]]:
	return [
	{
	"run_id": run_id,
	"repo": repo,
	"snapshot_id": snapshot_id,
	"report_generated_at": row.get("report_generated_at"),
	"window_days": row.get("window_days"),
	"author_login": row.get("author_login"),
	"name": row.get("name"),
	"profile_url": row.get("profile_url"),
	"repo_pull_requests_url": row.get("repo_pull_requests_url"),
	"repo_issues_url": row.get("repo_issues_url"),
	"repo_first_seen_at": row.get("repo_first_seen_at"),
	"repo_last_seen_at": row.get("repo_last_seen_at"),
	"repo_primary_artifact_count": row.get("repo_primary_artifact_count"),
	"repo_artifact_count": row.get("repo_artifact_count"),
	"snapshot_issue_count": row.get("snapshot_issue_count"),
	"snapshot_pr_count": row.get("snapshot_pr_count"),
	"snapshot_comment_count": row.get("snapshot_comment_count"),
	"snapshot_review_count": row.get("snapshot_review_count"),
	"snapshot_review_comment_count": row.get("snapshot_review_comment_count"),
	"repo_association": row.get("repo_association"),
	"new_to_repo": row.get("new_to_repo"),
	"first_seen_in_snapshot": row.get("first_seen_in_snapshot"),
	"report_reason": row.get("report_reason"),
	"account_age_days": row.get("account_age_days"),
	"young_account": row.get("young_account"),
	"follow_through_score": row.get("follow_through_score"),
	"breadth_score": row.get("breadth_score"),
	"automation_risk_signal": row.get("automation_risk_signal"),
	"heuristic_note": row.get("heuristic_note"),
	"public_orgs_json": row.get("public_orgs"),
	"visible_authored_pr_count": row.get("visible_authored_pr_count"),
	"merged_pr_count": row.get("merged_pr_count"),
	"closed_unmerged_pr_count": row.get("closed_unmerged_pr_count"),
	"open_pr_count": row.get("open_pr_count"),
	"merged_pr_rate": row.get("merged_pr_rate"),
	"closed_unmerged_pr_rate": row.get("closed_unmerged_pr_rate"),
	"still_open_pr_rate": row.get("still_open_pr_rate"),
	"distinct_repos_with_authored_prs": row.get("distinct_repos_with_authored_prs"),
	"distinct_repos_with_open_prs": row.get("distinct_repos_with_open_prs"),
	"fetch_error": row.get("fetch_error"),
	}
	for row in rows
	]


	def _contributor_row(row: Mapping[str, Any]) -> dict[str, Any]:
	return {
	**_without_json_fields(row),
	"public_orgs": _json_list(row.get("public_orgs_json")),
	}


	def _normalize_lookup_mode(mode: str) -> str:
	normalized = mode.strip().lower()
	if normalized not in {"auto", "indexed", "live"}:
	raise ValueError(f"Unsupported mode {mode!r}; expected auto, indexed, or live.")
	return normalized


	def _is_index_miss(exc: ValueError) -> bool:
	return "active indexed universe" in str(exc)


	def _probe_source_metadata(
	client: Any,
	*,
	owner: str,
	repo: str,
	number: int,
	) -> dict[str, Any]:
	metadata: dict[str, Any] = {"provider": _probe_provider(client)}
	base_url = getattr(client, "base_url", None)
	if isinstance(base_url, str) and base_url:
	metadata["base_url"] = base_url
	status_method = getattr(client, "get_pull_request_status", None)
	if not callable(status_method):
	return metadata
	try:
	status = status_method(owner, repo, number)
	except Exception as exc:
	metadata["status_error"] = str(exc)
	return metadata
	metadata.update(_normalize_probe_status(status))
	return metadata


	def _normalize_probe_status(raw: Any) -> dict[str, Any]:
	if not isinstance(raw, dict):
	return {}
	indexed = raw.get("indexed")
	if indexed is None:
	indexed = raw.get("is_indexed")
	freshness = raw.get("index_freshness")
	if freshness is None:
	freshness = raw.get("freshness")
	last_indexed_at = raw.get("last_indexed_at")
	if last_indexed_at is None:
	last_indexed_at = raw.get("indexed_at")

	normalized: dict[str, Any] = {}
	if indexed is not None:
	normalized["indexed"] = bool(indexed)
	if freshness is not None:
	normalized["index_freshness"] = str(freshness)
	if last_indexed_at is not None:
	normalized["last_indexed_at"] = str(last_indexed_at)
	return normalized


	def _probe_provider(client: Any) -> str:
	provider = getattr(client, "provider", None)
	if isinstance(provider, str) and provider:
	return provider
	if isinstance(client, GitHubClient):
	return "github"
	return "live"