sangsangfinder / scripts /build_eval_pinecone.py
cksleigen's picture
Initial clean deploy
54656fc
Raw
History Blame Contribute Delete
4.94 kB
"""
Build or refresh a Pinecone namespace for retrieval evaluation.
Default behavior:
- index qa_dataset_generation/data/test_notices_2025.json
For full 2025/2026 evaluation, pass --full. Use --force to re-embed notices even
when their manifest entries look unchanged.
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
from pathlib import Path
ROOT = Path(__file__).parent.parent
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
DEFAULT_CORPORA = [ROOT / "qa_dataset_generation" / "data" / "test_notices_2025.json"]
FULL_EVAL_CORPORA = [
ROOT / "qa_dataset_generation" / "data" / "test_notices_2025.json",
ROOT / "data" / "data_2026.json",
]
DEFAULT_EVAL_NAMESPACE = "eval"
DEFAULT_EVAL_STATE = ROOT / "pinecone_index" / "eval"
def load_notices(paths: list[Path]) -> list[dict]:
seen_urls = set()
notices = []
for path in paths:
with path.open(encoding="utf-8") as f:
data = json.load(f)
for item in data:
url = item.get("url")
if not url or url in seen_urls:
continue
seen_urls.add(url)
notices.append(
{
"title": item.get("title", ""),
"url": url,
"date": item.get("date", ""),
"body": item.get("body", ""),
"category": item.get("category", ""),
}
)
return notices
def main() -> None:
parser = argparse.ArgumentParser(description="Build a Pinecone evaluation namespace.")
parser.add_argument(
"--corpus",
type=Path,
action="append",
default=None,
help="Corpus JSON file. Can be passed multiple times.",
)
parser.add_argument(
"--full",
action="store_true",
help="Index both the 2025 QA corpus and current 2026 notices.",
)
parser.add_argument(
"--namespace",
default=DEFAULT_EVAL_NAMESPACE,
help="Pinecone namespace to index.",
)
parser.add_argument(
"--state-dir",
type=Path,
default=DEFAULT_EVAL_STATE,
help="Local manifest/cache directory for the evaluation namespace.",
)
parser.add_argument(
"--force",
action="store_true",
help="Re-embed notices even when manifest entries look unchanged.",
)
parser.add_argument(
"--sync-deletions",
action="store_true",
help="Delete vectors for notices no longer present in the selected corpora.",
)
parser.add_argument(
"--pooling",
choices=["cls", "mean"],
default="cls",
help="Pooling mode used by SimCSEEmbedder.",
)
parser.add_argument(
"--model",
default=None,
help="Embedding model name/path. Defaults to api.core.config.BASE_MODEL_EMBED.",
)
parser.add_argument(
"--backend",
choices=["simcse", "sentence-transformers"],
default="simcse",
help="Embedding backend.",
)
parser.add_argument("--notice-batch-size", type=int, default=20)
parser.add_argument("--embed-batch-size", type=int, default=16)
args = parser.parse_args()
default_corpora = FULL_EVAL_CORPORA if args.full else DEFAULT_CORPORA
corpora = [path.resolve() for path in (args.corpus or default_corpora)]
state_dir = args.state_dir.resolve()
os.environ["VECTOR_DB"] = "pinecone"
os.environ["PINECONE_NAMESPACE"] = args.namespace
os.environ["VECTOR_STATE_PATH"] = str(state_dir)
os.environ["PINECONE_CACHE_PATH"] = str(state_dir / "pinecone_chunks.json")
os.environ["SIMCSE_POOLING"] = args.pooling
os.environ["EMBEDDER_BACKEND"] = args.backend
if args.model:
os.environ["BASE_MODEL_EMBED"] = args.model
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
from api.core.models import get_vector_collection, index_notices
notices = load_notices(corpora)
if not notices:
raise SystemExit("No notices loaded.")
print("=" * 72)
print("Building Pinecone evaluation namespace")
print("=" * 72)
print(f"Namespace : {args.namespace}")
print(f"State dir : {state_dir}")
print(f"Backend : {args.backend}")
print(f"Model : {args.model or '(config default)'}")
print(f"Pooling : {args.pooling}")
print("Corpora:")
for corpus in corpora:
print(f" - {corpus}")
print(f"Notices : {len(notices)} unique URLs")
indexed = index_notices(
notices,
force=args.force,
sync_deletions=args.sync_deletions,
notice_batch_size=args.notice_batch_size,
embed_batch_size=args.embed_batch_size,
)
print(f"Indexed notices : {indexed}")
print(f"Pinecone vectors: {get_vector_collection().count():,}")
if __name__ == "__main__":
main()