Spaces:
Sleeping
Sleeping
| """ | |
| Build or refresh a Pinecone namespace for retrieval evaluation. | |
| Default behavior: | |
| - index qa_dataset_generation/data/test_notices_2025.json | |
| For full 2025/2026 evaluation, pass --full. Use --force to re-embed notices even | |
| when their manifest entries look unchanged. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import logging | |
| import os | |
| import sys | |
| from pathlib import Path | |
| ROOT = Path(__file__).parent.parent | |
| if str(ROOT) not in sys.path: | |
| sys.path.insert(0, str(ROOT)) | |
| DEFAULT_CORPORA = [ROOT / "qa_dataset_generation" / "data" / "test_notices_2025.json"] | |
| FULL_EVAL_CORPORA = [ | |
| ROOT / "qa_dataset_generation" / "data" / "test_notices_2025.json", | |
| ROOT / "data" / "data_2026.json", | |
| ] | |
| DEFAULT_EVAL_NAMESPACE = "eval" | |
| DEFAULT_EVAL_STATE = ROOT / "pinecone_index" / "eval" | |
| def load_notices(paths: list[Path]) -> list[dict]: | |
| seen_urls = set() | |
| notices = [] | |
| for path in paths: | |
| with path.open(encoding="utf-8") as f: | |
| data = json.load(f) | |
| for item in data: | |
| url = item.get("url") | |
| if not url or url in seen_urls: | |
| continue | |
| seen_urls.add(url) | |
| notices.append( | |
| { | |
| "title": item.get("title", ""), | |
| "url": url, | |
| "date": item.get("date", ""), | |
| "body": item.get("body", ""), | |
| "category": item.get("category", ""), | |
| } | |
| ) | |
| return notices | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Build a Pinecone evaluation namespace.") | |
| parser.add_argument( | |
| "--corpus", | |
| type=Path, | |
| action="append", | |
| default=None, | |
| help="Corpus JSON file. Can be passed multiple times.", | |
| ) | |
| parser.add_argument( | |
| "--full", | |
| action="store_true", | |
| help="Index both the 2025 QA corpus and current 2026 notices.", | |
| ) | |
| parser.add_argument( | |
| "--namespace", | |
| default=DEFAULT_EVAL_NAMESPACE, | |
| help="Pinecone namespace to index.", | |
| ) | |
| parser.add_argument( | |
| "--state-dir", | |
| type=Path, | |
| default=DEFAULT_EVAL_STATE, | |
| help="Local manifest/cache directory for the evaluation namespace.", | |
| ) | |
| parser.add_argument( | |
| "--force", | |
| action="store_true", | |
| help="Re-embed notices even when manifest entries look unchanged.", | |
| ) | |
| parser.add_argument( | |
| "--sync-deletions", | |
| action="store_true", | |
| help="Delete vectors for notices no longer present in the selected corpora.", | |
| ) | |
| parser.add_argument( | |
| "--pooling", | |
| choices=["cls", "mean"], | |
| default="cls", | |
| help="Pooling mode used by SimCSEEmbedder.", | |
| ) | |
| parser.add_argument( | |
| "--model", | |
| default=None, | |
| help="Embedding model name/path. Defaults to api.core.config.BASE_MODEL_EMBED.", | |
| ) | |
| parser.add_argument( | |
| "--backend", | |
| choices=["simcse", "sentence-transformers"], | |
| default="simcse", | |
| help="Embedding backend.", | |
| ) | |
| parser.add_argument("--notice-batch-size", type=int, default=20) | |
| parser.add_argument("--embed-batch-size", type=int, default=16) | |
| args = parser.parse_args() | |
| default_corpora = FULL_EVAL_CORPORA if args.full else DEFAULT_CORPORA | |
| corpora = [path.resolve() for path in (args.corpus or default_corpora)] | |
| state_dir = args.state_dir.resolve() | |
| os.environ["VECTOR_DB"] = "pinecone" | |
| os.environ["PINECONE_NAMESPACE"] = args.namespace | |
| os.environ["VECTOR_STATE_PATH"] = str(state_dir) | |
| os.environ["PINECONE_CACHE_PATH"] = str(state_dir / "pinecone_chunks.json") | |
| os.environ["SIMCSE_POOLING"] = args.pooling | |
| os.environ["EMBEDDER_BACKEND"] = args.backend | |
| if args.model: | |
| os.environ["BASE_MODEL_EMBED"] = args.model | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") | |
| from api.core.models import get_vector_collection, index_notices | |
| notices = load_notices(corpora) | |
| if not notices: | |
| raise SystemExit("No notices loaded.") | |
| print("=" * 72) | |
| print("Building Pinecone evaluation namespace") | |
| print("=" * 72) | |
| print(f"Namespace : {args.namespace}") | |
| print(f"State dir : {state_dir}") | |
| print(f"Backend : {args.backend}") | |
| print(f"Model : {args.model or '(config default)'}") | |
| print(f"Pooling : {args.pooling}") | |
| print("Corpora:") | |
| for corpus in corpora: | |
| print(f" - {corpus}") | |
| print(f"Notices : {len(notices)} unique URLs") | |
| indexed = index_notices( | |
| notices, | |
| force=args.force, | |
| sync_deletions=args.sync_deletions, | |
| notice_batch_size=args.notice_batch_size, | |
| embed_batch_size=args.embed_batch_size, | |
| ) | |
| print(f"Indexed notices : {indexed}") | |
| print(f"Pinecone vectors: {get_vector_collection().count():,}") | |
| if __name__ == "__main__": | |
| main() | |