Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 2,624 Bytes
98bf60c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | #!/usr/bin/env python3
import argparse, os, sys, time
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(dotenv_path=Path.cwd() / ".env")
sys.path.append(str(Path(__file__).resolve().parents[1]))
# Pipeline status tracking (no-op when running standalone)
_PIPELINE_SCRIPT = os.environ.get("PIPELINE_CURRENT_SCRIPT")
_pipeline = None
if _PIPELINE_SCRIPT:
try:
from pipeline_status import PipelineStatus
_pipeline = PipelineStatus()
except Exception:
pass
def _update_pipeline_progress(current, total, unit="vectorstores", message=""):
if not _pipeline:
return
try:
_pipeline.update_progress(_PIPELINE_SCRIPT, current, total, unit, message)
except Exception:
pass
def main():
p = argparse.ArgumentParser()
p.add_argument("--source", default="data/known_bills_visualize.json")
p.add_argument("--backend", choices=["chroma","pinecone"], default=os.getenv("VECTOR_BACKEND","pinecone"))
p.add_argument("--persist", default="data/bills_vectorstore")
p.add_argument("--collection", default="bills")
p.add_argument("--manifest", default="data/bills_vectorstore_manifest.json")
p.add_argument("--model", default=None)
p.add_argument("--batch", type=int, default=128)
p.add_argument("--reset", action="store_true",
help="Delete all vectors and re-embed from scratch (fixes duplicate bloat).")
args = p.parse_args()
_update_pipeline_progress(0, 1, "vectorstores", "Building bills vectorstore...")
if args.backend == "pinecone":
from vectorstore.pinecone_bills_vectorstore import upsert_from_bills_json
stats = upsert_from_bills_json(
source_json_path=args.source,
manifest_path=args.manifest,
embed_model=args.model,
batch_size=args.batch,
reset=args.reset,
)
else:
from vectorstore.bills_vectorstore import upsert_from_bills_json
stats = upsert_from_bills_json(
source_json_path=args.source,
persist_dir=args.persist,
collection=args.collection,
manifest_path=args.manifest,
embed_model=args.model,
batch_size=args.batch,
)
_update_pipeline_progress(1, 1, "vectorstores",
f"Done: {stats.get('embedded', 0)} embedded, {stats.get('skipped_unchanged', 0)} skipped")
print("✅ Vectorstore updated")
for k, v in stats.items():
print(f" {k}: {v}")
if __name__ == "__main__":
main()
sys.exit(0) # Force exit — Pinecone/LangChain background threads can hang
|