File size: 2,624 Bytes
98bf60c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3
import argparse, os, sys, time
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(dotenv_path=Path.cwd() / ".env")
sys.path.append(str(Path(__file__).resolve().parents[1]))

# Pipeline status tracking (no-op when running standalone)
_PIPELINE_SCRIPT = os.environ.get("PIPELINE_CURRENT_SCRIPT")
_pipeline = None
if _PIPELINE_SCRIPT:
    try:
        from pipeline_status import PipelineStatus
        _pipeline = PipelineStatus()
    except Exception:
        pass

def _update_pipeline_progress(current, total, unit="vectorstores", message=""):
    if not _pipeline:
        return
    try:
        _pipeline.update_progress(_PIPELINE_SCRIPT, current, total, unit, message)
    except Exception:
        pass

def main():
    p = argparse.ArgumentParser()
    p.add_argument("--source", default="data/known_bills_visualize.json")
    p.add_argument("--backend", choices=["chroma","pinecone"], default=os.getenv("VECTOR_BACKEND","pinecone"))
    p.add_argument("--persist", default="data/bills_vectorstore")
    p.add_argument("--collection", default="bills")
    p.add_argument("--manifest", default="data/bills_vectorstore_manifest.json")
    p.add_argument("--model", default=None)
    p.add_argument("--batch", type=int, default=128)
    p.add_argument("--reset", action="store_true",
                   help="Delete all vectors and re-embed from scratch (fixes duplicate bloat).")
    args = p.parse_args()

    _update_pipeline_progress(0, 1, "vectorstores", "Building bills vectorstore...")

    if args.backend == "pinecone":
        from vectorstore.pinecone_bills_vectorstore import upsert_from_bills_json
        stats = upsert_from_bills_json(
            source_json_path=args.source,
            manifest_path=args.manifest,
            embed_model=args.model,
            batch_size=args.batch,
            reset=args.reset,
        )
    else:
        from vectorstore.bills_vectorstore import upsert_from_bills_json
        stats = upsert_from_bills_json(
            source_json_path=args.source,
            persist_dir=args.persist,
            collection=args.collection,
            manifest_path=args.manifest,
            embed_model=args.model,
            batch_size=args.batch,
        )

    _update_pipeline_progress(1, 1, "vectorstores",
        f"Done: {stats.get('embedded', 0)} embedded, {stats.get('skipped_unchanged', 0)} skipped")

    print("✅ Vectorstore updated")
    for k, v in stats.items():
        print(f"  {k}: {v}")

if __name__ == "__main__":
    main()
    sys.exit(0)  # Force exit — Pinecone/LangChain background threads can hang