ramanna commited on
Commit
b501a8e
·
verified ·
1 Parent(s): ee67113

Upload 7 files

Browse files
vectorstore/__init__.py ADDED
File without changes
vectorstore/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (192 Bytes). View file
 
vectorstore/__pycache__/bills_vectorstore.cpython-313.pyc ADDED
Binary file (13.1 kB). View file
 
vectorstore/__pycache__/pinecone_bills_vectorstore.cpython-313.pyc ADDED
Binary file (13.1 kB). View file
 
vectorstore/bills_vectorstore.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vectorstore/bills_vectorstore.py
2
+ from __future__ import annotations
3
+ import os, json, hashlib, time
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional, Iterable, Any
6
+
7
+ from dotenv import load_dotenv, find_dotenv
8
+ load_dotenv(find_dotenv())
9
+
10
+ try:
11
+ from langchain_chroma import Chroma
12
+ except Exception:
13
+ from langchain_community.vectorstores import Chroma
14
+
15
+ from langchain_openai import OpenAIEmbeddings
16
+ from langchain.schema import Document
17
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
18
+
19
+ DEFAULT_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
20
+ DEFAULT_PERSIST_DIR = "data/bills_vectorstore"
21
+ DEFAULT_COLLECTION = "bills"
22
+ DEFAULT_MANIFEST = "data/bills_vectorstore_manifest.json"
23
+
24
+ def get_embeddings(model: Optional[str] = None) -> OpenAIEmbeddings:
25
+ api_key = os.getenv("OPENAI_API_KEY")
26
+ if not api_key:
27
+ raise RuntimeError("OPENAI_API_KEY is not set. Check your .env or environment.")
28
+ return OpenAIEmbeddings(api_key=api_key, model=model or DEFAULT_EMBED_MODEL, chunk_size=32)
29
+
30
+ def _sha256(text: str) -> str:
31
+ import hashlib
32
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
33
+
34
+ def _bill_id(b: Dict[str, Any]) -> str:
35
+ return f"{b.get('state','Unknown')}_{b.get('bill_number','Unknown')}"
36
+
37
+ def _bill_text(b: Dict[str, Any]) -> str:
38
+ title = b.get("title") or ""
39
+ summary = b.get("description") or ""
40
+ txt = b.get("text") or ""
41
+ return f"Title: {title}\n\nSummary: {summary}\n\nFull Text:\n{txt}"
42
+
43
+ def _bill_hash(b: Dict[str, Any]) -> str:
44
+ payload = json.dumps({
45
+ "title": b.get("title"),
46
+ "description": b.get("description"),
47
+ "text": b.get("text"),
48
+ "status": b.get("status"),
49
+ "last_action_date": b.get("last_action_date"),
50
+ }, ensure_ascii=False, sort_keys=True)
51
+ return _sha256(payload)
52
+
53
+ def _manifest_load(path: str) -> Dict[str, Dict[str, str]]:
54
+ p = Path(path)
55
+ if not p.exists():
56
+ return {}
57
+ try:
58
+ return json.loads(p.read_text(encoding="utf-8"))
59
+ except Exception:
60
+ return {}
61
+
62
+ def _manifest_save(path: str, data: Dict[str, Dict[str, str]]) -> None:
63
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
64
+ Path(path).write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
65
+
66
+ def _clean_metadata(meta: Dict[str, Any]) -> Dict[str, Any]:
67
+ """Keep only metadata values that Chroma accepts: str/int/float/bool and not None."""
68
+ allowed_types = (str, int, float, bool)
69
+ cleaned: Dict[str, Any] = {}
70
+ for k, v in meta.items():
71
+ if v is None:
72
+ continue
73
+ if isinstance(v, allowed_types):
74
+ cleaned[k] = v
75
+ else:
76
+ # If you prefer to drop complex types instead of stringifying, replace with `continue`
77
+ cleaned[k] = str(v)
78
+ return cleaned
79
+
80
+
81
+ def _make_doc(b: Dict[str, Any]) -> Document:
82
+ sponsors_list = b.get("sponsors") or []
83
+ if isinstance(sponsors_list, list):
84
+ sponsors_str = "; ".join(map(str, sponsors_list))
85
+ else:
86
+ sponsors_str = str(sponsors_list) if sponsors_list else ""
87
+
88
+ flat_iapp = []
89
+ iapp = b.get("iapp_categories")
90
+ if isinstance(iapp, dict):
91
+ for k, v in iapp.items():
92
+ if isinstance(v, list):
93
+ for sub in v:
94
+ flat_iapp.append(f"{k}:{sub}")
95
+ iapp_str = "; ".join(flat_iapp) if flat_iapp else ""
96
+
97
+ meta = {
98
+ "doc_id": _bill_id(b),
99
+ "state": b.get("state"),
100
+ "session_year": b.get("session_year"),
101
+ "legislative_body": b.get("chamber") or b.get("legislative_body") or None,
102
+ "status": b.get("status"),
103
+ "title": b.get("title"),
104
+ "bill_number": b.get("bill_number"),
105
+ "sponsors": sponsors_str,
106
+ "last_action_date": b.get("last_action_date"),
107
+ "iapp_flat": iapp_str,
108
+ }
109
+
110
+ meta = _clean_metadata(meta)
111
+
112
+ return Document(page_content=_bill_text(b), metadata=meta)
113
+
114
+
115
+ meta = {k: v for k, v in meta.items() if v is not None}
116
+
117
+ return Document(page_content=_bill_text(b), metadata=meta)
118
+
119
+
120
+ meta = {k: v for k, v in meta.items() if v is not None}
121
+
122
+ return Document(page_content=_bill_text(b), metadata=meta)
123
+
124
+ def _load_bills(source_json_path: str) -> List[Dict[str, Any]]:
125
+ data = json.loads(Path(source_json_path).read_text(encoding="utf-8"))
126
+ if not isinstance(data, list):
127
+ raise ValueError(f"{source_json_path} must contain a list of bills")
128
+ return data
129
+
130
+ def load_vectorstore(
131
+ persist_dir: str = DEFAULT_PERSIST_DIR,
132
+ collection: str = DEFAULT_COLLECTION,
133
+ embeddings: Optional[OpenAIEmbeddings] = None,
134
+ ) -> Chroma:
135
+ embeddings = embeddings or get_embeddings()
136
+ Path(persist_dir).mkdir(parents=True, exist_ok=True)
137
+ return Chroma(
138
+ collection_name=collection,
139
+ persist_directory=persist_dir,
140
+ embedding_function=embeddings,
141
+ )
142
+
143
+ def _chunk_bill(b: Dict[str, Any], *, size: int = 1500, overlap: int = 200) -> List[Document]:
144
+ text = _bill_text(b)
145
+ splitter = RecursiveCharacterTextSplitter(
146
+ chunk_size=size, chunk_overlap=overlap,
147
+ separators=["\n\n", "\n", ". ", " ", ""]
148
+ )
149
+ pieces = splitter.split_text(text) or ["(no content)"]
150
+
151
+ docs: List[Document] = []
152
+ base_meta = {
153
+ "doc_id": _bill_id(b),
154
+ "state": b.get("state"),
155
+ "session_year": b.get("session_year"),
156
+ "legislative_body": b.get("chamber") or b.get("legislative_body") or None,
157
+ "status": b.get("status"),
158
+ "title": b.get("title"),
159
+ "bill_number": b.get("bill_number"),
160
+ "sponsors": (("; ".join(map(str, b.get("sponsors") or [])))
161
+ if isinstance(b.get("sponsors"), list)
162
+ else (b.get("sponsors") or "")),
163
+ "last_action_date": b.get("last_action_date"),
164
+ }
165
+
166
+ iapp = b.get("iapp_categories") or {}
167
+ flat = []
168
+ if isinstance(iapp, dict):
169
+ for k, v in iapp.items():
170
+ if isinstance(v, list):
171
+ for sub in v:
172
+ flat.append(f"{k}:{sub}")
173
+ base_meta["iapp_flat"] = "; ".join(flat)
174
+
175
+ # 🔑 Clean out None / bad types before using this as metadata
176
+ base_meta = _clean_metadata(base_meta)
177
+
178
+
179
+ total = len(pieces)
180
+ for i, chunk in enumerate(pieces):
181
+ m = dict(base_meta)
182
+ m["chunk_index"] = i
183
+ m["chunk_total"] = total
184
+ docs.append(Document(page_content=chunk, metadata=m))
185
+ return docs
186
+
187
+ def upsert_from_bills_json(
188
+ source_json_path: str = "data/known_bills_visualize.json",
189
+ persist_dir: str = DEFAULT_PERSIST_DIR,
190
+ collection: str = DEFAULT_COLLECTION,
191
+ manifest_path: str = DEFAULT_MANIFEST,
192
+ embed_model: Optional[str] = None,
193
+ batch_size: int = 128,
194
+ ) -> Dict[str, int]:
195
+ t0 = time.time()
196
+ bills = _load_bills(source_json_path)
197
+ embeddings = get_embeddings(embed_model)
198
+ vs = load_vectorstore(persist_dir, collection, embeddings)
199
+ manifest = _manifest_load(manifest_path)
200
+ manifest_meta = manifest.get("_meta", {})
201
+ if manifest_meta.get("embed_model") != (embed_model or DEFAULT_EMBED_MODEL):
202
+ manifest = {}
203
+ manifest["_meta"] = {"embed_model": embed_model or DEFAULT_EMBED_MODEL}
204
+
205
+ to_docs, to_ids = [], []
206
+ added, skipped = 0, 0
207
+
208
+ for b in bills:
209
+ if not (b.get("text") or b.get("description") or b.get("title")):
210
+ skipped += 1
211
+ continue
212
+ doc_id = _bill_id(b)
213
+ hsh = _bill_hash(b)
214
+ if manifest.get(doc_id, {}).get("hash") == hsh:
215
+ skipped += 1
216
+ continue
217
+ try:
218
+ vs.delete(where={"doc_id": doc_id})
219
+ except Exception:
220
+ pass
221
+ chunks = _chunk_bill(b)
222
+ for d in chunks:
223
+ to_docs.append(d)
224
+ to_ids.append(f"{doc_id}::c{d.metadata['chunk_index']}")
225
+ if len(to_docs) >= batch_size:
226
+ vs.add_documents(documents=to_docs, ids=to_ids)
227
+ to_docs, to_ids = [], []
228
+ manifest[doc_id] = {"hash": hsh}
229
+ added += 1
230
+
231
+ if to_docs:
232
+ vs.add_documents(documents=to_docs, ids=to_ids)
233
+
234
+ if hasattr(vs, "persist"):
235
+ vs.persist()
236
+
237
+ manifest["_meta"] = {"embed_model": embed_model or DEFAULT_EMBED_MODEL}
238
+ _manifest_save(manifest_path, manifest)
239
+
240
+ return {
241
+ "total_bills": len(bills),
242
+ "embedded": added,
243
+ "skipped_unchanged": skipped,
244
+ "elapsed_sec": int(time.time() - t0),
245
+ }
246
+
247
+ def get_retriever(persist_dir=DEFAULT_PERSIST_DIR, collection=DEFAULT_COLLECTION, k=8, filter_kwargs=None):
248
+ vs = load_vectorstore(persist_dir=persist_dir, collection=collection)
249
+ search_kwargs = {"k": k}
250
+ if filter_kwargs:
251
+ search_kwargs["filter"] = filter_kwargs
252
+ return vs.as_retriever(search_kwargs=search_kwargs)
253
+
254
+ def similarity_search(
255
+ query: str,
256
+ k: int = 5,
257
+ where: Optional[Dict[str, Any]] = None,
258
+ persist_dir: str = DEFAULT_PERSIST_DIR,
259
+ collection: str = DEFAULT_COLLECTION,
260
+ ):
261
+ vs = load_vectorstore(persist_dir=persist_dir, collection=collection)
262
+ filt = where if (where and len(where) > 0) else None # <-- key line
263
+ return vs.similarity_search(query, k=k, filter=filt)
vectorstore/pinecone_bills_vectorstore.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vectorstore/pinecone_bills_vectorstore.py
2
+ from __future__ import annotations
3
+ import os, json, time
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional, Any
6
+
7
+ from datetime import datetime
8
+ from langchain_openai import OpenAIEmbeddings
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
+ from langchain_core.documents import Document
11
+
12
+ from pinecone import Pinecone
13
+ from langchain_pinecone import PineconeVectorStore
14
+
15
+ from dotenv import load_dotenv
16
+ from pathlib import Path as _Path
17
+ load_dotenv(dotenv_path=_Path.cwd() / ".env")
18
+
19
+ DEFAULT_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
20
+ DEFAULT_COLLECTION = os.getenv("PINECONE_INDEX", "legislation-tracker")
21
+ DEFAULT_MANIFEST = "data/bills_vectorstore_manifest.json"
22
+ # Use empty string for namespace if not specified (Pinecone default)
23
+ DEFAULT_NAMESPACE = os.getenv("PINECONE_NAMESPACE", "")
24
+
25
+ def get_embeddings(model: Optional[str] = None) -> OpenAIEmbeddings:
26
+ key = os.getenv("OPENAI_API_KEY")
27
+ if not key:
28
+ raise RuntimeError("OPENAI_API_KEY not set")
29
+ return OpenAIEmbeddings(api_key=key, model=model or DEFAULT_EMBED_MODEL, chunk_size=32)
30
+
31
+ def _clean_meta(m: dict) -> dict:
32
+ out = {}
33
+ for k, v in m.items():
34
+ if v is None:
35
+ continue
36
+ if isinstance(v, (str, bool, int, float)):
37
+ out[k] = v
38
+ elif isinstance(v, (list, tuple)):
39
+ out[k] = [str(x) for x in v if x is not None]
40
+ else:
41
+ out[k] = str(v)
42
+ return out
43
+
44
+ def _sha256(text: str) -> str:
45
+ import hashlib
46
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
47
+
48
+
49
+ def _bill_id(b: Dict[str, Any]) -> str:
50
+ return f"{b.get('state','Unknown')}_{b.get('bill_number','Unknown')}"
51
+
52
+ def _bill_text(b: Dict[str, Any]) -> str:
53
+ title = b.get("title") or ""
54
+ summary = b.get("description") or ""
55
+ txt = b.get("text") or ""
56
+ return f"Title: {title}\n\nSummary: {summary}\n\nFull Text:\n{txt}"
57
+
58
+ def _bill_hash(b: Dict[str, Any]) -> str:
59
+ payload = json.dumps({
60
+ "title": b.get("title"),
61
+ "description": b.get("description"),
62
+ "text": b.get("text"),
63
+ "status": b.get("status"),
64
+ "last_action_date": b.get("last_action_date"),
65
+ }, ensure_ascii=False, sort_keys=True)
66
+ return _sha256(payload)
67
+
68
+ def _manifest_load(path: str) -> Dict[str, Dict[str, str]]:
69
+ p = Path(path)
70
+ if not p.exists():
71
+ return {}
72
+ try:
73
+ return json.loads(p.read_text(encoding="utf-8"))
74
+ except Exception:
75
+ return {}
76
+
77
+ def _manifest_save(path: str, data: Dict[str, Dict[str, str]]) -> None:
78
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
79
+ Path(path).write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
80
+
81
+ def _flatten_iapp(iapp: Any) -> list[str]:
82
+ flat: List[str] = []
83
+ if isinstance(iapp, dict):
84
+ for k, v in iapp.items():
85
+ if isinstance(v, list):
86
+ for sub in v:
87
+ flat.append(f"{k}:{sub}")
88
+ return flat # keep as list[str] for Pinecone $in filters
89
+
90
+ def _parse_session_years(val) -> tuple[int | None, int | None]:
91
+ """
92
+ Accepts:
93
+ - string like '2023-2024'
94
+ - dict with keys 'year_start'/'year_end'
95
+ Returns (start, end) as ints or (None, None)
96
+ """
97
+ if isinstance(val, str) and "-" in val:
98
+ try:
99
+ a, b = val.split("-", 1)
100
+ return int(a), int(b)
101
+ except Exception:
102
+ return None, None
103
+ if isinstance(val, dict):
104
+ try:
105
+ return int(val.get("year_start")), int(val.get("year_end"))
106
+ except Exception:
107
+ return None, None
108
+ return None, None
109
+
110
+ def _to_epoch(date_str: str | None) -> int | None:
111
+ """
112
+ Accepts YYYY-MM-DD or ISO-8601; returns Unix epoch seconds or None
113
+ """
114
+ if not date_str:
115
+ return None
116
+ try:
117
+ ds = date_str.replace("Z", "")
118
+ return int(datetime.fromisoformat(ds).timestamp())
119
+ except Exception:
120
+ return None
121
+
122
+ def _chunk_bill(b: Dict[str, Any], *, size: int = 1500, overlap: int = 200) -> List[Document]:
123
+ splitter = RecursiveCharacterTextSplitter(
124
+ chunk_size=size, chunk_overlap=overlap,
125
+ separators=["\n\n", "\n", ". ", " ", ""]
126
+ )
127
+ text = _bill_text(b)
128
+ pieces = splitter.split_text(text) or ["(no content)"]
129
+
130
+ iapp_list = _flatten_iapp(b.get("iapp_categories"))
131
+ sy_start, sy_end = _parse_session_years(b.get("session_year"))
132
+ last_action_date = b.get("last_action_date")
133
+ last_action_ts = _to_epoch(last_action_date)
134
+
135
+ base_meta = {
136
+ "doc_id": _bill_id(b),
137
+ "state": b.get("state"),
138
+ "session_year": b.get("session_year"),
139
+ "session_year_start": sy_start,
140
+ "session_year_end": sy_end,
141
+ "legislative_body": b.get("chamber") or b.get("legislative_body") or "",
142
+ "status": b.get("status"),
143
+ "title": b.get("title"),
144
+ "bill_number": b.get("bill_number"),
145
+ "sponsors": (("; ".join(map(str, b.get("sponsors") or []))) if isinstance(b.get("sponsors"), list) else (b.get("sponsors") or "")),
146
+ "last_action_date": last_action_date,
147
+ "last_action_ts": last_action_ts,
148
+ "iapp_flat": iapp_list, # list[str]
149
+ }
150
+
151
+ docs: List[Document] = []
152
+ total = len(pieces)
153
+ for i, chunk in enumerate(pieces):
154
+ m = dict(base_meta)
155
+ m["chunk_index"] = i
156
+ m["chunk_total"] = total
157
+ m["text"] = chunk
158
+ m = _clean_meta(m)
159
+ docs.append(Document(page_content=chunk, metadata=m))
160
+ return docs
161
+
162
+ def _load_bills(path: str) -> List[Dict[str, Any]]:
163
+ import json as _json
164
+ from pathlib import Path as _P
165
+ data = _json.loads(_P(path).read_text(encoding="utf-8"))
166
+ if not isinstance(data, list):
167
+ raise ValueError(f"{path} must contain a list of bills")
168
+ return data
169
+
170
+ def _vectorstore(embeddings: OpenAIEmbeddings) -> PineconeVectorStore:
171
+ pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
172
+ index_name = os.getenv("PINECONE_INDEX", DEFAULT_COLLECTION)
173
+ index = pc.Index(index_name)
174
+ namespace = DEFAULT_NAMESPACE if DEFAULT_NAMESPACE else None
175
+ return PineconeVectorStore(index=index, embedding=embeddings, namespace=namespace)
176
+
177
+ def upsert_from_bills_json(
178
+ source_json_path: str = "data/known_bills_visualize.json",
179
+ manifest_path: str = DEFAULT_MANIFEST,
180
+ embed_model: Optional[str] = None,
181
+ batch_size: int = 128,
182
+ ) -> Dict[str, int]:
183
+ t0 = time.time()
184
+ bills = _load_bills(source_json_path)
185
+ embeddings = get_embeddings(embed_model)
186
+ vs = _vectorstore(embeddings)
187
+
188
+ manifest = _manifest_load(manifest_path)
189
+ meta = manifest.get("_meta", {})
190
+ model_in_use = embed_model or DEFAULT_EMBED_MODEL
191
+ if meta.get("embed_model") != model_in_use:
192
+ manifest = {"_meta": {"embed_model": model_in_use}}
193
+
194
+ to_upsert: List[Document] = []
195
+ added, skipped = 0, 0
196
+
197
+ for b in bills:
198
+ if not (b.get("text") or b.get("description") or b.get("title")):
199
+ skipped += 1
200
+ continue
201
+ doc_id = _bill_id(b)
202
+ hsh = _bill_hash(b)
203
+ if manifest.get(doc_id, {}).get("hash") == hsh:
204
+ skipped += 1
205
+ continue
206
+
207
+ try:
208
+ vs.delete(filter={"doc_id": doc_id})
209
+ except Exception:
210
+ pass
211
+
212
+ for d in _chunk_bill(b):
213
+ to_upsert.append(d)
214
+ if len(to_upsert) >= batch_size:
215
+ vs.add_documents(documents=to_upsert)
216
+ to_upsert = []
217
+
218
+ manifest[doc_id] = {"hash": hsh}
219
+ added += 1
220
+
221
+ if to_upsert:
222
+ vs.add_documents(documents=to_upsert)
223
+
224
+ manifest["_meta"] = {"embed_model": model_in_use}
225
+ _manifest_save(manifest_path, manifest)
226
+
227
+ return {
228
+ "total_bills": len(bills),
229
+ "embedded": added,
230
+ "skipped_unchanged": skipped,
231
+ "elapsed_sec": int(time.time() - t0),
232
+ }
233
+
234
+ def get_retriever(k=8, filter_kwargs: Optional[Dict[str, Any]] = None):
235
+ embeddings = get_embeddings()
236
+ vs = _vectorstore(embeddings)
237
+ kwargs = {"k": k}
238
+ if filter_kwargs:
239
+ kwargs["filter"] = filter_kwargs
240
+ return vs.as_retriever(search_kwargs=kwargs)
241
+
242
+ def similarity_search(query: str, k: int = 5, where: Optional[Dict[str, Any]] = None):
243
+ embeddings = get_embeddings()
244
+ vs = _vectorstore(embeddings)
245
+ return vs.similarity_search(query, k=k, filter=where or None)
vectorstore/pinecone_delta_upsert.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vectorstore/pinecone_delta_upsert.py
2
+ import os, json, hashlib, time
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any, Callable
5
+ from dotenv import load_dotenv
6
+ from pinecone import Pinecone
7
+ from openai import OpenAI
8
+ from datetime import datetime
9
+
10
+ load_dotenv()
11
+
12
+ MANIFEST_PATH = Path("data/pinecone_manifest.json")
13
+ EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
14
+ INDEX_NAME = os.getenv("PINECONE_INDEX", "legislation-tracker")
15
+ DEFAULT_NAMESPACE = os.getenv("PINECONE_NAMESPACE", "default")
16
+
17
+ def _sha(s: str) -> str:
18
+ return hashlib.sha256(s.encode("utf-8")).hexdigest()
19
+
20
+ def _load_manifest() -> Dict[str, str]:
21
+ if MANIFEST_PATH.exists():
22
+ return json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
23
+ return {}
24
+
25
+ def _save_manifest(m: Dict[str, str]) -> None:
26
+ MANIFEST_PATH.parent.mkdir(parents=True, exist_ok=True)
27
+ MANIFEST_PATH.write_text(json.dumps(m, indent=2, ensure_ascii=False), encoding="utf-8")
28
+
29
+ def make_embedder() -> Callable[[str], List[float]]:
30
+ api = os.getenv("OPENAI_API_KEY")
31
+ if not api:
32
+ raise RuntimeError("OPENAI_API_KEY not set")
33
+ client = OpenAI(api_key=api)
34
+
35
+ def _emb(text: str) -> List[float]:
36
+ return client.embeddings.create(input=text, model=EMBED_MODEL).data[0].embedding
37
+
38
+ return _emb
39
+
40
+ def _parse_session_years(val) -> tuple[int | None, int | None]:
41
+ """
42
+ Accepts:
43
+ - string like '2023-2024'
44
+ - dict with keys 'year_start'/'year_end'
45
+ Returns (start, end) as ints or (None, None)
46
+ """
47
+ if isinstance(val, str) and "-" in val:
48
+ try:
49
+ a, b = val.split("-", 1)
50
+ return int(a), int(b)
51
+ except Exception:
52
+ return None, None
53
+ if isinstance(val, dict):
54
+ try:
55
+ return int(val.get("year_start")), int(val.get("year_end"))
56
+ except Exception:
57
+ return None, None
58
+ return None, None
59
+
60
+ def _to_epoch(date_str: str | None) -> int | None:
61
+ """
62
+ Accepts YYYY-MM-DD or ISO-8601; returns Unix epoch seconds or None
63
+ """
64
+ if not date_str:
65
+ return None
66
+ try:
67
+ ds = date_str.replace("Z", "")
68
+ # If only date is given, fromisoformat still works (YYYY-MM-DD)
69
+ return int(datetime.fromisoformat(ds).timestamp())
70
+ except Exception:
71
+ return None
72
+
73
+ def upsert_changed_vectors(
74
+ records: List[Dict[str, Any]],
75
+ *,
76
+ index_name: str = INDEX_NAME,
77
+ namespace: str = DEFAULT_NAMESPACE,
78
+ id_key: str = "id",
79
+ text_key: str = "text"
80
+ ) -> int:
81
+ """
82
+ Incremental upsert using a manifest (id+text hash). Each record is:
83
+ { "id": "...", "text": "...", "metadata": {...} }
84
+ Only changed/new records are embedded and upserted.
85
+ """
86
+ api = os.getenv("PINECONE_API_KEY")
87
+ if not api:
88
+ raise RuntimeError("PINECONE_API_KEY not set")
89
+ pc = Pinecone(api_key=api)
90
+ index = pc.Index(index_name)
91
+ manifest = _load_manifest()
92
+ embed = make_embedder()
93
+
94
+ to_upsert = []
95
+ for r in records:
96
+ rid = r[id_key]
97
+ txt = r[text_key] or ""
98
+ h = _sha(rid + "|" + txt)
99
+ if manifest.get(rid) != h:
100
+ vec = {
101
+ "id": rid,
102
+ "values": embed(txt),
103
+ "metadata": r.get("metadata", {})
104
+ }
105
+ to_upsert.append(vec)
106
+ manifest[rid] = h
107
+
108
+ if to_upsert:
109
+ index.upsert(vectors=to_upsert, namespace=namespace)
110
+ _save_manifest(manifest)
111
+ return len(to_upsert)
112
+
113
+ # --- Tiny helper chunker --------------------
114
+ def chunk_bill(bill: Dict[str, Any], size: int = 1500, overlap: int = 200) -> List[Dict[str, Any]]:
115
+ """
116
+ Creates simple chunks with ids like 'STATE_BILL::c0', including chunk text in metadata for easy display.
117
+ Includes numeric fields: session_year_start, session_year_end, last_action_ts (epoch seconds).
118
+ Stores iapp_flat as list[str] for Pinecone $in filters.
119
+ """
120
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
121
+
122
+ doc_id = f"{bill.get('state','Unknown')}_{bill.get('bill_number','Unknown')}"
123
+ title = bill.get("title") or ""
124
+ summary = bill.get("description") or ""
125
+ txt = bill.get("text") or ""
126
+ full = f"Title: {title}\n\nSummary: {summary}\n\nFull Text:\n{txt}"
127
+
128
+ splitter = RecursiveCharacterTextSplitter(
129
+ chunk_size=size, chunk_overlap=overlap,
130
+ separators=["\n\n", "\n", ". ", " ", ""]
131
+ )
132
+
133
+ iapp = bill.get("iapp_categories") or {}
134
+ iapp_flat: List[str] = []
135
+ if isinstance(iapp, dict):
136
+ for k, v in iapp.items():
137
+ if isinstance(v, list):
138
+ for sub in v:
139
+ iapp_flat.append(f"{k}:{sub}")
140
+
141
+ # Numeric session years + timestamp for last action date
142
+ sy_start, sy_end = _parse_session_years(bill.get("session_year"))
143
+ last_action_date = bill.get("last_action_date")
144
+ last_action_ts = _to_epoch(last_action_date)
145
+
146
+ pieces = splitter.split_text(full) or ["(no content)"]
147
+ out: List[Dict[str, Any]] = []
148
+ total = len(pieces)
149
+
150
+ base_meta = {
151
+ "doc_id": doc_id,
152
+ "state": bill.get("state"),
153
+ "bill_number": bill.get("bill_number"),
154
+ "title": title,
155
+ "session_year": bill.get("session_year"),
156
+ "session_year_start": sy_start,
157
+ "session_year_end": sy_end,
158
+ "status": bill.get("status"),
159
+ "last_action_date": last_action_date,
160
+ "last_action_ts": last_action_ts,
161
+ "iapp_flat": iapp_flat,
162
+ }
163
+
164
+ for i, chunk in enumerate(pieces):
165
+ md = dict(base_meta)
166
+ md["chunk_index"] = i
167
+ md["chunk_total"] = total
168
+ md["text"] = chunk
169
+ out.append({
170
+ "id": f"{doc_id}::c{i}",
171
+ "text": chunk,
172
+ "metadata": md
173
+ })
174
+ return out