Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Upload 7 files
Browse files- vectorstore/__init__.py +0 -0
- vectorstore/__pycache__/__init__.cpython-313.pyc +0 -0
- vectorstore/__pycache__/bills_vectorstore.cpython-313.pyc +0 -0
- vectorstore/__pycache__/pinecone_bills_vectorstore.cpython-313.pyc +0 -0
- vectorstore/bills_vectorstore.py +263 -0
- vectorstore/pinecone_bills_vectorstore.py +245 -0
- vectorstore/pinecone_delta_upsert.py +174 -0
vectorstore/__init__.py
ADDED
|
File without changes
|
vectorstore/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (192 Bytes). View file
|
|
|
vectorstore/__pycache__/bills_vectorstore.cpython-313.pyc
ADDED
|
Binary file (13.1 kB). View file
|
|
|
vectorstore/__pycache__/pinecone_bills_vectorstore.cpython-313.pyc
ADDED
|
Binary file (13.1 kB). View file
|
|
|
vectorstore/bills_vectorstore.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# vectorstore/bills_vectorstore.py
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import os, json, hashlib, time
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Dict, List, Optional, Iterable, Any
|
| 6 |
+
|
| 7 |
+
from dotenv import load_dotenv, find_dotenv
|
| 8 |
+
load_dotenv(find_dotenv())
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from langchain_chroma import Chroma
|
| 12 |
+
except Exception:
|
| 13 |
+
from langchain_community.vectorstores import Chroma
|
| 14 |
+
|
| 15 |
+
from langchain_openai import OpenAIEmbeddings
|
| 16 |
+
from langchain.schema import Document
|
| 17 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 18 |
+
|
| 19 |
+
DEFAULT_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
|
| 20 |
+
DEFAULT_PERSIST_DIR = "data/bills_vectorstore"
|
| 21 |
+
DEFAULT_COLLECTION = "bills"
|
| 22 |
+
DEFAULT_MANIFEST = "data/bills_vectorstore_manifest.json"
|
| 23 |
+
|
| 24 |
+
def get_embeddings(model: Optional[str] = None) -> OpenAIEmbeddings:
|
| 25 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 26 |
+
if not api_key:
|
| 27 |
+
raise RuntimeError("OPENAI_API_KEY is not set. Check your .env or environment.")
|
| 28 |
+
return OpenAIEmbeddings(api_key=api_key, model=model or DEFAULT_EMBED_MODEL, chunk_size=32)
|
| 29 |
+
|
| 30 |
+
def _sha256(text: str) -> str:
|
| 31 |
+
import hashlib
|
| 32 |
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
| 33 |
+
|
| 34 |
+
def _bill_id(b: Dict[str, Any]) -> str:
|
| 35 |
+
return f"{b.get('state','Unknown')}_{b.get('bill_number','Unknown')}"
|
| 36 |
+
|
| 37 |
+
def _bill_text(b: Dict[str, Any]) -> str:
|
| 38 |
+
title = b.get("title") or ""
|
| 39 |
+
summary = b.get("description") or ""
|
| 40 |
+
txt = b.get("text") or ""
|
| 41 |
+
return f"Title: {title}\n\nSummary: {summary}\n\nFull Text:\n{txt}"
|
| 42 |
+
|
| 43 |
+
def _bill_hash(b: Dict[str, Any]) -> str:
|
| 44 |
+
payload = json.dumps({
|
| 45 |
+
"title": b.get("title"),
|
| 46 |
+
"description": b.get("description"),
|
| 47 |
+
"text": b.get("text"),
|
| 48 |
+
"status": b.get("status"),
|
| 49 |
+
"last_action_date": b.get("last_action_date"),
|
| 50 |
+
}, ensure_ascii=False, sort_keys=True)
|
| 51 |
+
return _sha256(payload)
|
| 52 |
+
|
| 53 |
+
def _manifest_load(path: str) -> Dict[str, Dict[str, str]]:
|
| 54 |
+
p = Path(path)
|
| 55 |
+
if not p.exists():
|
| 56 |
+
return {}
|
| 57 |
+
try:
|
| 58 |
+
return json.loads(p.read_text(encoding="utf-8"))
|
| 59 |
+
except Exception:
|
| 60 |
+
return {}
|
| 61 |
+
|
| 62 |
+
def _manifest_save(path: str, data: Dict[str, Dict[str, str]]) -> None:
|
| 63 |
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
| 64 |
+
Path(path).write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
| 65 |
+
|
| 66 |
+
def _clean_metadata(meta: Dict[str, Any]) -> Dict[str, Any]:
|
| 67 |
+
"""Keep only metadata values that Chroma accepts: str/int/float/bool and not None."""
|
| 68 |
+
allowed_types = (str, int, float, bool)
|
| 69 |
+
cleaned: Dict[str, Any] = {}
|
| 70 |
+
for k, v in meta.items():
|
| 71 |
+
if v is None:
|
| 72 |
+
continue
|
| 73 |
+
if isinstance(v, allowed_types):
|
| 74 |
+
cleaned[k] = v
|
| 75 |
+
else:
|
| 76 |
+
# If you prefer to drop complex types instead of stringifying, replace with `continue`
|
| 77 |
+
cleaned[k] = str(v)
|
| 78 |
+
return cleaned
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _make_doc(b: Dict[str, Any]) -> Document:
|
| 82 |
+
sponsors_list = b.get("sponsors") or []
|
| 83 |
+
if isinstance(sponsors_list, list):
|
| 84 |
+
sponsors_str = "; ".join(map(str, sponsors_list))
|
| 85 |
+
else:
|
| 86 |
+
sponsors_str = str(sponsors_list) if sponsors_list else ""
|
| 87 |
+
|
| 88 |
+
flat_iapp = []
|
| 89 |
+
iapp = b.get("iapp_categories")
|
| 90 |
+
if isinstance(iapp, dict):
|
| 91 |
+
for k, v in iapp.items():
|
| 92 |
+
if isinstance(v, list):
|
| 93 |
+
for sub in v:
|
| 94 |
+
flat_iapp.append(f"{k}:{sub}")
|
| 95 |
+
iapp_str = "; ".join(flat_iapp) if flat_iapp else ""
|
| 96 |
+
|
| 97 |
+
meta = {
|
| 98 |
+
"doc_id": _bill_id(b),
|
| 99 |
+
"state": b.get("state"),
|
| 100 |
+
"session_year": b.get("session_year"),
|
| 101 |
+
"legislative_body": b.get("chamber") or b.get("legislative_body") or None,
|
| 102 |
+
"status": b.get("status"),
|
| 103 |
+
"title": b.get("title"),
|
| 104 |
+
"bill_number": b.get("bill_number"),
|
| 105 |
+
"sponsors": sponsors_str,
|
| 106 |
+
"last_action_date": b.get("last_action_date"),
|
| 107 |
+
"iapp_flat": iapp_str,
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
meta = _clean_metadata(meta)
|
| 111 |
+
|
| 112 |
+
return Document(page_content=_bill_text(b), metadata=meta)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
meta = {k: v for k, v in meta.items() if v is not None}
|
| 116 |
+
|
| 117 |
+
return Document(page_content=_bill_text(b), metadata=meta)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
meta = {k: v for k, v in meta.items() if v is not None}
|
| 121 |
+
|
| 122 |
+
return Document(page_content=_bill_text(b), metadata=meta)
|
| 123 |
+
|
| 124 |
+
def _load_bills(source_json_path: str) -> List[Dict[str, Any]]:
|
| 125 |
+
data = json.loads(Path(source_json_path).read_text(encoding="utf-8"))
|
| 126 |
+
if not isinstance(data, list):
|
| 127 |
+
raise ValueError(f"{source_json_path} must contain a list of bills")
|
| 128 |
+
return data
|
| 129 |
+
|
| 130 |
+
def load_vectorstore(
|
| 131 |
+
persist_dir: str = DEFAULT_PERSIST_DIR,
|
| 132 |
+
collection: str = DEFAULT_COLLECTION,
|
| 133 |
+
embeddings: Optional[OpenAIEmbeddings] = None,
|
| 134 |
+
) -> Chroma:
|
| 135 |
+
embeddings = embeddings or get_embeddings()
|
| 136 |
+
Path(persist_dir).mkdir(parents=True, exist_ok=True)
|
| 137 |
+
return Chroma(
|
| 138 |
+
collection_name=collection,
|
| 139 |
+
persist_directory=persist_dir,
|
| 140 |
+
embedding_function=embeddings,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
def _chunk_bill(b: Dict[str, Any], *, size: int = 1500, overlap: int = 200) -> List[Document]:
|
| 144 |
+
text = _bill_text(b)
|
| 145 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 146 |
+
chunk_size=size, chunk_overlap=overlap,
|
| 147 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
| 148 |
+
)
|
| 149 |
+
pieces = splitter.split_text(text) or ["(no content)"]
|
| 150 |
+
|
| 151 |
+
docs: List[Document] = []
|
| 152 |
+
base_meta = {
|
| 153 |
+
"doc_id": _bill_id(b),
|
| 154 |
+
"state": b.get("state"),
|
| 155 |
+
"session_year": b.get("session_year"),
|
| 156 |
+
"legislative_body": b.get("chamber") or b.get("legislative_body") or None,
|
| 157 |
+
"status": b.get("status"),
|
| 158 |
+
"title": b.get("title"),
|
| 159 |
+
"bill_number": b.get("bill_number"),
|
| 160 |
+
"sponsors": (("; ".join(map(str, b.get("sponsors") or [])))
|
| 161 |
+
if isinstance(b.get("sponsors"), list)
|
| 162 |
+
else (b.get("sponsors") or "")),
|
| 163 |
+
"last_action_date": b.get("last_action_date"),
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
iapp = b.get("iapp_categories") or {}
|
| 167 |
+
flat = []
|
| 168 |
+
if isinstance(iapp, dict):
|
| 169 |
+
for k, v in iapp.items():
|
| 170 |
+
if isinstance(v, list):
|
| 171 |
+
for sub in v:
|
| 172 |
+
flat.append(f"{k}:{sub}")
|
| 173 |
+
base_meta["iapp_flat"] = "; ".join(flat)
|
| 174 |
+
|
| 175 |
+
# 🔑 Clean out None / bad types before using this as metadata
|
| 176 |
+
base_meta = _clean_metadata(base_meta)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
total = len(pieces)
|
| 180 |
+
for i, chunk in enumerate(pieces):
|
| 181 |
+
m = dict(base_meta)
|
| 182 |
+
m["chunk_index"] = i
|
| 183 |
+
m["chunk_total"] = total
|
| 184 |
+
docs.append(Document(page_content=chunk, metadata=m))
|
| 185 |
+
return docs
|
| 186 |
+
|
| 187 |
+
def upsert_from_bills_json(
|
| 188 |
+
source_json_path: str = "data/known_bills_visualize.json",
|
| 189 |
+
persist_dir: str = DEFAULT_PERSIST_DIR,
|
| 190 |
+
collection: str = DEFAULT_COLLECTION,
|
| 191 |
+
manifest_path: str = DEFAULT_MANIFEST,
|
| 192 |
+
embed_model: Optional[str] = None,
|
| 193 |
+
batch_size: int = 128,
|
| 194 |
+
) -> Dict[str, int]:
|
| 195 |
+
t0 = time.time()
|
| 196 |
+
bills = _load_bills(source_json_path)
|
| 197 |
+
embeddings = get_embeddings(embed_model)
|
| 198 |
+
vs = load_vectorstore(persist_dir, collection, embeddings)
|
| 199 |
+
manifest = _manifest_load(manifest_path)
|
| 200 |
+
manifest_meta = manifest.get("_meta", {})
|
| 201 |
+
if manifest_meta.get("embed_model") != (embed_model or DEFAULT_EMBED_MODEL):
|
| 202 |
+
manifest = {}
|
| 203 |
+
manifest["_meta"] = {"embed_model": embed_model or DEFAULT_EMBED_MODEL}
|
| 204 |
+
|
| 205 |
+
to_docs, to_ids = [], []
|
| 206 |
+
added, skipped = 0, 0
|
| 207 |
+
|
| 208 |
+
for b in bills:
|
| 209 |
+
if not (b.get("text") or b.get("description") or b.get("title")):
|
| 210 |
+
skipped += 1
|
| 211 |
+
continue
|
| 212 |
+
doc_id = _bill_id(b)
|
| 213 |
+
hsh = _bill_hash(b)
|
| 214 |
+
if manifest.get(doc_id, {}).get("hash") == hsh:
|
| 215 |
+
skipped += 1
|
| 216 |
+
continue
|
| 217 |
+
try:
|
| 218 |
+
vs.delete(where={"doc_id": doc_id})
|
| 219 |
+
except Exception:
|
| 220 |
+
pass
|
| 221 |
+
chunks = _chunk_bill(b)
|
| 222 |
+
for d in chunks:
|
| 223 |
+
to_docs.append(d)
|
| 224 |
+
to_ids.append(f"{doc_id}::c{d.metadata['chunk_index']}")
|
| 225 |
+
if len(to_docs) >= batch_size:
|
| 226 |
+
vs.add_documents(documents=to_docs, ids=to_ids)
|
| 227 |
+
to_docs, to_ids = [], []
|
| 228 |
+
manifest[doc_id] = {"hash": hsh}
|
| 229 |
+
added += 1
|
| 230 |
+
|
| 231 |
+
if to_docs:
|
| 232 |
+
vs.add_documents(documents=to_docs, ids=to_ids)
|
| 233 |
+
|
| 234 |
+
if hasattr(vs, "persist"):
|
| 235 |
+
vs.persist()
|
| 236 |
+
|
| 237 |
+
manifest["_meta"] = {"embed_model": embed_model or DEFAULT_EMBED_MODEL}
|
| 238 |
+
_manifest_save(manifest_path, manifest)
|
| 239 |
+
|
| 240 |
+
return {
|
| 241 |
+
"total_bills": len(bills),
|
| 242 |
+
"embedded": added,
|
| 243 |
+
"skipped_unchanged": skipped,
|
| 244 |
+
"elapsed_sec": int(time.time() - t0),
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
def get_retriever(persist_dir=DEFAULT_PERSIST_DIR, collection=DEFAULT_COLLECTION, k=8, filter_kwargs=None):
|
| 248 |
+
vs = load_vectorstore(persist_dir=persist_dir, collection=collection)
|
| 249 |
+
search_kwargs = {"k": k}
|
| 250 |
+
if filter_kwargs:
|
| 251 |
+
search_kwargs["filter"] = filter_kwargs
|
| 252 |
+
return vs.as_retriever(search_kwargs=search_kwargs)
|
| 253 |
+
|
| 254 |
+
def similarity_search(
|
| 255 |
+
query: str,
|
| 256 |
+
k: int = 5,
|
| 257 |
+
where: Optional[Dict[str, Any]] = None,
|
| 258 |
+
persist_dir: str = DEFAULT_PERSIST_DIR,
|
| 259 |
+
collection: str = DEFAULT_COLLECTION,
|
| 260 |
+
):
|
| 261 |
+
vs = load_vectorstore(persist_dir=persist_dir, collection=collection)
|
| 262 |
+
filt = where if (where and len(where) > 0) else None # <-- key line
|
| 263 |
+
return vs.similarity_search(query, k=k, filter=filt)
|
vectorstore/pinecone_bills_vectorstore.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# vectorstore/pinecone_bills_vectorstore.py
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import os, json, time
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Dict, List, Optional, Any
|
| 6 |
+
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from langchain_openai import OpenAIEmbeddings
|
| 9 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 10 |
+
from langchain_core.documents import Document
|
| 11 |
+
|
| 12 |
+
from pinecone import Pinecone
|
| 13 |
+
from langchain_pinecone import PineconeVectorStore
|
| 14 |
+
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
+
from pathlib import Path as _Path
|
| 17 |
+
load_dotenv(dotenv_path=_Path.cwd() / ".env")
|
| 18 |
+
|
| 19 |
+
DEFAULT_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
|
| 20 |
+
DEFAULT_COLLECTION = os.getenv("PINECONE_INDEX", "legislation-tracker")
|
| 21 |
+
DEFAULT_MANIFEST = "data/bills_vectorstore_manifest.json"
|
| 22 |
+
# Use empty string for namespace if not specified (Pinecone default)
|
| 23 |
+
DEFAULT_NAMESPACE = os.getenv("PINECONE_NAMESPACE", "")
|
| 24 |
+
|
| 25 |
+
def get_embeddings(model: Optional[str] = None) -> OpenAIEmbeddings:
|
| 26 |
+
key = os.getenv("OPENAI_API_KEY")
|
| 27 |
+
if not key:
|
| 28 |
+
raise RuntimeError("OPENAI_API_KEY not set")
|
| 29 |
+
return OpenAIEmbeddings(api_key=key, model=model or DEFAULT_EMBED_MODEL, chunk_size=32)
|
| 30 |
+
|
| 31 |
+
def _clean_meta(m: dict) -> dict:
|
| 32 |
+
out = {}
|
| 33 |
+
for k, v in m.items():
|
| 34 |
+
if v is None:
|
| 35 |
+
continue
|
| 36 |
+
if isinstance(v, (str, bool, int, float)):
|
| 37 |
+
out[k] = v
|
| 38 |
+
elif isinstance(v, (list, tuple)):
|
| 39 |
+
out[k] = [str(x) for x in v if x is not None]
|
| 40 |
+
else:
|
| 41 |
+
out[k] = str(v)
|
| 42 |
+
return out
|
| 43 |
+
|
| 44 |
+
def _sha256(text: str) -> str:
|
| 45 |
+
import hashlib
|
| 46 |
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _bill_id(b: Dict[str, Any]) -> str:
|
| 50 |
+
return f"{b.get('state','Unknown')}_{b.get('bill_number','Unknown')}"
|
| 51 |
+
|
| 52 |
+
def _bill_text(b: Dict[str, Any]) -> str:
|
| 53 |
+
title = b.get("title") or ""
|
| 54 |
+
summary = b.get("description") or ""
|
| 55 |
+
txt = b.get("text") or ""
|
| 56 |
+
return f"Title: {title}\n\nSummary: {summary}\n\nFull Text:\n{txt}"
|
| 57 |
+
|
| 58 |
+
def _bill_hash(b: Dict[str, Any]) -> str:
|
| 59 |
+
payload = json.dumps({
|
| 60 |
+
"title": b.get("title"),
|
| 61 |
+
"description": b.get("description"),
|
| 62 |
+
"text": b.get("text"),
|
| 63 |
+
"status": b.get("status"),
|
| 64 |
+
"last_action_date": b.get("last_action_date"),
|
| 65 |
+
}, ensure_ascii=False, sort_keys=True)
|
| 66 |
+
return _sha256(payload)
|
| 67 |
+
|
| 68 |
+
def _manifest_load(path: str) -> Dict[str, Dict[str, str]]:
|
| 69 |
+
p = Path(path)
|
| 70 |
+
if not p.exists():
|
| 71 |
+
return {}
|
| 72 |
+
try:
|
| 73 |
+
return json.loads(p.read_text(encoding="utf-8"))
|
| 74 |
+
except Exception:
|
| 75 |
+
return {}
|
| 76 |
+
|
| 77 |
+
def _manifest_save(path: str, data: Dict[str, Dict[str, str]]) -> None:
|
| 78 |
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
| 79 |
+
Path(path).write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
| 80 |
+
|
| 81 |
+
def _flatten_iapp(iapp: Any) -> list[str]:
|
| 82 |
+
flat: List[str] = []
|
| 83 |
+
if isinstance(iapp, dict):
|
| 84 |
+
for k, v in iapp.items():
|
| 85 |
+
if isinstance(v, list):
|
| 86 |
+
for sub in v:
|
| 87 |
+
flat.append(f"{k}:{sub}")
|
| 88 |
+
return flat # keep as list[str] for Pinecone $in filters
|
| 89 |
+
|
| 90 |
+
def _parse_session_years(val) -> tuple[int | None, int | None]:
|
| 91 |
+
"""
|
| 92 |
+
Accepts:
|
| 93 |
+
- string like '2023-2024'
|
| 94 |
+
- dict with keys 'year_start'/'year_end'
|
| 95 |
+
Returns (start, end) as ints or (None, None)
|
| 96 |
+
"""
|
| 97 |
+
if isinstance(val, str) and "-" in val:
|
| 98 |
+
try:
|
| 99 |
+
a, b = val.split("-", 1)
|
| 100 |
+
return int(a), int(b)
|
| 101 |
+
except Exception:
|
| 102 |
+
return None, None
|
| 103 |
+
if isinstance(val, dict):
|
| 104 |
+
try:
|
| 105 |
+
return int(val.get("year_start")), int(val.get("year_end"))
|
| 106 |
+
except Exception:
|
| 107 |
+
return None, None
|
| 108 |
+
return None, None
|
| 109 |
+
|
| 110 |
+
def _to_epoch(date_str: str | None) -> int | None:
|
| 111 |
+
"""
|
| 112 |
+
Accepts YYYY-MM-DD or ISO-8601; returns Unix epoch seconds or None
|
| 113 |
+
"""
|
| 114 |
+
if not date_str:
|
| 115 |
+
return None
|
| 116 |
+
try:
|
| 117 |
+
ds = date_str.replace("Z", "")
|
| 118 |
+
return int(datetime.fromisoformat(ds).timestamp())
|
| 119 |
+
except Exception:
|
| 120 |
+
return None
|
| 121 |
+
|
| 122 |
+
def _chunk_bill(b: Dict[str, Any], *, size: int = 1500, overlap: int = 200) -> List[Document]:
|
| 123 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 124 |
+
chunk_size=size, chunk_overlap=overlap,
|
| 125 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
| 126 |
+
)
|
| 127 |
+
text = _bill_text(b)
|
| 128 |
+
pieces = splitter.split_text(text) or ["(no content)"]
|
| 129 |
+
|
| 130 |
+
iapp_list = _flatten_iapp(b.get("iapp_categories"))
|
| 131 |
+
sy_start, sy_end = _parse_session_years(b.get("session_year"))
|
| 132 |
+
last_action_date = b.get("last_action_date")
|
| 133 |
+
last_action_ts = _to_epoch(last_action_date)
|
| 134 |
+
|
| 135 |
+
base_meta = {
|
| 136 |
+
"doc_id": _bill_id(b),
|
| 137 |
+
"state": b.get("state"),
|
| 138 |
+
"session_year": b.get("session_year"),
|
| 139 |
+
"session_year_start": sy_start,
|
| 140 |
+
"session_year_end": sy_end,
|
| 141 |
+
"legislative_body": b.get("chamber") or b.get("legislative_body") or "",
|
| 142 |
+
"status": b.get("status"),
|
| 143 |
+
"title": b.get("title"),
|
| 144 |
+
"bill_number": b.get("bill_number"),
|
| 145 |
+
"sponsors": (("; ".join(map(str, b.get("sponsors") or []))) if isinstance(b.get("sponsors"), list) else (b.get("sponsors") or "")),
|
| 146 |
+
"last_action_date": last_action_date,
|
| 147 |
+
"last_action_ts": last_action_ts,
|
| 148 |
+
"iapp_flat": iapp_list, # list[str]
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
docs: List[Document] = []
|
| 152 |
+
total = len(pieces)
|
| 153 |
+
for i, chunk in enumerate(pieces):
|
| 154 |
+
m = dict(base_meta)
|
| 155 |
+
m["chunk_index"] = i
|
| 156 |
+
m["chunk_total"] = total
|
| 157 |
+
m["text"] = chunk
|
| 158 |
+
m = _clean_meta(m)
|
| 159 |
+
docs.append(Document(page_content=chunk, metadata=m))
|
| 160 |
+
return docs
|
| 161 |
+
|
| 162 |
+
def _load_bills(path: str) -> List[Dict[str, Any]]:
|
| 163 |
+
import json as _json
|
| 164 |
+
from pathlib import Path as _P
|
| 165 |
+
data = _json.loads(_P(path).read_text(encoding="utf-8"))
|
| 166 |
+
if not isinstance(data, list):
|
| 167 |
+
raise ValueError(f"{path} must contain a list of bills")
|
| 168 |
+
return data
|
| 169 |
+
|
| 170 |
+
def _vectorstore(embeddings: OpenAIEmbeddings) -> PineconeVectorStore:
|
| 171 |
+
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
|
| 172 |
+
index_name = os.getenv("PINECONE_INDEX", DEFAULT_COLLECTION)
|
| 173 |
+
index = pc.Index(index_name)
|
| 174 |
+
namespace = DEFAULT_NAMESPACE if DEFAULT_NAMESPACE else None
|
| 175 |
+
return PineconeVectorStore(index=index, embedding=embeddings, namespace=namespace)
|
| 176 |
+
|
| 177 |
+
def upsert_from_bills_json(
|
| 178 |
+
source_json_path: str = "data/known_bills_visualize.json",
|
| 179 |
+
manifest_path: str = DEFAULT_MANIFEST,
|
| 180 |
+
embed_model: Optional[str] = None,
|
| 181 |
+
batch_size: int = 128,
|
| 182 |
+
) -> Dict[str, int]:
|
| 183 |
+
t0 = time.time()
|
| 184 |
+
bills = _load_bills(source_json_path)
|
| 185 |
+
embeddings = get_embeddings(embed_model)
|
| 186 |
+
vs = _vectorstore(embeddings)
|
| 187 |
+
|
| 188 |
+
manifest = _manifest_load(manifest_path)
|
| 189 |
+
meta = manifest.get("_meta", {})
|
| 190 |
+
model_in_use = embed_model or DEFAULT_EMBED_MODEL
|
| 191 |
+
if meta.get("embed_model") != model_in_use:
|
| 192 |
+
manifest = {"_meta": {"embed_model": model_in_use}}
|
| 193 |
+
|
| 194 |
+
to_upsert: List[Document] = []
|
| 195 |
+
added, skipped = 0, 0
|
| 196 |
+
|
| 197 |
+
for b in bills:
|
| 198 |
+
if not (b.get("text") or b.get("description") or b.get("title")):
|
| 199 |
+
skipped += 1
|
| 200 |
+
continue
|
| 201 |
+
doc_id = _bill_id(b)
|
| 202 |
+
hsh = _bill_hash(b)
|
| 203 |
+
if manifest.get(doc_id, {}).get("hash") == hsh:
|
| 204 |
+
skipped += 1
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
try:
|
| 208 |
+
vs.delete(filter={"doc_id": doc_id})
|
| 209 |
+
except Exception:
|
| 210 |
+
pass
|
| 211 |
+
|
| 212 |
+
for d in _chunk_bill(b):
|
| 213 |
+
to_upsert.append(d)
|
| 214 |
+
if len(to_upsert) >= batch_size:
|
| 215 |
+
vs.add_documents(documents=to_upsert)
|
| 216 |
+
to_upsert = []
|
| 217 |
+
|
| 218 |
+
manifest[doc_id] = {"hash": hsh}
|
| 219 |
+
added += 1
|
| 220 |
+
|
| 221 |
+
if to_upsert:
|
| 222 |
+
vs.add_documents(documents=to_upsert)
|
| 223 |
+
|
| 224 |
+
manifest["_meta"] = {"embed_model": model_in_use}
|
| 225 |
+
_manifest_save(manifest_path, manifest)
|
| 226 |
+
|
| 227 |
+
return {
|
| 228 |
+
"total_bills": len(bills),
|
| 229 |
+
"embedded": added,
|
| 230 |
+
"skipped_unchanged": skipped,
|
| 231 |
+
"elapsed_sec": int(time.time() - t0),
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
def get_retriever(k=8, filter_kwargs: Optional[Dict[str, Any]] = None):
|
| 235 |
+
embeddings = get_embeddings()
|
| 236 |
+
vs = _vectorstore(embeddings)
|
| 237 |
+
kwargs = {"k": k}
|
| 238 |
+
if filter_kwargs:
|
| 239 |
+
kwargs["filter"] = filter_kwargs
|
| 240 |
+
return vs.as_retriever(search_kwargs=kwargs)
|
| 241 |
+
|
| 242 |
+
def similarity_search(query: str, k: int = 5, where: Optional[Dict[str, Any]] = None):
|
| 243 |
+
embeddings = get_embeddings()
|
| 244 |
+
vs = _vectorstore(embeddings)
|
| 245 |
+
return vs.similarity_search(query, k=k, filter=where or None)
|
vectorstore/pinecone_delta_upsert.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# vectorstore/pinecone_delta_upsert.py
|
| 2 |
+
import os, json, hashlib, time
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Dict, List, Any, Callable
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
from pinecone import Pinecone
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
MANIFEST_PATH = Path("data/pinecone_manifest.json")
|
| 13 |
+
EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
|
| 14 |
+
INDEX_NAME = os.getenv("PINECONE_INDEX", "legislation-tracker")
|
| 15 |
+
DEFAULT_NAMESPACE = os.getenv("PINECONE_NAMESPACE", "default")
|
| 16 |
+
|
| 17 |
+
def _sha(s: str) -> str:
|
| 18 |
+
return hashlib.sha256(s.encode("utf-8")).hexdigest()
|
| 19 |
+
|
| 20 |
+
def _load_manifest() -> Dict[str, str]:
|
| 21 |
+
if MANIFEST_PATH.exists():
|
| 22 |
+
return json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
|
| 23 |
+
return {}
|
| 24 |
+
|
| 25 |
+
def _save_manifest(m: Dict[str, str]) -> None:
|
| 26 |
+
MANIFEST_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 27 |
+
MANIFEST_PATH.write_text(json.dumps(m, indent=2, ensure_ascii=False), encoding="utf-8")
|
| 28 |
+
|
| 29 |
+
def make_embedder() -> Callable[[str], List[float]]:
|
| 30 |
+
api = os.getenv("OPENAI_API_KEY")
|
| 31 |
+
if not api:
|
| 32 |
+
raise RuntimeError("OPENAI_API_KEY not set")
|
| 33 |
+
client = OpenAI(api_key=api)
|
| 34 |
+
|
| 35 |
+
def _emb(text: str) -> List[float]:
|
| 36 |
+
return client.embeddings.create(input=text, model=EMBED_MODEL).data[0].embedding
|
| 37 |
+
|
| 38 |
+
return _emb
|
| 39 |
+
|
| 40 |
+
def _parse_session_years(val) -> tuple[int | None, int | None]:
|
| 41 |
+
"""
|
| 42 |
+
Accepts:
|
| 43 |
+
- string like '2023-2024'
|
| 44 |
+
- dict with keys 'year_start'/'year_end'
|
| 45 |
+
Returns (start, end) as ints or (None, None)
|
| 46 |
+
"""
|
| 47 |
+
if isinstance(val, str) and "-" in val:
|
| 48 |
+
try:
|
| 49 |
+
a, b = val.split("-", 1)
|
| 50 |
+
return int(a), int(b)
|
| 51 |
+
except Exception:
|
| 52 |
+
return None, None
|
| 53 |
+
if isinstance(val, dict):
|
| 54 |
+
try:
|
| 55 |
+
return int(val.get("year_start")), int(val.get("year_end"))
|
| 56 |
+
except Exception:
|
| 57 |
+
return None, None
|
| 58 |
+
return None, None
|
| 59 |
+
|
| 60 |
+
def _to_epoch(date_str: str | None) -> int | None:
|
| 61 |
+
"""
|
| 62 |
+
Accepts YYYY-MM-DD or ISO-8601; returns Unix epoch seconds or None
|
| 63 |
+
"""
|
| 64 |
+
if not date_str:
|
| 65 |
+
return None
|
| 66 |
+
try:
|
| 67 |
+
ds = date_str.replace("Z", "")
|
| 68 |
+
# If only date is given, fromisoformat still works (YYYY-MM-DD)
|
| 69 |
+
return int(datetime.fromisoformat(ds).timestamp())
|
| 70 |
+
except Exception:
|
| 71 |
+
return None
|
| 72 |
+
|
| 73 |
+
def upsert_changed_vectors(
|
| 74 |
+
records: List[Dict[str, Any]],
|
| 75 |
+
*,
|
| 76 |
+
index_name: str = INDEX_NAME,
|
| 77 |
+
namespace: str = DEFAULT_NAMESPACE,
|
| 78 |
+
id_key: str = "id",
|
| 79 |
+
text_key: str = "text"
|
| 80 |
+
) -> int:
|
| 81 |
+
"""
|
| 82 |
+
Incremental upsert using a manifest (id+text hash). Each record is:
|
| 83 |
+
{ "id": "...", "text": "...", "metadata": {...} }
|
| 84 |
+
Only changed/new records are embedded and upserted.
|
| 85 |
+
"""
|
| 86 |
+
api = os.getenv("PINECONE_API_KEY")
|
| 87 |
+
if not api:
|
| 88 |
+
raise RuntimeError("PINECONE_API_KEY not set")
|
| 89 |
+
pc = Pinecone(api_key=api)
|
| 90 |
+
index = pc.Index(index_name)
|
| 91 |
+
manifest = _load_manifest()
|
| 92 |
+
embed = make_embedder()
|
| 93 |
+
|
| 94 |
+
to_upsert = []
|
| 95 |
+
for r in records:
|
| 96 |
+
rid = r[id_key]
|
| 97 |
+
txt = r[text_key] or ""
|
| 98 |
+
h = _sha(rid + "|" + txt)
|
| 99 |
+
if manifest.get(rid) != h:
|
| 100 |
+
vec = {
|
| 101 |
+
"id": rid,
|
| 102 |
+
"values": embed(txt),
|
| 103 |
+
"metadata": r.get("metadata", {})
|
| 104 |
+
}
|
| 105 |
+
to_upsert.append(vec)
|
| 106 |
+
manifest[rid] = h
|
| 107 |
+
|
| 108 |
+
if to_upsert:
|
| 109 |
+
index.upsert(vectors=to_upsert, namespace=namespace)
|
| 110 |
+
_save_manifest(manifest)
|
| 111 |
+
return len(to_upsert)
|
| 112 |
+
|
| 113 |
+
# --- Tiny helper chunker --------------------
|
| 114 |
+
def chunk_bill(bill: Dict[str, Any], size: int = 1500, overlap: int = 200) -> List[Dict[str, Any]]:
|
| 115 |
+
"""
|
| 116 |
+
Creates simple chunks with ids like 'STATE_BILL::c0', including chunk text in metadata for easy display.
|
| 117 |
+
Includes numeric fields: session_year_start, session_year_end, last_action_ts (epoch seconds).
|
| 118 |
+
Stores iapp_flat as list[str] for Pinecone $in filters.
|
| 119 |
+
"""
|
| 120 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 121 |
+
|
| 122 |
+
doc_id = f"{bill.get('state','Unknown')}_{bill.get('bill_number','Unknown')}"
|
| 123 |
+
title = bill.get("title") or ""
|
| 124 |
+
summary = bill.get("description") or ""
|
| 125 |
+
txt = bill.get("text") or ""
|
| 126 |
+
full = f"Title: {title}\n\nSummary: {summary}\n\nFull Text:\n{txt}"
|
| 127 |
+
|
| 128 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 129 |
+
chunk_size=size, chunk_overlap=overlap,
|
| 130 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
iapp = bill.get("iapp_categories") or {}
|
| 134 |
+
iapp_flat: List[str] = []
|
| 135 |
+
if isinstance(iapp, dict):
|
| 136 |
+
for k, v in iapp.items():
|
| 137 |
+
if isinstance(v, list):
|
| 138 |
+
for sub in v:
|
| 139 |
+
iapp_flat.append(f"{k}:{sub}")
|
| 140 |
+
|
| 141 |
+
# Numeric session years + timestamp for last action date
|
| 142 |
+
sy_start, sy_end = _parse_session_years(bill.get("session_year"))
|
| 143 |
+
last_action_date = bill.get("last_action_date")
|
| 144 |
+
last_action_ts = _to_epoch(last_action_date)
|
| 145 |
+
|
| 146 |
+
pieces = splitter.split_text(full) or ["(no content)"]
|
| 147 |
+
out: List[Dict[str, Any]] = []
|
| 148 |
+
total = len(pieces)
|
| 149 |
+
|
| 150 |
+
base_meta = {
|
| 151 |
+
"doc_id": doc_id,
|
| 152 |
+
"state": bill.get("state"),
|
| 153 |
+
"bill_number": bill.get("bill_number"),
|
| 154 |
+
"title": title,
|
| 155 |
+
"session_year": bill.get("session_year"),
|
| 156 |
+
"session_year_start": sy_start,
|
| 157 |
+
"session_year_end": sy_end,
|
| 158 |
+
"status": bill.get("status"),
|
| 159 |
+
"last_action_date": last_action_date,
|
| 160 |
+
"last_action_ts": last_action_ts,
|
| 161 |
+
"iapp_flat": iapp_flat,
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
for i, chunk in enumerate(pieces):
|
| 165 |
+
md = dict(base_meta)
|
| 166 |
+
md["chunk_index"] = i
|
| 167 |
+
md["chunk_total"] = total
|
| 168 |
+
md["text"] = chunk
|
| 169 |
+
out.append({
|
| 170 |
+
"id": f"{doc_id}::c{i}",
|
| 171 |
+
"text": chunk,
|
| 172 |
+
"metadata": md
|
| 173 |
+
})
|
| 174 |
+
return out
|