Spaces:

Pushkya
/

Financial_bot

Running

App Files Files Community

Financial_bot / src /embedder.py

Pushkya

Upload 30 files

8299003 verified 1 day ago

Raw

History Blame Contribute Delete

17.6 kB

	"""
	embedder.py
	===========
	Phase 4 – Embedding & Vector Store Ingestion

	Converts chunks produced by chunker.py into vector embeddings and persists
	them in a ChromaDB collection.

	Model: sentence-transformers/all-MiniLM-L6-v2
	- 384-dimensional embeddings
	- 256-token context window (matches our chunking max_tokens exactly)
	- Fast inference, strong retrieval quality for English financial text
	- Same model for indexing AND retrieval → vectors are in the same space

	ChromaDB collection: financial_docs
	- Single collection for ALL document types (Morningstar + SEC filings)
	- Cosine similarity space (best for sentence transformers)
	- Upsert semantics → re-running is fully safe, no duplicate vectors
	- Metadata filters enable per-source / per-doc-type / per-ticker queries:
	{"source": "morningstar"}
	{"doc_type": "10-K", "ticker": "AAPL"}
	{"chunk_type": "table"}

	ChromaDB metadata constraints
	ChromaDB only accepts scalar values (str, int, float, bool).
	Lists (e.g., col_headers) are JSON-serialised to strings.
	None values are replaced with "" to avoid insertion errors.

	Output format per stored vector
	id : chunk_id (e.g. "ptc01302411420_text_0042")
	document : chunk text (prose or markdown table)
	metadata : all chunk metadata + chunk_type, is_atomic, doc_id

	Usage (as a module)
	-------------------
	from src.embedder import DocumentEmbedder
	emb = DocumentEmbedder()
	emb.embed_document("data/chunks/morningstar/ptc01302411420_chunks.json")
	results = emb.query("What is PTC's revenue growth?", n_results=5)

	Usage (as a script)
	-------------------
	python src/embedder.py
	python src/embedder.py --force # re-embed even if already stored
	"""

	import os
	import json
	import hashlib
	import logging
	from pathlib import Path
	from datetime import datetime, timezone

	from dotenv import load_dotenv
	load_dotenv()

	# "chromadb" (local default) or "qdrant" (set QDRANT_URL to auto-switch)
	_BACKEND = "qdrant" if os.getenv("QDRANT_URL") else "chromadb"

	# ── Logging ────────────────────────────────────────────────────────────────────
	logging.basicConfig(
	level = logging.INFO,
	format = "%(asctime)s %(levelname)-8s %(message)s",
	)
	log = logging.getLogger(__name__)

	# ── Paths ──────────────────────────────────────────────────────────────────────
	BASE_DIR = Path(__file__).parent.parent
	CHUNKS_DIR = BASE_DIR / "data" / "chunks"
	VECTORSTORE_DIR = BASE_DIR / "data" / "vectorstore"

	# ── Constants ──────────────────────────────────────────────────────────────────
	COLLECTION_NAME = "financial_docs"
	EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
	BATCH_SIZE = 100 # number of chunks per ChromaDB upsert call


	# ══════════════════════════════════════════════════════════════════════════════
	# METADATA SANITISATION
	# ──────────────────────────────────────────────────────────────────────────────
	# ChromaDB rejects metadata values that are:
	# - None → replace with ""
	# - list → JSON-serialise to a string
	# - other → cast to str as a safety fallback
	# ══════════════════════════════════════════════════════════════════════════════

	def _sanitize_metadata(meta: dict) -> dict:
	"""
	Convert metadata dict to ChromaDB-compatible scalar values.

	ChromaDB accepted types: str \| int \| float \| bool
	"""
	clean = {}
	for k, v in meta.items():
	if isinstance(v, (str, int, float, bool)):
	clean[k] = v
	elif v is None:
	clean[k] = ""
	elif isinstance(v, list):
	clean[k] = json.dumps(v, ensure_ascii=False)
	else:
	clean[k] = str(v)
	return clean


	# ══════════════════════════════════════════════════════════════════════════════
	# MAIN EMBEDDER CLASS
	# ══════════════════════════════════════════════════════════════════════════════

	class DocumentEmbedder:
	"""
	Embeds document chunks and persists them in a ChromaDB collection.

	All documents (Morningstar PDFs, SEC filings) share one collection so
	cross-document similarity search works out of the box. Use metadata
	filters to restrict retrieval to a specific source or document type.
	"""

	def __init__(
	self,
	vectorstore_dir : Path = VECTORSTORE_DIR,
	collection_name : str = COLLECTION_NAME,
	embedding_model : str = EMBEDDING_MODEL,
	):
	self._collection_name = collection_name
	self._embedding_model = embedding_model
	self._backend = _BACKEND

	if self._backend == "qdrant":
	self._init_qdrant(collection_name, embedding_model)
	else:
	self._init_chromadb(vectorstore_dir, collection_name, embedding_model)

	def _init_chromadb(self, vectorstore_dir, collection_name, embedding_model):
	import chromadb
	from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

	self.vectorstore_dir = Path(vectorstore_dir)
	self.vectorstore_dir.mkdir(parents=True, exist_ok=True)

	self.client = chromadb.PersistentClient(path=str(self.vectorstore_dir))
	self.ef = SentenceTransformerEmbeddingFunction(model_name=embedding_model)

	self.collection = self.client.get_or_create_collection(
	name = collection_name,
	embedding_function = self.ef,
	metadata = {"hnsw:space": "cosine"},
	)
	log.info(
	f"[ChromaDB] collection '{collection_name}' ready "
	f"({self.collection.count()} vectors already stored)"
	)

	def _init_qdrant(self, collection_name, embedding_model):
	from qdrant_client import QdrantClient
	from qdrant_client.models import VectorParams, Distance
	from sentence_transformers import SentenceTransformer

	self._qdrant = QdrantClient(
	url = os.getenv("QDRANT_URL"),
	api_key = os.getenv("QDRANT_API_KEY"),
	)
	self._embed_model = SentenceTransformer(embedding_model)

	# Create collection if it doesn't exist
	existing = [c.name for c in self._qdrant.get_collections().collections]
	if collection_name not in existing:
	self._qdrant.create_collection(
	collection_name = collection_name,
	vectors_config = VectorParams(size=384, distance=Distance.COSINE),
	)
	log.info(f"[Qdrant] Created collection '{collection_name}'")

	count = self._qdrant.count(collection_name).count
	log.info(
	f"[Qdrant] collection '{collection_name}' ready "
	f"({count} vectors already stored)"
	)

	# ── Embed one document ─────────────────────────────────────────────────────

	def embed_document(
	self,
	chunks_json_path : str \| Path,
	force : bool = False,
	) -> int:
	"""
	Embed all chunks from one document and upsert into ChromaDB.

	Args:
	chunks_json_path : path to the _chunks.json file (output of chunker.py)
	force : if True, re-embed even if already stored

	Returns:
	number of chunks upserted (0 if skipped)
	"""
	chunks_json_path = Path(chunks_json_path)

	with open(chunks_json_path) as f:
	doc = json.load(f)

	chunks = doc["chunks"]
	all_ids = [c["chunk_id"] for c in chunks]

	# ── Check which chunks are already stored ─────────────────────────────
	if not force:
	existing = self.collection.get(ids=all_ids, include=[])
	already = set(existing["ids"])
	to_embed = [c for c in chunks if c["chunk_id"] not in already]

	if not to_embed:
	log.info(
	f"SKIP {chunks_json_path.name} "
	f"(all {len(all_ids)} chunks already in store)"
	)
	return 0

	log.info(
	f"Embedding: {chunks_json_path.name} "
	f"({len(to_embed)} new, {len(already)} already stored)"
	)
	else:
	to_embed = chunks
	log.info(f"Embedding: {chunks_json_path.name} ({len(to_embed)} chunks)")

	# ── Build inputs ──────────────────────────────────────────────────────
	ids = []
	documents = []
	metadatas = []

	for chunk in to_embed:
	ids.append(chunk["chunk_id"])
	documents.append(chunk["text"])

	meta = _sanitize_metadata(chunk["metadata"])
	meta["chunk_type"] = chunk["chunk_type"]
	meta["is_atomic"] = chunk["is_atomic"]
	meta["doc_id"] = chunk["doc_id"]
	metadatas.append(meta)

	# ── Upsert in batches ─────────────────────────────────────────────────
	if self._backend == "qdrant":
	self._upsert_qdrant(ids, documents, metadatas)
	else:
	self._upsert_chromadb(ids, documents, metadatas)

	return len(ids)

	def _upsert_chromadb(self, ids, documents, metadatas):
	for i in range(0, len(ids), BATCH_SIZE):
	batch_ids = ids[i : i + BATCH_SIZE]
	batch_docs = documents[i : i + BATCH_SIZE]
	batch_meta = metadatas[i : i + BATCH_SIZE]

	self.collection.upsert(
	ids = batch_ids,
	documents = batch_docs,
	metadatas = batch_meta,
	)
	log.info(
	f" Batch {i // BATCH_SIZE + 1} "
	f"({len(batch_ids)} chunks) "
	f"total in store: {self.collection.count()}"
	)

	def _upsert_qdrant(self, ids, documents, metadatas):
	from qdrant_client.models import PointStruct

	log.info(f" Encoding {len(documents)} chunks with SentenceTransformer...")
	embeddings = self._embed_model.encode(
	documents,
	batch_size = 32,
	show_progress_bar = False,
	)

	for i in range(0, len(ids), BATCH_SIZE):
	batch_ids = ids[i : i + BATCH_SIZE]
	batch_docs = documents[i : i + BATCH_SIZE]
	batch_meta = metadatas[i : i + BATCH_SIZE]
	batch_embs = embeddings[i : i + BATCH_SIZE]

	points = [
	PointStruct(
	# deterministic integer ID from chunk_id string
	id = int(hashlib.md5(cid.encode()).hexdigest(), 16) % (2 ** 63),
	vector = emb.tolist(),
	payload = {"text": doc, "chunk_id": cid, **meta},
	)
	for cid, doc, meta, emb in zip(batch_ids, batch_docs, batch_meta, batch_embs)
	]

	self._qdrant.upsert(
	collection_name = self._collection_name,
	points = points,
	)
	total = self._qdrant.count(self._collection_name).count
	log.info(
	f" Batch {i // BATCH_SIZE + 1} "
	f"({len(points)} chunks) "
	f"total in store: {total}"
	)

	# ── Batch embed all documents ──────────────────────────────────────────────

	def embed_all(
	self,
	chunks_dir : Path = CHUNKS_DIR,
	force : bool = False,
	) -> dict:
	"""
	Embed all *_chunks.json files found under chunks_dir.

	Returns:
	dict mapping filename → number of chunks upserted
	"""
	chunk_files = sorted(Path(chunks_dir).rglob("*_chunks.json"))
	log.info(f"Found {len(chunk_files)} chunk files under {chunks_dir}")

	summary = {}
	for cf in chunk_files:
	try:
	n = self.embed_document(cf, force=force)
	summary[cf.name] = n
	except Exception as e:
	log.error(f" FAILED {cf.name}: {e}")
	summary[cf.name] = -1

	return summary

	# ── Query ──────────────────────────────────────────────────────────────────

	def query(
	self,
	text : str,
	n_results : int = 5,
	filters : dict = None,
	) -> list[dict]:
	"""
	Similarity search against the collection.

	Args:
	text : natural language query (embedded on-the-fly)
	n_results : number of top results to return
	filters : ChromaDB where clause, e.g. {"source": "morningstar"}
	or {"$and": [{"doc_type": "10-K"}, {"ticker": "AAPL"}]}

	Returns:
	list of dicts with keys: id, text, metadata, distance
	(distance is cosine distance: 0 = identical, 2 = opposite)
	"""
	kwargs = {
	"query_texts" : [text],
	"n_results" : n_results,
	"include" : ["documents", "metadatas", "distances"],
	}
	if filters:
	kwargs["where"] = filters

	result = self.collection.query(**kwargs)

	return [
	{
	"id" : result["ids"][0][i],
	"text" : result["documents"][0][i],
	"metadata" : result["metadatas"][0][i],
	"distance" : result["distances"][0][i],
	}
	for i in range(len(result["ids"][0]))
	]

	# ── Collection stats ───────────────────────────────────────────────────────

	def get_stats(self) -> dict:
	"""Return a summary of what is stored in the collection."""
	if self._backend == "qdrant":
	count = self._qdrant.count(self._collection_name).count
	if count == 0:
	return {"total_vectors": 0, "sources": [], "doc_ids": [], "chunk_types": []}
	records, _ = self._qdrant.scroll(
	collection_name = self._collection_name,
	limit = count,
	with_payload = True,
	with_vectors = False,
	)
	metas = [r.payload for r in records]
	else:
	count = self.collection.count()
	if count == 0:
	return {"total_vectors": 0, "sources": [], "doc_ids": [], "chunk_types": []}
	metas = self.collection.get(limit=count, include=["metadatas"])["metadatas"]

	return {
	"total_vectors" : count,
	"sources" : sorted({m.get("source", "") for m in metas}),
	"doc_ids" : sorted({m.get("doc_id", "") for m in metas}),
	"chunk_types" : sorted({m.get("chunk_type", "") for m in metas}),
	}


	# ── Entry point ────────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	import sys

	force = "--force" in sys.argv

	log.info("=" * 60)
	log.info("Phase 4 – Document Embedder (all-MiniLM-L6-v2 + ChromaDB)")
	log.info("=" * 60)

	embedder = DocumentEmbedder()
	summary = embedder.embed_all(force=force)

	log.info("\n" + "=" * 60)
	log.info("Embedding complete.")
	total = 0
	for fname, n in summary.items():
	status = f"{n:>5} upserted" if n >= 0 else " FAILED"
	log.info(f" {fname:55s} {status}")
	if n > 0:
	total += n
	log.info(f" {'TOTAL NEW VECTORS':55s} {total:>5}")

	stats = embedder.get_stats()
	log.info(f"\nCollection total: {stats['total_vectors']} vectors")
	log.info(f" Sources : {stats['sources']}")
	log.info(f" Documents : {stats['doc_ids']}")
	log.info("=" * 60)