Spaces:

oviya08
/

AlzDetectAI

Running

tpriyadata

add changes in embedded py

6d0d692 16 days ago

13.6 kB

	"""
	vector_store/embedder.py
	=========================
	ALZDETECT-AI — Enterprise Embedder + Pinecone Upserter.

	WHAT: Converts 19,771 chunks → 768-dim vectors → Pinecone.
	WHY: PubMedBERT understands biomedical language —
	"pTau217" and "phosphorylated tau" map to nearby vectors.
	Keyword search would miss this entirely.
	WHO: Run once from Colab T4 GPU notebook.
	Called by scripts/run_pipeline.py
	WHERE: Reads → data/processed/chunks.json
	Writes → Pinecone index (alzdetect)
	WHEN: Once per plan. After chunking. Resumes from checkpoint.

	WORST-CASE DESIGN:
	- Colab timeout mid-run → checkpoint every 1,000 vectors
	- Pinecone upsert fail → retry 3 times with backoff
	- Wrong embedding dim → caught before first upsert
	- Chunk text empty → skipped, logged, counted
	- GPU not available → falls back to CPU with warning
	"""

	import json
	import time
	import os
	from pathlib import Path
	from typing import Optional

	import numpy as np
	from pydantic import BaseModel, Field
	from sentence_transformers import SentenceTransformer
	from pinecone import Pinecone
	from loguru import logger
	from tqdm import tqdm

	from configs.settings import get_settings


	# ── Embed diagnostic model ────────────────────────────────────────

	class EmbedDiagnostic(BaseModel):
	"""
	RE inspector for embedding stage.

	Analogy: Barcode printer quality report.
	How many barcodes printed, how many failed,
	how many are in the scanner (Pinecone).
	"""
	total_chunks: int
	total_embedded: int
	total_upserted: int
	total_failed: int
	embedding_model: str
	embedding_dim: int
	pinecone_index: str
	duration_secs: float
	vectors_in_index: int

	def log_summary(self) -> None:
	logger.info("=" * 60)
	logger.info("[EMBED-DIAGNOSTIC] Run complete")
	logger.info(f" Total chunks : {self.total_chunks:,}")
	logger.info(f" Embedded : {self.total_embedded:,}")
	logger.info(f" Upserted : {self.total_upserted:,}")
	logger.info(f" Failed : {self.total_failed:,}")
	logger.info(f" Model : {self.embedding_model}")
	logger.info(f" Dimensions : {self.embedding_dim}")
	logger.info(f" Pinecone index : {self.pinecone_index}")
	logger.info(f" Vectors in index : {self.vectors_in_index:,}")
	logger.info(f" Duration : {self.duration_secs:.1f}s")
	logger.info("=" * 60)


	# ── Core embedder class ───────────────────────────────────────────

	class ChunkEmbedder:
	"""
	Enterprise embedder + Pinecone upserter.

	Analogy: The barcode printing and filing team.
	Takes index cards (chunks), prints barcodes (embeddings),
	files them in the barcode library (Pinecone).

	Usage:
	embedder = ChunkEmbedder()
	diagnostic = embedder.run()
	"""

	_MAX_RETRIES: int = 3
	_RETRY_BACKOFF: float = 2.0
	_UPSERT_BATCH: int = 100 # Pinecone max upsert batch
	_CHECKPOINT_EVERY: int = 1000 # save progress every N vectors

	def __init__(self) -> None:
	self.settings = get_settings()
	self._setup_paths()
	self._setup_model()
	self._setup_pinecone()

	def _setup_paths(self) -> None:
	"""Paths for chunks input and checkpoint."""
	self.chunks_path = self.settings.processed_data_path
	self.checkpoint_path = self.chunks_path.parent / "embed_checkpoint.json"
	logger.info(f"[EMBEDDER] Chunks path: {self.chunks_path}")

	def _setup_model(self) -> None:
	"""
	Load PubMedBERT embedding model.
	Worst-case: GPU not available → CPU fallback with warning.
	"""
	import torch
	device = "cuda" if torch.cuda.is_available() else "cpu"
	if device == "cpu":
	logger.warning(
	"[EMBEDDER] GPU not available — running on CPU. "
	"This will be slow. Use Colab T4 for faster embedding."
	)
	else:
	logger.info(f"[EMBEDDER] GPU detected: {torch.cuda.get_device_name(0)}")

	logger.info(f"[EMBEDDER] Loading model: {self.settings.embedding_model}")
	self.model = SentenceTransformer(
	self.settings.embedding_model,
	device=device
	)
	self.device = device
	logger.info(f"[EMBEDDER] Model loaded on {device}")

	def _setup_pinecone(self) -> None:
	"""Connect to Pinecone and verify index exists."""
	pc = Pinecone(api_key=self.settings.pinecone_api_key)
	self.index = pc.Index(self.settings.pinecone_index_name)
	stats = self.index.describe_index_stats()
	logger.info(
	f"[EMBEDDER] Pinecone connected \| "
	f"index: {self.settings.pinecone_index_name} \| "
	f"existing vectors: {stats.total_vector_count:,}"
	)

	# Verify dimension matches — worst-case: wrong model used before
	if stats.dimension != self.settings.embedding_dim:
	raise ValueError(
	f"[EMBEDDER] Dimension mismatch! "
	f"Index has {stats.dimension} dims, "
	f"model produces {self.settings.embedding_dim} dims. "
	f"Delete index and recreate with correct dimensions."
	)

	def _load_chunks(self) -> list[dict]:
	"""Load chunks from JSON — fail fast if missing."""
	if not self.chunks_path.exists():
	logger.error(f"[EMBEDDER] Chunks not found: {self.chunks_path}")
	raise FileNotFoundError(
	f"Run chunker first. No file at: {self.chunks_path}"
	)
	with open(self.chunks_path, encoding="utf-8") as f:
	chunks = json.load(f)
	logger.info(f"[EMBEDDER] Loaded {len(chunks):,} chunks")
	return chunks

	def _load_checkpoint(self) -> set[str]:
	"""
	Load set of already-upserted chunk IDs.
	Allows resuming after Colab timeout.
	"""
	if self.checkpoint_path.exists():
	try:
	with open(self.checkpoint_path, encoding="utf-8") as f:
	done = set(json.load(f))
	logger.info(
	f"[EMBEDDER] Checkpoint found — "
	f"{len(done):,} chunks already upserted"
	)
	return done
	except Exception as e:
	logger.warning(f"[EMBEDDER] Checkpoint corrupt: {e} — starting fresh")
	logger.info("[EMBEDDER] No checkpoint — starting fresh")
	return set()

	def _save_checkpoint(self, done_ids: set[str]) -> None:
	"""Save checkpoint — called every 1,000 upserted chunks."""
	try:
	with open(self.checkpoint_path, "w", encoding="utf-8") as f:
	json.dump(list(done_ids), f)
	logger.debug(f"[EMBEDDER] Checkpoint saved: {len(done_ids):,} done")
	except Exception as e:
	logger.error(f"[EMBEDDER] Checkpoint save failed: {e}")

	def _upsert_batch(
	self,
	batch: list[dict],
	embeddings: np.ndarray,
	) -> int:
	"""
	Upsert one batch to Pinecone with retry.

	Worst-case: network error → retry 3 times with backoff.
	Returns number of successfully upserted vectors.
	"""
	vectors = []
	for chunk, embedding in zip(batch, embeddings):
	vectors.append({
	"id": chunk["chunk_id"],
	"values": embedding.tolist(),
	"metadata": {
	"pmid": chunk.get("pmid", ""),
	"chunk_idx": chunk.get("chunk_idx", 0),
	"text": chunk.get("text", "")[:1000],
	"title": chunk.get("title", "")[:200],
	"year": chunk.get("year"),
	"keywords": chunk.get("keywords", [])[:10],
	"journal": chunk.get("journal", ""),
	"source_query": chunk.get("source_query", "unknown"),
	"source": chunk.get("source", "pubmed"),
	"word_count": chunk.get("word_count", 0),
	}
	})

	for attempt in range(1, self._MAX_RETRIES + 1):
	try:
	self.index.upsert(vectors=vectors)
	return len(vectors)
	except Exception as e:
	logger.warning(
	f"[EMBEDDER] Upsert attempt {attempt}/{self._MAX_RETRIES} "
	f"failed: {e}"
	)
	if attempt < self._MAX_RETRIES:
	time.sleep(self._RETRY_BACKOFF * attempt)
	else:
	logger.error("[EMBEDDER] Upsert failed after max retries")
	return 0

	def run(self) -> EmbedDiagnostic:
	"""
	Main entry point — embed all chunks and upsert to Pinecone.

	Flow:
	1. Load chunks
	2. Load checkpoint (resume if crashed)
	3. Filter already-done chunks
	4. Embed in batches using GPU
	5. Upsert each batch to Pinecone
	6. Checkpoint every 1,000 vectors
	7. Return EmbedDiagnostic
	"""
	start_time = time.time()
	logger.info("[EMBEDDER] Starting enterprise embedding + upsert")

	chunks = self._load_chunks()
	done_ids = self._load_checkpoint()

	# Filter already upserted chunks
	remaining = [c for c in chunks if c["chunk_id"] not in done_ids]
	logger.info(
	f"[EMBEDDER] {len(remaining):,} chunks to embed "
	f"({len(done_ids):,} already done)"
	)

	total_upserted = len(done_ids)
	total_failed = 0

	# Process in embedding batches
	batch_size = self.settings.embedding_batch_size
	for i in tqdm(
	range(0, len(remaining), batch_size),
	desc="Embedding",
	unit="batch"
	):
	batch = remaining[i : i + batch_size]
	texts = [c["text"] for c in batch]

	# Embed batch
	try:
	embeddings = self.model.encode(
	texts,
	batch_size=batch_size,
	show_progress_bar=False,
	convert_to_numpy=True,
	device=self.device,
	)
	except Exception as e:
	logger.error(f"[EMBEDDER] Embedding failed for batch {i}: {e}")
	total_failed += len(batch)
	continue

	# Upsert to Pinecone in sub-batches of 100
	for j in range(0, len(batch), self._UPSERT_BATCH):
	sub_batch = batch[j : j + self._UPSERT_BATCH]
	sub_embeds = embeddings[j : j + self._UPSERT_BATCH]
	upserted = self._upsert_batch(sub_batch, sub_embeds)
	total_upserted += upserted

	if upserted == 0:
	total_failed += len(sub_batch)
	else:
	for chunk in sub_batch:
	done_ids.add(chunk["chunk_id"])

	# Checkpoint every 1,000
	if total_upserted % self._CHECKPOINT_EVERY < batch_size:
	self._save_checkpoint(done_ids)
	logger.info(
	f"[EMBEDDER] Progress: {total_upserted:,} / "
	f"{len(chunks):,} upserted"
	)

	# Final checkpoint
	self._save_checkpoint(done_ids)

	# Get final vector count from Pinecone
	final_stats = self.index.describe_index_stats()

	duration = round(time.time() - start_time, 1)
	diagnostic = EmbedDiagnostic(
	total_chunks = len(chunks),
	total_embedded = len(chunks) - total_failed,
	total_upserted = total_upserted,
	total_failed = total_failed,
	embedding_model = self.settings.embedding_model,
	embedding_dim = self.settings.embedding_dim,
	pinecone_index = self.settings.pinecone_index_name,
	duration_secs = duration,
	vectors_in_index = final_stats.total_vector_count,
	)
	diagnostic.log_summary()
	return diagnostic


	# ── RE probe ──────────────────────────────────────────────────────

	def diagnose_embedder() -> None:
	"""
	RE probe — check Pinecone index health without re-embedding.

	Usage:
	python -c "from vector_store.embedder import diagnose_embedder; diagnose_embedder()"
	"""
	settings = get_settings()
	pc = Pinecone(api_key=settings.pinecone_api_key)
	index = pc.Index(settings.pinecone_index_name)
	stats = index.describe_index_stats()

	logger.info("=" * 60)
	logger.info("[RE-EMBEDDER] Pinecone index health check")
	logger.info(f" Index name : {settings.pinecone_index_name}")
	logger.info(f" Total vectors : {stats.total_vector_count:,}")
	logger.info(f" Dimension : {stats.dimension}")
	logger.info(f" Namespaces : {dict(stats.namespaces)}")
	logger.info("=" * 60)