Spaces:

AbdullahKhanSherwani
/

BlackBox

Sleeping

File size: 4,274 Bytes

"""
Upsert enriched section chunks to Pinecone using pre-built embeddings.

Run enrich_section_chunks.py first to produce chunks_md_section_enriched.json.

Reads  : data/processed/chunks_md_section_enriched.json
         data/processed/embeddings_md_section.npz
Upserts: Pinecone index 'ntsb-rag' with strategy='section'
"""
import json
import os
import sys
from pathlib import Path

import numpy as np
from dotenv import load_dotenv
from pinecone import Pinecone

BASE_DIR = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(BASE_DIR))

INDEX_NAME     = "ntsb-rag"
CHUNK_FILE     = BASE_DIR / "data" / "processed" / "chunks_md_section_enriched.json"
EMBEDDING_FILE = BASE_DIR / "data" / "processed" / "embeddings_md_section.npz"
BATCH_SIZE     = 100


def init_pinecone() -> "pinecone.Index":
    load_dotenv(BASE_DIR / "data" / "processed" / "env")
    load_dotenv(BASE_DIR / ".env")
    api_key = os.environ.get("PINECONE_API_KEY")
    if not api_key:
        raise RuntimeError("PINECONE_API_KEY not found. Check .env or data/processed/env.")
    pc = Pinecone(api_key=api_key)
    return pc.Index(INDEX_NAME)


def load_artifacts() -> tuple[dict[str, dict], np.ndarray, list[str]]:
    if not CHUNK_FILE.exists():
        raise FileNotFoundError(
            f"{CHUNK_FILE.name} not found. Run enrich_section_chunks.py first."
        )
    if not EMBEDDING_FILE.exists():
        raise FileNotFoundError(f"{EMBEDDING_FILE.name} not found.")

    print(f"Loading enriched chunks from {CHUNK_FILE.name}...")
    with open(CHUNK_FILE, "r", encoding="utf-8") as f:
        chunks = json.load(f)
    by_id = {c["chunk_id"]: c for c in chunks}
    print(f"  {len(by_id)} chunks loaded")

    print(f"Loading embeddings from {EMBEDDING_FILE.name}...")
    npz = np.load(EMBEDDING_FILE)
    chunk_ids = npz["chunk_ids"].tolist()
    embeddings = npz["embeddings"]
    print(f"  {len(chunk_ids)} embeddings, shape={embeddings.shape}")

    return by_id, embeddings, chunk_ids


def build_vectors(by_id: dict, embeddings: np.ndarray, chunk_ids: list[str]) -> list[dict]:
    vectors = []
    skipped = 0
    for cid, emb in zip(chunk_ids, embeddings):
        chunk = by_id.get(cid)
        if chunk is None:
            skipped += 1
            continue
        vectors.append({
            "id": cid,
            "values": emb.tolist(),
            "metadata": {
                "ntsb_no":        chunk.get("ntsb_no", ""),
                "report_id":      chunk.get("report_id", ""),
                "entity_id":      chunk.get("entity_id", chunk.get("report_id", "")),
                "event_date":     chunk.get("event_date", ""),
                "state":          chunk.get("state", ""),
                "make":           chunk.get("make", ""),
                "model":          chunk.get("model", ""),
                "phase_of_flight": chunk.get("phase_of_flight", ""),
                "weather":        chunk.get("weather", ""),
                "section_title":  chunk.get("section_title", ""),
                "source_filename": chunk.get("source_filename", ""),
                "context_summary": chunk.get("context_summary", ""),
                "strategy":       "section",
            },
        })
    if skipped:
        print(f"  WARNING: {skipped} chunk IDs in embeddings had no match in chunks file")
    return vectors


def upsert_vectors(index, vectors: list[dict]) -> None:
    total = len(vectors)
    print(f"Upserting {total} vectors in batches of {BATCH_SIZE}...")
    for i in range(0, total, BATCH_SIZE):
        batch = vectors[i : i + BATCH_SIZE]
        index.upsert(vectors=batch)
        done = min(i + BATCH_SIZE, total)
        print(f"  {done}/{total} ({done/total*100:.1f}%)", flush=True)
    print("Upsert complete.")


def main():
    print("=== Upsert Section Chunks to Pinecone ===\n")

    index = init_pinecone()
    print(f"Connected to index '{INDEX_NAME}'")

    by_id, embeddings, chunk_ids = load_artifacts()
    vectors = build_vectors(by_id, embeddings, chunk_ids)
    print(f"Built {len(vectors)} vectors with enriched metadata\n")

    upsert_vectors(index, vectors)

    stats = index.describe_index_stats()
    print(f"\nIndex total vectors: {stats.total_vector_count}")


if __name__ == "__main__":
    main()