File size: 4,274 Bytes
b295ac7
9426b65
 
 
b295ac7
9426b65
 
 
 
b295ac7
 
 
 
 
 
 
9426b65
b295ac7
 
 
 
9426b65
 
 
 
b295ac7
 
9426b65
 
b295ac7
 
 
9426b65
b295ac7
9426b65
 
 
 
b295ac7
9426b65
 
 
 
 
 
 
b295ac7
 
9426b65
 
 
 
 
 
 
 
 
 
 
 
 
b295ac7
9426b65
b295ac7
9426b65
 
 
b295ac7
 
 
 
 
9426b65
 
 
 
 
 
 
b295ac7
9426b65
 
 
 
 
b295ac7
 
9426b65
 
b295ac7
 
 
9426b65
 
 
 
 
 
 
 
 
b295ac7
 
 
9426b65
 
b295ac7
9426b65
 
 
 
 
 
 
 
 
 
b295ac7
 
 
9426b65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Upsert enriched section chunks to Pinecone using pre-built embeddings.

Run enrich_section_chunks.py first to produce chunks_md_section_enriched.json.

Reads  : data/processed/chunks_md_section_enriched.json
         data/processed/embeddings_md_section.npz
Upserts: Pinecone index 'ntsb-rag' with strategy='section'
"""
import json
import os
import sys
from pathlib import Path

import numpy as np
from dotenv import load_dotenv
from pinecone import Pinecone

BASE_DIR = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(BASE_DIR))

INDEX_NAME     = "ntsb-rag"
CHUNK_FILE     = BASE_DIR / "data" / "processed" / "chunks_md_section_enriched.json"
EMBEDDING_FILE = BASE_DIR / "data" / "processed" / "embeddings_md_section.npz"
BATCH_SIZE     = 100


def init_pinecone() -> "pinecone.Index":
    load_dotenv(BASE_DIR / "data" / "processed" / "env")
    load_dotenv(BASE_DIR / ".env")
    api_key = os.environ.get("PINECONE_API_KEY")
    if not api_key:
        raise RuntimeError("PINECONE_API_KEY not found. Check .env or data/processed/env.")
    pc = Pinecone(api_key=api_key)
    return pc.Index(INDEX_NAME)


def load_artifacts() -> tuple[dict[str, dict], np.ndarray, list[str]]:
    if not CHUNK_FILE.exists():
        raise FileNotFoundError(
            f"{CHUNK_FILE.name} not found. Run enrich_section_chunks.py first."
        )
    if not EMBEDDING_FILE.exists():
        raise FileNotFoundError(f"{EMBEDDING_FILE.name} not found.")

    print(f"Loading enriched chunks from {CHUNK_FILE.name}...")
    with open(CHUNK_FILE, "r", encoding="utf-8") as f:
        chunks = json.load(f)
    by_id = {c["chunk_id"]: c for c in chunks}
    print(f"  {len(by_id)} chunks loaded")

    print(f"Loading embeddings from {EMBEDDING_FILE.name}...")
    npz = np.load(EMBEDDING_FILE)
    chunk_ids = npz["chunk_ids"].tolist()
    embeddings = npz["embeddings"]
    print(f"  {len(chunk_ids)} embeddings, shape={embeddings.shape}")

    return by_id, embeddings, chunk_ids


def build_vectors(by_id: dict, embeddings: np.ndarray, chunk_ids: list[str]) -> list[dict]:
    vectors = []
    skipped = 0
    for cid, emb in zip(chunk_ids, embeddings):
        chunk = by_id.get(cid)
        if chunk is None:
            skipped += 1
            continue
        vectors.append({
            "id": cid,
            "values": emb.tolist(),
            "metadata": {
                "ntsb_no":        chunk.get("ntsb_no", ""),
                "report_id":      chunk.get("report_id", ""),
                "entity_id":      chunk.get("entity_id", chunk.get("report_id", "")),
                "event_date":     chunk.get("event_date", ""),
                "state":          chunk.get("state", ""),
                "make":           chunk.get("make", ""),
                "model":          chunk.get("model", ""),
                "phase_of_flight": chunk.get("phase_of_flight", ""),
                "weather":        chunk.get("weather", ""),
                "section_title":  chunk.get("section_title", ""),
                "source_filename": chunk.get("source_filename", ""),
                "context_summary": chunk.get("context_summary", ""),
                "strategy":       "section",
            },
        })
    if skipped:
        print(f"  WARNING: {skipped} chunk IDs in embeddings had no match in chunks file")
    return vectors


def upsert_vectors(index, vectors: list[dict]) -> None:
    total = len(vectors)
    print(f"Upserting {total} vectors in batches of {BATCH_SIZE}...")
    for i in range(0, total, BATCH_SIZE):
        batch = vectors[i : i + BATCH_SIZE]
        index.upsert(vectors=batch)
        done = min(i + BATCH_SIZE, total)
        print(f"  {done}/{total} ({done/total*100:.1f}%)", flush=True)
    print("Upsert complete.")


def main():
    print("=== Upsert Section Chunks to Pinecone ===\n")

    index = init_pinecone()
    print(f"Connected to index '{INDEX_NAME}'")

    by_id, embeddings, chunk_ids = load_artifacts()
    vectors = build_vectors(by_id, embeddings, chunk_ids)
    print(f"Built {len(vectors)} vectors with enriched metadata\n")

    upsert_vectors(index, vectors)

    stats = index.describe_index_stats()
    print(f"\nIndex total vectors: {stats.total_vector_count}")


if __name__ == "__main__":
    main()