File size: 2,763 Bytes
0e99494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import sys
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from dotenv import load_dotenv
from google import genai
from pinecone import Pinecone

load_dotenv()

# Initialize clients
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("portfolio-chat")
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

# Constants
EMBEDDING_MODEL = "gemini-embedding-001"
EMBEDDING_DIMENSION = 768
DATA_FILE = Path(__file__).parent.parent / "data" / "knowledge_base.txt"


def get_embedding(text: str) -> list[float]:
    """Generate embedding for a given text."""
    response = client.models.embed_content(
        model=EMBEDDING_MODEL,
        contents=text,
        config={
            "output_dimensionality": EMBEDDING_DIMENSION
        }
    )
    return response.embeddings[0].values


def load_and_chunk(file_path: Path) -> list[str]:
    """Load text file and split into chunks by double newlines."""
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    
    # Split by double newlines
    chunks = [chunk.strip() for chunk in content.split("\n\n") if chunk.strip()]
    return chunks


def main():
    print("=" * 50)
    print("Portfolio Knowledge Base Ingestion Script")
    print("=" * 50)
    
    # Step 1: Load and chunk the data
    print(f"\n[1/3] Loading data from: {DATA_FILE}")
    
    if not DATA_FILE.exists():
        print(f"ERROR: File not found: {DATA_FILE}")
        sys.exit(1)
    
    chunks = load_and_chunk(DATA_FILE)
    print(f"      Loaded {len(chunks)} chunks")
    
    # Step 2: Generate embeddings and prepare vectors
    print(f"\n[2/3] Generating embeddings...")
    vectors = []
    
    for i, chunk in enumerate(chunks):
        print(f"      Processing chunk {i + 1}/{len(chunks)}...", end="\r")
        
        embedding = get_embedding(chunk)
        vectors.append({
            "id": str(i),
            "values": embedding,
            "metadata": {"text": chunk}
        })
    
    print(f"      Generated {len(vectors)} embeddings" + " " * 20)
    
    # Step 3: Upsert to Pinecone
    print(f"\n[3/3] Upserting to Pinecone...")
    
    # Upsert in batches of 100 (Pinecone best practice)
    batch_size = 100
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)
        print(f"      Upserted batch {i // batch_size + 1}")
    
    print("\n" + "=" * 50)
    print("SUCCESS: Knowledge base ingested!")
    print(f"Total vectors: {len(vectors)}")
    print("=" * 50)


if __name__ == "__main__":
    main()