digital-twin / scripts /build_vectors.py
LSmithPMP's picture
Deploy: security-hardened RAG digital twin
6ca927d
Raw
History Blame Contribute Delete
1.26 kB
#!/usr/bin/env python3
"""
Build (or rebuild) the ChromaDB vector store from data/biography.txt.
Usage:
python scripts/build_vectors.py
"""
import sys
from pathlib import Path
# Add project root to path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import chromadb
from openai import OpenAI
from dotenv import load_dotenv
import config
import rag
load_dotenv()
def main():
print(f"Loading biography from {config.BIOGRAPHY_TXT}...")
text = config.BIOGRAPHY_TXT.read_text(encoding="utf-8")
print("Chunking...")
chunks = rag.chunk_curated_lines(text)
print(f" → {len(chunks)} chunks")
print(f"Embedding with {config.EMBEDDING_MODEL}...")
oai_client = OpenAI()
rag.embed_chunks(oai_client, chunks)
print(f" → {len(chunks)} embeddings generated")
print(f"Storing in ChromaDB at {config.CHROMA_PATH}...")
chroma_client = chromadb.PersistentClient(
config.CHROMA_PATH,
config.CHROMA_CLIENT_SETTINGS,
)
collection = rag.db_store_embeds(chroma_client, config.CHROMA_COLLECTION_NAME, chunks)
print(f" → Collection '{config.CHROMA_COLLECTION_NAME}' created with {collection.count()} entries")
print("Done!")
if __name__ == "__main__":
main()