Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Build (or rebuild) the ChromaDB vector store from data/biography.txt. | |
| Usage: | |
| python scripts/build_vectors.py | |
| """ | |
| import sys | |
| from pathlib import Path | |
| # Add project root to path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) | |
| import chromadb | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| import config | |
| import rag | |
| load_dotenv() | |
| def main(): | |
| print(f"Loading biography from {config.BIOGRAPHY_TXT}...") | |
| text = config.BIOGRAPHY_TXT.read_text(encoding="utf-8") | |
| print("Chunking...") | |
| chunks = rag.chunk_curated_lines(text) | |
| print(f" → {len(chunks)} chunks") | |
| print(f"Embedding with {config.EMBEDDING_MODEL}...") | |
| oai_client = OpenAI() | |
| rag.embed_chunks(oai_client, chunks) | |
| print(f" → {len(chunks)} embeddings generated") | |
| print(f"Storing in ChromaDB at {config.CHROMA_PATH}...") | |
| chroma_client = chromadb.PersistentClient( | |
| config.CHROMA_PATH, | |
| config.CHROMA_CLIENT_SETTINGS, | |
| ) | |
| collection = rag.db_store_embeds(chroma_client, config.CHROMA_COLLECTION_NAME, chunks) | |
| print(f" → Collection '{config.CHROMA_COLLECTION_NAME}' created with {collection.count()} entries") | |
| print("Done!") | |
| if __name__ == "__main__": | |
| main() | |