locus-rag-bot / scripts /index_data.py
khagu's picture
fix: updated to automatically clear pinecone index before re-indexing
bf637d9
import os
import glob
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
load_dotenv()
def index_data():
data_path = "data/cleaned/*.md"
files = glob.glob(data_path)
documents = []
for file in files:
print(f"Loading {file}...")
loader = UnstructuredMarkdownLoader(file)
documents.extend(loader.load())
print(f"Loaded {len(documents)} documents.")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500,
chunk_overlap=300,
add_start_index=True
)
chunks = text_splitter.split_documents(documents)
print(f"Split into {len(chunks)} chunks.")
api_key = os.getenv("PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX_NAME", "locus-rag")
if not api_key:
print("Error: PINECONE_API_KEY not found in environment variables.")
return
pc = Pinecone(api_key=api_key)
# Clear the index before re-indexing
print(f"Clearing index: {index_name}...")
index = pc.Index(index_name)
index.delete(delete_all=True)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
print(f"Indexing to Pinecone index: {index_name}...")
vectorstore = PineconeVectorStore.from_documents(
chunks,
embeddings,
index_name=index_name
)
print("Indexing complete!")
if __name__ == "__main__":
index_data()