import os import glob from dotenv import load_dotenv from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_pinecone import PineconeVectorStore from pinecone import Pinecone load_dotenv() def index_data(): data_path = "data/cleaned/*.md" files = glob.glob(data_path) documents = [] for file in files: print(f"Loading {file}...") loader = UnstructuredMarkdownLoader(file) documents.extend(loader.load()) print(f"Loaded {len(documents)} documents.") text_splitter = RecursiveCharacterTextSplitter( chunk_size=1500, chunk_overlap=300, add_start_index=True ) chunks = text_splitter.split_documents(documents) print(f"Split into {len(chunks)} chunks.") api_key = os.getenv("PINECONE_API_KEY") index_name = os.getenv("PINECONE_INDEX_NAME", "locus-rag") if not api_key: print("Error: PINECONE_API_KEY not found in environment variables.") return pc = Pinecone(api_key=api_key) # Clear the index before re-indexing print(f"Clearing index: {index_name}...") index = pc.Index(index_name) index.delete(delete_all=True) embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") print(f"Indexing to Pinecone index: {index_name}...") vectorstore = PineconeVectorStore.from_documents( chunks, embeddings, index_name=index_name ) print("Indexing complete!") if __name__ == "__main__": index_data()