Spaces:
Sleeping
Sleeping
| import os | |
| import glob | |
| from dotenv import load_dotenv | |
| from langchain_community.document_loaders import UnstructuredMarkdownLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_pinecone import PineconeVectorStore | |
| from pinecone import Pinecone | |
| load_dotenv() | |
| def index_data(): | |
| data_path = "data/cleaned/*.md" | |
| files = glob.glob(data_path) | |
| documents = [] | |
| for file in files: | |
| print(f"Loading {file}...") | |
| loader = UnstructuredMarkdownLoader(file) | |
| documents.extend(loader.load()) | |
| print(f"Loaded {len(documents)} documents.") | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1500, | |
| chunk_overlap=300, | |
| add_start_index=True | |
| ) | |
| chunks = text_splitter.split_documents(documents) | |
| print(f"Split into {len(chunks)} chunks.") | |
| api_key = os.getenv("PINECONE_API_KEY") | |
| index_name = os.getenv("PINECONE_INDEX_NAME", "locus-rag") | |
| if not api_key: | |
| print("Error: PINECONE_API_KEY not found in environment variables.") | |
| return | |
| pc = Pinecone(api_key=api_key) | |
| # Clear the index before re-indexing | |
| print(f"Clearing index: {index_name}...") | |
| index = pc.Index(index_name) | |
| index.delete(delete_all=True) | |
| embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| print(f"Indexing to Pinecone index: {index_name}...") | |
| vectorstore = PineconeVectorStore.from_documents( | |
| chunks, | |
| embeddings, | |
| index_name=index_name | |
| ) | |
| print("Indexing complete!") | |
| if __name__ == "__main__": | |
| index_data() | |