Spaces:
Sleeping
Sleeping
File size: 1,671 Bytes
4b55bd6 bf637d9 4b55bd6 bf637d9 4b55bd6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import os
import glob
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
load_dotenv()
def index_data():
data_path = "data/cleaned/*.md"
files = glob.glob(data_path)
documents = []
for file in files:
print(f"Loading {file}...")
loader = UnstructuredMarkdownLoader(file)
documents.extend(loader.load())
print(f"Loaded {len(documents)} documents.")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500,
chunk_overlap=300,
add_start_index=True
)
chunks = text_splitter.split_documents(documents)
print(f"Split into {len(chunks)} chunks.")
api_key = os.getenv("PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX_NAME", "locus-rag")
if not api_key:
print("Error: PINECONE_API_KEY not found in environment variables.")
return
pc = Pinecone(api_key=api_key)
# Clear the index before re-indexing
print(f"Clearing index: {index_name}...")
index = pc.Index(index_name)
index.delete(delete_all=True)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
print(f"Indexing to Pinecone index: {index_name}...")
vectorstore = PineconeVectorStore.from_documents(
chunks,
embeddings,
index_name=index_name
)
print("Indexing complete!")
if __name__ == "__main__":
index_data()
|