import os from pinecone import Pinecone, ServerlessSpec from langchain_pinecone import PineconeVectorStore from langchain_huggingface import HuggingFaceEmbeddings # Changed to HuggingFaceEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter # Import API keys from config (only Pinecone is needed here now) from config import PINECONE_API_KEY # Set environment variables for Pinecone os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY # Initialize Pinecone client pc = Pinecone(api_key=PINECONE_API_KEY) # Define Hugging Face embedding model # This will download the model the first time it's used. # The default model for HuggingFaceEmbeddings is 'sentence-transformers/all-MiniLM-L6-v2' # which has a dimension of 384. embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Define Pinecone index name INDEX_NAME = "rag-index" # Make sure this matches your actual index name # --- Retriever (Existing function) --- def get_retriever(): """Initializes and returns the Pinecone vector store retriever.""" # Ensure the index exists, create if not if INDEX_NAME not in pc.list_indexes().names(): print(f"Creating new Pinecone index: {INDEX_NAME}...") pc.create_index( name=INDEX_NAME, dimension=384, # Changed dimension for 'sentence-transformers/all-MiniLM-L6-v2' metric="cosine", spec=ServerlessSpec(cloud='aws', region='us-east-1') # Adjust cloud/region as per your Pinecone setup ) print(f"Created new Pinecone index: {INDEX_NAME}") vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings) return vectorstore.as_retriever() # --- Function to add documents to the vector store --- def add_document_to_vectorstore(text_content: str): """ Adds a single text document to the Pinecone vector store. Splits the text into chunks before embedding and upserting. """ if not text_content: raise ValueError("Document content cannot be empty.") text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, add_start_index=True, ) # Create Langchain Document objects from the raw text documents = text_splitter.create_documents([text_content]) print(f"Splitting document into {len(documents)} chunks for indexing...") # Get the vectorstore instance (not the retriever) to add documents vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings) # Add documents to the vector store vectorstore.add_documents(documents) print(f"Successfully added {len(documents)} chunks to Pinecone index '{INDEX_NAME}'.")