chatbot / vectorstore.py
Keshabwi66's picture
Huggingface
4184ffc
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings # Changed to HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Import API keys from config (only Pinecone is needed here now)
from config import PINECONE_API_KEY
# Set environment variables for Pinecone
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)
# Define Hugging Face embedding model
# This will download the model the first time it's used.
# The default model for HuggingFaceEmbeddings is 'sentence-transformers/all-MiniLM-L6-v2'
# which has a dimension of 384.
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Define Pinecone index name
INDEX_NAME = "rag-index" # Make sure this matches your actual index name
# --- Retriever (Existing function) ---
def get_retriever():
"""Initializes and returns the Pinecone vector store retriever."""
# Ensure the index exists, create if not
if INDEX_NAME not in pc.list_indexes().names():
print(f"Creating new Pinecone index: {INDEX_NAME}...")
pc.create_index(
name=INDEX_NAME,
dimension=384, # Changed dimension for 'sentence-transformers/all-MiniLM-L6-v2'
metric="cosine",
spec=ServerlessSpec(cloud='aws', region='us-east-1') # Adjust cloud/region as per your Pinecone setup
)
print(f"Created new Pinecone index: {INDEX_NAME}")
vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
return vectorstore.as_retriever()
# --- Function to add documents to the vector store ---
def add_document_to_vectorstore(text_content: str):
"""
Adds a single text document to the Pinecone vector store.
Splits the text into chunks before embedding and upserting.
"""
if not text_content:
raise ValueError("Document content cannot be empty.")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
add_start_index=True,
)
# Create Langchain Document objects from the raw text
documents = text_splitter.create_documents([text_content])
print(f"Splitting document into {len(documents)} chunks for indexing...")
# Get the vectorstore instance (not the retriever) to add documents
vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
# Add documents to the vector store
vectorstore.add_documents(documents)
print(f"Successfully added {len(documents)} chunks to Pinecone index '{INDEX_NAME}'.")