Spaces:

Keshabwi66
/

chatbot

Sleeping

File size: 2,740 Bytes

4184ffc

import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings # Changed to HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Import API keys from config (only Pinecone is needed here now)
from config import PINECONE_API_KEY

# Set environment variables for Pinecone
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define Hugging Face embedding model
# This will download the model the first time it's used.
# The default model for HuggingFaceEmbeddings is 'sentence-transformers/all-MiniLM-L6-v2'
# which has a dimension of 384.
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Define Pinecone index name
INDEX_NAME = "rag-index" # Make sure this matches your actual index name

# --- Retriever (Existing function) ---
def get_retriever():
    """Initializes and returns the Pinecone vector store retriever."""
    # Ensure the index exists, create if not
    if INDEX_NAME not in pc.list_indexes().names():
        print(f"Creating new Pinecone index: {INDEX_NAME}...")
        pc.create_index(
            name=INDEX_NAME,
            dimension=384, # Changed dimension for 'sentence-transformers/all-MiniLM-L6-v2'
            metric="cosine",
            spec=ServerlessSpec(cloud='aws', region='us-east-1') # Adjust cloud/region as per your Pinecone setup
        )
        print(f"Created new Pinecone index: {INDEX_NAME}")
    
    vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
    return vectorstore.as_retriever()

# --- Function to add documents to the vector store ---
def add_document_to_vectorstore(text_content: str):
    """
    Adds a single text document to the Pinecone vector store.
    Splits the text into chunks before embedding and upserting.
    """
    if not text_content:
        raise ValueError("Document content cannot be empty.")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True,
    )
    
    # Create Langchain Document objects from the raw text
    documents = text_splitter.create_documents([text_content])
    
    print(f"Splitting document into {len(documents)} chunks for indexing...")
    
    # Get the vectorstore instance (not the retriever) to add documents
    vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
    
    # Add documents to the vector store
    vectorstore.add_documents(documents)
    print(f"Successfully added {len(documents)} chunks to Pinecone index '{INDEX_NAME}'.")