Spaces:

Keshabwi66
/

chatbot

Sleeping

App Files Files Community

chatbot / vectorstore.py

Keshabwi66

Huggingface

4184ffc 22 days ago

raw

history blame contribute delete

2.74 kB

	import os
	from pinecone import Pinecone, ServerlessSpec
	from langchain_pinecone import PineconeVectorStore
	from langchain_huggingface import HuggingFaceEmbeddings # Changed to HuggingFaceEmbeddings
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	# Import API keys from config (only Pinecone is needed here now)
	from config import PINECONE_API_KEY

	# Set environment variables for Pinecone
	os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

	# Initialize Pinecone client
	pc = Pinecone(api_key=PINECONE_API_KEY)

	# Define Hugging Face embedding model
	# This will download the model the first time it's used.
	# The default model for HuggingFaceEmbeddings is 'sentence-transformers/all-MiniLM-L6-v2'
	# which has a dimension of 384.
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	# Define Pinecone index name
	INDEX_NAME = "rag-index" # Make sure this matches your actual index name

	# --- Retriever (Existing function) ---
	def get_retriever():
	"""Initializes and returns the Pinecone vector store retriever."""
	# Ensure the index exists, create if not
	if INDEX_NAME not in pc.list_indexes().names():
	print(f"Creating new Pinecone index: {INDEX_NAME}...")
	pc.create_index(
	name=INDEX_NAME,
	dimension=384, # Changed dimension for 'sentence-transformers/all-MiniLM-L6-v2'
	metric="cosine",
	spec=ServerlessSpec(cloud='aws', region='us-east-1') # Adjust cloud/region as per your Pinecone setup
	)
	print(f"Created new Pinecone index: {INDEX_NAME}")

	vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
	return vectorstore.as_retriever()

	# --- Function to add documents to the vector store ---
	def add_document_to_vectorstore(text_content: str):
	"""
	Adds a single text document to the Pinecone vector store.
	Splits the text into chunks before embedding and upserting.
	"""
	if not text_content:
	raise ValueError("Document content cannot be empty.")

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	add_start_index=True,
	)

	# Create Langchain Document objects from the raw text
	documents = text_splitter.create_documents([text_content])

	print(f"Splitting document into {len(documents)} chunks for indexing...")

	# Get the vectorstore instance (not the retriever) to add documents
	vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)

	# Add documents to the vector store
	vectorstore.add_documents(documents)
	print(f"Successfully added {len(documents)} chunks to Pinecone index '{INDEX_NAME}'.")