import streamlit as st from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from sentence_transformers import SentenceTransformer import os from langchain.chains import create_retrieval_chain from langchain.chains.combine_documents import create_stuff_documents_chain from langchain_core.prompts import ChatPromptTemplate from dotenv import load_dotenv from pinecone import Pinecone, ServerlessSpec import time from langchain_community.vectorstores import Pinecone as LangchainPinecone from PyPDF2 import PdfReader from langchain.schema import Document st.set_page_config( page_title="Upsert to Pinecone", page_icon="📤") def load_css(file_path): with open(file_path, "r") as f: return f"" # Load and inject CSS css = load_css("style.css") st.markdown(css, unsafe_allow_html=True) # Load environment variables load_dotenv() st.title('Upsert to Pinecone using \r paraphrase-multilingual-mpnet-base-v2\rEmbeddings📤') # PDF file uploader uploaded_file = st.file_uploader("Choose a PDF file📁", type="pdf") def extract_text_from_pdf(pdf_file): pdf_reader = PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text def get_text_chunks(text): text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=100, ) chunks = text_splitter.split_text(text) return chunks def get_embeddings(text_chunks): embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2") return embeddings.embed_documents(text_chunks) def get_vectorstore(text_chunks, index_name): embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2") # Create Document objects documents = [Document(page_content=chunk) for chunk in text_chunks] # Create and return the vector store vectorstore = LangchainPinecone.from_documents( documents, embeddings, index_name=index_name ) return vectorstore # Pinecone setup key = st.text_input("Enter your Pinecone API key:", type="password") index_name = st.text_input("Enter your Pinecone Index name:") if key and index_name: # Set the Pinecone API key as an environment variable os.environ['PINECONE_API_KEY'] = key # Initialize Pinecone pc = Pinecone() spec = ServerlessSpec( cloud="aws", region="us-east-1" ) # Check if the index exists, if not create it if index_name not in pc.list_indexes().names(): pc.create_index( name=index_name, dimension=768, # Dimension for paraphrase-multilingual-mpnet-base-v2 model metric='cosine', spec=spec ) st.info(f"Created new Pinecone index: {index_name}") # Get the index index = pc.Index(index_name) if uploaded_file is not None: text = extract_text_from_pdf(uploaded_file) text_chunks = get_text_chunks(text) if st.button("Generate Embeddings and Create Vectorstore"): with st.spinner("Processing..."): embeddings = get_embeddings(text_chunks) vectorstore = get_vectorstore(text_chunks, index_name) st.success("Embeddings generated and vectorstore created successfully!") st.write(f"Number of chunks: {len(text_chunks)}") st.write(f"Embedding dimension: {len(embeddings[0])}") # You can add more functionality here, such as querying the vectorstore else: st.warning("Please enter your Pinecone API key and Index Name to proceed.") footer = """ 1. Upload the PDF file you want to vectorize and upload to the Pinecone Database. 2. Enter your Pinecone API key. 3. Enter your Pinecone Index name. 4. Selected environment by default is

us-east-1

if you want a different one make changes in app.py. """ st.markdown(footer, unsafe_allow_html=True)