| """ | |
| Indexing with vector database - updated for Weaviate, FAISS, Qdrant, Pinecone | |
| Compatible with latest LangChain and HuggingFaceEmbeddings | |
| """ | |
| from pathlib import Path | |
| import re | |
| import os | |
| from unidecode import unidecode | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| def load_doc(list_file_path, chunk_size, chunk_overlap): | |
| loaders = [PyPDFLoader(x) for x in list_file_path] | |
| pages = [] | |
| for loader in loaders: | |
| pages.extend(loader.load()) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| doc_splits = text_splitter.split_documents(pages) | |
| return doc_splits | |
| def create_collection_name(filepath): | |
| collection_name = Path(filepath).stem | |
| collection_name = collection_name.replace(" ", "-") | |
| collection_name = unidecode(collection_name) | |
| collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name) | |
| collection_name = collection_name[:50] | |
| if len(collection_name) < 3: | |
| collection_name += "xyz" | |
| if not collection_name[0].isalnum(): | |
| collection_name = "A" + collection_name[1:] | |
| if not collection_name[-1].isalnum(): | |
| collection_name = collection_name[:-1] + "Z" | |
| print("\n\nFilepath:", filepath) | |
| print("Collection name:", collection_name) | |
| return collection_name | |
| def create_db(splits, collection_name, db_type="ChromaDB"): | |
| embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2") | |
| if db_type == "ChromaDB": | |
| import chromadb | |
| from langchain_chroma import Chroma | |
| chromadb.api.client.SharedSystemClient.clear_system_cache() | |
| vectordb = Chroma.from_documents( | |
| documents=splits, | |
| embedding=embedding, | |
| client=chromadb.EphemeralClient(), | |
| collection_name=collection_name, | |
| ) | |
| return vectordb | |
| elif db_type == "Weaviate": | |
| import weaviate | |
| from langchain_weaviate.vectorstores import WeaviateVectorStore | |
| client = weaviate.connect_to_local("http://localhost:8080", | |
| grpc_port=50051) | |
| vectordb = WeaviateVectorStore.from_documents( | |
| splits, | |
| embedding, | |
| client=client, | |
| index_name=collection_name, | |
| text_key="text" | |
| ) | |
| return vectordb | |
| elif db_type == "FAISS": | |
| from langchain.vectorstores import FAISS | |
| vectordb = FAISS.from_documents(splits, embedding) | |
| vectordb.save_local(f"{collection_name}_index") | |
| return vectordb | |
| elif db_type == "Qdrant": | |
| from qdrant_client import QdrantClient | |
| from langchain.vectorstores import Qdrant | |
| client = QdrantClient("::memory::") | |
| vectordb = Qdrant.from_documents(splits, embedding, client=client, collection_name=collection_name) | |
| return vectordb | |
| elif db_type == "Pinecone": | |
| import pinecone | |
| from langchain_pinecone import PineconeVectorStore | |
| pinecone_api_key = os.environ.get("PINECONE_API_KEY") | |
| pc = pinecone.Pinecone(api_key=pinecone_api_key) | |
| index_name = collection_name | |
| dim = len(embedding.embed_query("test")) | |
| if index_name not in [i.name for i in pc.list_indexes()]: | |
| pc.create_index(name=index_name, dimension=dim, metric="cosine") | |
| index = pc.Index(index_name) | |
| vectordb = PineconeVectorStore.from_documents(docs=splits, index=index, embedding=embedding) | |
| return vectordb | |
| else: | |
| raise ValueError(f"Unsupported vector DB type: {db_type}") |