gardio_test / llm /vectorization.py
dev-yuje's picture
Add
d4ecde8
raw
history blame contribute delete
822 Bytes
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from config import EMBEDDING_MODEL, FAISS_PATH, PDF_SOURCE
def make_vectorization():
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
loader = PyPDFLoader(PDF_SOURCE)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
length_function=len,
is_separator_regex=False
)
documents = text_splitter.split_documents(documents)
vectorstore = FAISS.from_documents(documents, embedding_model)
vectorstore.save_local(FAISS_PATH)
return vectorstore