File size: 1,470 Bytes
cfd509f
2c9aa3f
 
 
cfd509f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
# from dotenv import load_dotenv
# load_dotenv()
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
import tempfile
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.csv_loader import CSVLoader 
from langchain.document_loaders import PyPDFLoader 



def check_file_type(file_path):
    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()

    # Check if the file is a PDF
    if file_extension == '.pdf':
        return 1
    # Check if the file is a CSV
    if file_extension == '.csv':
        return 2
    

def configure_retriever(uploaded_files):
    docs = []
    temp_dir = tempfile.TemporaryDirectory()
    for file in uploaded_files:
        check = check_file_type(file)
        if check ==1:
            loader = PyPDFLoader(file) 
        if check ==2:
            loader = CSVLoader(file)
        docs.extend(loader.load())
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)

    # Create embeddings and store in vectordb
    embeddings = OpenAIEmbeddings()
    vectordb = FAISS.from_documents(splits, embeddings)

    # Define retriever
    retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4})
    print("embeddings created")
    return retriever