File size: 1,191 Bytes
d9f3078
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os

from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document


def create_cache_dir(directory=None):
    if not directory:
        directory = './.cache'

    os.makedirs('./.cache', exist_ok=True)
    return directory


def load_pdf(file_path):
    loader = PyPDFLoader(file_path)

    return loader.load()


def load_pdf_directory(directory):
    loader = PyPDFDirectoryLoader(directory)
    documents = loader.load()

    return loader.load()


def split_pdf(pdfs: list[Document]):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=64,
        length_function=len,
        is_separator_regex=False
    )

    return splitter.split_documents(pdfs)


def extract_pdf(uploaded_pdf):
    cache_dir = create_cache_dir()
    cache_dir = os.path.join(cache_dir, 'temp_files')
    os.makedirs(cache_dir, exist_ok=True)

    for file in uploaded_pdf:
        file_path = os.path.join(cache_dir, file.name)

        with open(file_path, 'wb') as w:
            w.write(file.getvalue())

    return cache_dir