Al1Abdullah commited on
Commit
d9f3078
·
verified ·
1 Parent(s): b29ce3b

Create pdf_handler.py

Browse files
Files changed (1) hide show
  1. src/pdf_handler.py +51 -0
src/pdf_handler.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
4
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
+ from langchain.schema.document import Document
6
+
7
+
8
+ def create_cache_dir(directory=None):
9
+ if not directory:
10
+ directory = './.cache'
11
+
12
+ os.makedirs('./.cache', exist_ok=True)
13
+ return directory
14
+
15
+
16
+ def load_pdf(file_path):
17
+ loader = PyPDFLoader(file_path)
18
+
19
+ return loader.load()
20
+
21
+
22
+ def load_pdf_directory(directory):
23
+ loader = PyPDFDirectoryLoader(directory)
24
+ documents = loader.load()
25
+
26
+ return loader.load()
27
+
28
+
29
+ def split_pdf(pdfs: list[Document]):
30
+ splitter = RecursiveCharacterTextSplitter(
31
+ chunk_size=512,
32
+ chunk_overlap=64,
33
+ length_function=len,
34
+ is_separator_regex=False
35
+ )
36
+
37
+ return splitter.split_documents(pdfs)
38
+
39
+
40
+ def extract_pdf(uploaded_pdf):
41
+ cache_dir = create_cache_dir()
42
+ cache_dir = os.path.join(cache_dir, 'temp_files')
43
+ os.makedirs(cache_dir, exist_ok=True)
44
+
45
+ for file in uploaded_pdf:
46
+ file_path = os.path.join(cache_dir, file.name)
47
+
48
+ with open(file_path, 'wb') as w:
49
+ w.write(file.getvalue())
50
+
51
+ return cache_dir