shubhendu-ghosh commited on
Commit
e4a4226
·
verified ·
1 Parent(s): 3a3e144

Create pdf_utils.py

Browse files
Files changed (1) hide show
  1. pdf_utils.py +14 -0
pdf_utils.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+
4
+ async def extract_text_from_pdfs(files):
5
+ text = ""
6
+ for file in files:
7
+ pdf = PdfReader(file.file)
8
+ for page in pdf.pages:
9
+ text += page.extract_text()
10
+ return text
11
+
12
+ def split_text(text):
13
+ splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
14
+ return splitter.split_text(text)