Spaces:
Sleeping
Sleeping
Create pdf_utils.py
Browse files- pdf_utils.py +14 -0
pdf_utils.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PyPDF2 import PdfReader
|
| 2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
+
|
| 4 |
+
async def extract_text_from_pdfs(files):
|
| 5 |
+
text = ""
|
| 6 |
+
for file in files:
|
| 7 |
+
pdf = PdfReader(file.file)
|
| 8 |
+
for page in pdf.pages:
|
| 9 |
+
text += page.extract_text()
|
| 10 |
+
return text
|
| 11 |
+
|
| 12 |
+
def split_text(text):
|
| 13 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
|
| 14 |
+
return splitter.split_text(text)
|