Spaces:
Sleeping
Sleeping
File size: 2,416 Bytes
46917c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import glob
import os
from pypdf import PdfReader
def get_pdf_files(file_path):
"""
Get all PDF files from the specified file path.
Args:
file_path (str): The directory path containing the PDF files.
Returns:
list: A list containing the paths of all the PDF files in the directory.
"""
if os.path.exists(file_path):
return glob.glob(os.path.join(file_path, "*.pdf"))
else:
return []
def read_multiple_pdf(file_path: str) -> list:
"""
Read multiple PDF files from the specified file path and extract the text from each page.
Args:
file_path (str): The directory path containing the PDF files.
Returns:
list: A list containing the extracted text from each page of the PDF files.
"""
pdf_files = get_pdf_files(file_path)
output = []
for file in pdf_files:
try:
with open(file, "rb") as f:
pdf_reader = PdfReader(f)
count = pdf_reader.getNumPages()
for i in range(count):
page = pdf_reader.getPage(i)
output.append(page.extractText())
except Exception as e:
print(f"Error reading file '{file}': {str(e)}")
return output
def read_single_pdf(file_path: str) -> str:
"""
Read a single PDF file and extract the text from each page.
Args:
file_path (str): The path of the PDF file.
Returns:
list: A list containing the extracted text from each page of the PDF file.
"""
output = []
try:
with open(file_path, "rb") as f:
pdf_reader = PdfReader(f)
count = len(pdf_reader.pages)
for i in range(count):
page = pdf_reader.pages[i]
output.append(page.extract_text())
except Exception as e:
print(f"Error reading file '{file_path}': {str(e)}")
return str(" ".join(output))
def get_pdf_files(file_path: str) -> list:
"""
Get a list of PDF files from the specified directory path.
Args:
file_path (str): The directory path containing the PDF files.
Returns:
list: A list of PDF file paths.
"""
pdf_files = []
try:
pdf_files = glob.glob(os.path.join(file_path, "*.pdf"))
except Exception as e:
print(f"Error getting PDF files from '{file_path}': {str(e)}")
return pdf_files
|