resumematcher / scripts /ReadPdf.py
March
first
46917c3
import glob
import os
from pypdf import PdfReader
def get_pdf_files(file_path):
"""
Get all PDF files from the specified file path.
Args:
file_path (str): The directory path containing the PDF files.
Returns:
list: A list containing the paths of all the PDF files in the directory.
"""
if os.path.exists(file_path):
return glob.glob(os.path.join(file_path, "*.pdf"))
else:
return []
def read_multiple_pdf(file_path: str) -> list:
"""
Read multiple PDF files from the specified file path and extract the text from each page.
Args:
file_path (str): The directory path containing the PDF files.
Returns:
list: A list containing the extracted text from each page of the PDF files.
"""
pdf_files = get_pdf_files(file_path)
output = []
for file in pdf_files:
try:
with open(file, "rb") as f:
pdf_reader = PdfReader(f)
count = pdf_reader.getNumPages()
for i in range(count):
page = pdf_reader.getPage(i)
output.append(page.extractText())
except Exception as e:
print(f"Error reading file '{file}': {str(e)}")
return output
def read_single_pdf(file_path: str) -> str:
"""
Read a single PDF file and extract the text from each page.
Args:
file_path (str): The path of the PDF file.
Returns:
list: A list containing the extracted text from each page of the PDF file.
"""
output = []
try:
with open(file_path, "rb") as f:
pdf_reader = PdfReader(f)
count = len(pdf_reader.pages)
for i in range(count):
page = pdf_reader.pages[i]
output.append(page.extract_text())
except Exception as e:
print(f"Error reading file '{file_path}': {str(e)}")
return str(" ".join(output))
def get_pdf_files(file_path: str) -> list:
"""
Get a list of PDF files from the specified directory path.
Args:
file_path (str): The directory path containing the PDF files.
Returns:
list: A list of PDF file paths.
"""
pdf_files = []
try:
pdf_files = glob.glob(os.path.join(file_path, "*.pdf"))
except Exception as e:
print(f"Error getting PDF files from '{file_path}': {str(e)}")
return pdf_files