HR-Resume-Analyzer / text_extraction.py
DreamStream-1's picture
Create text_extraction.py
3dfe527 verified
raw
history blame contribute delete
961 Bytes
import PyPDF2
import re
def extract_text_from_pdf(pdf_file_path):
"""Extracts text from a PDF file with improved error handling."""
try:
with open(pdf_file_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ' '.join(page.extract_text() or '' for page in pdf_reader.pages)
return text.strip()
except Exception as e:
return f"Error reading PDF: {str(e)}"
def extract_text_from_txt(txt_file_path):
"""Extracts text from a TXT file with encoding fallbacks."""
encodings = ['utf-8', 'latin-1', 'ascii']
for encoding in encodings:
try:
with open(txt_file_path, 'r', encoding=encoding) as txt_file:
return txt_file.read().strip()
except UnicodeDecodeError:
continue
except Exception as e:
return f"Error reading TXT: {str(e)}"
return "Error: Unable to decode file with supported encodings"