Spaces:

midlajvalappil
/

Resume-Ranker-AI

Sleeping

Resume-Ranker-AI / src /utils /document_parser.py

Upload 8 files

35fa114 verified 9 months ago

1.8 kB

	import PyPDF2
	import pdfplumber
	import docx
	import os

	class DocumentParser:
	"""Parse PDF and DOCX documents to extract text content."""

	@staticmethod
	def parse_pdf(file_path):
	"""Extract text from PDF files using pdfplumber for better text extraction."""
	text = ""
	try:
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	text += page.extract_text() or ""

	# If pdfplumber fails to extract text, try PyPDF2 as fallback
	if not text.strip():
	with open(file_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	for page_num in range(len(reader.pages)):
	text += reader.pages[page_num].extract_text() or ""
	except Exception as e:
	print(f"Error parsing PDF: {e}")
	return None

	return text.strip()

	@staticmethod
	def parse_docx(file_path):
	"""Extract text from DOCX files."""
	try:
	doc = docx.Document(file_path)
	text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
	return text.strip()
	except Exception as e:
	print(f"Error parsing DOCX: {e}")
	return None

	@staticmethod
	def parse_document(file_path):
	"""Parse document based on file extension."""
	_, file_extension = os.path.splitext(file_path)

	if file_extension.lower() == '.pdf':
	return DocumentParser.parse_pdf(file_path)
	elif file_extension.lower() == '.docx':
	return DocumentParser.parse_docx(file_path)
	else:
	print(f"Unsupported file format: {file_extension}")
	return None