Spaces:

devmalik-official
/

resume-analyzer

Sleeping

App Files Files Community

resume-analyzer / utils /resume_parser.py

devmalik-official

Deploy Gradio app

1aea493 27 days ago

raw

history blame contribute delete

2.23 kB

	import PyPDF2
	from docx import Document
	from pathlib import Path
	import io


	def extract_text_from_pdf(file_content):
	"""Extract text from PDF file"""
	try:
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text.strip()
	except Exception as e:
	raise Exception(f"Error reading PDF: {str(e)}")


	def extract_text_from_docx(file_content):
	"""Extract text from DOCX file"""
	try:
	doc = Document(io.BytesIO(file_content))
	text = ""
	for para in doc.paragraphs:
	text += para.text + "\n"
	for table in doc.tables:
	for row in table.rows:
	for cell in row.cells:
	text += cell.text + " "
	text += "\n"
	return text.strip()
	except Exception as e:
	raise Exception(f"Error reading DOCX: {str(e)}")


	def extract_text_from_txt(file_content):
	"""Extract text from TXT file"""
	try:
	return file_content.decode('utf-8').strip()
	except Exception as e:
	raise Exception(f"Error reading TXT: {str(e)}")


	def parse_resume(file_content, file_extension):
	"""
	Parse resume based on file type

	Args:
	file_content: Binary file content
	file_extension: File extension (.pdf, .docx, .txt)

	Returns:
	Extracted text from resume
	"""
	file_extension = file_extension.lower()

	if file_extension == ".pdf":
	return extract_text_from_pdf(file_content)
	elif file_extension in [".docx", ".doc"]:
	return extract_text_from_docx(file_content)
	elif file_extension == ".txt":
	return extract_text_from_txt(file_content)
	else:
	raise ValueError(f"Unsupported file format: {file_extension}")


	def extract_from_uploaded_file(uploaded_file):
	"""
	Extract text from uploaded file object

	Args:
	uploaded_file: Streamlit uploaded file object

	Returns:
	Extracted text
	"""
	file_extension = Path(uploaded_file.name).suffix.lower()
	file_content = uploaded_file.read()
	return parse_resume(file_content, file_extension)