Spaces:

Alpha108
/

MatchHive-ai

Sleeping

App Files Files Community

MatchHive-ai / backend /agents /resume_parser.py

Alpha108

Create resume_parser.py

be56e96 verified 4 months ago

raw

history blame contribute delete

1.95 kB

	import PyPDF2
	import docx
	import io

	def parse_pdf(file_stream):
	"""
	Extracts text from a PDF file stream.

	Args:
	file_stream: A file-like object (e.g., from st.file_uploader).

	Returns:
	str: The extracted text from the PDF.
	"""
	text = ""
	try:
	reader = PyPDF2.PdfReader(file_stream)
	for page in reader.pages:
	text += page.extract_text() or ""
	except Exception as e:
	print(f"Error reading PDF: {e}")
	raise ValueError("Could not parse the PDF file. It might be corrupted or image-based.")
	return text

	def parse_docx(file_stream):
	"""
	Extracts text from a DOCX file stream.

	Args:
	file_stream: A file-like object.

	Returns:
	str: The extracted text from the DOCX file.
	"""
	text = ""
	try:
	doc = docx.Document(file_stream)
	for para in doc.paragraphs:
	text += para.text + "\n"
	except Exception as e:
	print(f"Error reading DOCX: {e}")
	raise ValueError("Could not parse the DOCX file.")
	return text

	def parse_resume(uploaded_file):
	"""
	Parses an uploaded resume file (PDF or DOCX) and returns its text content.

	Args:
	uploaded_file: The file object from Streamlit's file_uploader.

	Returns:
	str: The text content of the resume.

	Raises:
	ValueError: If the file type is not supported or parsing fails.
	"""
	if uploaded_file is None:
	raise ValueError("No file uploaded.")

	file_extension = uploaded_file.name.split('.')[-1].lower()

	# We use BytesIO to handle the file in memory
	file_stream = io.BytesIO(uploaded_file.getvalue())

	if file_extension == 'pdf':
	return parse_pdf(file_stream)
	elif file_extension == 'docx':
	return parse_docx(file_stream)
	else:
	raise ValueError(f"Unsupported file type: '{file_extension}'. Please upload a PDF or DOCX file.")