Spaces:

chirag1121
/

Resume_Screening_Model

Sleeping

App Files Files Community

Resume_Screening_Model / utils /parser.py

chirag1121

Update utils/parser.py

ef89ade verified about 1 month ago

raw

history blame contribute delete

3.14 kB

	"""
	parser.py — Resume file parsing module.

	Handles text extraction from PDF and DOCX files.
	Uses PyMuPDF for PDFs and python-docx for Word documents.
	"""

	import io
	import fitz # PyMuPDF
	from docx import Document


	def extract_text_from_pdf(file_bytes: bytes) -> str:
	"""
	Extract all text from a PDF file given its raw bytes.

	Args:
	file_bytes: Raw bytes of the PDF file.

	Returns:
	Extracted text as a single string, or empty string on failure.
	"""
	try:
	pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
	text_parts = []
	for page_num in range(len(pdf_doc)):
	page = pdf_doc[page_num]
	text_parts.append(page.get_text("text"))
	pdf_doc.close()
	return "\n".join(text_parts).strip()
	except Exception as e:
	print(f"[parser] PDF extraction error: {e}")
	return ""


	def extract_text_from_docx(file_bytes: bytes) -> str:
	"""
	Extract all text from a DOCX file given its raw bytes.

	Args:
	file_bytes: Raw bytes of the DOCX file.

	Returns:
	Extracted text as a single string, or empty string on failure.
	"""
	try:
	doc = Document(io.BytesIO(file_bytes))
	paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
	# Also extract text from tables
	for table in doc.tables:
	for row in table.rows:
	for cell in row.cells:
	if cell.text.strip():
	paragraphs.append(cell.text.strip())
	return "\n".join(paragraphs).strip()
	except Exception as e:
	print(f"[parser] DOCX extraction error: {e}")
	return ""


	def parse_resume(uploaded_file) -> dict:
	"""
	Main entry point: parse an uploaded Streamlit file object.

	Detects file type and routes to the correct extractor.

	Args:
	uploaded_file: Streamlit UploadedFile object.

	Returns:
	dict with keys:
	- 'text' : extracted resume text (str)
	- 'filename' : original file name (str)
	- 'file_type': 'pdf' \| 'docx' \| 'unknown'
	- 'error' : error message if extraction failed (str \| None)
	"""
	result = {
	"text": "",
	"filename": uploaded_file.name,
	"file_type": "unknown",
	"error": None,
	}

	file_bytes = uploaded_file.read()

	if not file_bytes:
	result["error"] = "Uploaded file is empty."
	return result

	filename_lower = uploaded_file.name.lower()

	if filename_lower.endswith(".pdf"):
	result["file_type"] = "pdf"
	result["text"] = extract_text_from_pdf(file_bytes)
	elif filename_lower.endswith(".docx"):
	result["file_type"] = "docx"
	result["text"] = extract_text_from_docx(file_bytes)
	else:
	result["error"] = "Unsupported file type. Please upload a PDF or DOCX."
	return result

	if not result["text"]:
	result["error"] = (
	"Could not extract text from the file. "
	"The file may be image-based or corrupted."
	)

	return result