chirag1121's picture
Update utils/parser.py
ef89ade verified
"""
parser.py — Resume file parsing module.
Handles text extraction from PDF and DOCX files.
Uses PyMuPDF for PDFs and python-docx for Word documents.
"""
import io
import fitz # PyMuPDF
from docx import Document
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""
Extract all text from a PDF file given its raw bytes.
Args:
file_bytes: Raw bytes of the PDF file.
Returns:
Extracted text as a single string, or empty string on failure.
"""
try:
pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
text_parts = []
for page_num in range(len(pdf_doc)):
page = pdf_doc[page_num]
text_parts.append(page.get_text("text"))
pdf_doc.close()
return "\n".join(text_parts).strip()
except Exception as e:
print(f"[parser] PDF extraction error: {e}")
return ""
def extract_text_from_docx(file_bytes: bytes) -> str:
"""
Extract all text from a DOCX file given its raw bytes.
Args:
file_bytes: Raw bytes of the DOCX file.
Returns:
Extracted text as a single string, or empty string on failure.
"""
try:
doc = Document(io.BytesIO(file_bytes))
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
paragraphs.append(cell.text.strip())
return "\n".join(paragraphs).strip()
except Exception as e:
print(f"[parser] DOCX extraction error: {e}")
return ""
def parse_resume(uploaded_file) -> dict:
"""
Main entry point: parse an uploaded Streamlit file object.
Detects file type and routes to the correct extractor.
Args:
uploaded_file: Streamlit UploadedFile object.
Returns:
dict with keys:
- 'text' : extracted resume text (str)
- 'filename' : original file name (str)
- 'file_type': 'pdf' | 'docx' | 'unknown'
- 'error' : error message if extraction failed (str | None)
"""
result = {
"text": "",
"filename": uploaded_file.name,
"file_type": "unknown",
"error": None,
}
file_bytes = uploaded_file.read()
if not file_bytes:
result["error"] = "Uploaded file is empty."
return result
filename_lower = uploaded_file.name.lower()
if filename_lower.endswith(".pdf"):
result["file_type"] = "pdf"
result["text"] = extract_text_from_pdf(file_bytes)
elif filename_lower.endswith(".docx"):
result["file_type"] = "docx"
result["text"] = extract_text_from_docx(file_bytes)
else:
result["error"] = "Unsupported file type. Please upload a PDF or DOCX."
return result
if not result["text"]:
result["error"] = (
"Could not extract text from the file. "
"The file may be image-based or corrupted."
)
return result