""" parser.py — Resume file parsing module. Handles text extraction from PDF and DOCX files. Uses PyMuPDF for PDFs and python-docx for Word documents. """ import io import fitz # PyMuPDF from docx import Document def extract_text_from_pdf(file_bytes: bytes) -> str: """ Extract all text from a PDF file given its raw bytes. Args: file_bytes: Raw bytes of the PDF file. Returns: Extracted text as a single string, or empty string on failure. """ try: pdf_doc = fitz.open(stream=file_bytes, filetype="pdf") text_parts = [] for page_num in range(len(pdf_doc)): page = pdf_doc[page_num] text_parts.append(page.get_text("text")) pdf_doc.close() return "\n".join(text_parts).strip() except Exception as e: print(f"[parser] PDF extraction error: {e}") return "" def extract_text_from_docx(file_bytes: bytes) -> str: """ Extract all text from a DOCX file given its raw bytes. Args: file_bytes: Raw bytes of the DOCX file. Returns: Extracted text as a single string, or empty string on failure. """ try: doc = Document(io.BytesIO(file_bytes)) paragraphs = [para.text for para in doc.paragraphs if para.text.strip()] # Also extract text from tables for table in doc.tables: for row in table.rows: for cell in row.cells: if cell.text.strip(): paragraphs.append(cell.text.strip()) return "\n".join(paragraphs).strip() except Exception as e: print(f"[parser] DOCX extraction error: {e}") return "" def parse_resume(uploaded_file) -> dict: """ Main entry point: parse an uploaded Streamlit file object. Detects file type and routes to the correct extractor. Args: uploaded_file: Streamlit UploadedFile object. Returns: dict with keys: - 'text' : extracted resume text (str) - 'filename' : original file name (str) - 'file_type': 'pdf' | 'docx' | 'unknown' - 'error' : error message if extraction failed (str | None) """ result = { "text": "", "filename": uploaded_file.name, "file_type": "unknown", "error": None, } file_bytes = uploaded_file.read() if not file_bytes: result["error"] = "Uploaded file is empty." return result filename_lower = uploaded_file.name.lower() if filename_lower.endswith(".pdf"): result["file_type"] = "pdf" result["text"] = extract_text_from_pdf(file_bytes) elif filename_lower.endswith(".docx"): result["file_type"] = "docx" result["text"] = extract_text_from_docx(file_bytes) else: result["error"] = "Unsupported file type. Please upload a PDF or DOCX." return result if not result["text"]: result["error"] = ( "Could not extract text from the file. " "The file may be image-based or corrupted." ) return result