Spaces:
Sleeping
Sleeping
| """ | |
| parser.py — Resume file parsing module. | |
| Handles text extraction from PDF and DOCX files. | |
| Uses PyMuPDF for PDFs and python-docx for Word documents. | |
| """ | |
| import io | |
| import fitz # PyMuPDF | |
| from docx import Document | |
| def extract_text_from_pdf(file_bytes: bytes) -> str: | |
| """ | |
| Extract all text from a PDF file given its raw bytes. | |
| Args: | |
| file_bytes: Raw bytes of the PDF file. | |
| Returns: | |
| Extracted text as a single string, or empty string on failure. | |
| """ | |
| try: | |
| pdf_doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| text_parts = [] | |
| for page_num in range(len(pdf_doc)): | |
| page = pdf_doc[page_num] | |
| text_parts.append(page.get_text("text")) | |
| pdf_doc.close() | |
| return "\n".join(text_parts).strip() | |
| except Exception as e: | |
| print(f"[parser] PDF extraction error: {e}") | |
| return "" | |
| def extract_text_from_docx(file_bytes: bytes) -> str: | |
| """ | |
| Extract all text from a DOCX file given its raw bytes. | |
| Args: | |
| file_bytes: Raw bytes of the DOCX file. | |
| Returns: | |
| Extracted text as a single string, or empty string on failure. | |
| """ | |
| try: | |
| doc = Document(io.BytesIO(file_bytes)) | |
| paragraphs = [para.text for para in doc.paragraphs if para.text.strip()] | |
| # Also extract text from tables | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| if cell.text.strip(): | |
| paragraphs.append(cell.text.strip()) | |
| return "\n".join(paragraphs).strip() | |
| except Exception as e: | |
| print(f"[parser] DOCX extraction error: {e}") | |
| return "" | |
| def parse_resume(uploaded_file) -> dict: | |
| """ | |
| Main entry point: parse an uploaded Streamlit file object. | |
| Detects file type and routes to the correct extractor. | |
| Args: | |
| uploaded_file: Streamlit UploadedFile object. | |
| Returns: | |
| dict with keys: | |
| - 'text' : extracted resume text (str) | |
| - 'filename' : original file name (str) | |
| - 'file_type': 'pdf' | 'docx' | 'unknown' | |
| - 'error' : error message if extraction failed (str | None) | |
| """ | |
| result = { | |
| "text": "", | |
| "filename": uploaded_file.name, | |
| "file_type": "unknown", | |
| "error": None, | |
| } | |
| file_bytes = uploaded_file.read() | |
| if not file_bytes: | |
| result["error"] = "Uploaded file is empty." | |
| return result | |
| filename_lower = uploaded_file.name.lower() | |
| if filename_lower.endswith(".pdf"): | |
| result["file_type"] = "pdf" | |
| result["text"] = extract_text_from_pdf(file_bytes) | |
| elif filename_lower.endswith(".docx"): | |
| result["file_type"] = "docx" | |
| result["text"] = extract_text_from_docx(file_bytes) | |
| else: | |
| result["error"] = "Unsupported file type. Please upload a PDF or DOCX." | |
| return result | |
| if not result["text"]: | |
| result["error"] = ( | |
| "Could not extract text from the file. " | |
| "The file may be image-based or corrupted." | |
| ) | |
| return result |