Spaces:
Sleeping
Sleeping
File size: 3,139 Bytes
ef89ade | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | """
parser.py — Resume file parsing module.
Handles text extraction from PDF and DOCX files.
Uses PyMuPDF for PDFs and python-docx for Word documents.
"""
import io
import fitz # PyMuPDF
from docx import Document
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""
Extract all text from a PDF file given its raw bytes.
Args:
file_bytes: Raw bytes of the PDF file.
Returns:
Extracted text as a single string, or empty string on failure.
"""
try:
pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
text_parts = []
for page_num in range(len(pdf_doc)):
page = pdf_doc[page_num]
text_parts.append(page.get_text("text"))
pdf_doc.close()
return "\n".join(text_parts).strip()
except Exception as e:
print(f"[parser] PDF extraction error: {e}")
return ""
def extract_text_from_docx(file_bytes: bytes) -> str:
"""
Extract all text from a DOCX file given its raw bytes.
Args:
file_bytes: Raw bytes of the DOCX file.
Returns:
Extracted text as a single string, or empty string on failure.
"""
try:
doc = Document(io.BytesIO(file_bytes))
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
paragraphs.append(cell.text.strip())
return "\n".join(paragraphs).strip()
except Exception as e:
print(f"[parser] DOCX extraction error: {e}")
return ""
def parse_resume(uploaded_file) -> dict:
"""
Main entry point: parse an uploaded Streamlit file object.
Detects file type and routes to the correct extractor.
Args:
uploaded_file: Streamlit UploadedFile object.
Returns:
dict with keys:
- 'text' : extracted resume text (str)
- 'filename' : original file name (str)
- 'file_type': 'pdf' | 'docx' | 'unknown'
- 'error' : error message if extraction failed (str | None)
"""
result = {
"text": "",
"filename": uploaded_file.name,
"file_type": "unknown",
"error": None,
}
file_bytes = uploaded_file.read()
if not file_bytes:
result["error"] = "Uploaded file is empty."
return result
filename_lower = uploaded_file.name.lower()
if filename_lower.endswith(".pdf"):
result["file_type"] = "pdf"
result["text"] = extract_text_from_pdf(file_bytes)
elif filename_lower.endswith(".docx"):
result["file_type"] = "docx"
result["text"] = extract_text_from_docx(file_bytes)
else:
result["error"] = "Unsupported file type. Please upload a PDF or DOCX."
return result
if not result["text"]:
result["error"] = (
"Could not extract text from the file. "
"The file may be image-based or corrupted."
)
return result |