Spaces:

chirag1121
/

Resume_Screening_Model

Sleeping

File size: 3,139 Bytes

ef89ade

"""
parser.py — Resume file parsing module.
 
Handles text extraction from PDF and DOCX files.
Uses PyMuPDF for PDFs and python-docx for Word documents.
"""
 
import io
import fitz  # PyMuPDF
from docx import Document
 
 
def extract_text_from_pdf(file_bytes: bytes) -> str:
    """
    Extract all text from a PDF file given its raw bytes.
 
    Args:
        file_bytes: Raw bytes of the PDF file.
 
    Returns:
        Extracted text as a single string, or empty string on failure.
    """
    try:
        pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
        text_parts = []
        for page_num in range(len(pdf_doc)):
            page = pdf_doc[page_num]
            text_parts.append(page.get_text("text"))
        pdf_doc.close()
        return "\n".join(text_parts).strip()
    except Exception as e:
        print(f"[parser] PDF extraction error: {e}")
        return ""
 
 
def extract_text_from_docx(file_bytes: bytes) -> str:
    """
    Extract all text from a DOCX file given its raw bytes.
 
    Args:
        file_bytes: Raw bytes of the DOCX file.
 
    Returns:
        Extracted text as a single string, or empty string on failure.
    """
    try:
        doc = Document(io.BytesIO(file_bytes))
        paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        paragraphs.append(cell.text.strip())
        return "\n".join(paragraphs).strip()
    except Exception as e:
        print(f"[parser] DOCX extraction error: {e}")
        return ""
 
 
def parse_resume(uploaded_file) -> dict:
    """
    Main entry point: parse an uploaded Streamlit file object.
 
    Detects file type and routes to the correct extractor.
 
    Args:
        uploaded_file: Streamlit UploadedFile object.
 
    Returns:
        dict with keys:
            - 'text'     : extracted resume text (str)
            - 'filename' : original file name (str)
            - 'file_type': 'pdf' | 'docx' | 'unknown'
            - 'error'    : error message if extraction failed (str | None)
    """
    result = {
        "text": "",
        "filename": uploaded_file.name,
        "file_type": "unknown",
        "error": None,
    }
 
    file_bytes = uploaded_file.read()
 
    if not file_bytes:
        result["error"] = "Uploaded file is empty."
        return result
 
    filename_lower = uploaded_file.name.lower()
 
    if filename_lower.endswith(".pdf"):
        result["file_type"] = "pdf"
        result["text"] = extract_text_from_pdf(file_bytes)
    elif filename_lower.endswith(".docx"):
        result["file_type"] = "docx"
        result["text"] = extract_text_from_docx(file_bytes)
    else:
        result["error"] = "Unsupported file type. Please upload a PDF or DOCX."
        return result
 
    if not result["text"]:
        result["error"] = (
            "Could not extract text from the file. "
            "The file may be image-based or corrupted."
        )
 
    return result