File size: 3,139 Bytes
ef89ade
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
parser.py — Resume file parsing module.
 
Handles text extraction from PDF and DOCX files.
Uses PyMuPDF for PDFs and python-docx for Word documents.
"""
 
import io
import fitz  # PyMuPDF
from docx import Document
 
 
def extract_text_from_pdf(file_bytes: bytes) -> str:
    """
    Extract all text from a PDF file given its raw bytes.
 
    Args:
        file_bytes: Raw bytes of the PDF file.
 
    Returns:
        Extracted text as a single string, or empty string on failure.
    """
    try:
        pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
        text_parts = []
        for page_num in range(len(pdf_doc)):
            page = pdf_doc[page_num]
            text_parts.append(page.get_text("text"))
        pdf_doc.close()
        return "\n".join(text_parts).strip()
    except Exception as e:
        print(f"[parser] PDF extraction error: {e}")
        return ""
 
 
def extract_text_from_docx(file_bytes: bytes) -> str:
    """
    Extract all text from a DOCX file given its raw bytes.
 
    Args:
        file_bytes: Raw bytes of the DOCX file.
 
    Returns:
        Extracted text as a single string, or empty string on failure.
    """
    try:
        doc = Document(io.BytesIO(file_bytes))
        paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        paragraphs.append(cell.text.strip())
        return "\n".join(paragraphs).strip()
    except Exception as e:
        print(f"[parser] DOCX extraction error: {e}")
        return ""
 
 
def parse_resume(uploaded_file) -> dict:
    """
    Main entry point: parse an uploaded Streamlit file object.
 
    Detects file type and routes to the correct extractor.
 
    Args:
        uploaded_file: Streamlit UploadedFile object.
 
    Returns:
        dict with keys:
            - 'text'     : extracted resume text (str)
            - 'filename' : original file name (str)
            - 'file_type': 'pdf' | 'docx' | 'unknown'
            - 'error'    : error message if extraction failed (str | None)
    """
    result = {
        "text": "",
        "filename": uploaded_file.name,
        "file_type": "unknown",
        "error": None,
    }
 
    file_bytes = uploaded_file.read()
 
    if not file_bytes:
        result["error"] = "Uploaded file is empty."
        return result
 
    filename_lower = uploaded_file.name.lower()
 
    if filename_lower.endswith(".pdf"):
        result["file_type"] = "pdf"
        result["text"] = extract_text_from_pdf(file_bytes)
    elif filename_lower.endswith(".docx"):
        result["file_type"] = "docx"
        result["text"] = extract_text_from_docx(file_bytes)
    else:
        result["error"] = "Unsupported file type. Please upload a PDF or DOCX."
        return result
 
    if not result["text"]:
        result["error"] = (
            "Could not extract text from the file. "
            "The file may be image-based or corrupted."
        )
 
    return result