Spaces:

chirag1121
/

Resume_Screening_Model

Sleeping

App Files Files Community

chirag1121 commited on Apr 16

Commit

ef89ade

verified ·

1 Parent(s): 3b1a71d

Update utils/parser.py

Browse files

Files changed (1) hide show

utils/parser.py +108 -0

utils/parser.py CHANGED Viewed

	@@ -0,0 +1,108 @@

+"""
+parser.py — Resume file parsing module.
+Handles text extraction from PDF and DOCX files.
+Uses PyMuPDF for PDFs and python-docx for Word documents.
+"""
+import io
+import fitz  # PyMuPDF
+from docx import Document
+def extract_text_from_pdf(file_bytes: bytes) -> str:
+    """
+    Extract all text from a PDF file given its raw bytes.
+    Args:
+        file_bytes: Raw bytes of the PDF file.
+    Returns:
+        Extracted text as a single string, or empty string on failure.
+    """
+    try:
+        pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
+        text_parts = []
+        for page_num in range(len(pdf_doc)):
+            page = pdf_doc[page_num]
+            text_parts.append(page.get_text("text"))
+        pdf_doc.close()
+        return "\n".join(text_parts).strip()
+    except Exception as e:
+        print(f"[parser] PDF extraction error: {e}")
+        return ""
+def extract_text_from_docx(file_bytes: bytes) -> str:
+    """
+    Extract all text from a DOCX file given its raw bytes.
+    Args:
+        file_bytes: Raw bytes of the DOCX file.
+    Returns:
+        Extracted text as a single string, or empty string on failure.
+    """
+    try:
+        doc = Document(io.BytesIO(file_bytes))
+        paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
+        # Also extract text from tables
+        for table in doc.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    if cell.text.strip():
+                        paragraphs.append(cell.text.strip())
+        return "\n".join(paragraphs).strip()
+    except Exception as e:
+        print(f"[parser] DOCX extraction error: {e}")
+        return ""
+def parse_resume(uploaded_file) -> dict:
+    """
+    Main entry point: parse an uploaded Streamlit file object.
+    Detects file type and routes to the correct extractor.
+    Args:
+        uploaded_file: Streamlit UploadedFile object.
+    Returns:
+        dict with keys:
+            - 'text'     : extracted resume text (str)
+            - 'filename' : original file name (str)
+            - 'file_type': 'pdf' | 'docx' | 'unknown'
+            - 'error'    : error message if extraction failed (str | None)
+    """
+    result = {
+        "text": "",
+        "filename": uploaded_file.name,
+        "file_type": "unknown",
+        "error": None,
+    }
+    file_bytes = uploaded_file.read()
+    if not file_bytes:
+        result["error"] = "Uploaded file is empty."
+        return result
+    filename_lower = uploaded_file.name.lower()
+    if filename_lower.endswith(".pdf"):
+        result["file_type"] = "pdf"
+        result["text"] = extract_text_from_pdf(file_bytes)
+    elif filename_lower.endswith(".docx"):
+        result["file_type"] = "docx"
+        result["text"] = extract_text_from_docx(file_bytes)
+    else:
+        result["error"] = "Unsupported file type. Please upload a PDF or DOCX."
+        return result
+    if not result["text"]:
+        result["error"] = (
+            "Could not extract text from the file. "
+            "The file may be image-based or corrupted."
+        )
+    return result