chirag1121 commited on
Commit
ef89ade
·
verified ·
1 Parent(s): 3b1a71d

Update utils/parser.py

Browse files
Files changed (1) hide show
  1. utils/parser.py +108 -0
utils/parser.py CHANGED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ parser.py — Resume file parsing module.
3
+
4
+ Handles text extraction from PDF and DOCX files.
5
+ Uses PyMuPDF for PDFs and python-docx for Word documents.
6
+ """
7
+
8
+ import io
9
+ import fitz # PyMuPDF
10
+ from docx import Document
11
+
12
+
13
+ def extract_text_from_pdf(file_bytes: bytes) -> str:
14
+ """
15
+ Extract all text from a PDF file given its raw bytes.
16
+
17
+ Args:
18
+ file_bytes: Raw bytes of the PDF file.
19
+
20
+ Returns:
21
+ Extracted text as a single string, or empty string on failure.
22
+ """
23
+ try:
24
+ pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
25
+ text_parts = []
26
+ for page_num in range(len(pdf_doc)):
27
+ page = pdf_doc[page_num]
28
+ text_parts.append(page.get_text("text"))
29
+ pdf_doc.close()
30
+ return "\n".join(text_parts).strip()
31
+ except Exception as e:
32
+ print(f"[parser] PDF extraction error: {e}")
33
+ return ""
34
+
35
+
36
+ def extract_text_from_docx(file_bytes: bytes) -> str:
37
+ """
38
+ Extract all text from a DOCX file given its raw bytes.
39
+
40
+ Args:
41
+ file_bytes: Raw bytes of the DOCX file.
42
+
43
+ Returns:
44
+ Extracted text as a single string, or empty string on failure.
45
+ """
46
+ try:
47
+ doc = Document(io.BytesIO(file_bytes))
48
+ paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
49
+ # Also extract text from tables
50
+ for table in doc.tables:
51
+ for row in table.rows:
52
+ for cell in row.cells:
53
+ if cell.text.strip():
54
+ paragraphs.append(cell.text.strip())
55
+ return "\n".join(paragraphs).strip()
56
+ except Exception as e:
57
+ print(f"[parser] DOCX extraction error: {e}")
58
+ return ""
59
+
60
+
61
+ def parse_resume(uploaded_file) -> dict:
62
+ """
63
+ Main entry point: parse an uploaded Streamlit file object.
64
+
65
+ Detects file type and routes to the correct extractor.
66
+
67
+ Args:
68
+ uploaded_file: Streamlit UploadedFile object.
69
+
70
+ Returns:
71
+ dict with keys:
72
+ - 'text' : extracted resume text (str)
73
+ - 'filename' : original file name (str)
74
+ - 'file_type': 'pdf' | 'docx' | 'unknown'
75
+ - 'error' : error message if extraction failed (str | None)
76
+ """
77
+ result = {
78
+ "text": "",
79
+ "filename": uploaded_file.name,
80
+ "file_type": "unknown",
81
+ "error": None,
82
+ }
83
+
84
+ file_bytes = uploaded_file.read()
85
+
86
+ if not file_bytes:
87
+ result["error"] = "Uploaded file is empty."
88
+ return result
89
+
90
+ filename_lower = uploaded_file.name.lower()
91
+
92
+ if filename_lower.endswith(".pdf"):
93
+ result["file_type"] = "pdf"
94
+ result["text"] = extract_text_from_pdf(file_bytes)
95
+ elif filename_lower.endswith(".docx"):
96
+ result["file_type"] = "docx"
97
+ result["text"] = extract_text_from_docx(file_bytes)
98
+ else:
99
+ result["error"] = "Unsupported file type. Please upload a PDF or DOCX."
100
+ return result
101
+
102
+ if not result["text"]:
103
+ result["error"] = (
104
+ "Could not extract text from the file. "
105
+ "The file may be image-based or corrupted."
106
+ )
107
+
108
+ return result