Alpha108 commited on
Commit
be56e96
·
verified ·
1 Parent(s): 4dbd292

Create resume_parser.py

Browse files
Files changed (1) hide show
  1. backend/agents/resume_parser.py +71 -0
backend/agents/resume_parser.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import docx
3
+ import io
4
+
5
+ def parse_pdf(file_stream):
6
+ """
7
+ Extracts text from a PDF file stream.
8
+
9
+ Args:
10
+ file_stream: A file-like object (e.g., from st.file_uploader).
11
+
12
+ Returns:
13
+ str: The extracted text from the PDF.
14
+ """
15
+ text = ""
16
+ try:
17
+ reader = PyPDF2.PdfReader(file_stream)
18
+ for page in reader.pages:
19
+ text += page.extract_text() or ""
20
+ except Exception as e:
21
+ print(f"Error reading PDF: {e}")
22
+ raise ValueError("Could not parse the PDF file. It might be corrupted or image-based.")
23
+ return text
24
+
25
+ def parse_docx(file_stream):
26
+ """
27
+ Extracts text from a DOCX file stream.
28
+
29
+ Args:
30
+ file_stream: A file-like object.
31
+
32
+ Returns:
33
+ str: The extracted text from the DOCX file.
34
+ """
35
+ text = ""
36
+ try:
37
+ doc = docx.Document(file_stream)
38
+ for para in doc.paragraphs:
39
+ text += para.text + "\n"
40
+ except Exception as e:
41
+ print(f"Error reading DOCX: {e}")
42
+ raise ValueError("Could not parse the DOCX file.")
43
+ return text
44
+
45
+ def parse_resume(uploaded_file):
46
+ """
47
+ Parses an uploaded resume file (PDF or DOCX) and returns its text content.
48
+
49
+ Args:
50
+ uploaded_file: The file object from Streamlit's file_uploader.
51
+
52
+ Returns:
53
+ str: The text content of the resume.
54
+
55
+ Raises:
56
+ ValueError: If the file type is not supported or parsing fails.
57
+ """
58
+ if uploaded_file is None:
59
+ raise ValueError("No file uploaded.")
60
+
61
+ file_extension = uploaded_file.name.split('.')[-1].lower()
62
+
63
+ # We use BytesIO to handle the file in memory
64
+ file_stream = io.BytesIO(uploaded_file.getvalue())
65
+
66
+ if file_extension == 'pdf':
67
+ return parse_pdf(file_stream)
68
+ elif file_extension == 'docx':
69
+ return parse_docx(file_stream)
70
+ else:
71
+ raise ValueError(f"Unsupported file type: '{file_extension}'. Please upload a PDF or DOCX file.")