MatchHive-ai / backend /agents /resume_parser.py
Alpha108's picture
Create resume_parser.py
be56e96 verified
import PyPDF2
import docx
import io
def parse_pdf(file_stream):
"""
Extracts text from a PDF file stream.
Args:
file_stream: A file-like object (e.g., from st.file_uploader).
Returns:
str: The extracted text from the PDF.
"""
text = ""
try:
reader = PyPDF2.PdfReader(file_stream)
for page in reader.pages:
text += page.extract_text() or ""
except Exception as e:
print(f"Error reading PDF: {e}")
raise ValueError("Could not parse the PDF file. It might be corrupted or image-based.")
return text
def parse_docx(file_stream):
"""
Extracts text from a DOCX file stream.
Args:
file_stream: A file-like object.
Returns:
str: The extracted text from the DOCX file.
"""
text = ""
try:
doc = docx.Document(file_stream)
for para in doc.paragraphs:
text += para.text + "\n"
except Exception as e:
print(f"Error reading DOCX: {e}")
raise ValueError("Could not parse the DOCX file.")
return text
def parse_resume(uploaded_file):
"""
Parses an uploaded resume file (PDF or DOCX) and returns its text content.
Args:
uploaded_file: The file object from Streamlit's file_uploader.
Returns:
str: The text content of the resume.
Raises:
ValueError: If the file type is not supported or parsing fails.
"""
if uploaded_file is None:
raise ValueError("No file uploaded.")
file_extension = uploaded_file.name.split('.')[-1].lower()
# We use BytesIO to handle the file in memory
file_stream = io.BytesIO(uploaded_file.getvalue())
if file_extension == 'pdf':
return parse_pdf(file_stream)
elif file_extension == 'docx':
return parse_docx(file_stream)
else:
raise ValueError(f"Unsupported file type: '{file_extension}'. Please upload a PDF or DOCX file.")