resume-analyzer / utils /resume_parser.py
devmalik-official's picture
Deploy Gradio app
1aea493
import PyPDF2
from docx import Document
from pathlib import Path
import io
def extract_text_from_pdf(file_content):
"""Extract text from PDF file"""
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
raise Exception(f"Error reading PDF: {str(e)}")
def extract_text_from_docx(file_content):
"""Extract text from DOCX file"""
try:
doc = Document(io.BytesIO(file_content))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
text += cell.text + " "
text += "\n"
return text.strip()
except Exception as e:
raise Exception(f"Error reading DOCX: {str(e)}")
def extract_text_from_txt(file_content):
"""Extract text from TXT file"""
try:
return file_content.decode('utf-8').strip()
except Exception as e:
raise Exception(f"Error reading TXT: {str(e)}")
def parse_resume(file_content, file_extension):
"""
Parse resume based on file type
Args:
file_content: Binary file content
file_extension: File extension (.pdf, .docx, .txt)
Returns:
Extracted text from resume
"""
file_extension = file_extension.lower()
if file_extension == ".pdf":
return extract_text_from_pdf(file_content)
elif file_extension in [".docx", ".doc"]:
return extract_text_from_docx(file_content)
elif file_extension == ".txt":
return extract_text_from_txt(file_content)
else:
raise ValueError(f"Unsupported file format: {file_extension}")
def extract_from_uploaded_file(uploaded_file):
"""
Extract text from uploaded file object
Args:
uploaded_file: Streamlit uploaded file object
Returns:
Extracted text
"""
file_extension = Path(uploaded_file.name).suffix.lower()
file_content = uploaded_file.read()
return parse_resume(file_content, file_extension)