SMART_AI_RESUME / utils /resume_parser.py
parthib07's picture
Upload 531 files
d7d3dff verified
import pypdf
import docx
import re
from io import BytesIO
class ResumeParser:
def __init__(self):
pass
def extract_text_from_pdf(self, pdf_file):
try:
# Handle different file input types
if hasattr(pdf_file, 'read'):
# If it's a file-like object
file_content = pdf_file.read()
pdf_file.seek(0) # Reset file pointer
else:
# If it's already bytes
file_content = pdf_file
pdf_reader = pypdf.PdfReader(BytesIO(file_content))
text = ""
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
else:
# Handle empty page text
text += "\n"
return text.strip()
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return ""
def extract_text_from_docx(self, docx_file):
try:
doc = docx.Document(BytesIO(docx_file.read()))
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text.strip()
except Exception as e:
print(f"Error extracting text from DOCX: {e}")
return ""
def extract_text(self, file):
# Reset file pointer to beginning
file.seek(0)
if file.name.endswith('.pdf'):
return self.extract_text_from_pdf(file)
elif file.name.endswith('.docx'):
return self.extract_text_from_docx(file)
else:
return ""
def parse(self, file):
text = self.extract_text(file)
# Simple keyword-based parsing
skills = []
experience = []
education = []
# Common programming languages and tools
skill_keywords = ['python', 'java', 'javascript', 'html', 'css', 'sql', 'react', 'angular', 'vue',
'node', 'express', 'django', 'flask', 'spring', 'docker', 'kubernetes', 'aws',
'azure', 'git', 'jenkins', 'jira']
# Look for skills
text_lower = text.lower()
for skill in skill_keywords:
if skill in text_lower:
skills.append(skill)
return {
"skills": skills,
"experience": experience,
"education": education,
"raw_text": text
}