TalentScout-Ai / backend /pdf_parser.py
GitHub Actions
Deploy FastAPI backend (backend.main:app) via GitHub Actions
31e79c4
import pymupdf as fitz
import re
from difflib import get_close_matches
def extract_text_from_pdf(file_path: str) -> str:
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
def parse_resume_text(text: str) -> dict:
"""Enhanced resume parsing with skill validation"""
lines = [line.strip() for line in text.split('\n') if line.strip()]
text_lower = text.lower()
extracted = {
"name": "",
"email": "",
"phone": "",
"skills": [],
"experience": ""
}
# Valid skills database for matching
valid_skills = [
'FastAPI', 'React', 'Next.js', 'Flask', 'MongoDB', 'Tailwind CSS',
'Machine Learning', 'Python', 'JavaScript', 'HTML', 'CSS', 'Node.js',
'Docker', 'Kubernetes', 'AWS', 'Git', 'GitHub', 'TensorFlow', 'PyTorch',
'Streamlit', 'Qdrant', 'LangChain', 'Gemini API', 'OpenAI', 'Gradio',
'Pandas', 'NumPy', 'Scikit-learn', 'OpenCV', 'Django', 'Vue.js',
'Angular', 'TypeScript', 'PostgreSQL', 'MySQL', 'Redis', 'GraphQL',
'RESTful API', 'Microservices', 'CI/CD', 'Linux', 'Ubuntu', 'Nginx',
'Apache', 'Jenkins', 'Terraform', 'Ansible', 'Elasticsearch'
]
# Extract Email using regex
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email_match = re.search(email_pattern, text)
if email_match:
extracted["email"] = email_match.group()
# Extract Phone using regex
phone_pattern = r'\b(?:\+91|91)?[6-9]\d{9}\b'
phone_match = re.search(phone_pattern, text)
if phone_match:
extracted["phone"] = phone_match.group()
# Extract Name
for i, line in enumerate(lines[:10]):
skip_keywords = ['course', 'email', 'mobile', 'cgpa', 'academic', 'details']
if any(keyword in line.lower() for keyword in skip_keywords):
continue
if re.match(r'^[A-Z][A-Z\s]+$', line) and len(line.split()) >= 2:
extracted["name"] = line.title()
break
# Extract and clean skills
raw_skills = []
# Look for explicit skill mentions
for skill in valid_skills:
if skill.lower() in text_lower:
raw_skills.append(skill)
# Extract from common skill patterns
skill_patterns = [
r'built with (.*?)(?:\.|,|;|\n)',
r'using (.*?)(?:\.|,|;|\n)',
r'technologies?:?\s*(.*?)(?:\.|,|;|\n)',
r'skills?:?\s*(.*?)(?:\.|,|;|\n)',
r'stack:?\s*(.*?)(?:\.|,|;|\n)'
]
for pattern in skill_patterns:
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
for match in matches:
# Split by common delimiters
words = re.split(r'[,\.\sand\s&\s]+', match.strip())
for word in words:
word = word.strip()
if len(word) > 2:
# Try to match with valid skills using fuzzy matching
close_matches = get_close_matches(word, valid_skills, n=1, cutoff=0.7)
if close_matches:
raw_skills.append(close_matches[0])
# Remove duplicates and limit
extracted["skills"] = list(set(raw_skills))[:12]
# Extract Experience
exp_patterns = [
r'(\d+)\+?\s*years?\s*(?:of\s*)?experience',
r'experience\s*:?\s*(\d+)\+?\s*years?'
]
for pattern in exp_patterns:
match = re.search(pattern, text_lower)
if match:
extracted["experience"] = f"{match.group(1)} years"
break
if not extracted["experience"]:
if 'intern' in text_lower and 'b.tech' in text_lower:
extracted["experience"] = "0-1 years (Student/Intern)"
else:
extracted["experience"] = "Fresher"
return extracted