| """ |
| Enhanced Resume Parser v2.0 |
| Provides structured extraction of skills, experience, projects, and education |
| with proper normalization and context understanding. |
| """ |
|
|
| from typing import Any, Dict, List, Optional |
| import re |
| import json |
| from dataclasses import dataclass, asdict, field |
| from llm_client import LLMClient |
| from metrics import log_metric |
| import tiktoken |
|
|
| @dataclass |
| class Experience: |
| title: str |
| company: str |
| duration: str |
| location: str = "" |
| responsibilities: List[str] = field(default_factory=list) |
| achievements: List[str] = field(default_factory=list) |
| technologies: List[str] = field(default_factory=list) |
| start_date: str = "" |
| end_date: str = "" |
| is_current: bool = False |
| |
| def __post_init__(self): |
| if self.responsibilities is None: |
| self.responsibilities = [] |
| if self.achievements is None: |
| self.achievements = [] |
| if self.technologies is None: |
| self.technologies = [] |
|
|
| @dataclass |
| class Project: |
| name: str |
| description: str |
| technologies: List[str] = field(default_factory=list) |
| github_url: str = "" |
| demo_url: str = "" |
| duration: str = "" |
| key_features: List[str] = field(default_factory=list) |
| |
| def __post_init__(self): |
| if self.technologies is None: |
| self.technologies = [] |
| if self.key_features is None: |
| self.key_features = [] |
|
|
| @dataclass |
| class Education: |
| degree: str |
| field: str |
| school: str |
| graduation_year: str = "" |
| gpa: str = "" |
| relevant_courses: List[str] = None |
| honors: List[str] = None |
| |
| def __post_init__(self): |
| if self.relevant_courses is None: |
| self.relevant_courses = [] |
| if self.honors is None: |
| self.honors = [] |
|
|
| @dataclass |
| class Skills: |
| technical: List[str] = None |
| programming_languages: List[str] = None |
| frameworks: List[str] = None |
| tools: List[str] = None |
| databases: List[str] = None |
| cloud_platforms: List[str] = None |
| methodologies: List[str] = None |
| soft_skills: List[str] = None |
| |
| def __post_init__(self): |
| for field in ['technical', 'programming_languages', 'frameworks', 'tools', |
| 'databases', 'cloud_platforms', 'methodologies', 'soft_skills']: |
| if getattr(self, field) is None: |
| setattr(self, field, []) |
|
|
| @dataclass |
| class ResumeData: |
| personal_info: Dict[str, str] |
| summary: str |
| skills: Skills |
| experience: List[Experience] |
| education: List[Education] |
| projects: List[Project] |
| certifications: List[Dict[str, str]] |
| languages: List[str] |
| years_of_experience: int = 0 |
| |
| def __post_init__(self): |
| if not self.certifications: |
| self.certifications = [] |
| if not self.languages: |
| self.languages = [] |
|
|
| class SkillsNormalizer: |
| """Normalizes and categorizes skills with synonym detection""" |
| |
| def __init__(self): |
| self.skill_synonyms = { |
| |
| "python": ["python", "python3", "python 3", "py"], |
| "javascript": ["javascript", "js", "node.js", "nodejs", "node js"], |
| "typescript": ["typescript", "ts"], |
| "java": ["java", "java 8", "java 11", "java 17"], |
| "csharp": ["c#", "csharp", "c sharp", ".net", "dotnet"], |
| "cpp": ["c++", "cpp", "c plus plus"], |
| "go": ["go", "golang"], |
| "rust": ["rust", "rust-lang"], |
| "swift": ["swift", "ios development"], |
| "kotlin": ["kotlin", "android development"], |
| "r": ["r", "r programming"], |
| "scala": ["scala"], |
| "php": ["php", "php 7", "php 8"], |
| "ruby": ["ruby", "ruby on rails", "ror"], |
| |
| |
| "react": ["react", "reactjs", "react.js", "react js"], |
| "angular": ["angular", "angularjs", "angular 2+"], |
| "vue": ["vue", "vue.js", "vuejs", "vue js"], |
| "svelte": ["svelte", "sveltekit"], |
| "django": ["django", "django rest framework", "drf"], |
| "flask": ["flask", "flask-restful"], |
| "fastapi": ["fastapi", "fast api"], |
| "express": ["express", "express.js", "expressjs"], |
| "spring": ["spring", "spring boot", "spring framework"], |
| "laravel": ["laravel"], |
| "rails": ["rails", "ruby on rails", "ror"], |
| |
| |
| "postgresql": ["postgresql", "postgres", "pg", "psql"], |
| "mysql": ["mysql", "my sql"], |
| "mongodb": ["mongodb", "mongo", "mongo db"], |
| "redis": ["redis"], |
| "elasticsearch": ["elasticsearch", "elastic search"], |
| "cassandra": ["cassandra", "apache cassandra"], |
| "dynamodb": ["dynamodb", "dynamo db"], |
| "sqlite": ["sqlite", "sqlite3"], |
| |
| |
| "aws": ["aws", "amazon web services", "amazon aws"], |
| "azure": ["azure", "microsoft azure"], |
| "gcp": ["gcp", "google cloud", "google cloud platform"], |
| "heroku": ["heroku"], |
| "digitalocean": ["digitalocean", "digital ocean"], |
| "vercel": ["vercel"], |
| "netlify": ["netlify"], |
| |
| |
| "docker": ["docker", "containerization", "containers"], |
| "kubernetes": ["kubernetes", "k8s", "container orchestration"], |
| "jenkins": ["jenkins", "ci/cd"], |
| "github actions": ["github actions", "gh actions"], |
| "terraform": ["terraform", "infrastructure as code", "iac"], |
| "ansible": ["ansible"], |
| "git": ["git", "version control", "source control"], |
| "linux": ["linux", "unix", "ubuntu", "centos"], |
| |
| |
| "machine learning": ["machine learning", "ml", "artificial intelligence", "ai"], |
| "deep learning": ["deep learning", "neural networks"], |
| "tensorflow": ["tensorflow", "tf"], |
| "pytorch": ["pytorch", "torch"], |
| "scikit-learn": ["scikit-learn", "sklearn", "scikit learn"], |
| "pandas": ["pandas", "data manipulation"], |
| "numpy": ["numpy", "numerical computing"], |
| "matplotlib": ["matplotlib", "data visualization"], |
| "seaborn": ["seaborn"], |
| "jupyter": ["jupyter", "jupyter notebooks"], |
| |
| |
| "pytest": ["pytest", "python testing"], |
| "jest": ["jest", "javascript testing"], |
| "selenium": ["selenium", "web automation"], |
| "cypress": ["cypress", "e2e testing"], |
| |
| |
| "agile": ["agile", "scrum", "kanban"], |
| "devops": ["devops", "dev ops"], |
| "microservices": ["microservices", "micro services"], |
| "rest api": ["rest", "rest api", "restful", "api development"], |
| "graphql": ["graphql", "graph ql"], |
| } |
| |
| self.skill_categories = { |
| "programming_languages": ["python", "javascript", "typescript", "java", "csharp", "cpp", "go", "rust", "swift", "kotlin", "r", "scala", "php", "ruby"], |
| "frameworks": ["react", "angular", "vue", "svelte", "django", "flask", "fastapi", "express", "spring", "laravel", "rails"], |
| "databases": ["postgresql", "mysql", "mongodb", "redis", "elasticsearch", "cassandra", "dynamodb", "sqlite"], |
| "cloud_platforms": ["aws", "azure", "gcp", "heroku", "digitalocean", "vercel", "netlify"], |
| "tools": ["docker", "kubernetes", "jenkins", "github actions", "terraform", "ansible", "git", "linux", "pytest", "jest", "selenium", "cypress"], |
| "methodologies": ["agile", "devops", "microservices", "rest api", "graphql"] |
| } |
| |
| def normalize_skill(self, skill: str) -> Optional[str]: |
| """Normalize a skill to its canonical form""" |
| skill_lower = skill.lower().strip() |
| |
| for canonical, synonyms in self.skill_synonyms.items(): |
| if skill_lower in synonyms: |
| return canonical |
| |
| return skill_lower if len(skill_lower) > 1 else None |
| |
| def categorize_skill(self, normalized_skill: str) -> str: |
| """Categorize a normalized skill""" |
| for category, skills in self.skill_categories.items(): |
| if normalized_skill in skills: |
| return category |
| return "technical" |
| |
| def normalize_skill_list(self, skills: List[str]) -> Dict[str, List[str]]: |
| """Normalize and categorize a list of skills""" |
| categorized = { |
| "programming_languages": [], |
| "frameworks": [], |
| "databases": [], |
| "cloud_platforms": [], |
| "tools": [], |
| "methodologies": [], |
| "technical": [] |
| } |
| |
| for skill in skills: |
| normalized = self.normalize_skill(skill) |
| if normalized: |
| category = self.categorize_skill(normalized) |
| if normalized not in categorized[category]: |
| categorized[category].append(normalized) |
| |
| return categorized |
|
|
| class EnhancedResumeParser: |
| """Enhanced resume parser with structured extraction and normalization""" |
| |
| def __init__(self): |
| self.skills_normalizer = SkillsNormalizer() |
| self.llm_client = LLMClient() |
| |
| async def run(self, data: Dict[str, Any]) -> Dict[str, Any]: |
| resume_text = data.get("resume_text", "") |
| |
| if not resume_text: |
| return {**data, "resume_data_enhanced": {"error": "No resume content provided"}} |
| |
| try: |
| |
| resume_data = await self._extract_resume_data_structured(resume_text) |
| |
| log_metric("resume_parse_enhanced_success", { |
| "skills_count": len(self._get_all_skills(resume_data.skills)), |
| "experience_count": len(resume_data.experience), |
| "projects_count": len(resume_data.projects), |
| "years_experience": resume_data.years_of_experience |
| }) |
| |
| return {**data, "resume_data_enhanced": asdict(resume_data)} |
| |
| except Exception as e: |
| log_metric("resume_parse_enhanced_error", {"error": str(e)}) |
| return {**data, "resume_data_enhanced": {"error": f"Enhanced resume parsing failed: {e}"}} |
| |
| async def _extract_resume_data_structured(self, resume_text: str) -> ResumeData: |
| """Extract structured resume data using multiple approaches""" |
| |
| |
| try: |
| structured_data = await self._llm_extract_structured(resume_text) |
| if structured_data: |
| return structured_data |
| except Exception as e: |
| log_metric("resume_llm_extraction_error", {"error": str(e)}) |
| |
| |
| return await self._section_based_extraction(resume_text) |
| |
| async def _llm_extract_structured(self, resume_text: str) -> Optional[ResumeData]: |
| """Use LLM to extract structured resume data""" |
| |
| |
| token_count = self._count_tokens(resume_text) |
| if token_count > 15000: |
| |
| return await self._chunked_extraction(resume_text) |
| |
| prompt = f""" |
| Extract comprehensive structured data from this resume. Return ONLY valid JSON with this exact structure: |
| |
| {{ |
| "personal_info": {{ |
| "name": "Full Name", |
| "email": "email@domain.com", |
| "phone": "+1234567890", |
| "location": "City, State", |
| "linkedin": "linkedin.com/in/username", |
| "github": "github.com/username", |
| "website": "personal-website.com" |
| }}, |
| "summary": "Professional summary or objective statement", |
| "skills": {{ |
| "technical": ["skill1", "skill2"], |
| "programming_languages": ["Python", "JavaScript"], |
| "frameworks": ["React", "Django"], |
| "tools": ["Git", "Docker"], |
| "databases": ["PostgreSQL", "MongoDB"], |
| "cloud_platforms": ["AWS", "Azure"], |
| "methodologies": ["Agile", "DevOps"], |
| "soft_skills": ["Leadership", "Communication"] |
| }}, |
| "experience": [ |
| {{ |
| "title": "Job Title", |
| "company": "Company Name", |
| "duration": "Jan 2020 - Present", |
| "location": "City, State", |
| "start_date": "2020-01", |
| "end_date": "Present", |
| "is_current": true, |
| "responsibilities": ["responsibility 1", "responsibility 2"], |
| "achievements": ["achievement 1", "achievement 2"], |
| "technologies": ["tech1", "tech2"] |
| }} |
| ], |
| "education": [ |
| {{ |
| "degree": "Bachelor of Science", |
| "field": "Computer Science", |
| "school": "University Name", |
| "graduation_year": "2020", |
| "gpa": "3.8", |
| "relevant_courses": ["Data Structures", "Algorithms"], |
| "honors": ["Dean's List", "Magna Cum Laude"] |
| }} |
| ], |
| "projects": [ |
| {{ |
| "name": "Project Name", |
| "description": "Brief description of the project", |
| "technologies": ["tech1", "tech2"], |
| "github_url": "github.com/user/repo", |
| "demo_url": "live-demo-url.com", |
| "duration": "3 months", |
| "key_features": ["feature1", "feature2"] |
| }} |
| ], |
| "certifications": [ |
| {{ |
| "name": "Certification Name", |
| "issuer": "Organization", |
| "date": "2023", |
| "credential_id": "123456" |
| }} |
| ], |
| "languages": ["English (Native)", "Spanish (Conversational)"], |
| "years_of_experience": 5 |
| }} |
| |
| Important guidelines: |
| 1. Extract ALL skills mentioned, including those in job descriptions and projects |
| 2. Normalize technology names (e.g., "React.js" → "React", "ML" → "Machine Learning") |
| 3. Calculate years_of_experience from work history |
| 4. Parse dates in YYYY-MM format when possible |
| 5. Group similar skills appropriately |
| 6. Extract quantifiable achievements when possible |
| 7. If information is missing, omit the field or use empty array/string |
| |
| Resume text: |
| {resume_text} |
| """ |
| |
| try: |
| response = self.llm_client.call_llm(prompt, temperature=0, max_tokens=4000) |
| |
| |
| json_start = response.find('{') |
| json_end = response.rfind('}') + 1 |
| if json_start != -1 and json_end > json_start: |
| json_str = response[json_start:json_end] |
| else: |
| json_str = response |
| |
| data = json.loads(json_str) |
| |
| |
| return self._convert_to_resume_data(data) |
| |
| except json.JSONDecodeError as e: |
| log_metric("resume_json_parse_error", {"error": str(e), "response": response[:500]}) |
| return None |
| except Exception as e: |
| log_metric("resume_llm_error", {"error": str(e)}) |
| return None |
| |
| async def _chunked_extraction(self, resume_text: str) -> ResumeData: |
| """Extract data from long resumes by processing in chunks""" |
| |
| sections = self._split_resume_sections(resume_text) |
| |
| |
| personal_info = await self._extract_personal_info(sections.get("header", "")) |
| summary = await self._extract_summary(sections.get("summary", "")) |
| skills = await self._extract_skills(sections.get("skills", "")) |
| experience = await self._extract_experience(sections.get("experience", "")) |
| education = await self._extract_education(sections.get("education", "")) |
| projects = await self._extract_projects(sections.get("projects", "")) |
| certifications = await self._extract_certifications(sections.get("certifications", "")) |
| |
| |
| years_exp = self._calculate_years_experience(experience) |
| |
| return ResumeData( |
| personal_info=personal_info, |
| summary=summary, |
| skills=skills, |
| experience=experience, |
| education=education, |
| projects=projects, |
| certifications=certifications, |
| languages=[], |
| years_of_experience=years_exp |
| ) |
| |
| async def _section_based_extraction(self, resume_text: str) -> ResumeData: |
| """Fallback extraction using regex and basic parsing""" |
| |
| |
| personal_info = self._extract_personal_info_regex(resume_text) |
| skills = self._extract_skills_regex(resume_text) |
| |
| return ResumeData( |
| personal_info=personal_info, |
| summary="", |
| skills=skills, |
| experience=[], |
| education=[], |
| projects=[], |
| certifications=[], |
| languages=[], |
| years_of_experience=0 |
| ) |
| |
| def _convert_to_resume_data(self, data: Dict[str, Any]) -> ResumeData: |
| """Convert parsed JSON to ResumeData objects""" |
| |
| |
| skills_data = data.get("skills", {}) |
| all_skills = [] |
| for skill_list in skills_data.values(): |
| if isinstance(skill_list, list): |
| all_skills.extend(skill_list) |
| |
| normalized_skills = self.skills_normalizer.normalize_skill_list(all_skills) |
| skills = Skills(**normalized_skills) |
| |
| |
| experience = [] |
| for exp_data in data.get("experience", []): |
| exp = Experience(**exp_data) |
| experience.append(exp) |
| |
| |
| education = [] |
| for edu_data in data.get("education", []): |
| edu = Education(**edu_data) |
| education.append(edu) |
| |
| |
| projects = [] |
| for proj_data in data.get("projects", []): |
| proj = Project(**proj_data) |
| projects.append(proj) |
| |
| return ResumeData( |
| personal_info=data.get("personal_info", {}), |
| summary=data.get("summary", ""), |
| skills=skills, |
| experience=experience, |
| education=education, |
| projects=projects, |
| certifications=data.get("certifications", []), |
| languages=data.get("languages", []), |
| years_of_experience=data.get("years_of_experience", 0) |
| ) |
| |
| def _get_all_skills(self, skills: Skills) -> List[str]: |
| """Get all skills as a flat list""" |
| all_skills = [] |
| for field in ['technical', 'programming_languages', 'frameworks', 'tools', |
| 'databases', 'cloud_platforms', 'methodologies']: |
| all_skills.extend(getattr(skills, field, [])) |
| return all_skills |
| |
| def _split_resume_sections(self, resume_text: str) -> Dict[str, str]: |
| """Split resume into sections using common headers""" |
| |
| sections = {} |
| current_section = "header" |
| current_content = [] |
| |
| |
| section_patterns = { |
| r'(experience|work experience|employment|professional experience)': 'experience', |
| r'(education|academic background)': 'education', |
| r'(skills|technical skills|core competencies)': 'skills', |
| r'(projects|personal projects|side projects)': 'projects', |
| r'(summary|objective|profile)': 'summary', |
| r'(certifications|licenses)': 'certifications' |
| } |
| |
| lines = resume_text.split('\n') |
| |
| for line in lines: |
| line_lower = line.lower().strip() |
| |
| |
| section_found = False |
| for pattern, section_name in section_patterns.items(): |
| if re.search(pattern, line_lower): |
| |
| if current_content: |
| sections[current_section] = '\n'.join(current_content) |
| |
| current_section = section_name |
| current_content = [] |
| section_found = True |
| break |
| |
| if not section_found: |
| current_content.append(line) |
| |
| |
| if current_content: |
| sections[current_section] = '\n'.join(current_content) |
| |
| return sections |
| |
| def _extract_personal_info_regex(self, text: str) -> Dict[str, str]: |
| """Extract personal information using regex""" |
| |
| |
| email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) |
| email = email_match.group() if email_match else "" |
| |
| |
| phone_match = re.search(r'(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})', text) |
| phone = phone_match.group() if phone_match else "" |
| |
| |
| linkedin_match = re.search(r'linkedin\.com/in/[\w-]+', text, re.IGNORECASE) |
| linkedin = f"https://{linkedin_match.group()}" if linkedin_match else "" |
| |
| |
| github_match = re.search(r'github\.com/[\w-]+', text, re.IGNORECASE) |
| github = f"https://{github_match.group()}" if github_match else "" |
| |
| return { |
| "email": email, |
| "phone": phone, |
| "linkedin": linkedin, |
| "github": github |
| } |
| |
| def _extract_skills_regex(self, text: str) -> Skills: |
| """Extract skills using regex patterns""" |
| |
| |
| tech_keywords = [ |
| 'Python', 'JavaScript', 'Java', 'C++', 'React', 'Node.js', 'SQL', |
| 'AWS', 'Docker', 'Git', 'Machine Learning', 'Data Science', |
| 'TensorFlow', 'PyTorch', 'Pandas', 'NumPy', 'Django', 'Flask', |
| 'PostgreSQL', 'MongoDB', 'Redis', 'Kubernetes', 'Jenkins' |
| ] |
| |
| found_skills = [] |
| for skill in tech_keywords: |
| if re.search(rf'\b{re.escape(skill)}\b', text, re.IGNORECASE): |
| found_skills.append(skill) |
| |
| |
| normalized = self.skills_normalizer.normalize_skill_list(found_skills) |
| |
| return Skills(**normalized) |
| |
| def _calculate_years_experience(self, experience: List[Experience]) -> int: |
| """Calculate total years of experience""" |
| if not experience: |
| return 0 |
| |
| |
| |
| return len(experience) |
| |
| def _count_tokens(self, text: str) -> int: |
| """Count tokens in text""" |
| try: |
| encoding = tiktoken.encoding_for_model("gpt-4o-mini") |
| return len(encoding.encode(text)) |
| except: |
| |
| return len(text) // 4 |
| |
| |
| |
| |
| async def _extract_personal_info(self, text: str) -> Dict[str, str]: |
| return self._extract_personal_info_regex(text) |
| |
| async def _extract_summary(self, text: str) -> str: |
| return text.strip() |
| |
| async def _extract_skills(self, text: str) -> Skills: |
| return self._extract_skills_regex(text) |
| |
| async def _extract_experience(self, text: str) -> List[Experience]: |
| return [] |
| |
| async def _extract_education(self, text: str) -> List[Education]: |
| return [] |
| |
| async def _extract_projects(self, text: str) -> List[Project]: |
| return [] |
| |
| async def _extract_certifications(self, text: str) -> List[Dict[str, str]]: |
| return [] |