Spaces:
Runtime error
Runtime error
| """ | |
| ATS Resume Optimizer - Powered by Claude 3.5 Sonnet (Anthropic SOTA) | |
| Optimizes CV/Resume for ATS platforms and generates professional PDF | |
| """ | |
| import gradio as gr | |
| import re | |
| import os | |
| import json | |
| from typing import Dict, List, Tuple | |
| from collections import Counter | |
| import tempfile | |
| # Anthropic API Key (Claude 3.5 Sonnet - SOTA for structured formatting) | |
| ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "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") | |
| # OpenAI API Key (fallback) | |
| OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "") | |
| # ============== RESUME PARSER (GENERALIZED) ============== | |
| def parse_resume(file) -> str: | |
| """Parse resume from uploaded file.""" | |
| if file is None: | |
| return "" | |
| file_path = file.name if hasattr(file, 'name') else str(file) | |
| file_ext = file_path.lower().split('.')[-1] | |
| try: | |
| if file_ext == 'pdf': | |
| return _parse_pdf(file_path) | |
| elif file_ext in ['docx', 'doc']: | |
| return _parse_docx(file_path) | |
| elif file_ext == 'txt': | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| else: | |
| return f"Unsupported format: {file_ext}" | |
| except Exception as e: | |
| return f"Error parsing file: {str(e)}" | |
| def _parse_pdf(file_path: str) -> str: | |
| """Parse PDF with proper line preservation.""" | |
| text = "" | |
| # Try PyMuPDF first (best line preservation) | |
| try: | |
| import fitz # PyMuPDF | |
| doc = fitz.open(file_path) | |
| for page in doc: | |
| page_text = page.get_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| doc.close() | |
| if text.strip(): | |
| return _clean_resume_text(text) | |
| except: | |
| pass | |
| # Fallback to pdfplumber | |
| try: | |
| import pdfplumber | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| if text.strip(): | |
| return _clean_resume_text(text) | |
| except: | |
| pass | |
| # Final fallback to PyPDF2 | |
| try: | |
| import PyPDF2 | |
| with open(file_path, 'rb') as f: | |
| reader = PyPDF2.PdfReader(f) | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| return _clean_resume_text(text) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def _parse_docx(file_path: str) -> str: | |
| """Parse DOCX file.""" | |
| try: | |
| from docx import Document | |
| doc = Document(file_path) | |
| text = "\n".join([p.text for p in doc.paragraphs]) | |
| return _clean_resume_text(text) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def _clean_resume_text(text: str) -> str: | |
| """Clean and merge fragmented PDF text while preserving name on first line.""" | |
| # Fix common PDF encoding issues FIRST | |
| text = text.replace('(cid:127)', 'β’') | |
| text = text.replace('(cid:128)', 'β’') | |
| text = text.replace('β', 'β’') | |
| text = text.replace('β', 'β’') | |
| text = text.replace('βͺ', 'β’') | |
| text = text.replace('β ', 'β’') | |
| text = text.replace('β¦', 'β’') | |
| text = text.replace('\uf0b7', 'β’') # Unicode bullet | |
| text = text.replace('\u2022', 'β’') # Unicode bullet | |
| text = re.sub(r'[^\x00-\x7F]+', lambda m: 'β’' if m.group() in ['β', 'β', 'βͺ', 'β '] else m.group(), text) | |
| lines = text.split('\n') | |
| section_headers = [ | |
| 'PROFESSIONAL SUMMARY', 'SUMMARY', 'OBJECTIVE', 'PROFILE', 'ABOUT', | |
| 'PROFESSIONAL EXPERIENCE', 'EXPERIENCE', 'EMPLOYMENT', 'WORK HISTORY', 'CAREER', | |
| 'EDUCATION', 'ACADEMIC', 'QUALIFICATIONS', | |
| 'SKILLS', 'TECHNICAL SKILLS', 'CORE COMPETENCIES', 'COMPETENCIES', 'EXPERTISE', 'TECHNOLOGIES', | |
| 'CERTIFICATIONS', 'CERTIFICATES', 'LICENSES', | |
| 'PROJECTS', 'PORTFOLIO', 'ACHIEVEMENTS', 'AWARDS', | |
| 'PUBLICATIONS', 'RESEARCH', 'VOLUNTEER', 'INTERESTS', 'LEADERSHIP', 'COMMUNITY', 'COMPETITIVE' | |
| ] | |
| merged_lines = [] | |
| current_line = "" | |
| line_count = 0 # Track which line we're on | |
| for line in lines: | |
| line = re.sub(r'\s+', ' ', line).strip() | |
| # Clean any remaining cid patterns | |
| line = re.sub(r'\(cid:\d+\)', 'β’', line) | |
| if not line: | |
| continue | |
| line_count += 1 | |
| line_upper = line.upper().strip() | |
| is_header = any(line_upper.startswith(h) or line_upper == h for h in section_headers) | |
| is_bullet = line.startswith('β’') or line.startswith('*') or line.startswith('-') | |
| is_company = bool(re.match(r'^[A-Z][A-Z\s&\.,]+(\s*[\|β-]\s*|\s+)(.*\d{4}|[A-Z][a-z]+,?\s+[A-Z]{2})', line)) | |
| # First few lines (1-4) are typically: Name, Title, Contact, Links - keep them separate | |
| is_header_line = line_count <= 4 and len(line) < 100 | |
| starts_new = is_header or is_bullet or is_company or is_header_line | |
| if starts_new: | |
| if current_line: | |
| merged_lines.append(current_line) | |
| current_line = line if not line.startswith('*') else 'β’ ' + line[1:].strip() | |
| elif current_line: | |
| current_line += ' ' + line | |
| else: | |
| current_line = line | |
| if current_line: | |
| merged_lines.append(current_line) | |
| return '\n'.join(merged_lines) | |
| # ============== RESUME POST-PROCESSOR ============== | |
| def post_process_resume_format(resume_text: str) -> str: | |
| """Clean up and enforce consistent professional formatting on resume output.""" | |
| lines = resume_text.split('\n') | |
| processed_lines = [] | |
| # Section headers that should be standardized | |
| section_keywords = { | |
| 'professional summary': 'PROFESSIONAL SUMMARY', | |
| 'summary': 'PROFESSIONAL SUMMARY', | |
| 'profile': 'PROFESSIONAL SUMMARY', | |
| 'objective': 'PROFESSIONAL SUMMARY', | |
| 'professional experience': 'PROFESSIONAL EXPERIENCE', | |
| 'experience': 'PROFESSIONAL EXPERIENCE', | |
| 'work experience': 'PROFESSIONAL EXPERIENCE', | |
| 'employment': 'PROFESSIONAL EXPERIENCE', | |
| 'work history': 'PROFESSIONAL EXPERIENCE', | |
| 'education': 'EDUCATION', | |
| 'academic background': 'EDUCATION', | |
| 'skills': 'SKILLS', | |
| 'technical skills': 'TECHNICAL SKILLS', | |
| 'core competencies': 'CORE COMPETENCIES', | |
| 'key skills': 'KEY SKILLS', | |
| 'certifications': 'CERTIFICATIONS', | |
| 'certificates': 'CERTIFICATIONS', | |
| 'projects': 'PROJECTS', | |
| 'achievements': 'ACHIEVEMENTS', | |
| 'awards': 'AWARDS & RECOGNITION', | |
| 'publications': 'PUBLICATIONS', | |
| 'languages': 'LANGUAGES', | |
| } | |
| for i, line in enumerate(lines): | |
| stripped = line.strip() | |
| # Skip empty lines but preserve them for spacing | |
| if not stripped: | |
| processed_lines.append('') | |
| continue | |
| # Skip metadata lines | |
| if stripped.startswith('__CANDIDATE_NAME__'): | |
| processed_lines.append(stripped) | |
| continue | |
| # Standardize section headers | |
| stripped_lower = stripped.lower().replace('=', '').replace('-', '').strip() | |
| if stripped_lower in section_keywords: | |
| # Add separator before section header | |
| if processed_lines and processed_lines[-1] != '': | |
| processed_lines.append('') | |
| processed_lines.append('=' * 80) | |
| processed_lines.append(section_keywords[stripped_lower]) | |
| processed_lines.append('=' * 80) | |
| continue | |
| # Check for === section dividers (keep them clean) | |
| if stripped.startswith('=') and stripped.endswith('=') and len(stripped) > 10: | |
| processed_lines.append('β' * 80) | |
| continue | |
| # Check for βββ Unicode dividers (keep them) | |
| if stripped.startswith('β') and len(stripped) > 10: | |
| processed_lines.append('β' * 80) | |
| continue | |
| # Check for --- dividers | |
| if stripped.startswith('-') and len(stripped) > 10 and stripped.replace('-', '') == '': | |
| processed_lines.append('β' * 80) | |
| continue | |
| # Standardize bullet points | |
| if stripped.startswith(('-', '*', '>', 'β', 'β')) and len(stripped) > 2: | |
| bullet_text = stripped.lstrip('-*>ββ ') | |
| processed_lines.append(f'β’ {bullet_text}') | |
| continue | |
| # Ensure bullet points have proper spacing | |
| if stripped.startswith('β’'): | |
| if not stripped.startswith('β’ '): | |
| stripped = 'β’ ' + stripped[1:].lstrip() | |
| processed_lines.append(stripped) | |
| continue | |
| # Handle company/date lines - ensure proper formatting | |
| if '|' in stripped and any(year in stripped for year in ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025', 'Present']): | |
| processed_lines.append(stripped) | |
| continue | |
| # Pass through other lines | |
| processed_lines.append(stripped) | |
| result = '\n'.join(processed_lines) | |
| # Clean up excessive blank lines | |
| result = re.sub(r'\n{4,}', '\n\n\n', result) | |
| return result | |
| # ============== CLAUDE 3.5 SONNET OPTIMIZER ============== | |
| def optimize_with_llm(resume_text: str, job_description: str) -> Tuple[str, List[str]]: | |
| """Use Claude 3.5 Sonnet (Anthropic SOTA) to intelligently optimize and professionally format resume.""" | |
| # VALIDATION: Reject empty or minimal resumes - we optimize, not fabricate | |
| resume_stripped = resume_text.strip() | |
| word_count = len(resume_stripped.split()) | |
| if len(resume_stripped) < 100 or word_count < 20: | |
| return resume_text, [ | |
| "β ERROR: Resume is too short to optimize.", | |
| f" Your resume has only {len(resume_stripped)} characters and {word_count} words.", | |
| " Please provide a complete resume with at least:", | |
| " - Contact information", | |
| " - Work experience (with dates)", | |
| " - Skills section", | |
| " - Education", | |
| " Minimum required: 100+ characters, 20+ words" | |
| ] | |
| if not job_description.strip() or len(job_description.strip()) < 30: | |
| return resume_text, [ | |
| "β ERROR: Job description is too short.", | |
| " Please provide a job description with at least 30 characters." | |
| ] | |
| try: | |
| import anthropic | |
| client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY) | |
| # Pre-extract important keywords from JD for the AI | |
| stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'have', 'we', 'you', 'they', 'this', 'that', 'will', 'would', 'should', 'can', 'may', 'our', 'your', 'their', 'work', 'working', 'role', 'job', 'candidate', 'looking', 'experience', 'years', 'ability', 'team', 'including', 'across', 'within'} | |
| jd_words = re.findall(r'\b[a-zA-Z]{3,}\b', job_description.lower()) | |
| keyword_counts = Counter([w for w in jd_words if w not in stop_words]) | |
| top_jd_keywords = [w for w, c in keyword_counts.most_common(25) if c >= 2] | |
| prompt = f"""Transform this resume into a clean, professionally formatted document optimized for ATS systems. | |
| ## STRICT OUTPUT TEMPLATE (follow EXACTLY) | |
| ``` | |
| JOHN SMITH | |
| Senior Software Engineer | |
| john.smith@email.com | (555) 123-4567 | New York, NY | linkedin.com/in/johnsmith | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PROFESSIONAL SUMMARY | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Results-driven professional with X+ years of experience in [field]. Proven track record of [key achievement]. Expert in [key skills from job description]. | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PROFESSIONAL EXPERIENCE | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| COMPANY NAME | City, State | |
| Job Title | Jan 2020 β Present | |
| β’ Spearheaded [initiative] resulting in [quantified outcome] | |
| β’ Developed [solution] that improved [metric] by X% | |
| β’ Led cross-functional team of X to deliver [project] | |
| PREVIOUS COMPANY | City, State | |
| Previous Title | Jan 2018 β Dec 2019 | |
| β’ Managed [responsibility] serving X+ clients | |
| β’ Achieved [result] through [action] | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| EDUCATION | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Master of Science in Computer Science | Stanford University | 2018 | |
| Bachelor of Science in Computer Science | UC Berkeley | 2016 | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TECHNICAL SKILLS | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Languages: Python, JavaScript, Java, SQL | |
| Frameworks: React, Node.js, Django, FastAPI | |
| Cloud/DevOps: AWS, Docker, Kubernetes, CI/CD | |
| Tools: Git, Jira, Tableau, Terraform | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CERTIFICATIONS | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| β’ AWS Solutions Architect Professional β Amazon Web Services | 2023 | |
| β’ PMP Project Management Professional β PMI | 2022 | |
| ``` | |
| ## RULES (CRITICAL): | |
| 1. Use EXACTLY the formatting above with βββ horizontal lines between sections | |
| 2. Name on line 1 (FIRST LAST only, no titles), professional title on line 2 | |
| 3. Contact info on line 3 with | separators | |
| 4. Every bullet starts with β’ and a STRONG ACTION VERB (Spearheaded, Architected, Delivered, etc.) | |
| 5. PRESERVE ALL original facts (dates, numbers, companies, degrees) EXACTLY | |
| 6. DO NOT fabricate achievements, certifications, or experiences | |
| 7. Integrate these keywords naturally: {', '.join(top_jd_keywords[:15])} | |
| ## JOB DESCRIPTION: | |
| {job_description[:2000]} | |
| ## ORIGINAL RESUME TO TRANSFORM: | |
| {resume_text} | |
| ## OUTPUT (JSON only): | |
| Return ONLY valid JSON with no markdown code blocks: | |
| {{"candidate_name": "THE PERSON'S FULL NAME (e.g. John Smith, NOT a company name)", "optimized_resume": "THE COMPLETE FORMATTED RESUME", "changes_made": ["change1", "change2"], "keywords_added": ["kw1", "kw2"]}} | |
| IMPORTANT: candidate_name must be the PERSON's name (like 'Salim Shaikh'), NOT a company name (like 'JP Morgan').""" | |
| response = client.messages.create( | |
| model="claude-3-5-haiku-latest", | |
| max_tokens=8192, | |
| temperature=0.1, | |
| messages=[ | |
| {"role": "user", "content": prompt} | |
| ], | |
| system="You are an expert executive resume writer. Transform resumes into beautifully formatted, ATS-optimized documents. Output ONLY valid JSON - no markdown code blocks. Follow the template EXACTLY with βββ dividers. Every bullet must start with β’ and an action verb. Preserve all facts." | |
| ) | |
| result_text = response.content[0].text | |
| # Clean up JSON extraction | |
| if "```json" in result_text: | |
| result_text = result_text.split("```json")[1].split("```")[0] | |
| elif "```" in result_text: | |
| result_text = result_text.split("```")[1].split("```")[0] | |
| # Try to find JSON in the response | |
| if result_text.strip().startswith('{'): | |
| pass # Already clean | |
| else: | |
| # Find JSON object in text | |
| json_match = re.search(r'\{[\s\S]*\}', result_text) | |
| if json_match: | |
| result_text = json_match.group() | |
| result = json.loads(result_text.strip()) | |
| optimized = result.get("optimized_resume", resume_text) | |
| candidate_name_from_ai = result.get("candidate_name", "") | |
| changes = result.get("changes_made", []) | |
| keywords = result.get("keywords_added", []) | |
| # Store candidate name for PDF generation | |
| if candidate_name_from_ai: | |
| optimized = f"__CANDIDATE_NAME__:{candidate_name_from_ai}\n" + optimized | |
| # POST-PROCESS: Clean up and enforce consistent formatting | |
| optimized = post_process_resume_format(optimized) | |
| # VALIDATION: If optimized is significantly shorter, AI truncated it - use fallback | |
| # Lower threshold to 0.6 since AI can legitimately condense verbose resumes | |
| if len(optimized) < len(resume_text) * 0.6: | |
| print(f"Warning: AI truncated resume ({len(optimized)} < {len(resume_text) * 0.6}). Using enhanced fallback.") | |
| optimized = enhanced_optimize(resume_text, job_description) | |
| changes = ["β οΈ AI response was truncated. Applied keyword-based optimization instead."] | |
| keywords = [] | |
| # HALLUCINATION CHECK: Verify key facts are preserved | |
| original_lower = resume_text.lower() | |
| optimized_lower = optimized.lower() | |
| # Check if original dates are preserved | |
| original_dates = set(re.findall(r'\b(19|20)\d{2}\b', resume_text)) | |
| optimized_dates = set(re.findall(r'\b(19|20)\d{2}\b', optimized)) | |
| if not original_dates.issubset(optimized_dates): | |
| missing_dates = original_dates - optimized_dates | |
| print(f"Warning: AI removed dates: {missing_dates}. Using fallback.") | |
| optimized = enhanced_optimize(resume_text, job_description) | |
| changes = ["β οΈ AI modified dates. Applied safe keyword optimization instead."] | |
| # Check if original numbers/percentages are preserved | |
| original_metrics = set(re.findall(r'\d+(?:\.\d+)?%|\$[\d,]+|\d+\+', resume_text)) | |
| optimized_metrics = set(re.findall(r'\d+(?:\.\d+)?%|\$[\d,]+|\d+\+', optimized)) | |
| if len(original_metrics) > 0 and not original_metrics.issubset(optimized_metrics): | |
| missing_metrics = original_metrics - optimized_metrics | |
| if len(missing_metrics) > 2: # Allow minor losses | |
| print(f"Warning: AI removed metrics: {missing_metrics}. Using fallback.") | |
| optimized = enhanced_optimize(resume_text, job_description) | |
| changes = ["β οΈ AI modified metrics. Applied safe keyword optimization instead."] | |
| suggestions = [f"β {change}" for change in changes[:5]] | |
| if keywords: | |
| suggestions.append(f"π Keywords added: {', '.join(keywords[:10])}") | |
| return optimized, suggestions | |
| except Exception as e: | |
| print(f"Claude API Error: {str(e)}") | |
| return enhanced_optimize(resume_text, job_description), [f"β οΈ AI unavailable: {str(e)[:50]}. Using keyword optimization."] | |
| def enhanced_optimize(resume_text: str, job_description: str) -> str: | |
| """Enhanced fallback optimization - formats resume professionally and injects keywords from JD.""" | |
| optimized = resume_text | |
| # Extract important keywords from job description | |
| stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'have', 'has', 'we', 'you', 'they', 'this', 'that', 'will', 'would', 'could', 'should', 'work', 'working', 'role', 'job', 'candidate', 'looking', 'experience', 'years', 'ability', 'team', 'etc', 'such', 'including'} | |
| jd_words = re.findall(r'\b[a-zA-Z]{4,}\b', job_description.lower()) | |
| from collections import Counter | |
| keyword_counts = Counter([w for w in jd_words if w not in stop_words]) | |
| top_keywords = [w for w, c in keyword_counts.most_common(15) if c >= 2] | |
| # Strong action verb replacements | |
| replacements = { | |
| r'\bworked on\b': 'developed', | |
| r'\bhelped\b': 'contributed to', | |
| r'\bwas responsible for\b': 'managed', | |
| r'\bhandled\b': 'orchestrated', | |
| r'\bused\b': 'leveraged', | |
| r'\bworked with\b': 'collaborated with', | |
| r'\bmade\b': 'engineered', | |
| r'\bdid\b': 'executed', | |
| r'\bran\b': 'spearheaded', | |
| r'\bbuilt\b': 'architected and built', | |
| } | |
| for pattern, replacement in replacements.items(): | |
| optimized = re.sub(pattern, replacement, optimized, flags=re.IGNORECASE) | |
| # ===== PROFESSIONAL FORMATTING ===== | |
| # Standardize bullets to professional format | |
| optimized = re.sub(r'^\s*[\*\-\>]\s*', 'β’ ', optimized, flags=re.MULTILINE) | |
| # Standardize section headers to uppercase | |
| section_headers = ['experience', 'education', 'skills', 'summary', 'objective', | |
| 'certifications', 'projects', 'awards', 'publications', | |
| 'professional experience', 'work experience', 'technical skills', | |
| 'core competencies', 'professional summary', 'career objective'] | |
| for header in section_headers: | |
| # Match header at start of line with optional colon | |
| pattern = rf'^({header})\s*:?\s*$' | |
| optimized = re.sub(pattern, header.upper(), optimized, flags=re.IGNORECASE | re.MULTILINE) | |
| # Ensure proper spacing between sections (add blank line before headers) | |
| for header in ['EXPERIENCE', 'EDUCATION', 'SKILLS', 'CERTIFICATIONS', 'PROJECTS', | |
| 'AWARDS', 'PUBLICATIONS', 'PROFESSIONAL EXPERIENCE', 'WORK EXPERIENCE', | |
| 'TECHNICAL SKILLS', 'CORE COMPETENCIES', 'PROFESSIONAL SUMMARY']: | |
| optimized = re.sub(rf'([^\n])\n({header})', r'\1\n\n\2', optimized) | |
| # Clean up multiple blank lines to max 2 | |
| optimized = re.sub(r'\n{3,}', '\n\n', optimized) | |
| # Standardize date formats (Mon YYYY or YYYY) | |
| month_map = { | |
| 'january': 'Jan', 'february': 'Feb', 'march': 'Mar', 'april': 'Apr', | |
| 'may': 'May', 'june': 'Jun', 'july': 'Jul', 'august': 'Aug', | |
| 'september': 'Sep', 'october': 'Oct', 'november': 'Nov', 'december': 'Dec' | |
| } | |
| for full, abbr in month_map.items(): | |
| optimized = re.sub(rf'\b{full}\b', abbr, optimized, flags=re.IGNORECASE) | |
| # Add a skills enhancement if "SKILLS" section exists | |
| skills_match = re.search(r'(SKILLS?|TECHNICAL SKILLS?|CORE COMPETENCIES)[:\s]*\n', optimized, re.IGNORECASE) | |
| if skills_match and top_keywords: | |
| # Check which keywords are missing from skills section | |
| skills_section_start = skills_match.end() | |
| skills_section = optimized[skills_section_start:skills_section_start+500].lower() | |
| missing_in_skills = [kw for kw in top_keywords[:8] if kw not in skills_section and kw in optimized.lower()] | |
| # We won't add fake skills, just note them | |
| return optimized | |
| def basic_optimize(resume_text: str, job_description: str) -> str: | |
| """Fallback basic optimization without LLM.""" | |
| return enhanced_optimize(resume_text, job_description) | |
| # ============== ATS COMPATIBILITY ANALYZER (ADVANCED) ============== | |
| class ATSCompatibilityAnalyzer: | |
| """ | |
| Advanced ATS scoring using multiple techniques: | |
| - TF-IDF weighted keyword matching | |
| - Stemming for word variations | |
| - Fuzzy matching for abbreviations | |
| - Experience years parsing | |
| - Skills taxonomy mapping | |
| - Education & job title matching | |
| """ | |
| def __init__(self): | |
| # Weights based on Jobscan research: | |
| # - 76.4% recruiters filter by SKILLS (most important) | |
| # - 59.7% filter by EDUCATION | |
| # - 55.3% filter by JOB TITLE | |
| # - 50.6% filter by CERTIFICATIONS | |
| # - 44% filter by YEARS OF EXPERIENCE | |
| self.weights = { | |
| 'keyword_match': 0.30, # Primary: keyword matching (highest priority) | |
| 'skills_match': 0.25, # Skills matching (76.4% of recruiters) | |
| 'semantic_match': 0.15, # Job title/role matching (55.3%) | |
| 'experience_match': 0.12, # Experience years (44%) | |
| 'format_score': 0.08, # ATS-friendly formatting | |
| 'section_score': 0.05, # Standard sections present | |
| 'action_verbs': 0.03, # Impact-oriented language | |
| 'quantification': 0.02 # Measurable achievements | |
| } | |
| self.action_verbs = [ | |
| 'achieved', 'administered', 'analyzed', 'architected', 'automated', | |
| 'built', 'collaborated', 'conducted', 'created', 'delivered', 'designed', | |
| 'developed', 'directed', 'drove', 'engineered', 'established', | |
| 'executed', 'generated', 'implemented', 'improved', 'increased', | |
| 'launched', 'led', 'managed', 'optimized', 'orchestrated', | |
| 'reduced', 'resolved', 'spearheaded', 'streamlined', 'transformed', | |
| 'accelerated', 'consolidated', 'converted', 'customized', 'decreased', | |
| 'enhanced', 'exceeded', 'expanded', 'facilitated', | |
| 'formulated', 'founded', 'identified', 'initiated', 'innovated', | |
| 'integrated', 'leveraged', 'maximized', 'mentored', 'modernized', | |
| 'negotiated', 'outperformed', 'pioneered', 'produced', 'programmed', | |
| 'proposed', 'redesigned', 'revamped', 'scaled', 'standardized', | |
| # Additional common verbs | |
| 'supported', 'trained', 'utilized', 'validated', 'verified', | |
| 'wrote', 'maintained', 'monitored', 'performed', 'presented', | |
| 'processed', 'provided', 'published', 'recommended', 'researched', | |
| 'reviewed', 'supervised', 'tested', 'tracked', 'updated', | |
| 'coordinated', 'defined', 'demonstrated', 'documented', 'ensured', | |
| 'evaluated', 'examined', 'extracted', 'gathered', 'guided', | |
| 'handled', 'influenced', 'instructed', 'interpreted', 'investigated', | |
| 'modeled', 'organized', 'oversaw', 'prepared', 'prioritized', | |
| # More verbs found missing in tests | |
| 'advised', 'allocated', 'appointed', 'approved', 'assigned', | |
| 'assisted', 'attained', 'authored', 'calculated', 'captured', | |
| 'chaired', 'clarified', 'coached', 'collected', 'communicated', | |
| 'compiled', 'completed', 'composed', 'computed', 'conceptualized', | |
| 'conserved', 'constructed', 'consulted', 'contracted', 'controlled', | |
| 'convinced', 'cultivated', 'delegated', 'deployed', 'devised', | |
| 'diagnosed', 'discovered', 'dispatched', 'earned', 'edited', | |
| 'educated', 'enabled', 'encouraged', 'enforced', 'enlisted', | |
| 'equipped', 'estimated', 'examined', 'expedited', 'fabricated', | |
| 'finalized', 'forecasted', 'fulfilled', 'gained', 'hired', | |
| 'hosted', 'illustrated', 'improved', 'incorporated', 'increased', | |
| 'inspected', 'installed', 'instituted', 'introduced', 'invented', | |
| 'issued', 'lectured', 'licensed', 'logged', 'marketed', | |
| 'mediated', 'merged', 'motivated', 'navigated', 'obtained', | |
| 'operated', 'ordered', 'originated', 'partnered', 'planned', | |
| 'predicted', 'prescribed', 'prevented', 'promoted', 'protected', | |
| 'purchased', 'qualified', 'raised', 'ranked', 'rated', | |
| 'realized', 'received', 'recognized', 'recruited', 'rectified', | |
| 'referred', 'regulated', 'rehabilitated', 'reinforced', 'rendered', | |
| 'reorganized', 'repaired', 'replaced', 'reported', 'represented', | |
| 'rescued', 'restored', 'restructured', 'retrieved', 'safeguarded', | |
| 'saved', 'screened', 'secured', 'selected', 'served', | |
| 'shaped', 'simplified', 'solved', 'sorted', 'specified', | |
| 'sponsored', 'staffed', 'steered', 'strengthened', 'structured', | |
| 'studied', 'submitted', 'succeeded', 'summarized', 'superseded', | |
| 'supervised', 'surpassed', 'surveyed', 'sustained', 'targeted', | |
| 'taught', 'terminated', 'traded', 'transcribed', 'transferred', | |
| 'translated', 'tripled', 'troubleshot', 'tutored', 'uncovered', | |
| 'unified', 'upgraded', 'validated', 'valued', 'visualized', | |
| 'widened', 'won', 'worked', 'wrote', | |
| # Additional common verbs from test failures | |
| 'closed', 'grew', 'covered', 'published', 'filled', 'supported', | |
| 'provided', 'trained', 'responded', 'triaged', 'maintained', | |
| 'advised', 'drafted', 'reviewed', 'researched', 'processed', | |
| 'migrated', 'architected', 'scaled', 'resolved', 'tested', | |
| ] | |
| # Skills taxonomy - maps related terms (COMPREHENSIVE FOR ALL DOMAINS) | |
| self.skills_taxonomy = { | |
| # ============== TECHNOLOGY / SOFTWARE ============== | |
| 'python': ['python', 'py', 'python3', 'python2'], | |
| 'java': ['java', 'java8', 'java11', 'jvm', 'spring boot', 'spring'], | |
| 'javascript': ['javascript', 'js', 'node.js', 'nodejs', 'react', 'angular', 'vue', 'typescript'], | |
| 'sql': ['sql', 'mysql', 'postgresql', 'postgres', 'sql server', 'tsql', 'plsql', 'oracle'], | |
| 'api': ['api', 'apis', 'rest api', 'restful', 'rest', 'graphql', 'soap'], | |
| 'agile': ['agile', 'scrum', 'kanban', 'sprint', 'jira', 'waterfall'], | |
| 'ci/cd': ['ci/cd', 'cicd', 'ci cd', 'continuous integration', 'continuous deployment', 'jenkins', 'github actions'], | |
| 'git': ['git', 'github', 'gitlab', 'bitbucket', 'version control', 'svn'], | |
| 'cloud': ['cloud', 'cloud computing', 'cloud services', 'saas', 'paas', 'iaas'], | |
| 'aws': ['aws', 'amazon web services', 'amazon cloud', 'ec2', 's3', 'lambda'], | |
| 'azure': ['azure', 'microsoft azure', 'azure ml', 'azure cloud'], | |
| 'gcp': ['gcp', 'google cloud', 'google cloud platform', 'bigquery'], | |
| 'docker': ['docker', 'containerization', 'containers', 'dockerfile'], | |
| 'kubernetes': ['kubernetes', 'k8s', 'kube', 'container orchestration'], | |
| 'devops': ['devops', 'dev ops', 'sre', 'site reliability'], | |
| 'linux': ['linux', 'unix', 'ubuntu', 'centos', 'redhat', 'bash', 'shell'], | |
| 'networking': ['networking', 'tcp/ip', 'dns', 'vpn', 'firewall', 'load balancer'], | |
| 'security': ['security', 'cybersecurity', 'infosec', 'penetration testing', 'vulnerability'], | |
| # ============== AI / ML / DATA SCIENCE ============== | |
| 'machine learning': ['machine learning', 'ml', 'machine-learning', 'machinelearning'], | |
| 'deep learning': ['deep learning', 'dl', 'deep-learning', 'neural networks', 'neural nets'], | |
| 'artificial intelligence': ['artificial intelligence', 'ai', 'a.i.', 'a.i'], | |
| 'natural language processing': ['natural language processing', 'nlp', 'text mining', 'text analytics'], | |
| 'data science': ['data science', 'data scientist', 'datascience', 'ds'], | |
| 'tensorflow': ['tensorflow', 'tf', 'tensor flow', 'keras'], | |
| 'pytorch': ['pytorch', 'torch', 'py torch'], | |
| 'llm': ['llm', 'large language model', 'large language models', 'llms', 'chatgpt', 'gpt'], | |
| 'generative ai': ['generative ai', 'genai', 'gen ai', 'gen-ai'], | |
| 'computer vision': ['computer vision', 'cv', 'image recognition', 'object detection'], | |
| 'langchain': ['langchain', 'lang chain', 'langgraph'], | |
| 'rag': ['rag', 'retrieval augmented generation', 'similarity search'], | |
| 'embeddings': ['embedding', 'embeddings', 'vector embeddings', 'word embeddings'], | |
| 'mlops': ['mlops', 'ml ops', 'machine learning operations'], | |
| 'spark': ['spark', 'pyspark', 'apache spark', 'spark sql'], | |
| 'hadoop': ['hadoop', 'hdfs', 'mapreduce', 'hive'], | |
| # ============== DATA / ANALYTICS ============== | |
| 'tableau': ['tableau', 'tableau desktop', 'tableau server'], | |
| 'power bi': ['power bi', 'powerbi', 'power-bi', 'pbi'], | |
| 'excel': ['excel', 'ms excel', 'microsoft excel', 'spreadsheet', 'vlookup', 'pivot table'], | |
| 'data analysis': ['data analysis', 'data analytics', 'analytics', 'analytical'], | |
| 'statistics': ['statistics', 'statistical', 'statistical analysis', 'regression', 'hypothesis'], | |
| 'visualization': ['visualization', 'data visualization', 'dashboards', 'reporting'], | |
| 'etl': ['etl', 'extract transform load', 'data pipeline', 'data integration'], | |
| 'business intelligence': ['business intelligence', 'bi', 'reporting', 'insights'], | |
| 'forecasting': ['forecasting', 'prediction', 'predictive', 'time series'], | |
| # ============== FINANCE / ACCOUNTING ============== | |
| 'financial analysis': ['financial analysis', 'financial modeling', 'financial planning', 'fp&a'], | |
| 'accounting': ['accounting', 'accountant', 'bookkeeping', 'ledger'], | |
| 'gaap': ['gaap', 'generally accepted accounting principles', 'ifrs'], | |
| 'budgeting': ['budgeting', 'budget', 'forecasting', 'variance analysis'], | |
| 'auditing': ['auditing', 'audit', 'internal audit', 'external audit', 'sox'], | |
| 'tax': ['tax', 'taxation', 'tax planning', 'tax compliance', 'tax return'], | |
| 'cpa': ['cpa', 'certified public accountant', 'cma', 'cfa'], | |
| 'investment': ['investment', 'investing', 'portfolio', 'asset management'], | |
| 'banking': ['banking', 'bank', 'commercial banking', 'retail banking'], | |
| 'risk management': ['risk management', 'risk', 'risk assessment', 'credit risk', 'market risk'], | |
| 'compliance': ['compliance', 'regulatory', 'regulations', 'regulatory compliance'], | |
| 'valuation': ['valuation', 'dcf', 'discounted cash flow', 'comparable analysis'], | |
| 'mergers': ['mergers', 'm&a', 'acquisitions', 'merger', 'due diligence'], | |
| 'bloomberg': ['bloomberg', 'bloomberg terminal', 'reuters', 'factset'], | |
| 'quickbooks': ['quickbooks', 'quick books', 'sage', 'xero', 'netsuite'], | |
| # ============== MARKETING / SALES ============== | |
| 'marketing': ['marketing', 'digital marketing', 'marketing strategy', 'brand'], | |
| 'seo': ['seo', 'search engine optimization', 'organic search', 'keywords'], | |
| 'sem': ['sem', 'search engine marketing', 'ppc', 'pay per click', 'google ads'], | |
| 'social media': ['social media', 'social media marketing', 'facebook', 'instagram', 'linkedin', 'twitter'], | |
| 'content marketing': ['content marketing', 'content strategy', 'content creation', 'copywriting'], | |
| 'email marketing': ['email marketing', 'email campaigns', 'mailchimp', 'hubspot email'], | |
| 'crm': ['crm', 'customer relationship management', 'salesforce', 'hubspot', 'zoho'], | |
| 'salesforce': ['salesforce', 'sfdc', 'salesforce crm', 'salesforce admin'], | |
| 'hubspot': ['hubspot', 'hub spot', 'hubspot crm', 'hubspot marketing'], | |
| 'lead generation': ['lead generation', 'leads', 'prospecting', 'pipeline'], | |
| 'sales': ['sales', 'selling', 'revenue', 'quota', 'target'], | |
| 'b2b': ['b2b', 'business to business', 'enterprise sales', 'corporate sales'], | |
| 'b2c': ['b2c', 'business to consumer', 'retail', 'consumer'], | |
| 'account management': ['account management', 'account manager', 'client management', 'customer success'], | |
| 'market research': ['market research', 'competitive analysis', 'market analysis'], | |
| 'branding': ['branding', 'brand management', 'brand strategy', 'brand identity'], | |
| 'advertising': ['advertising', 'ads', 'ad campaigns', 'media buying'], | |
| 'google analytics': ['google analytics', 'ga', 'analytics', 'web analytics'], | |
| 'conversion': ['conversion', 'conversion rate', 'cro', 'conversion optimization'], | |
| # ============== HEALTHCARE / MEDICAL ============== | |
| 'healthcare': ['healthcare', 'health care', 'medical', 'clinical'], | |
| 'hipaa': ['hipaa', 'hipaa compliance', 'patient privacy', 'phi'], | |
| 'emr': ['emr', 'ehr', 'electronic medical records', 'electronic health records', 'epic', 'cerner'], | |
| 'patient care': ['patient care', 'patient', 'patients', 'bedside', 'clinical care'], | |
| 'nursing': ['nursing', 'nurse', 'rn', 'lpn', 'np', 'nurse practitioner'], | |
| 'medical coding': ['medical coding', 'icd-10', 'cpt', 'medical billing', 'coding'], | |
| 'pharmacy': ['pharmacy', 'pharmacist', 'pharmaceutical', 'medications', 'drugs'], | |
| 'clinical trials': ['clinical trials', 'clinical research', 'research', 'fda'], | |
| 'diagnosis': ['diagnosis', 'diagnostic', 'treatment', 'prognosis'], | |
| 'public health': ['public health', 'epidemiology', 'population health'], | |
| 'mental health': ['mental health', 'behavioral health', 'psychology', 'psychiatry'], | |
| 'telehealth': ['telehealth', 'telemedicine', 'virtual care', 'remote care'], | |
| # ============== HUMAN RESOURCES ============== | |
| 'recruitment': ['recruitment', 'recruiting', 'talent acquisition', 'hiring', 'sourcing'], | |
| 'onboarding': ['onboarding', 'orientation', 'new hire', 'induction'], | |
| 'hris': ['hris', 'hcm', 'workday', 'adp', 'peoplesoft', 'successfactors'], | |
| 'payroll': ['payroll', 'compensation', 'benefits', 'salary'], | |
| 'performance management': ['performance management', 'performance review', 'appraisal', 'feedback'], | |
| 'employee relations': ['employee relations', 'labor relations', 'er', 'workplace'], | |
| 'training': ['training', 'learning and development', 'l&d', 'development'], | |
| 'benefits administration': ['benefits administration', 'benefits', 'health insurance', '401k'], | |
| 'hr compliance': ['hr compliance', 'labor law', 'employment law', 'eeoc', 'fmla'], | |
| 'shrm': ['shrm', 'phr', 'sphr', 'hr certification'], | |
| 'employee engagement': ['employee engagement', 'engagement', 'culture', 'retention'], | |
| 'diversity': ['diversity', 'dei', 'inclusion', 'equity', 'd&i'], | |
| # ============== LEGAL ============== | |
| 'legal': ['legal', 'law', 'attorney', 'lawyer', 'counsel'], | |
| 'contracts': ['contracts', 'contract', 'agreement', 'negotiation', 'drafting'], | |
| 'litigation': ['litigation', 'court', 'trial', 'dispute', 'lawsuit'], | |
| 'corporate law': ['corporate law', 'corporate', 'governance', 'bylaws'], | |
| 'intellectual property': ['intellectual property', 'ip', 'patent', 'trademark', 'copyright'], | |
| 'legal research': ['legal research', 'westlaw', 'lexisnexis', 'case law'], | |
| 'regulatory': ['regulatory', 'regulations', 'compliance', 'policy'], | |
| 'paralegal': ['paralegal', 'legal assistant', 'legal support'], | |
| # ============== OPERATIONS / SUPPLY CHAIN ============== | |
| 'operations': ['operations', 'ops', 'operational', 'operating'], | |
| 'supply chain': ['supply chain', 'scm', 'logistics', 'procurement', 'sourcing'], | |
| 'inventory': ['inventory', 'inventory management', 'stock', 'warehouse'], | |
| 'manufacturing': ['manufacturing', 'production', 'assembly', 'factory'], | |
| 'quality': ['quality', 'quality control', 'qc', 'quality assurance', 'qa'], | |
| 'lean': ['lean', 'lean manufacturing', 'lean six sigma', 'continuous improvement'], | |
| 'six sigma': ['six sigma', '6 sigma', 'dmaic', 'green belt', 'black belt'], | |
| 'process improvement': ['process improvement', 'optimization', 'efficiency', 'streamline'], | |
| 'vendor management': ['vendor management', 'vendor', 'supplier', 'supplier management'], | |
| 'erp': ['erp', 'sap', 'oracle erp', 'enterprise resource planning', 'netsuite'], | |
| 'logistics': ['logistics', 'transportation', 'shipping', 'freight', 'distribution'], | |
| 'project management': ['project management', 'pm', 'pmp', 'project manager'], | |
| 'program management': ['program management', 'program manager', 'portfolio'], | |
| 'change management': ['change management', 'change', 'transformation'], | |
| # ============== ENGINEERING (NON-SOFTWARE) ============== | |
| 'mechanical engineering': ['mechanical engineering', 'mechanical', 'cad', 'solidworks', 'autocad'], | |
| 'electrical engineering': ['electrical engineering', 'electrical', 'circuits', 'pcb'], | |
| 'civil engineering': ['civil engineering', 'civil', 'structural', 'construction'], | |
| 'chemical engineering': ['chemical engineering', 'chemical', 'process engineering'], | |
| 'engineering design': ['engineering design', 'design', 'prototyping', 'testing'], | |
| 'cad': ['cad', 'autocad', 'solidworks', 'catia', 'inventor'], | |
| 'simulation': ['simulation', 'modeling', 'fea', 'cfd', 'ansys'], | |
| # ============== EDUCATION ============== | |
| 'teaching': ['teaching', 'teacher', 'instructor', 'educator', 'professor'], | |
| 'curriculum': ['curriculum', 'curriculum development', 'lesson plans', 'syllabus'], | |
| 'classroom': ['classroom', 'classroom management', 'instruction', 'students'], | |
| 'assessment': ['assessment', 'grading', 'evaluation', 'testing'], | |
| 'e-learning': ['e-learning', 'online learning', 'lms', 'canvas', 'blackboard'], | |
| 'tutoring': ['tutoring', 'tutor', 'mentoring', 'coaching'], | |
| # ============== CUSTOMER SERVICE ============== | |
| 'customer service': ['customer service', 'customer support', 'support', 'helpdesk'], | |
| 'customer experience': ['customer experience', 'cx', 'customer satisfaction', 'csat'], | |
| 'call center': ['call center', 'contact center', 'phone support', 'tickets'], | |
| 'troubleshooting': ['troubleshooting', 'problem solving', 'issue resolution'], | |
| 'zendesk': ['zendesk', 'freshdesk', 'intercom', 'ticketing'], | |
| # ============== CREATIVE / DESIGN ============== | |
| 'graphic design': ['graphic design', 'graphics', 'visual design', 'designer'], | |
| 'adobe': ['adobe', 'photoshop', 'illustrator', 'indesign', 'creative suite'], | |
| 'figma': ['figma', 'sketch', 'invision', 'xd', 'adobe xd'], | |
| 'ui/ux': ['ui/ux', 'ui', 'ux', 'user interface', 'user experience'], | |
| 'web design': ['web design', 'website design', 'responsive design'], | |
| 'video editing': ['video editing', 'premiere', 'final cut', 'after effects'], | |
| 'photography': ['photography', 'photo editing', 'lightroom'], | |
| 'branding design': ['branding design', 'brand design', 'logo design', 'identity'], | |
| # ============== GENERAL PROFESSIONAL SKILLS ============== | |
| 'communication': ['communication', 'communications', 'verbal', 'written', 'presentation'], | |
| 'leadership': ['leadership', 'leader', 'leading', 'management', 'managing'], | |
| 'teamwork': ['teamwork', 'team', 'collaboration', 'collaborative', 'cross-functional'], | |
| 'problem solving': ['problem solving', 'problem-solving', 'analytical', 'critical thinking'], | |
| 'time management': ['time management', 'prioritization', 'multitasking', 'deadline'], | |
| 'organization': ['organization', 'organizational', 'organized', 'detail-oriented'], | |
| 'strategic': ['strategic', 'strategy', 'strategic planning', 'strategic thinking'], | |
| 'stakeholder': ['stakeholder', 'stakeholders', 'stakeholder management'], | |
| 'presentation': ['presentation', 'presentations', 'powerpoint', 'public speaking'], | |
| 'negotiation': ['negotiation', 'negotiate', 'negotiating', 'deal'], | |
| 'decision making': ['decision making', 'decision-making', 'judgment'], | |
| 'mentoring': ['mentoring', 'mentor', 'coaching', 'developing others'], | |
| # Additional terms found missing in tests | |
| 'version control': ['version control', 'git', 'github', 'gitlab', 'bitbucket', 'svn'], | |
| 'analytical': ['analytical', 'analysis', 'analyze', 'analytics', 'analyzing'], | |
| 'verbal': ['verbal', 'verbal communication', 'speaking', 'spoken'], | |
| 'written': ['written', 'written communication', 'writing', 'documentation'], | |
| 'portfolio': ['portfolio', 'portfolios', 'work samples', 'projects'], | |
| 'coaching': ['coaching', 'coach', 'mentoring', 'training', 'developing'], | |
| 'client relationship': ['client relationship', 'client relations', 'customer relationship', 'account management'], | |
| 'care coordination': ['care coordination', 'care management', 'patient coordination', 'case management'], | |
| 'patient safety': ['patient safety', 'safety', 'safe care', 'patient care'], | |
| 'quality improvement': ['quality improvement', 'qi', 'continuous improvement', 'process improvement'], | |
| 'electronic health records': ['electronic health records', 'ehr', 'emr', 'electronic medical records', 'epic', 'cerner'], | |
| 'due diligence': ['due diligence', 'diligence', 'research', 'investigation'], | |
| 'oracle': ['oracle', 'oracle database', 'oracle erp', 'oracle cloud'], | |
| 'testing': ['testing', 'test', 'tests', 'a/b testing', 'quality assurance'], | |
| 'systems': ['systems', 'system', 'information systems', 'it systems'], | |
| 'equity': ['equity', 'dei', 'diversity equity inclusion', 'fairness'], | |
| 'process improvement': ['process improvement', 'process optimization', 'continuous improvement', 'lean'], | |
| # ============== NEW DOMAINS FOR EXTENDED TEST COVERAGE ============== | |
| # Hospitality | |
| 'hospitality': ['hospitality', 'guest services', 'hotel', 'resort', 'lodging'], | |
| 'food service': ['food service', 'f&b', 'food and beverage', 'restaurant', 'dining', 'catering'], | |
| 'culinary': ['culinary', 'chef', 'cooking', 'kitchen', 'cuisine', 'menu'], | |
| 'guest experience': ['guest experience', 'guest satisfaction', 'customer experience', 'service excellence'], | |
| 'reservation': ['reservation', 'booking', 'front desk', 'check-in', 'concierge'], | |
| # Retail | |
| 'retail': ['retail', 'store', 'shop', 'merchandise', 'consumer'], | |
| 'merchandising': ['merchandising', 'merchandise', 'product display', 'visual merchandising', 'planogram'], | |
| 'inventory management': ['inventory management', 'stock management', 'inventory control', 'stockroom'], | |
| 'point of sale': ['point of sale', 'pos', 'cash register', 'checkout', 'transactions'], | |
| 'loss prevention': ['loss prevention', 'asset protection', 'shrinkage', 'theft prevention'], | |
| # Government / Public Sector | |
| 'policy': ['policy', 'public policy', 'policy analysis', 'policy development', 'legislation'], | |
| 'grants': ['grants', 'grant writing', 'grant management', 'federal grants', 'funding'], | |
| 'government': ['government', 'public sector', 'federal', 'state', 'municipal', 'public administration'], | |
| 'regulations': ['regulations', 'regulatory affairs', 'compliance', 'policy compliance'], | |
| 'constituent': ['constituent', 'citizen', 'public', 'stakeholder', 'community'], | |
| # Nonprofit | |
| 'nonprofit': ['nonprofit', 'non-profit', 'ngo', 'charity', 'foundation'], | |
| 'fundraising': ['fundraising', 'development', 'donor relations', 'major gifts', 'annual fund'], | |
| 'volunteer': ['volunteer', 'volunteer management', 'community outreach', 'volunteer coordination'], | |
| 'mission': ['mission', 'mission-driven', 'impact', 'social impact', 'cause'], | |
| 'program management': ['program management', 'program development', 'program evaluation', 'grants management'], | |
| # Insurance | |
| 'insurance': ['insurance', 'underwriting', 'claims', 'policy', 'coverage'], | |
| 'underwriting': ['underwriting', 'risk assessment', 'policy writing', 'premium'], | |
| 'claims processing': ['claims processing', 'claims adjustment', 'claims investigation', 'claim settlement'], | |
| 'actuarial': ['actuarial', 'actuary', 'actuarial analysis', 'risk modeling', 'pricing'], | |
| 'reinsurance': ['reinsurance', 'risk transfer', 'ceding', 'treaty'], | |
| # Trades / Construction | |
| 'construction': ['construction', 'building', 'contractor', 'general contractor', 'renovation'], | |
| 'electrical': ['electrical', 'electrician', 'wiring', 'circuits', 'electrical systems'], | |
| 'plumbing': ['plumbing', 'plumber', 'pipes', 'fixtures', 'water systems'], | |
| 'hvac': ['hvac', 'heating', 'ventilation', 'air conditioning', 'climate control'], | |
| 'carpentry': ['carpentry', 'carpenter', 'woodworking', 'framing', 'finish work'], | |
| 'welding': ['welding', 'welder', 'fabrication', 'metal work', 'steel'], | |
| 'blueprint': ['blueprint', 'schematic', 'technical drawing', 'construction drawings'], | |
| # Real Estate | |
| 'real estate': ['real estate', 'property', 'realty', 'residential', 'commercial'], | |
| 'leasing': ['leasing', 'tenant', 'lease agreement', 'property management', 'rental'], | |
| 'appraisal': ['appraisal', 'valuation', 'property assessment', 'market value'], | |
| 'escrow': ['escrow', 'title', 'closing', 'settlement', 'transaction'], | |
| 'mls': ['mls', 'multiple listing', 'listing', 'property listing'], | |
| # Media / Journalism | |
| 'journalism': ['journalism', 'reporter', 'news', 'press', 'media'], | |
| 'editorial': ['editorial', 'editor', 'editing', 'copy editing', 'proofreading'], | |
| 'broadcast': ['broadcast', 'broadcasting', 'tv', 'radio', 'on-air'], | |
| 'podcast': ['podcast', 'audio', 'podcasting', 'audio production'], | |
| 'publishing': ['publishing', 'publication', 'press', 'print', 'digital publishing'], | |
| # Science / Research | |
| 'research': ['research', 'scientific research', 'laboratory', 'lab work', 'experiments'], | |
| 'biology': ['biology', 'biological', 'life sciences', 'molecular biology', 'microbiology'], | |
| 'chemistry': ['chemistry', 'chemical', 'analytical chemistry', 'organic chemistry'], | |
| 'environmental': ['environmental', 'environment', 'sustainability', 'ecology', 'conservation'], | |
| 'laboratory': ['laboratory', 'lab', 'bench work', 'lab techniques', 'specimen'], | |
| 'scientific method': ['scientific method', 'hypothesis', 'experiments', 'data collection'], | |
| # Consulting | |
| 'consulting': ['consulting', 'consultant', 'advisory', 'advisory services'], | |
| 'strategy consulting': ['strategy consulting', 'strategic consulting', 'management consulting'], | |
| 'implementation': ['implementation', 'deployment', 'rollout', 'go-live', 'execution'], | |
| 'business transformation': ['business transformation', 'transformation', 'change management', 'reorganization'], | |
| 'client engagement': ['client engagement', 'client management', 'engagement', 'delivery'], | |
| # Additional soft skills | |
| 'adaptability': ['adaptability', 'flexible', 'adaptable', 'versatile', 'agility'], | |
| 'attention to detail': ['attention to detail', 'detail-oriented', 'detail oriented', 'meticulous', 'thorough'], | |
| 'creativity': ['creativity', 'creative', 'innovative', 'creative thinking', 'ideation'], | |
| 'initiative': ['initiative', 'self-starter', 'proactive', 'self-motivated'], | |
| 'interpersonal': ['interpersonal', 'interpersonal skills', 'relationship building', 'people skills'], | |
| 'multitasking': ['multitasking', 'multi-tasking', 'juggling priorities', 'handling multiple tasks'], | |
| 'resourcefulness': ['resourcefulness', 'resourceful', 'problem solver', 'solution-oriented'], | |
| } | |
| # Common abbreviation mappings (COMPREHENSIVE FOR ALL DOMAINS) | |
| self.abbreviations = { | |
| # Technology | |
| 'ml': 'machine learning', 'ai': 'artificial intelligence', 'dl': 'deep learning', | |
| 'nlp': 'natural language processing', 'cv': 'computer vision', 'llm': 'large language model', | |
| 'genai': 'generative ai', 'ds': 'data science', 'de': 'data engineering', | |
| 'swe': 'software engineer', 'sde': 'software development engineer', | |
| 'qa': 'quality assurance', 'qc': 'quality control', | |
| 'devops': 'development operations', 'mlops': 'machine learning operations', | |
| 'etl': 'extract transform load', 'eda': 'exploratory data analysis', | |
| 'api': 'application programming interface', 'ui': 'user interface', 'ux': 'user experience', | |
| 'sql': 'structured query language', 'aws': 'amazon web services', 'gcp': 'google cloud platform', | |
| 'saas': 'software as a service', 'paas': 'platform as a service', 'iaas': 'infrastructure as a service', | |
| # Business / Finance | |
| 'kpi': 'key performance indicator', 'roi': 'return on investment', 'yoe': 'years of experience', | |
| 'pm': 'project manager', 'ba': 'business analyst', 'cfo': 'chief financial officer', | |
| 'cto': 'chief technology officer', 'ceo': 'chief executive officer', 'coo': 'chief operating officer', | |
| 'vp': 'vice president', 'svp': 'senior vice president', 'evp': 'executive vice president', | |
| 'p&l': 'profit and loss', 'gaap': 'generally accepted accounting principles', | |
| 'ifrs': 'international financial reporting standards', 'sox': 'sarbanes oxley', | |
| 'cpa': 'certified public accountant', 'cfa': 'chartered financial analyst', | |
| 'cma': 'certified management accountant', 'dcf': 'discounted cash flow', | |
| 'm&a': 'mergers and acquisitions', 'ipo': 'initial public offering', | |
| 'ebitda': 'earnings before interest taxes depreciation amortization', | |
| 'ytd': 'year to date', 'mtd': 'month to date', 'yoy': 'year over year', | |
| 'b2b': 'business to business', 'b2c': 'business to consumer', | |
| # Marketing / Sales | |
| 'seo': 'search engine optimization', 'sem': 'search engine marketing', | |
| 'ppc': 'pay per click', 'cpc': 'cost per click', 'cpm': 'cost per mille', | |
| 'crm': 'customer relationship management', 'cro': 'conversion rate optimization', | |
| 'ctr': 'click through rate', 'cac': 'customer acquisition cost', 'ltv': 'lifetime value', | |
| 'nps': 'net promoter score', 'csat': 'customer satisfaction', | |
| # HR | |
| 'hr': 'human resources', 'hris': 'human resources information system', | |
| 'hcm': 'human capital management', 'phr': 'professional human resources', | |
| 'sphr': 'senior professional human resources', 'shrm': 'society human resources management', | |
| 'dei': 'diversity equity inclusion', 'l&d': 'learning and development', | |
| 'eeoc': 'equal employment opportunity commission', 'fmla': 'family medical leave act', | |
| # Healthcare | |
| 'rn': 'registered nurse', 'lpn': 'licensed practical nurse', 'np': 'nurse practitioner', | |
| 'md': 'medical doctor', 'do': 'doctor of osteopathic medicine', | |
| 'emr': 'electronic medical records', 'ehr': 'electronic health records', | |
| 'hipaa': 'health insurance portability accountability act', 'phi': 'protected health information', | |
| 'icd': 'international classification of diseases', 'cpt': 'current procedural terminology', | |
| # Operations | |
| 'scm': 'supply chain management', 'erp': 'enterprise resource planning', | |
| 'pmp': 'project management professional', 'six sigma': 'six sigma', | |
| 'tqm': 'total quality management', 'jit': 'just in time', | |
| # Legal | |
| 'jd': 'juris doctor', 'llm': 'master of laws', 'ip': 'intellectual property', | |
| 'nda': 'non disclosure agreement', 'sla': 'service level agreement', | |
| } | |
| self.stop_words = { | |
| 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', | |
| 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', | |
| 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', | |
| 'may', 'might', 'must', 'shall', 'can', 'we', 'you', 'they', 'he', 'she', | |
| 'it', 'i', 'me', 'my', 'your', 'our', 'their', 'his', 'her', 'its', | |
| 'this', 'that', 'these', 'those', 'what', 'which', 'who', 'whom', 'how', | |
| 'all', 'each', 'both', 'few', 'more', 'most', 'other', 'some', 'such', | |
| 'no', 'not', 'only', 'same', 'so', 'than', 'too', 'very', 'just', 'also', | |
| 'now', 'as', 'from', 'about', 'into', 'through', 'during', 'before', | |
| 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then', | |
| 'once', 'here', 'there', 'when', 'where', 'why', 'while', 'any', 'every', | |
| # Generic words that appear in JDs but aren't real keywords | |
| 'strong', 'related', 'junior', 'senior', 'mid', 'level', 'new', 'within', | |
| 'using', 'use', 'used', 'able', 'ability', 'include', 'including', 'includes', | |
| 'need', 'needed', 'needs', 'well', 'good', 'great', 'excellent', 'required', | |
| 'requirements', 'qualification', 'qualifications', 'preferred', 'desired', | |
| 'role', 'position', 'job', 'candidate', 'looking', 'seek', 'seeking', | |
| 'year', 'years', 'month', 'months', 'day', 'days', 'time', 'times', | |
| 'work', 'working', 'worker', 'works', 'join', 'joining', 'team', 'teams', | |
| 'based', 'base', 'company', 'organization', 'organizations', 'firm', | |
| 'make', 'making', 'made', 'get', 'getting', 'got', 'take', 'taking', 'took', | |
| # More generic filler words | |
| 'solid', 'background', 'capabilities', 'capability', 'knowledge', | |
| 'proficiency', 'proficient', 'expertise', 'expert', 'familiar', | |
| 'understanding', 'hands', 'hand', 'proven', 'track', 'record' | |
| } | |
| def _stem_word(self, word: str) -> str: | |
| """Simple Porter-like stemming for common suffixes.""" | |
| word = word.lower().strip() | |
| # Common suffix patterns | |
| suffixes = [ | |
| ('ational', 'ate'), ('tional', 'tion'), ('enci', 'ence'), ('anci', 'ance'), | |
| ('izer', 'ize'), ('isation', 'ize'), ('ization', 'ize'), ('ation', 'ate'), | |
| ('ator', 'ate'), ('alism', 'al'), ('iveness', 'ive'), ('fulness', 'ful'), | |
| ('ousness', 'ous'), ('aliti', 'al'), ('iviti', 'ive'), ('biliti', 'ble'), | |
| ('ling', 'l'), ('ment', ''), ('ness', ''), ('ity', ''), ('ies', 'y'), | |
| ('ing', ''), ('ed', ''), ('er', ''), ('ly', ''), ('es', ''), ('s', '') | |
| ] | |
| for suffix, replacement in suffixes: | |
| if word.endswith(suffix) and len(word) > len(suffix) + 2: | |
| return word[:-len(suffix)] + replacement | |
| return word | |
| def _fuzzy_match(self, word1: str, word2: str, threshold: float = 0.70) -> bool: | |
| """Check if two words are similar using sequence matching.""" | |
| from difflib import SequenceMatcher | |
| word1, word2 = word1.lower(), word2.lower() | |
| if word1 == word2: | |
| return True | |
| # Also check if one contains the other (e.g., 'collaborate' in 'collaboration') | |
| if word1 in word2 or word2 in word1: | |
| return True | |
| # Check stem match | |
| if self._stem_word(word1) == self._stem_word(word2): | |
| return True | |
| ratio = SequenceMatcher(None, word1, word2).ratio() | |
| return ratio >= threshold | |
| def _expand_with_taxonomy(self, words: List[str]) -> set: | |
| """Expand words using skills taxonomy - STRICT matching only. | |
| Only adds related terms for EXACT skill matches to prevent false positives. | |
| """ | |
| expanded = set(words) | |
| for word in words: | |
| word_lower = word.lower() | |
| # STRICT: Only expand if word is an EXACT match for a taxonomy key | |
| if word_lower in self.skills_taxonomy: | |
| # Add variations but NOT other unrelated skills | |
| expanded.update(self.skills_taxonomy[word_lower]) | |
| # Check abbreviation expansions - EXACT match only | |
| if word_lower in self.abbreviations: | |
| expanded.add(self.abbreviations[word_lower]) | |
| # Reverse: if full form EXACTLY matches, add abbreviation | |
| for abbr, full in self.abbreviations.items(): | |
| if word_lower == full.lower(): | |
| expanded.add(abbr) | |
| return expanded | |
| def _extract_years_experience(self, text: str) -> List[int]: | |
| """Extract years of experience mentions from text.""" | |
| patterns = [ | |
| r'(\d+)\+?\s*(?:years?|yrs?)(?:\s+of)?\s+(?:experience|exp)', | |
| r'(?:experience|exp)(?:\s+of)?\s*:?\s*(\d+)\+?\s*(?:years?|yrs?)', | |
| r'(\d+)\+?\s*(?:years?|yrs?)\s+(?:in|of|working)', | |
| r'over\s+(\d+)\s+(?:years?|yrs?)', | |
| r'(\d+)\+?\s*yoe', | |
| ] | |
| years = [] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, text.lower()) | |
| years.extend([int(m) for m in matches if m.isdigit()]) | |
| return years | |
| def _calculate_tfidf_score(self, resume: str, job_desc: str) -> float: | |
| """Calculate TF-IDF weighted keyword match score - REALISTIC SCORING. | |
| Based on real ATS systems: | |
| - No artificial baselines | |
| - Score = (matched keywords / required keywords) Γ 100 | |
| - Empty resumes score ~0-10%, not 60%+ | |
| """ | |
| import math | |
| resume_lower = resume.lower() | |
| jd_lower = job_desc.lower() | |
| # INPUT VALIDATION: Check for minimum content | |
| if len(resume.strip()) < 50: | |
| return max(5, len(resume.strip()) // 10) # Very short = very low score | |
| # Tokenize and clean | |
| resume_words = re.findall(r'\b[a-zA-Z]{2,}\b', resume_lower) | |
| jd_words = re.findall(r'\b[a-zA-Z]{2,}\b', jd_lower) | |
| # Filter stop words and stem | |
| resume_words = [self._stem_word(w) for w in resume_words if w not in self.stop_words] | |
| jd_words = [self._stem_word(w) for w in jd_words if w not in self.stop_words] | |
| if not jd_words: | |
| return 50 # No JD keywords to match - neutral score | |
| if len(resume_words) < 10: | |
| return 10 # Very sparse resume | |
| # Calculate TF for job description | |
| jd_tf = Counter(jd_words) | |
| # Calculate IDF-like weights (words appearing less often are more important) | |
| max_count = max(jd_tf.values()) if jd_tf else 1 | |
| jd_weights = {word: 1 + math.log(max_count / count) for word, count in jd_tf.items()} | |
| # Expand resume words with taxonomy | |
| resume_expanded = self._expand_with_taxonomy(resume_words) | |
| resume_stems = {self._stem_word(w) for w in resume_expanded} | |
| # Also add raw words for substring matching | |
| resume_raw = set(resume_lower.split()) | |
| # Calculate weighted match score - STRICT matching to avoid false positives | |
| weighted_matches = 0 | |
| total_weight = 0 | |
| for word, weight in jd_weights.items(): | |
| total_weight += weight | |
| # Check direct match (highest confidence) | |
| if word in resume_stems: | |
| weighted_matches += weight | |
| # Check exact word in resume text | |
| elif word in resume_lower: | |
| weighted_matches += weight | |
| # Check containment ONLY for stems (e.g., 'develop' in 'developer') | |
| # But require the word to be at least 5 chars to avoid false positives | |
| elif len(word) >= 5 and any(word in rw for rw in resume_stems if len(rw) >= 5): | |
| weighted_matches += weight * 0.9 | |
| # Check fuzzy match - STRICT threshold (0.80+) for technical terms | |
| elif len(word) >= 5 and any(self._fuzzy_match(word, rw, 0.80) for rw in resume_stems if len(rw) >= 5): | |
| weighted_matches += weight * 0.8 | |
| # NO 3-char or 4-char prefix matching - causes too many false positives | |
| if total_weight == 0: | |
| return 15 # No weighted keywords found | |
| # REALISTIC SCORING: Direct percentage based on actual matches | |
| raw_score = (weighted_matches / total_weight) * 100 | |
| return min(100, max(0, raw_score)) | |
| def _skills_match_score(self, resume: str, job_desc: str) -> float: | |
| """Score based on technical skills matching with taxonomy - REALISTIC. | |
| Based on Jobscan research: 76.4% of recruiters filter by skills | |
| Score = (matched skills / required skills) Γ 100 | |
| """ | |
| resume_lower = resume.lower() | |
| jd_lower = job_desc.lower() | |
| # INPUT VALIDATION | |
| if len(resume.strip()) < 50: | |
| return 5 # Empty/minimal resume | |
| # Extract skills from JD using taxonomy | |
| jd_skills = set() | |
| for skill_name, variations in self.skills_taxonomy.items(): | |
| for var in variations: | |
| if var in jd_lower: | |
| jd_skills.add(skill_name) | |
| break | |
| # Also extract raw important words from JD as potential skills | |
| jd_words = set(re.findall(r'\b[a-zA-Z]{4,}\b', jd_lower)) - self.stop_words | |
| if not jd_skills: | |
| # No taxonomy matches - fall back to direct word matching | |
| if jd_words: | |
| matched = sum(1 for w in list(jd_words)[:20] if w in resume_lower or w[:4] in resume_lower) | |
| # Direct percentage: matched out of checked words | |
| return min(100, max(0, (matched / min(20, len(jd_words))) * 100)) | |
| return 40 # No skills detected in JD - neutral | |
| # Check which skills are in resume - STRICT matching only | |
| matched_skills = 0 | |
| for skill_name in jd_skills: | |
| variations = self.skills_taxonomy.get(skill_name, [skill_name]) | |
| # Check direct match only (full word or variation) | |
| if any(var in resume_lower for var in variations): | |
| matched_skills += 1 | |
| # Check stem variations - EXACT stem match only | |
| elif any(self._stem_word(var) in resume_lower for var in variations): | |
| matched_skills += 0.9 | |
| # NO prefix matching - causes false positives | |
| # REALISTIC: Direct percentage scoring - no baseline | |
| match_ratio = matched_skills / len(jd_skills) | |
| # Score = match_ratio * 100 (0% matched = 0%, 100% matched = 100%) | |
| return min(100, max(0, match_ratio * 100)) | |
| def _experience_match_score(self, resume: str, job_desc: str) -> float: | |
| """Score based on years of experience matching - REALISTIC SCORING.""" | |
| # INPUT VALIDATION | |
| if len(resume.strip()) < 50: | |
| return 5 # Empty/minimal resume | |
| jd_years = self._extract_years_experience(job_desc) | |
| resume_years = self._extract_years_experience(resume) | |
| # Also calculate from date ranges in resume | |
| calculated_years = self._calculate_years_from_dates(resume) | |
| if not jd_years: | |
| # No requirement specified - give moderate score if has any dates | |
| if calculated_years > 0 or resume_years: | |
| return 70 # Has experience, no requirement - good match | |
| return 40 # No experience detected, no requirement | |
| required_years = max(jd_years) # Take the highest requirement | |
| # Use the best available years data | |
| if resume_years: | |
| candidate_years = max(resume_years) | |
| elif calculated_years > 0: | |
| candidate_years = calculated_years | |
| else: | |
| return 20 # Can't detect any experience - low score | |
| if candidate_years >= required_years: | |
| return 100 | |
| elif candidate_years >= required_years * 0.8: | |
| return 90 | |
| elif candidate_years >= required_years * 0.6: | |
| return 75 | |
| else: | |
| return max(50, 100 - (required_years - candidate_years) * 8) | |
| def _calculate_years_from_dates(self, resume: str) -> int: | |
| """Calculate total years of experience from date ranges in resume.""" | |
| import datetime | |
| current_year = datetime.datetime.now().year | |
| # Pattern: 2018 - 2023 or 2018 - Present | |
| date_pattern = r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?\s*(20\d{2}|19\d{2})\s*[-βto]+\s*(20\d{2}|19\d{2}|[Pp]resent|[Cc]urrent)' | |
| matches = re.findall(date_pattern, resume) | |
| total_years = 0 | |
| for start, end in matches: | |
| try: | |
| start_year = int(start) | |
| if end.lower() in ['present', 'current']: | |
| end_year = current_year | |
| else: | |
| end_year = int(end) | |
| total_years += max(0, end_year - start_year) | |
| except: | |
| continue | |
| return total_years | |
| # ================== FRAUD/GAMING DETECTION FUNCTIONS ================== | |
| def _detect_jd_copy(self, resume: str, job_desc: str) -> Dict: | |
| """Detect if resume is copied from job description (gaming attempt).""" | |
| resume_words = set(resume.lower().split()) | |
| jd_words = set(job_desc.lower().split()) | |
| if len(jd_words) < 10: | |
| return {'is_copy': False, 'similarity': 0, 'penalty': 0} | |
| # Calculate word overlap | |
| overlap = len(resume_words & jd_words) | |
| similarity = overlap / len(jd_words) if jd_words else 0 | |
| # Check for phrase copying (more damning) | |
| resume_lower = resume.lower() | |
| jd_sentences = [s.strip() for s in job_desc.split('.') if len(s.strip()) > 30] | |
| copied_phrases = sum(1 for s in jd_sentences if s.lower() in resume_lower) | |
| phrase_copy_ratio = copied_phrases / len(jd_sentences) if jd_sentences else 0 | |
| # High similarity or phrase copying = gaming | |
| is_copy = similarity > 0.75 or phrase_copy_ratio > 0.3 | |
| penalty = 60 if phrase_copy_ratio > 0.5 else (50 if similarity > 0.85 else (40 if similarity > 0.75 else 0)) | |
| return {'is_copy': is_copy, 'similarity': similarity, 'phrase_copy': phrase_copy_ratio, 'penalty': penalty} | |
| def _detect_skills_without_experience(self, resume: str) -> Dict: | |
| """Detect if resume lists skills without work context.""" | |
| resume_lower = resume.lower() | |
| # Check for experience section markers | |
| experience_markers = ['experience', 'employment', 'work history', 'professional background', | |
| 'career history', 'positions held', 'job history'] | |
| has_experience_section = any(marker in resume_lower for marker in experience_markers) | |
| # Check for work context indicators | |
| work_context = ['worked', 'managed', 'developed', 'implemented', 'created', 'led', 'designed', | |
| 'built', 'achieved', 'delivered', 'company', 'organization', 'team', 'project', | |
| 'responsible for', 'collaborated', 'years', 'months', 'position', 'role'] | |
| work_context_count = sum(1 for w in work_context if w in resume_lower) | |
| # Check for date patterns (employment dates) | |
| date_pattern = re.compile(r'\b(19|20)\d{2}\b|\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s*\d{4}', re.IGNORECASE) | |
| has_dates = bool(date_pattern.search(resume)) | |
| # Skills-only resume lacks context | |
| is_skills_only = not has_experience_section and work_context_count < 5 and not has_dates | |
| penalty = 35 if is_skills_only else 0 | |
| return {'is_skills_only': is_skills_only, 'has_experience_section': has_experience_section, | |
| 'work_context_count': work_context_count, 'penalty': penalty} | |
| def _detect_industry_mismatch(self, resume: str, job_desc: str) -> Dict: | |
| """Detect complete industry/profession mismatch (e.g., plumber applying for surgeon).""" | |
| # Define industry clusters - roles that should NOT cross | |
| industry_clusters = { | |
| 'healthcare_clinical': ['doctor', 'physician', 'surgeon', 'nurse', 'rn', 'lpn', 'np', 'pa', 'medical', | |
| 'patient care', 'diagnosis', 'treatment', 'clinical', 'hospital', 'healthcare provider', | |
| 'anesthesiologist', 'cardiologist', 'pediatrician', 'oncologist', 'radiologist'], | |
| 'trades': ['plumber', 'electrician', 'carpenter', 'mechanic', 'hvac', 'welder', 'mason', | |
| 'pipefitter', 'roofer', 'contractor', 'construction worker', 'handyman'], | |
| 'legal': ['lawyer', 'attorney', 'paralegal', 'legal counsel', 'solicitor', 'barrister', | |
| 'judge', 'litigation', 'legal assistant', 'law clerk', 'jd', 'bar admission'], | |
| 'aviation': ['pilot', 'flight', 'aviation', 'aircraft', 'airline', 'cockpit', 'atc', | |
| 'air traffic', 'faa', 'flight hours', 'aircraft type rating'], | |
| 'culinary': ['chef', 'cook', 'culinary', 'kitchen', 'restaurant', 'sous chef', 'pastry', | |
| 'food prep', 'catering', 'menu', 'cuisine'], | |
| 'education_teaching': ['teacher', 'professor', 'educator', 'instructor', 'teaching', 'classroom', | |
| 'curriculum', 'lesson plan', 'students', 'pedagogy', 'education degree'], | |
| 'law_enforcement': ['police', 'officer', 'detective', 'law enforcement', 'sheriff', 'trooper', | |
| 'patrol', 'investigation', 'criminal justice', 'peace officer'], | |
| 'finance_licensed': ['cpa', 'cfa', 'cfp', 'series 7', 'series 66', 'finra', 'broker', | |
| 'financial advisor', 'investment advisor', 'registered representative'], | |
| 'quantitative_finance': ['quant', 'quantitative', 'algorithmic', 'derivatives', 'risk model', | |
| 'stochastic', 'monte carlo', 'var', 'hedge fund', 'trading desk', | |
| 'fixed income', 'structured products', 'credit derivatives'], | |
| 'wellness_fitness': ['yoga', 'meditation', 'wellness', 'fitness', 'personal trainer', | |
| 'pilates', 'mindfulness', 'holistic', 'spa', 'massage'], | |
| 'academia': ['phd', 'professor', 'tenure', 'publish', 'research grant', 'dissertation', | |
| 'peer review', 'journal', 'academic', 'postdoc', 'faculty'], | |
| 'investment_banking': ['investment bank', 'ib analyst', 'm&a', 'merger', 'acquisition', | |
| 'dcf', 'lbo', 'pitch book', 'bulge bracket', 'deal flow'], | |
| 'retail_banking': ['retail bank', 'branch', 'teller', 'customer accounts', 'deposit', | |
| 'consumer banking', 'branch manager', 'retail lending'], | |
| } | |
| resume_lower = resume.lower() | |
| jd_lower = job_desc.lower() | |
| # Special fast-path for PhD-required academic positions | |
| if ('phd' in jd_lower and 'required' in jd_lower) or 'assistant professor' in jd_lower or 'associate professor' in jd_lower: | |
| # This is an academic position - check if resume has academic credentials | |
| has_phd = bool(re.search(r'\b(phd|ph\.?d\.?|doctorate)\b(?!\s*(?:required|needed|preferred|student))', resume_lower)) | |
| if 'no phd' in resume_lower or 'phd: none' in resume_lower: | |
| has_phd = False | |
| has_publications = bool(re.search(r'\b(publication|published|journal|conference paper|peer.?review)\b', resume_lower)) | |
| if re.search(r'publication[s]?\s*[:=]?\s*(none|n/a|0|no)\b', resume_lower): | |
| has_publications = False | |
| has_teaching = bool(re.search(r'\b(taught|professor|lecturer|instructor)\b', resume_lower)) | |
| if re.search(r'teaching\s*(?:experience)?\s*[:=]?\s*(none|n/a|0|no)\b', resume_lower): | |
| has_teaching = False | |
| if not has_phd and not has_publications and not has_teaching: | |
| return {'is_mismatch': True, 'resume_industry': 'industry/non-academic', | |
| 'job_industry': 'academia', 'penalty': 50} | |
| # Find which clusters the job belongs to (can match multiple) | |
| job_clusters = {} | |
| for cluster, keywords in industry_clusters.items(): | |
| matches = sum(1 for kw in keywords if kw in jd_lower) | |
| if matches >= 2: | |
| job_clusters[cluster] = matches | |
| # Get the primary job cluster | |
| job_cluster = max(job_clusters, key=job_clusters.get) if job_clusters else None | |
| job_cluster_matches = job_clusters.get(job_cluster, 0) if job_cluster else 0 | |
| # If job is in a specialized cluster, check if resume is from a different specialized cluster | |
| if job_cluster and job_cluster_matches >= 2: | |
| resume_cluster = None | |
| resume_cluster_matches = 0 | |
| for cluster, keywords in industry_clusters.items(): | |
| matches = sum(1 for kw in keywords if kw in resume_lower) | |
| if matches > resume_cluster_matches: | |
| resume_cluster_matches = matches | |
| resume_cluster = cluster | |
| # Mismatch: resume is strongly in a DIFFERENT specialized cluster | |
| if resume_cluster and resume_cluster != job_cluster and resume_cluster_matches >= 2: | |
| return {'is_mismatch': True, 'resume_industry': resume_cluster, | |
| 'job_industry': job_cluster, 'penalty': 55} | |
| # Special case: academia jobs require academic background | |
| if job_cluster == 'academia': | |
| # Check for PhD (but not "PhD required" from JD copy or "No PhD") | |
| has_phd = bool(re.search(r'\b(phd|ph\.?d\.?|doctorate)\b(?!\s*(?:required|needed|preferred|student))', resume_lower)) | |
| # Filter out "no phd" or "none" | |
| if 'no phd' in resume_lower or 'phd: none' in resume_lower: | |
| has_phd = False | |
| # Check for publications (not "none" or "n/a") | |
| has_publications = bool(re.search(r'\b(publication|published|journal|conference paper|peer.?review)\b', resume_lower)) | |
| if 'publication' in resume_lower: | |
| # Check if it's followed by "none", "n/a", "0", etc. | |
| if re.search(r'publication[s]?\s*[:=]?\s*(none|n/a|0|no)\b', resume_lower): | |
| has_publications = False | |
| # Check for teaching (not "none" or "n/a") | |
| has_teaching = bool(re.search(r'\b(taught|teaching|professor|lecturer|instructor|course|students)\b', resume_lower)) | |
| if 'teaching' in resume_lower: | |
| # Check if it's followed by "none", "n/a", "0", etc. | |
| if re.search(r'teaching\s*(?:experience)?\s*[:=]?\s*(none|n/a|0|no)\b', resume_lower): | |
| has_teaching = False | |
| if not has_phd and not has_publications and not has_teaching: | |
| return {'is_mismatch': True, 'resume_industry': 'industry/non-academic', | |
| 'job_industry': 'academia', 'penalty': 50} | |
| # Special case: investment banking vs retail banking | |
| if job_cluster == 'investment_banking' and resume_cluster == 'retail_banking': | |
| return {'is_mismatch': True, 'resume_industry': 'retail_banking', | |
| 'job_industry': 'investment_banking', 'penalty': 45} | |
| return {'is_mismatch': False, 'penalty': 0} | |
| def _detect_suspicious_dates(self, resume: str) -> Dict: | |
| """Detect future dates, impossible timelines, and concurrent role issues.""" | |
| import datetime | |
| current_year = datetime.datetime.now().year | |
| issues = [] | |
| penalty = 0 | |
| # Find all years mentioned | |
| years = list(set(int(m) for m in re.findall(r'\b(19\d{2}|20\d{2})\b', resume))) | |
| # Check for future years (but allow current year + 1 for expected graduations) | |
| future_years = [y for y in years if y > current_year + 1] | |
| if future_years: | |
| issues.append(f"Future dates: {future_years}") | |
| penalty += 35 | |
| # Check for impossibly old dates (before 1960 in work history) | |
| old_years = [y for y in years if y < 1960] | |
| if old_years and 'education' not in resume.lower()[:500]: | |
| issues.append(f"Suspicious old dates: {old_years}") | |
| penalty += 15 | |
| # Check for impossibly long tenure (>40 years at one company) | |
| year_ranges = re.findall(r'(19\d{2}|20\d{2})\s*[-βto]+\s*(19\d{2}|20\d{2}|present|current)', resume, re.IGNORECASE) | |
| for start, end in year_ranges: | |
| try: | |
| start_yr = int(start) | |
| end_yr = current_year if end.lower() in ['present', 'current'] else int(end) | |
| if end_yr - start_yr > 40: | |
| issues.append(f"Impossible tenure: {start}-{end}") | |
| penalty += 20 | |
| except: | |
| pass | |
| # Detect EXCESSIVE concurrent roles - 4+ simultaneous "present" jobs is suspicious | |
| # (but 2-3 is common: full-time job + freelance/consulting + board seat) | |
| present_mentions = len(re.findall(r'\b(present|current)\b', resume, re.IGNORECASE)) | |
| if present_mentions >= 5: # Raised threshold from 3 to 5 | |
| issues.append(f"Too many concurrent roles ({present_mentions} 'present' positions)") | |
| penalty += 20 # Reduced from 25 | |
| # Check for too many roles in too short a time (rapid job hopping or fabrication) | |
| role_indicators = re.findall(r'\b(manager|director|lead|head|chief|vp|president|ceo|cto|cfo)\b', resume, re.IGNORECASE) | |
| if len(role_indicators) >= 8: # Only flag extreme cases | |
| if years: | |
| year_span = max(years) - min(years) if len(years) > 1 else 1 | |
| if year_span <= 2 and len(role_indicators) >= 8: | |
| issues.append(f"Impossibly rapid advancement: {len(role_indicators)} senior roles in {year_span} years") | |
| penalty += 30 | |
| return {'has_issues': len(issues) > 0, 'issues': issues, 'penalty': min(penalty, 50)} | |
| def _detect_negative_sentiment(self, resume: str) -> Dict: | |
| """Detect negative language that shouldn't appear in resumes.""" | |
| negative_words = [ | |
| 'fired', 'terminated', 'dismissed', 'let go', 'laid off', # Job loss | |
| 'failed', 'failure', 'unsuccessful', 'unable to', # Failure | |
| 'conflict', 'disagreement', 'dispute', 'argument', # Interpersonal issues | |
| 'lawsuit', 'sued', 'legal action', 'harassment', # Legal issues | |
| 'mistake', 'error', 'wrong', 'poor performance', # Performance issues | |
| 'hate', 'hated', 'terrible', 'awful', 'worst', # Emotional language | |
| 'struggled', 'struggling', 'difficult time', # Weakness indicators | |
| 'not skilled', 'lack of', 'weakness', 'weaknesses', # Self-deprecation | |
| 'unemployed', 'gap in employment', 'taking time off', # Employment gaps | |
| 'criminal', 'arrest', 'conviction', 'probation', # Legal history | |
| ] | |
| resume_lower = resume.lower() | |
| found_negatives = [word for word in negative_words if word in resume_lower] | |
| penalty = min(len(found_negatives) * 15, 45) # Up to 45% penalty | |
| return {'has_negatives': len(found_negatives) > 0, 'found': found_negatives, 'penalty': penalty} | |
| def _detect_missing_required_credentials(self, resume: str, job_desc: str) -> Dict: | |
| """Detect if job requires specific credentials that are missing from resume.""" | |
| # Credentials that are REQUIRED for certain roles | |
| required_credentials = { | |
| # Healthcare | |
| ('rn', 'registered nurse', 'nursing'): ['rn', 'registered nurse', 'nursing license', 'nclex', 'bsn', 'nursing degree'], | |
| ('md', 'physician', 'doctor', 'medical doctor'): ['md', 'm.d.', 'medical degree', 'residency', 'board certified', 'medical license'], | |
| ('np', 'nurse practitioner'): ['np', 'nurse practitioner', 'aprn', 'dnp', 'msn'], | |
| ('pa', 'physician assistant'): ['pa-c', 'physician assistant', 'pa license'], | |
| ('pharmacist',): ['pharmd', 'rph', 'pharmacy license', 'pharmacy degree'], | |
| ('dentist',): ['dds', 'dmd', 'dental license', 'dental degree'], | |
| # Legal | |
| ('attorney', 'lawyer', 'legal counsel'): ['jd', 'j.d.', 'bar admission', 'bar license', 'law degree', 'esquire', 'esq'], | |
| # Accounting | |
| ('cpa required', 'certified public accountant'): ['cpa', 'certified public accountant'], | |
| # Finance | |
| ('cfa required',): ['cfa', 'chartered financial analyst'], | |
| ('cfp required', 'certified financial planner'): ['cfp', 'certified financial planner'], | |
| ('series 7', 'registered representative'): ['series 7', 'finra', 'securities license'], | |
| # Engineering | |
| ('pe required', 'professional engineer'): ['pe', 'p.e.', 'professional engineer', 'engineering license'], | |
| # Aviation | |
| ('pilot', 'captain', 'first officer'): ['atp', 'cpl', 'pilot license', 'flight hours', 'type rating', 'faa'], | |
| # Real Estate | |
| ('real estate agent', 'realtor'): ['real estate license', 'realtor license', 'licensed agent'], | |
| # Insurance | |
| ('insurance agent', 'insurance broker'): ['insurance license', 'licensed agent', 'p&c license', 'life license'], | |
| } | |
| jd_lower = job_desc.lower() | |
| resume_lower = resume.lower() | |
| for job_keywords, required_creds in required_credentials.items(): | |
| # Check if job requires this credential | |
| if any(kw in jd_lower for kw in job_keywords): | |
| # Check if resume has any of the required credentials | |
| has_credential = any(cred in resume_lower for cred in required_creds) | |
| if not has_credential: | |
| return {'missing_credential': True, 'required_for': job_keywords[0], | |
| 'needed': required_creds[:3], 'penalty': 40} | |
| return {'missing_credential': False, 'penalty': 0} | |
| def _detect_impossible_metrics(self, resume: str) -> Dict: | |
| """Detect impossible or exaggerated claims.""" | |
| issues = [] | |
| # Check for impossibly high percentages | |
| percentages = re.findall(r'(\d+)%', resume) | |
| for pct in percentages: | |
| pct_val = int(pct) | |
| if pct_val > 1000: # >1000% improvement claims | |
| issues.append(f"Impossible percentage: {pct_val}%") | |
| elif pct_val > 500 and pct_val != 100: # Suspicious large percentages | |
| issues.append(f"Suspicious percentage: {pct_val}%") | |
| # Check for impossibly large numbers in context | |
| money_patterns = re.findall(r'\$\s*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(million|billion|trillion|m|b|k)?', resume, re.IGNORECASE) | |
| for amount, unit in money_patterns: | |
| amount_num = float(amount.replace(',', '')) | |
| if unit and unit.lower() in ['trillion', 't']: | |
| issues.append(f"Unlikely amount: ${amount} {unit}") | |
| elif unit and unit.lower() in ['billion', 'b'] and amount_num > 100: | |
| issues.append(f"Suspicious large amount: ${amount} {unit}") | |
| # Check for impossibly high team sizes for individual contributors | |
| team_patterns = re.findall(r'(?:led|managed|supervised)\s+(?:a\s+)?(?:team\s+of\s+)?(\d+)\+?\s*(?:people|employees|staff|team members|engineers)', resume, re.IGNORECASE) | |
| for size in team_patterns: | |
| if int(size) > 1000: | |
| issues.append(f"Unlikely team size: {size}") | |
| penalty = min(len(issues) * 15, 40) | |
| return {'has_issues': len(issues) > 0, 'issues': issues, 'penalty': penalty} | |
| def _detect_format_issues(self, resume: str) -> Dict: | |
| """Detect format issues like bullet-only or no structure.""" | |
| issues = [] | |
| penalty = 0 | |
| lines = resume.strip().split('\n') | |
| non_empty_lines = [l for l in lines if l.strip()] | |
| if len(non_empty_lines) < 5: | |
| issues.append("Too few content lines") | |
| penalty += 30 # Increased | |
| # Check for bullet-only format (no sentences, just bullets) | |
| bullet_chars = ['β’', '-', '*', 'β', '>', 'Β·'] | |
| bullet_lines = sum(1 for l in non_empty_lines if any(l.strip().startswith(b) for b in bullet_chars)) | |
| bullet_ratio = bullet_lines / len(non_empty_lines) if non_empty_lines else 0 | |
| if bullet_ratio > 0.60 and len(non_empty_lines) <= 10: # Short bullet-only | |
| issues.append("Minimal bullet-only format") | |
| penalty += 35 | |
| elif bullet_ratio > 0.80 and len(non_empty_lines) > 5: | |
| issues.append("Bullet-only format (no context)") | |
| penalty += 30 | |
| # Check for code block format | |
| code_block_count = resume.count('```') | |
| if code_block_count >= 2: | |
| issues.append("Resume contains code blocks") | |
| penalty += 25 | |
| # Check for lack of any sections/headers | |
| section_patterns = ['experience', 'education', 'skills', 'summary', 'objective', | |
| 'work history', 'employment', 'qualifications', 'professional'] | |
| has_sections = any(pat in resume.lower() for pat in section_patterns) | |
| if not has_sections and len(resume) > 200: | |
| issues.append("No recognizable sections") | |
| penalty += 15 | |
| # Check for all-caps (often indicates bad parsing or shouting) | |
| caps_ratio = sum(1 for c in resume if c.isupper()) / len(resume) if resume else 0 | |
| if caps_ratio > 0.6: | |
| issues.append("Excessive capitalization") | |
| penalty += 10 | |
| # Check for lack of work verbs/action | |
| action_words = ['managed', 'developed', 'created', 'led', 'implemented', 'designed', | |
| 'built', 'achieved', 'improved', 'increased', 'delivered', 'established'] | |
| has_action = any(word in resume.lower() for word in action_words) | |
| if not has_action and len(resume) > 300: | |
| issues.append("No action verbs (passive resume)") | |
| penalty += 10 | |
| return {'has_issues': len(issues) > 0, 'issues': issues, 'penalty': min(penalty, 60)} | |
| def _detect_experience_level_mismatch(self, resume: str, job_desc: str) -> Dict: | |
| """Detect if experience level in resume doesn't match job requirements.""" | |
| resume_lower = resume.lower() | |
| jd_lower = job_desc.lower() | |
| # Job level indicators | |
| senior_indicators = ['senior', 'sr.', 'lead', 'principal', 'staff', 'architect', 'director', | |
| 'manager', 'head of', 'vp', 'chief', '10+ years', '8+ years', '7+ years'] | |
| entry_indicators = ['entry level', 'junior', 'associate', 'intern', 'graduate', 'new grad', | |
| '0-2 years', '1-2 years', 'no experience required', 'entry-level'] | |
| # Check job level | |
| job_is_senior = any(ind in jd_lower for ind in senior_indicators) | |
| job_is_entry = any(ind in jd_lower for ind in entry_indicators) | |
| # Check resume experience level | |
| # Count years of experience mentioned | |
| years_mentioned = re.findall(r'(\d+)\+?\s*years?\s*(?:of\s+)?(?:experience)?', resume_lower) | |
| max_years = max([int(y) for y in years_mentioned], default=0) | |
| # Check for entry-level resume indicators | |
| resume_is_entry = ( | |
| 'recent graduate' in resume_lower or | |
| 'new graduate' in resume_lower or | |
| 'entry level' in resume_lower or | |
| ('intern' in resume_lower and 'senior' not in resume_lower) or | |
| max_years <= 2 | |
| ) | |
| # Check for senior/overqualified resume indicators | |
| has_phd = 'phd' in resume_lower or 'ph.d' in resume_lower or 'doctorate' in resume_lower | |
| has_executive_titles = any(title in resume_lower for title in ['director', 'vp', 'vice president', 'chief', 'head of', 'principal', 'ceo', 'cto', 'cfo']) | |
| resume_is_senior = has_executive_titles or max_years >= 8 or has_phd | |
| # Mismatch detection | |
| if job_is_senior and resume_is_entry: | |
| return {'is_mismatch': True, 'job_level': 'senior', 'resume_level': 'entry', | |
| 'detail': 'Entry-level applying for senior role', 'penalty': 35} | |
| # NEW: Overqualified detection - PhD/executive for junior role | |
| if job_is_entry and resume_is_senior: | |
| overqualified_penalty = 0 | |
| reasons = [] | |
| if has_phd and 'phd' not in jd_lower and 'research' not in jd_lower: | |
| reasons.append('PhD for non-research entry role') | |
| overqualified_penalty += 20 | |
| if has_executive_titles: | |
| reasons.append('Executive experience for entry role') | |
| overqualified_penalty += 15 | |
| if max_years >= 15: | |
| reasons.append(f'{max_years}+ years experience for entry role') | |
| overqualified_penalty += 10 | |
| if overqualified_penalty > 0: | |
| return {'is_mismatch': True, 'job_level': 'entry', 'resume_level': 'senior/overqualified', | |
| 'detail': f"Overqualified: {', '.join(reasons)}", 'penalty': min(overqualified_penalty, 35)} | |
| return {'is_mismatch': False, 'penalty': 0} | |
| def analyze(self, resume: str, job_desc: str) -> Dict: | |
| """Calculate comprehensive ATS compatibility score with fraud detection.""" | |
| # First calculate base scores | |
| scores = { | |
| 'keyword_match': self._calculate_tfidf_score(resume, job_desc), | |
| 'semantic_match': self._semantic_section_match(resume, job_desc), | |
| 'experience_match': self._experience_match_score(resume, job_desc), | |
| 'skills_match': self._skills_match_score(resume, job_desc), | |
| 'format_score': self._format_score(resume), | |
| 'section_score': self._section_score(resume), | |
| 'action_verbs': self._action_verb_score(resume), | |
| 'quantification': self._quantification_score(resume) | |
| } | |
| # Run fraud/gaming detection - ONLY apply penalties for SEVERE issues | |
| # Light warnings are informational, not score-reducing | |
| fraud_checks = { | |
| 'jd_copy': self._detect_jd_copy(resume, job_desc), | |
| 'skills_only': self._detect_skills_without_experience(resume), | |
| 'industry_mismatch': self._detect_industry_mismatch(resume, job_desc), | |
| 'date_issues': self._detect_suspicious_dates(resume), | |
| 'negative_sentiment': self._detect_negative_sentiment(resume), | |
| 'missing_credentials': self._detect_missing_required_credentials(resume, job_desc), | |
| 'impossible_metrics': self._detect_impossible_metrics(resume), | |
| 'format_issues': self._detect_format_issues(resume), | |
| 'experience_mismatch': self._detect_experience_level_mismatch(resume, job_desc), | |
| } | |
| # ONLY apply penalties for truly severe issues (not minor flags) | |
| # Severe = JD copy, industry mismatch, impossible metrics, negative sentiment | |
| # Minor = format, experience level (informational warnings only) | |
| severe_penalty = 0 | |
| if fraud_checks['jd_copy'].get('is_copy'): | |
| severe_penalty += fraud_checks['jd_copy'].get('penalty', 0) | |
| if fraud_checks['industry_mismatch'].get('is_mismatch') and fraud_checks['industry_mismatch'].get('penalty', 0) >= 45: | |
| severe_penalty += fraud_checks['industry_mismatch'].get('penalty', 0) | |
| if fraud_checks['impossible_metrics'].get('has_issues'): | |
| severe_penalty += fraud_checks['impossible_metrics'].get('penalty', 0) | |
| if fraud_checks['negative_sentiment'].get('has_negatives') and len(fraud_checks['negative_sentiment'].get('found', [])) >= 2: | |
| severe_penalty += min(fraud_checks['negative_sentiment'].get('penalty', 0), 20) | |
| if fraud_checks['date_issues'].get('has_issues'): | |
| # Only penalize truly impossible dates (future, pre-1960) | |
| date_penalty = fraud_checks['date_issues'].get('penalty', 0) | |
| if date_penalty >= 30: # Significant date issue | |
| severe_penalty += min(date_penalty, 35) | |
| # Cap total penalty at 40% - allow legitimate resumes to score 60%+ | |
| total_penalty = min(severe_penalty, 40) | |
| # Calculate base score | |
| base_total = sum(scores[k] * self.weights[k] for k in scores) | |
| # Apply penalty | |
| final_total = max(5, base_total - total_penalty) | |
| # Collect warnings for feedback | |
| warnings = [] | |
| if fraud_checks['jd_copy']['is_copy']: | |
| warnings.append("β οΈ Resume appears to be copied from job description") | |
| if fraud_checks['skills_only']['is_skills_only']: | |
| warnings.append("β οΈ Skills listed without work experience context") | |
| if fraud_checks['industry_mismatch']['is_mismatch']: | |
| warnings.append(f"β οΈ Industry mismatch: Your background ({fraud_checks['industry_mismatch'].get('resume_industry', 'unknown')}) doesn't match job ({fraud_checks['industry_mismatch'].get('job_industry', 'unknown')})") | |
| if fraud_checks['date_issues']['has_issues']: | |
| warnings.append(f"β οΈ Date issues detected: {', '.join(fraud_checks['date_issues']['issues'])}") | |
| if fraud_checks['negative_sentiment']['has_negatives']: | |
| warnings.append(f"β οΈ Negative language detected: {', '.join(fraud_checks['negative_sentiment']['found'][:3])}") | |
| if fraud_checks['missing_credentials']['missing_credential']: | |
| warnings.append(f"β οΈ Missing required credential for {fraud_checks['missing_credentials']['required_for']}") | |
| if fraud_checks['impossible_metrics']['has_issues']: | |
| warnings.append(f"β οΈ Suspicious claims: {', '.join(fraud_checks['impossible_metrics']['issues'][:2])}") | |
| if fraud_checks['experience_mismatch']['is_mismatch']: | |
| warnings.append(f"β οΈ Experience level mismatch: {fraud_checks['experience_mismatch']['detail']}") | |
| return { | |
| 'total_score': min(99, max(1, int(final_total))), | |
| 'base_score': int(base_total), | |
| 'penalty_applied': int(total_penalty), | |
| 'breakdown': scores, | |
| 'keyword_match_pct': scores['keyword_match'], | |
| 'fraud_checks': fraud_checks, | |
| 'warnings': warnings | |
| } | |
| def _semantic_section_match(self, resume: str, job_desc: str) -> float: | |
| """Match job title/role semantically - COMPREHENSIVE FOR ALL 120+ DOMAINS.""" | |
| # Input validation - empty/minimal resumes get minimal scores | |
| if len(resume.strip()) < 50: | |
| return 5 | |
| # Common role patterns across all industries - MASSIVELY EXPANDED | |
| role_patterns = { | |
| # Technology - Software Engineering | |
| 'software engineer': ['software engineer', 'software developer', 'swe', 'developer', 'programmer', 'sde', 'full stack', 'backend', 'frontend', 'web developer', 'application developer'], | |
| 'frontend': ['frontend', 'front-end', 'front end', 'ui developer', 'react developer', 'angular developer', 'vue developer', 'web developer'], | |
| 'backend': ['backend', 'back-end', 'back end', 'server-side', 'api developer', 'node developer', 'python developer', 'java developer'], | |
| 'mobile': ['mobile developer', 'ios developer', 'android developer', 'mobile engineer', 'app developer', 'react native', 'flutter'], | |
| 'devops': ['devops', 'sre', 'site reliability', 'platform engineer', 'infrastructure engineer', 'cloud engineer', 'systems engineer'], | |
| 'security': ['security engineer', 'cybersecurity', 'information security', 'security analyst', 'infosec', 'penetration tester', 'soc analyst'], | |
| 'qa engineer': ['qa engineer', 'quality assurance', 'test engineer', 'sdet', 'automation engineer', 'qa analyst', 'quality engineer'], | |
| 'database': ['database administrator', 'dba', 'database engineer', 'data architect', 'sql developer'], | |
| 'network': ['network engineer', 'network administrator', 'network architect', 'systems administrator', 'it administrator'], | |
| # Technology - Data & AI | |
| 'data scientist': ['data scientist', 'data science', 'ml engineer', 'machine learning engineer', 'ai engineer', 'research scientist', 'applied scientist'], | |
| 'data analyst': ['data analyst', 'business analyst', 'analytics', 'bi analyst', 'reporting analyst', 'data analytics', 'analytics analyst'], | |
| 'data engineer': ['data engineer', 'etl developer', 'data pipeline', 'de', 'big data engineer', 'analytics engineer', 'data architect'], | |
| 'bi analyst': ['bi analyst', 'business intelligence', 'tableau developer', 'power bi developer', 'reporting analyst'], | |
| 'quantitative': ['quantitative analyst', 'quant', 'quantitative researcher', 'quantitative developer', 'algo trader'], | |
| # Management / Leadership - ALL LEVELS | |
| 'product manager': ['product manager', 'pm', 'product owner', 'po', 'product lead', 'product director', 'product management'], | |
| 'engineering manager': ['engineering manager', 'em', 'tech lead', 'technical lead', 'team lead', 'development manager', 'software manager'], | |
| 'project manager': ['project manager', 'program manager', 'pmp', 'scrum master', 'agile coach', 'delivery manager'], | |
| 'director': ['director', 'senior director', 'managing director', 'head of', 'department head'], | |
| 'vp': ['vice president', 'vp', 'avp', 'assistant vice president', 'svp', 'evp'], | |
| 'c-level': ['ceo', 'cto', 'cfo', 'coo', 'cmo', 'cio', 'chief', 'president', 'founder'], | |
| 'operations manager': ['operations manager', 'ops manager', 'operations director', 'operations lead', 'operations supervisor'], | |
| # Finance / Accounting - EXPANDED | |
| 'accountant': ['accountant', 'accounting', 'cpa', 'staff accountant', 'senior accountant', 'controller', 'accounting manager'], | |
| 'financial analyst': ['financial analyst', 'finance analyst', 'fp&a', 'investment analyst', 'equity analyst', 'research analyst'], | |
| 'auditor': ['auditor', 'internal auditor', 'external auditor', 'audit manager', 'audit associate', 'sox auditor'], | |
| 'banker': ['banker', 'investment banker', 'relationship manager', 'commercial banker', 'private banker'], | |
| 'tax': ['tax accountant', 'tax analyst', 'tax manager', 'tax specialist', 'tax preparer', 'tax advisor'], | |
| 'credit': ['credit analyst', 'credit manager', 'credit officer', 'underwriter', 'loan officer', 'credit risk'], | |
| 'portfolio': ['portfolio manager', 'asset manager', 'fund manager', 'investment manager', 'wealth manager'], | |
| 'bookkeeper': ['bookkeeper', 'bookkeeping', 'accounts clerk', 'accounting clerk', 'payroll clerk'], | |
| 'payroll': ['payroll specialist', 'payroll manager', 'payroll administrator', 'payroll coordinator'], | |
| 'controller': ['controller', 'financial controller', 'assistant controller', 'corporate controller'], | |
| 'cfo': ['cfo', 'chief financial officer', 'finance director', 'vp finance'], | |
| # Marketing - EXPANDED | |
| 'marketing manager': ['marketing manager', 'marketing director', 'brand manager', 'marketing lead', 'head of marketing'], | |
| 'digital marketing': ['digital marketing', 'seo specialist', 'sem specialist', 'performance marketing', 'growth marketing', 'ppc specialist'], | |
| 'content': ['content manager', 'content strategist', 'content writer', 'copywriter', 'content marketing', 'copy editor'], | |
| 'brand': ['brand manager', 'brand strategist', 'brand marketing', 'brand director'], | |
| 'product marketing': ['product marketing manager', 'pmm', 'product marketer', 'go-to-market'], | |
| 'email marketing': ['email marketing', 'email specialist', 'email marketing manager', 'crm specialist'], | |
| 'pr': ['public relations', 'pr specialist', 'pr manager', 'communications manager', 'media relations'], | |
| 'event': ['event manager', 'event coordinator', 'event planner', 'conference manager'], | |
| 'seo': ['seo specialist', 'seo manager', 'seo analyst', 'search specialist'], | |
| # Sales - EXPANDED | |
| 'sales': ['sales representative', 'sales manager', 'account executive', 'sales director', 'business development', 'sales associate'], | |
| 'sdr': ['sdr', 'sales development representative', 'bdr', 'business development representative', 'lead generation'], | |
| 'account executive': ['account executive', 'ae', 'enterprise ae', 'strategic ae', 'senior ae'], | |
| 'sales engineer': ['sales engineer', 'solutions engineer', 'presales', 'technical sales', 'se'], | |
| 'channel': ['channel manager', 'channel sales', 'partner manager', 'alliance manager', 'partner sales'], | |
| 'vp sales': ['vp sales', 'sales director', 'chief revenue officer', 'cro', 'head of sales'], | |
| 'account manager': ['account manager', 'customer success', 'client manager', 'relationship manager', 'key account manager'], | |
| # HR - EXPANDED | |
| 'recruiter': ['recruiter', 'talent acquisition', 'sourcer', 'recruiting manager', 'hr recruiter', 'technical recruiter'], | |
| 'hr manager': ['hr manager', 'hr director', 'hr business partner', 'hrbp', 'people manager', 'people ops'], | |
| 'hr generalist': ['hr generalist', 'hr coordinator', 'hr specialist', 'hr administrator', 'hr associate'], | |
| 'compensation': ['compensation analyst', 'compensation manager', 'total rewards', 'comp and benefits'], | |
| 'learning': ['learning and development', 'l&d', 'training manager', 'training specialist', 'instructional designer'], | |
| 'hris': ['hris analyst', 'hris manager', 'hr systems', 'workday analyst', 'peoplesoft'], | |
| 'benefits': ['benefits manager', 'benefits specialist', 'benefits administrator', 'benefits analyst'], | |
| # Healthcare - EXPANDED | |
| 'nurse': ['nurse', 'rn', 'registered nurse', 'lpn', 'nurse practitioner', 'np', 'clinical nurse', 'charge nurse', 'nurse manager'], | |
| 'physician': ['physician', 'doctor', 'md', 'do', 'attending physician', 'resident', 'hospitalist', 'specialist'], | |
| 'pharmacist': ['pharmacist', 'pharmacy', 'clinical pharmacist', 'pharmacy manager', 'pharmd'], | |
| 'physical therapist': ['physical therapist', 'pt', 'physiotherapist', 'rehabilitation', 'physical therapy'], | |
| 'medical coder': ['medical coder', 'medical billing', 'coding specialist', 'hcpcs', 'cpc', 'icd-10'], | |
| 'clinical research': ['clinical research', 'cra', 'clinical research associate', 'clinical trial', 'crc'], | |
| 'hospital admin': ['hospital administrator', 'healthcare administrator', 'medical director', 'clinic manager'], | |
| 'dental': ['dentist', 'dental hygienist', 'dental assistant', 'orthodontist'], | |
| 'occupational therapist': ['occupational therapist', 'ot', 'occupational therapy'], | |
| 'medical assistant': ['medical assistant', 'clinical assistant', 'patient care technician'], | |
| # Legal - EXPANDED | |
| 'attorney': ['attorney', 'lawyer', 'counsel', 'legal counsel', 'associate attorney', 'staff attorney'], | |
| 'paralegal': ['paralegal', 'legal assistant', 'legal secretary', 'litigation paralegal'], | |
| 'litigation': ['litigation attorney', 'litigator', 'trial attorney', 'trial lawyer'], | |
| 'ip': ['ip attorney', 'patent attorney', 'intellectual property', 'trademark attorney'], | |
| 'compliance': ['compliance officer', 'compliance manager', 'compliance analyst', 'regulatory compliance'], | |
| 'legal ops': ['legal operations', 'legal ops manager', 'legal project manager'], | |
| 'contract': ['contract manager', 'contracts administrator', 'contract specialist'], | |
| # Operations / Supply Chain - EXPANDED | |
| 'supply chain': ['supply chain manager', 'logistics manager', 'procurement manager', 'sourcing manager', 'supply chain analyst'], | |
| 'warehouse': ['warehouse manager', 'warehouse supervisor', 'inventory manager', 'distribution manager', 'warehouse associate'], | |
| 'production': ['production manager', 'manufacturing manager', 'plant manager', 'production supervisor', 'operations manager'], | |
| 'quality': ['quality manager', 'quality engineer', 'qa manager', 'quality control', 'quality assurance manager'], | |
| 'procurement': ['procurement manager', 'buyer', 'purchasing manager', 'procurement specialist', 'strategic sourcing'], | |
| 'facilities': ['facilities manager', 'facilities coordinator', 'building manager', 'maintenance manager'], | |
| # Education - EXPANDED | |
| 'teacher': ['teacher', 'instructor', 'professor', 'educator', 'lecturer', 'tutor', 'faculty'], | |
| 'principal': ['principal', 'assistant principal', 'school administrator', 'dean', 'headmaster'], | |
| 'professor': ['professor', 'associate professor', 'assistant professor', 'lecturer', 'adjunct'], | |
| 'instructional designer': ['instructional designer', 'curriculum developer', 'learning designer', 'course developer'], | |
| 'academic advisor': ['academic advisor', 'counselor', 'student advisor', 'guidance counselor'], | |
| 'curriculum': ['curriculum specialist', 'curriculum coordinator', 'curriculum manager'], | |
| # Creative / Design - EXPANDED | |
| 'designer': ['designer', 'graphic designer', 'ui designer', 'ux designer', 'product designer', 'visual designer', 'web designer'], | |
| 'creative': ['creative director', 'art director', 'creative lead', 'design director'], | |
| 'art director': ['art director', 'ad', 'creative director', 'design lead'], | |
| 'copywriter': ['copywriter', 'copy editor', 'content writer', 'creative writer'], | |
| 'video': ['video producer', 'videographer', 'video editor', 'multimedia producer', 'motion designer'], | |
| 'photographer': ['photographer', 'photo editor', 'photography', 'photojournalist'], | |
| '3d artist': ['3d artist', '3d modeler', 'cgi artist', 'visual effects', 'animator'], | |
| # Hospitality - NEW | |
| 'hotel manager': ['hotel manager', 'general manager', 'front desk manager', 'hospitality manager', 'resort manager'], | |
| 'restaurant manager': ['restaurant manager', 'food service manager', 'f&b manager', 'dining manager'], | |
| 'chef': ['chef', 'executive chef', 'sous chef', 'head chef', 'culinary', 'cook'], | |
| 'event coordinator': ['event coordinator', 'banquet manager', 'catering manager', 'conference coordinator'], | |
| 'concierge': ['concierge', 'guest services', 'guest relations', 'hospitality'], | |
| # Retail - NEW | |
| 'store manager': ['store manager', 'retail manager', 'assistant manager', 'shop manager'], | |
| 'buyer': ['buyer', 'merchandise buyer', 'retail buyer', 'category manager'], | |
| 'visual merchandiser': ['visual merchandiser', 'merchandising', 'display coordinator'], | |
| 'loss prevention': ['loss prevention', 'asset protection', 'security manager', 'lp manager'], | |
| # Government / Public Sector - NEW | |
| 'policy analyst': ['policy analyst', 'policy advisor', 'policy specialist', 'legislative analyst'], | |
| 'city planner': ['city planner', 'urban planner', 'regional planner', 'planning director'], | |
| 'grant writer': ['grant writer', 'grants manager', 'proposal writer', 'development writer'], | |
| 'public affairs': ['public affairs', 'government relations', 'public policy', 'lobbyist'], | |
| # Nonprofit - NEW | |
| 'program director': ['program director', 'program manager', 'program coordinator', 'program officer'], | |
| 'fundraiser': ['fundraiser', 'development director', 'major gifts', 'annual fund', 'donor relations'], | |
| 'volunteer coordinator': ['volunteer coordinator', 'volunteer manager', 'community outreach'], | |
| 'executive director': ['executive director', 'ed', 'nonprofit director', 'ceo'], | |
| # Insurance - NEW | |
| 'underwriter': ['underwriter', 'underwriting', 'underwriting analyst', 'risk underwriter'], | |
| 'claims': ['claims adjuster', 'claims analyst', 'claims examiner', 'claims representative'], | |
| 'actuary': ['actuary', 'actuarial analyst', 'actuarial consultant', 'pricing actuary'], | |
| 'insurance agent': ['insurance agent', 'insurance broker', 'insurance producer', 'insurance advisor'], | |
| # Engineering (Non-Software) - EXPANDED | |
| 'mechanical engineer': ['mechanical engineer', 'mechanical designer', 'cad engineer', 'product engineer'], | |
| 'electrical engineer': ['electrical engineer', 'electronics engineer', 'hardware engineer', 'ee'], | |
| 'civil engineer': ['civil engineer', 'structural engineer', 'construction engineer', 'project engineer'], | |
| 'chemical engineer': ['chemical engineer', 'process engineer', 'manufacturing engineer'], | |
| 'aerospace': ['aerospace engineer', 'aeronautical engineer', 'flight engineer', 'propulsion'], | |
| 'industrial': ['industrial engineer', 'manufacturing engineer', 'process engineer', 'ie'], | |
| # Science / Research - NEW | |
| 'biologist': ['biologist', 'research scientist', 'lab scientist', 'microbiologist', 'molecular biologist'], | |
| 'chemist': ['chemist', 'analytical chemist', 'research chemist', 'quality chemist'], | |
| 'environmental': ['environmental scientist', 'environmental engineer', 'environmental consultant'], | |
| 'lab technician': ['lab technician', 'laboratory technician', 'research technician', 'lab assistant'], | |
| # Media / Journalism - NEW | |
| 'journalist': ['journalist', 'reporter', 'correspondent', 'news writer', 'staff writer'], | |
| 'editor': ['editor', 'managing editor', 'copy editor', 'content editor', 'senior editor'], | |
| 'podcast': ['podcast producer', 'audio producer', 'podcast host', 'audio engineer'], | |
| 'social media': ['social media manager', 'social media specialist', 'community manager', 'social strategist'], | |
| # Real Estate - NEW | |
| 'real estate agent': ['real estate agent', 'realtor', 'real estate broker', 'listing agent'], | |
| 'property manager': ['property manager', 'building manager', 'leasing manager', 'asset manager'], | |
| 'appraiser': ['appraiser', 'real estate appraiser', 'property appraiser', 'valuation analyst'], | |
| # Consulting - NEW | |
| 'consultant': ['consultant', 'management consultant', 'strategy consultant', 'business consultant'], | |
| 'it consultant': ['it consultant', 'technology consultant', 'systems consultant', 'sap consultant'], | |
| 'strategy': ['strategy consultant', 'strategic advisor', 'strategy analyst', 'corporate strategy'], | |
| # Customer Service - EXPANDED | |
| 'customer service': ['customer service', 'customer support', 'support specialist', 'helpdesk', 'service rep', 'csr'], | |
| 'customer success': ['customer success manager', 'csm', 'customer success', 'client success'], | |
| 'support manager': ['support manager', 'customer support manager', 'service manager'], | |
| 'technical support': ['technical support', 'tech support', 'it support', 'it helpdesk', 'desktop support'], | |
| # Trades - NEW | |
| 'electrician': ['electrician', 'electrical technician', 'journeyman electrician', 'master electrician'], | |
| 'plumber': ['plumber', 'plumbing technician', 'pipefitter', 'journeyman plumber'], | |
| 'hvac': ['hvac technician', 'hvac installer', 'hvac mechanic', 'heating and cooling'], | |
| 'carpenter': ['carpenter', 'woodworker', 'cabinet maker', 'finish carpenter'], | |
| } | |
| resume_lower = resume.lower() | |
| jd_lower = job_desc.lower() | |
| # Find role in JD - check all patterns | |
| jd_role = None | |
| max_matches = 0 | |
| for role, variations in role_patterns.items(): | |
| matches = sum(1 for var in variations if var in jd_lower) | |
| if matches > max_matches: | |
| max_matches = matches | |
| jd_role = role | |
| if not jd_role: | |
| # Fallback: check for any professional words in RESUME | |
| professional_indicators = ['manager', 'engineer', 'analyst', 'specialist', 'coordinator', 'director', | |
| 'consultant', 'developer', 'designer', 'administrator', 'supervisor', | |
| 'technician', 'associate', 'representative', 'officer', 'executive'] | |
| # Check if resume has professional role indicators | |
| resume_has_professional = any(ind in resume_lower for ind in professional_indicators) | |
| jd_has_professional = any(ind in jd_lower for ind in professional_indicators) | |
| if resume_has_professional and jd_has_professional: | |
| return 50 # Both have some professional content, but can't match specifically | |
| elif resume_has_professional or jd_has_professional: | |
| return 30 # Only one side has professional content | |
| return 20 # Can't determine role - low score | |
| # Check if resume has matching role | |
| role_variations = role_patterns.get(jd_role, [jd_role]) | |
| if any(var in resume_lower for var in role_variations): | |
| return 100 | |
| # Check for related roles with fuzzy matching | |
| for var in role_variations: | |
| # Check first 500 chars (title area) - high priority | |
| if var in resume_lower[:500]: | |
| return 95 | |
| # Check substring match (5+ char prefix) | |
| if any(var[:5] in word for word in resume_lower.split() if len(var) >= 5): | |
| return 80 | |
| # Check 4-char prefix match | |
| if len(var) >= 4 and any(var[:4] in word for word in resume_lower.split()): | |
| return 65 | |
| # Check for generic professional overlap - different roles | |
| resume_has_roles = any(any(v in resume_lower for v in vars) for vars in role_patterns.values()) | |
| if resume_has_roles: | |
| return 40 # Has A role, but not the RIGHT role - significant penalty | |
| return 15 # No professional role detected in resume | |
| def _format_score(self, resume: str) -> float: | |
| """Score based on ATS-friendly formatting - REALISTIC SCORING.""" | |
| # Input validation - empty/minimal resumes get minimal scores | |
| if len(resume.strip()) < 50: | |
| return 5 | |
| score = 0 | |
| elements_found = 0 | |
| total_elements = 7 # Number of formatting elements we check | |
| # Email present (essential for contact) | |
| if re.search(r'[\w\.-]+@[\w\.-]+\.\w+', resume): | |
| score += 20 | |
| elements_found += 1 | |
| # Phone present (essential for contact) | |
| if re.search(r'\+?[\d\s\-\(\)]{10,}', resume): | |
| score += 20 | |
| elements_found += 1 | |
| # Bullet points (proper formatting) - more patterns | |
| if re.search(r'β’|\-\s|\*\s|^\s*\d+\.|^\s*[a-z]\)', resume, re.MULTILINE): | |
| score += 15 | |
| elements_found += 1 | |
| # LinkedIn/GitHub (professional presence) | |
| if re.search(r'linkedin|github', resume.lower()): | |
| score += 10 | |
| elements_found += 1 | |
| # Has dates (shows proper experience formatting) | |
| if re.search(r'\d{4}|present|current', resume.lower()): | |
| score += 15 | |
| elements_found += 1 | |
| # Has location/address | |
| if re.search(r'\b[A-Z][a-z]+,?\s+[A-Z]{2}\b|\bcity\b|\bstate\b', resume): | |
| score += 10 | |
| elements_found += 1 | |
| # Has name (capitalized words at start) | |
| if re.search(r'^[A-Z][a-z]+\s+[A-Z][a-z]+', resume.strip()): | |
| score += 10 | |
| elements_found += 1 | |
| # If no formatting elements found, return very low score | |
| if elements_found == 0: | |
| return 10 | |
| return min(100, score) | |
| def _section_score(self, resume: str) -> float: | |
| """Score based on standard section presence - REALISTIC SCORING.""" | |
| # Input validation - empty/minimal resumes get minimal scores | |
| if len(resume.strip()) < 50: | |
| return 5 | |
| resume_lower = resume.lower() | |
| # Core sections that most resumes should have | |
| core_sections = { | |
| 'experience': ['experience', 'employment', 'work history', 'professional experience', | |
| 'career', 'work experience', 'professional background', 'employment history', | |
| 'positions held', 'career history', 'professional history'], | |
| 'skills': ['skills', 'technical skills', 'competencies', 'technologies', 'expertise', | |
| 'proficiencies', 'core competencies', 'areas of expertise', 'technical expertise', | |
| 'key skills', 'professional skills', 'skill set'], | |
| } | |
| # Optional sections that add value | |
| optional_sections = { | |
| 'summary': ['summary', 'objective', 'profile', 'about', 'introduction', 'overview', | |
| 'professional summary', 'career objective', 'executive summary', 'highlights'], | |
| 'education': ['education', 'academic', 'qualification', 'degree', 'university', | |
| 'college', 'training', 'academic background', 'educational background', | |
| 'school', 'bachelor', 'master', 'phd', 'mba', 'certification'], | |
| 'certifications': ['certification', 'certificate', 'credentials', 'licensed', 'certif', | |
| 'accreditation', 'licenses', 'professional development', 'training'], | |
| 'achievements': ['achievement', 'accomplishment', 'award', 'honor', 'recognition', 'highlights'], | |
| 'projects': ['project', 'portfolio', 'case stud', 'initiatives'], | |
| } | |
| # Check for implicit experience (job titles, dates indicate experience section) | |
| job_titles = ['manager', 'engineer', 'analyst', 'developer', 'director', 'specialist', | |
| 'coordinator', 'consultant', 'lead', 'senior', 'junior', 'associate', | |
| 'supervisor', 'administrator', 'officer', 'technician', 'representative', | |
| 'executive', 'accountant', 'nurse', 'teacher', 'designer', 'writer'] | |
| has_job_indicators = bool(re.search(r'\d{4}\s*[-β]\s*(?:\d{4}|present|current)', resume_lower)) | |
| has_job_titles = any(title in resume_lower for title in job_titles) | |
| core_found = sum(1 for keywords in core_sections.values() if any(kw in resume_lower for kw in keywords)) | |
| optional_found = sum(1 for keywords in optional_sections.values() if any(kw in resume_lower for kw in keywords)) | |
| # If resume has job indicators, give credit for implicit experience section | |
| if (has_job_indicators or has_job_titles) and core_found == 0: | |
| core_found = 1 | |
| # REALISTIC SCORING: Start at 0, build up based on sections found | |
| # Core sections: 25 points each (max 50 for both) | |
| # Optional sections: 10 points each (max 50 for 5 sections) | |
| core_score = core_found * 25 | |
| optional_score = optional_found * 10 | |
| # If no sections found at all, very low score | |
| if core_found == 0 and optional_found == 0: | |
| return 15 | |
| return min(100, core_score + optional_score) | |
| def _action_verb_score(self, resume: str) -> float: | |
| """Score based on strong action verb usage - REALISTIC SCORING.""" | |
| # Input validation - empty/minimal resumes get minimal scores | |
| if len(resume.strip()) < 50: | |
| return 5 | |
| resume_lower = resume.lower() | |
| found = sum(1 for v in self.action_verbs if re.search(rf'\b{v}', resume_lower)) | |
| # REALISTIC SCORING: Start at 0, each action verb adds points | |
| # 0 verbs = 10%, 1-2 verbs = 30-40%, 3-5 verbs = 50-70%, 6+ verbs = 80-100% | |
| if found == 0: | |
| return 10 | |
| elif found <= 2: | |
| return 20 + (found * 10) | |
| elif found <= 5: | |
| return 40 + ((found - 2) * 10) | |
| elif found <= 10: | |
| return 70 + ((found - 5) * 6) | |
| else: | |
| return min(100, 90 + (found - 10) * 2) | |
| def _quantification_score(self, resume: str) -> float: | |
| """Score based on quantified achievements - REALISTIC SCORING.""" | |
| # Input validation - empty/minimal resumes get minimal scores | |
| if len(resume.strip()) < 50: | |
| return 5 | |
| patterns = [ | |
| r'\d+%', # Percentages | |
| r'\$[\d,\.]+[MKB]?', # Dollar amounts | |
| r'\d+\+?\s*(?:years?|months?)', # Time periods | |
| r'\d+[MKB]\+?', # Large numbers with suffix (1M, 5K) | |
| r'#\d+', # Rankings (#1, top #10) | |
| r'\d+\+?\s*(?:customers?|users?|clients?|employees?|team\s*members?|staff|people|patients?|students?|members?|associates?|reps?|agents?|nurses?|engineers?|developers?)', # People counts | |
| r'\d+x', # Multipliers (3x, 10x) | |
| r'top\s*\d+%?', # Top rankings | |
| r'\d+\s*(?:projects?|deals?|accounts?|transactions?|contracts?|cases?|clients?|positions?|requisitions?|hires?)', # Work counts | |
| r'\d+\s*(?:million|billion|thousand)', # Large numbers written | |
| r'\d{1,3}(?:,\d{3})+', # Numbers with commas (1,000,000) | |
| r'\d+\s*(?:per\s*(?:day|week|month|year|hour|shift))', # Rate metrics | |
| r'\d+\s*(?:daily|weekly|monthly|annually|yearly)', # Frequency | |
| r'\d+\s*(?:hours?|days?|weeks?|minutes?)', # Time | |
| r'\d+\s*(?:interviews?|reviews?|audits?|reports?|presentations?|meetings?|calls?)', # Work output | |
| r'(?:reduced|increased|improved|grew|saved|generated|delivered|managed|led|oversaw|handled|closed|achieved|exceeded|surpassed|maintained|built|developed|created|launched|completed)\s*(?:by\s*)?\d+', # Action + number | |
| r'\d+\s*(?:teams?|departments?|offices?|locations?|sites?|branches?|units?|facilities?|stores?)', # Organizational scale | |
| r'\d+\s*(?:products?|features?|releases?|launches?|applications?|systems?|tools?|platforms?)', # Product metrics | |
| r'\d+\s*(?:campaigns?|initiatives?|programs?|events?|workshops?|trainings?|courses?)', # Program metrics | |
| r'(?:over|more than|approximately|about|nearly|almost|up to|exceeding)\s*\d+', # Approximations | |
| r'\d+\s*(?:countries|regions|states|markets|territories|cities)', # Geographic scope | |
| r'\d+-(?:bed|person|member|seat)', # Capacity descriptions (40-bed unit) | |
| r'\d+\s*(?:vendors?|suppliers?|partners?|contractors?|agencies?)', # Business relationships | |
| r'\d+\s*(?:downloads?|installs?|views?|clicks?|impressions?|conversions?|leads?)', # Digital metrics | |
| r'\d+\s*(?:articles?|papers?|publications?|patents?|blogs?|posts?)', # Content metrics | |
| r'\d+\s*(?:beds?|rooms?|units?|seats?|pods?)', # Facility metrics | |
| r'\d+\s*(?:tickets?|issues?|requests?|inquiries?)', # Support metrics | |
| ] | |
| total_quantifications = 0 | |
| for pattern in patterns: | |
| matches = re.findall(pattern, resume, re.IGNORECASE) | |
| total_quantifications += len(matches) | |
| # Also count standalone significant numbers (likely metrics) | |
| # Numbers like 500, 1000, 50000 that aren't part of dates | |
| standalone_numbers = re.findall(r'(?<!\d)\d{2,}(?:,\d{3})*(?!\d)', resume) | |
| # Filter out years (1990-2030) | |
| standalone_numbers = [n for n in standalone_numbers if not (1980 <= int(n.replace(',', '')[:4]) <= 2030 and len(n.replace(',', '')) == 4)] | |
| total_quantifications += len(standalone_numbers) // 2 # Partial credit for standalone numbers | |
| # REALISTIC SCORING: Start at 0, each quantification adds points | |
| # 0 quants = 10%, 1-2 = 30-40%, 3-5 = 50-70%, 6+ = 80-100% | |
| if total_quantifications == 0: | |
| # Check if resume is too short | |
| if len(resume.strip()) < 50: | |
| return 5 | |
| return 15 # No quantifications in a real resume | |
| elif total_quantifications <= 2: | |
| return 25 + (total_quantifications * 10) | |
| elif total_quantifications <= 5: | |
| return 45 + ((total_quantifications - 2) * 10) | |
| elif total_quantifications <= 10: | |
| return 75 + ((total_quantifications - 5) * 5) | |
| else: | |
| return min(100, 95 + (total_quantifications - 10)) | |
| def get_keyword_analysis(self, resume: str, job_desc: str) -> Tuple[List[str], List[str]]: | |
| """Get detailed keyword analysis with taxonomy expansion and fuzzy matching.""" | |
| jd_lower = job_desc.lower() | |
| resume_lower = resume.lower() | |
| # Multi-word terms across ALL DOMAINS | |
| multi_word_terms = [ | |
| # Technology / AI / ML | |
| 'machine learning', 'deep learning', 'natural language processing', | |
| 'large language model', 'generative ai', 'prompt engineering', | |
| 'feature store', 'data pipeline', 'reinforcement learning', | |
| 'computer vision', 'neural network', 'transfer learning', | |
| 'agentic workflow', 'similarity search', 'gpu optimization', | |
| 'fine tuning', 'model inference', 'cloud native', | |
| 'recommender system', 'embedding model', 'vector database', | |
| 'software development', 'full stack', 'front end', 'back end', | |
| 'continuous integration', 'continuous deployment', 'version control', | |
| 'agile methodology', 'scrum master', 'sprint planning', | |
| # Finance / Accounting | |
| 'financial analysis', 'financial modeling', 'financial planning', | |
| 'budget management', 'variance analysis', 'cash flow', | |
| 'accounts payable', 'accounts receivable', 'general ledger', | |
| 'risk management', 'credit risk', 'market risk', 'operational risk', | |
| 'due diligence', 'internal audit', 'external audit', | |
| 'tax planning', 'tax compliance', 'financial reporting', | |
| 'investment banking', 'private equity', 'venture capital', | |
| 'portfolio management', 'asset management', 'wealth management', | |
| # Marketing / Sales | |
| 'digital marketing', 'content marketing', 'email marketing', | |
| 'search engine optimization', 'pay per click', 'social media marketing', | |
| 'lead generation', 'sales pipeline', 'customer acquisition', | |
| 'brand management', 'market research', 'competitive analysis', | |
| 'customer relationship management', 'account management', | |
| 'business development', 'revenue growth', 'quota attainment', | |
| # HR | |
| 'talent acquisition', 'performance management', 'employee engagement', | |
| 'learning and development', 'succession planning', 'workforce planning', | |
| 'compensation and benefits', 'employee relations', 'labor relations', | |
| 'diversity and inclusion', 'organizational development', | |
| # Healthcare | |
| 'patient care', 'clinical research', 'clinical trials', | |
| 'electronic health records', 'medical records', 'healthcare management', | |
| 'quality improvement', 'patient safety', 'care coordination', | |
| # Operations / Supply Chain | |
| 'supply chain management', 'inventory management', 'warehouse management', | |
| 'process improvement', 'lean manufacturing', 'six sigma', | |
| 'quality control', 'quality assurance', 'vendor management', | |
| 'project management', 'program management', 'change management', | |
| # Legal | |
| 'contract negotiation', 'legal research', 'intellectual property', | |
| 'regulatory compliance', 'corporate governance', 'risk assessment', | |
| # General Professional | |
| 'cross functional', 'stakeholder management', 'strategic planning', | |
| 'team leadership', 'client relationship', 'problem solving', | |
| ] | |
| # Check which multi-word terms are in JD but missing from resume | |
| important_multiword = [] | |
| for term in multi_word_terms: | |
| if term in jd_lower: | |
| important_multiword.append(term) | |
| # Single word extraction | |
| jd_words = re.findall(r'\b[a-zA-Z]{3,}\b', jd_lower) | |
| jd_words = [w for w in jd_words if w not in self.stop_words] | |
| keyword_counts = Counter(jd_words) | |
| # Get important keywords (appearing multiple times or in taxonomy) | |
| important_keywords = [] | |
| for word, count in keyword_counts.most_common(50): | |
| # Check if it's a technical term | |
| is_technical = any(word in variations for variations in self.skills_taxonomy.values()) | |
| if count >= 2 or is_technical: | |
| important_keywords.append(word) | |
| # Expand with taxonomy | |
| resume_lower = resume.lower() | |
| resume_words = set(re.findall(r'\b[a-zA-Z]{3,}\b', resume_lower)) | |
| resume_stems = {self._stem_word(w) for w in resume_words} | |
| matched = [] | |
| missing = [] | |
| for kw in important_keywords[:30]: | |
| found = False | |
| # Check 1: Direct match | |
| if kw in resume_lower: | |
| found = True | |
| # Check 2: Stem match (collaborate = collaborated = collaborating) | |
| if not found: | |
| kw_stem = self._stem_word(kw) | |
| if kw_stem in resume_stems or any(kw_stem in stem for stem in resume_stems): | |
| found = True | |
| # Check 3: Containment (support in supported, supporting) | |
| if not found: | |
| if any(kw in word or word in kw for word in resume_words if len(word) > 3): | |
| found = True | |
| # Check 4: Taxonomy expansion | |
| if not found: | |
| kw_expanded = self._expand_with_taxonomy([kw]) | |
| if any(exp in resume_lower for exp in kw_expanded): | |
| found = True | |
| # Check 5: Fuzzy match against resume words | |
| if not found: | |
| for resume_word in resume_words: | |
| if self._fuzzy_match(kw, resume_word): | |
| found = True | |
| break | |
| if found: | |
| matched.append(kw) | |
| else: | |
| missing.append(kw) | |
| # Also check multi-word terms | |
| for term in important_multiword: | |
| # Check if any variation of this term exists in resume | |
| # Try both with space and underscore as key | |
| term_variations = self.skills_taxonomy.get(term, | |
| self.skills_taxonomy.get(term.replace(' ', '_'), [term])) | |
| term_found = any(var in resume_lower for var in term_variations) or term in resume_lower | |
| # Also check fuzzy match for variations | |
| if not term_found: | |
| for var in term_variations: | |
| if var in resume_lower or any(self._fuzzy_match(var, rw) for rw in resume_words): | |
| term_found = True | |
| break | |
| if not term_found: | |
| # Check if component words exist | |
| term_words = term.split() | |
| if not all(any(tw in rw for rw in resume_words) for tw in term_words): | |
| if term not in missing: | |
| missing.insert(0, term) # Add at beginning (more important) | |
| else: | |
| if term not in matched: | |
| matched.insert(0, term) | |
| return matched[:20], missing[:15] | |
| # ============== PDF GENERATOR ============== | |
| def extract_candidate_name(resume_content: str) -> str: | |
| """Extract candidate name from resume using NER (spaCy) with rule-based fallback.""" | |
| # Get first few lines where name typically appears (first 500 chars or first 5 lines) | |
| lines = resume_content.strip().split('\n') | |
| first_lines = '\n'.join(lines[:5])[:500] | |
| # Company and non-name indicators to filter out | |
| company_indicators = { | |
| 'inc', 'corp', 'corporation', 'llc', 'ltd', 'limited', 'company', 'co', | |
| 'bank', 'chase', 'citi', 'citibank', 'jpmorgan', 'goldman', 'morgan stanley', | |
| 'google', 'meta', 'amazon', 'microsoft', 'apple', 'netflix', 'tesla', | |
| 'technologies', 'solutions', 'consulting', 'services', 'partners', 'group', | |
| 'capital', 'investments', 'financial', 'advisors', 'associates', | |
| 'jp morgan', 'hdfc', 'icici', 'vodafone', 'infosys', 'wipro', 'tcs', | |
| 'accenture', 'deloitte', 'kpmg', 'pwc', 'mckinsey', 'bcg', 'bain', | |
| 'monte carlo', 'bayesian', 'markov', 'gaussian' # Technical terms often misidentified | |
| } | |
| def is_valid_person_name(name): | |
| """Check if name looks like a real person's name.""" | |
| if not name or len(name) < 4: | |
| return False | |
| name_lower = name.lower() | |
| # Check against company indicators | |
| if any(comp in name_lower for comp in company_indicators): | |
| return False | |
| words = name.split() | |
| if len(words) < 2 or len(words) > 5: | |
| return False | |
| # Must be mostly letters | |
| letter_ratio = sum(1 for c in name if c.isalpha() or c.isspace()) / len(name) | |
| return letter_ratio > 0.85 | |
| # METHOD 1: Try spaCy NER on FIRST FEW LINES only | |
| try: | |
| import spacy | |
| try: | |
| nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| import subprocess | |
| subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True) | |
| nlp = spacy.load("en_core_web_sm") | |
| # Only analyze first few lines (where name should be) | |
| doc = nlp(first_lines) | |
| # Get PERSON entities, prioritize those appearing earliest | |
| for ent in doc.ents: | |
| if ent.label_ == "PERSON": | |
| name = ent.text.strip() | |
| name = re.sub(r'[,;:\-β|]+$', '', name).strip() | |
| if is_valid_person_name(name): | |
| words = name.split()[:4] | |
| name = ' '.join(words) | |
| if name.isupper(): | |
| return name.title() | |
| return name | |
| except Exception: | |
| pass | |
| # METHOD 2: Rule-based fallback | |
| skip_patterns = ['resume', 'curriculum vitae', 'cv', 'contact', 'personal', 'profile', | |
| 'phone:', 'email:', 'address:', 'summary', 'objective', 'experience', | |
| 'education', 'skills', 'professional'] | |
| title_words = { | |
| 'vice', 'president', 'vp', 'director', 'manager', 'engineer', 'analyst', | |
| 'developer', 'consultant', 'specialist', 'lead', 'senior', 'junior', | |
| 'associate', 'executive', 'coordinator', 'administrator', 'officer', | |
| 'ceo', 'cto', 'cfo', 'coo', 'chief', 'head', 'applied', 'data', | |
| 'software', 'product', 'project', 'avp', 'svp', 'evp', 'md' | |
| } | |
| for line in lines[:15]: | |
| line = line.strip() | |
| if not line or len(line) < 3: | |
| continue | |
| line_lower = line.lower() | |
| if any(pat in line_lower for pat in skip_patterns): | |
| continue | |
| if any(comp in line_lower for comp in company_indicators): | |
| continue | |
| if '@' in line or re.search(r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}', line): | |
| continue | |
| if 'linkedin' in line_lower or 'github' in line_lower or 'kaggle' in line_lower: | |
| continue | |
| if line.startswith('β’') or line.startswith('-') or line.startswith('*'): | |
| continue | |
| name = line | |
| for sep in ['|', ' - ', ' β ']: | |
| if sep in name: | |
| name = name.split(sep)[0].strip() | |
| words = name.split() | |
| name_words = [] | |
| for word in words: | |
| word_clean = re.sub(r'[^\w]', '', word).lower() | |
| if word_clean in title_words: | |
| break | |
| name_words.append(word) | |
| if len(name_words) >= 2: | |
| name = ' '.join(name_words[:4]) | |
| name = re.sub(r'[,;:\-β|]+$', '', name).strip() | |
| if is_valid_person_name(name): | |
| if name.isupper(): | |
| return name.title() | |
| return name | |
| return "Candidate" | |
| def generate_pdf(resume_content: str, color_scheme: str, candidate_name: str = None) -> str: | |
| """Generate professional, well-formatted PDF resume.""" | |
| try: | |
| from reportlab.lib import colors | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.lib.styles import ParagraphStyle | |
| from reportlab.lib.units import inch | |
| from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, HRFlowable, ListFlowable, ListItem | |
| from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY | |
| SCHEMES = { | |
| 'Navy Blue': {'primary': '#1a365d', 'accent': '#2c5282', 'text': '#2d3748'}, | |
| 'Forest Green': {'primary': '#1c4532', 'accent': '#276749', 'text': '#2d3748'}, | |
| 'Burgundy': {'primary': '#742a2a', 'accent': '#9b2c2c', 'text': '#2d3748'}, | |
| 'Charcoal': {'primary': '#1a202c', 'accent': '#4a5568', 'text': '#2d3748'}, | |
| 'Royal Purple': {'primary': '#44337a', 'accent': '#6b46c1', 'text': '#2d3748'} | |
| } | |
| scheme = SCHEMES.get(color_scheme, SCHEMES['Navy Blue']) | |
| primary = colors.HexColor(scheme['primary']) | |
| accent = colors.HexColor(scheme['accent']) | |
| text_color = colors.HexColor(scheme['text']) | |
| light_gray = colors.HexColor('#e2e8f0') | |
| def escape(t): | |
| if not t: return "" | |
| # Clean up ALL PDF encoding artifacts | |
| t = str(t) | |
| t = re.sub(r'\(cid:\d+\)', '', t) # Remove all cid patterns | |
| # Replace various bullet characters with a simple dash (safe for PDF) | |
| t = t.replace('β', '-').replace('β', '-').replace('βͺ', '-').replace('β ', '-') | |
| t = t.replace('β’', '-').replace('βΊ', '-').replace('βΈ', '-').replace('β', '-') | |
| t = t.replace('β', '-').replace('β', '-') # Replace divider chars | |
| t = t.replace('&', '&').replace('<', '<').replace('>', '>') | |
| return t.strip() | |
| # Generate filename based on candidate name | |
| if candidate_name: | |
| safe_name = re.sub(r'[^\w\s\-]', '', candidate_name).replace(' ', '_') | |
| else: | |
| safe_name = "Resume" | |
| temp_path = os.path.join(tempfile.gettempdir(), f"{safe_name}_ATS_Optimized.pdf") | |
| doc = SimpleDocTemplate(temp_path, pagesize=letter, | |
| rightMargin=0.5*inch, leftMargin=0.5*inch, | |
| topMargin=0.4*inch, bottomMargin=0.4*inch) | |
| story = [] | |
| # Define styles | |
| name_style = ParagraphStyle('Name', fontSize=20, textColor=primary, | |
| fontName='Helvetica-Bold', alignment=TA_CENTER, | |
| spaceAfter=2, leading=24) | |
| title_style = ParagraphStyle('Title', fontSize=11, textColor=accent, | |
| fontName='Helvetica-Oblique', alignment=TA_CENTER, | |
| spaceAfter=4, leading=14) | |
| contact_style = ParagraphStyle('Contact', fontSize=9, textColor=text_color, | |
| fontName='Helvetica', alignment=TA_CENTER, | |
| spaceAfter=8, leading=12) | |
| section_style = ParagraphStyle('Section', fontSize=11, textColor=primary, | |
| fontName='Helvetica-Bold', spaceBefore=14, | |
| spaceAfter=4, leading=14) | |
| company_style = ParagraphStyle('Company', fontSize=10, textColor=text_color, | |
| fontName='Helvetica-Bold', spaceBefore=8, | |
| spaceAfter=2, leading=13) | |
| role_style = ParagraphStyle('Role', fontSize=9, textColor=accent, | |
| fontName='Helvetica-Oblique', spaceAfter=4, leading=12) | |
| body_style = ParagraphStyle('Body', fontSize=9.5, textColor=text_color, | |
| fontName='Helvetica', spaceAfter=3, leading=12, | |
| alignment=TA_JUSTIFY) | |
| bullet_style = ParagraphStyle('Bullet', fontSize=9.5, textColor=text_color, | |
| fontName='Helvetica', leftIndent=15, | |
| firstLineIndent=-15, spaceAfter=4, leading=13, | |
| bulletIndent=0) | |
| # Section headers to detect | |
| section_keywords = ['professional summary', 'summary', 'objective', 'profile', | |
| 'professional experience', 'experience', 'employment', 'work history', | |
| 'education', 'academic', 'technical skills', 'skills', 'competencies', | |
| 'certifications', 'certificates', 'projects', 'achievements', | |
| 'awards', 'publications', 'leadership', 'community', 'competitive'] | |
| lines = resume_content.split('\n') | |
| # Process resume line by line with smart detection | |
| i = 0 | |
| name_found = False | |
| while i < len(lines): | |
| line = lines[i].strip() | |
| if not line: | |
| i += 1 | |
| continue | |
| line_lower = line.lower() | |
| # First non-empty line is the name | |
| if not name_found: | |
| # Check if it contains job title (split name and title) | |
| if '|' in line or ' - ' in line: | |
| parts = re.split(r'\s*[\|β-]\s*', line, maxsplit=1) | |
| story.append(Paragraph(escape(parts[0].strip()), name_style)) | |
| if len(parts) > 1: | |
| story.append(Paragraph(escape(parts[1].strip()), title_style)) | |
| else: | |
| story.append(Paragraph(escape(line), name_style)) | |
| name_found = True | |
| i += 1 | |
| continue | |
| # Contact info (early lines with email, phone, LinkedIn) | |
| if i <= 5 and ('@' in line or '+91' in line or '+1' in line or 'linkedin' in line_lower or 'github' in line_lower): | |
| story.append(Paragraph(escape(line), contact_style)) | |
| i += 1 | |
| continue | |
| # Section headers | |
| is_section = any(kw in line_lower for kw in section_keywords) and len(line) < 60 | |
| is_all_caps = line.isupper() and len(line) < 50 and len(line) > 3 | |
| if is_section or is_all_caps: | |
| story.append(Spacer(1, 6)) | |
| story.append(Paragraph(escape(line.upper()), section_style)) | |
| story.append(HRFlowable(width="100%", thickness=1, color=primary, spaceAfter=6)) | |
| i += 1 | |
| continue | |
| # Company/Role lines (COMPANY NAME | Location | Date pattern) | |
| company_pattern = re.match(r'^([A-Z][A-Za-z\s&\.,]+(?:LTD|LIMITED|INC|CO|CORP|BANK|CHASE)?\.?)\s*[\|β-]\s*(.+)$', line) | |
| if company_pattern or (line.isupper() and ('|' in line or any(m in line for m in ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']))): | |
| story.append(Paragraph(escape(line), company_style)) | |
| i += 1 | |
| # Check if next line is role/title | |
| if i < len(lines): | |
| next_line = lines[i].strip() | |
| if next_line and not next_line.startswith('β’') and len(next_line) < 100: | |
| # Likely a role line | |
| if 'vice president' in next_line.lower() or 'manager' in next_line.lower() or 'lead' in next_line.lower() or 'team' in next_line.lower(): | |
| story.append(Paragraph(escape(next_line), role_style)) | |
| i += 1 | |
| continue | |
| # Bullet points | |
| if line.startswith('β’') or line.startswith('-') or line.startswith('*') or line.startswith('β'): | |
| bullet_text = line.lstrip('β’-*ββΊ ') | |
| # Use simple dash for PDF compatibility (Unicode bullets cause cid encoding issues) | |
| story.append(Paragraph(f"- {escape(bullet_text)}", bullet_style)) | |
| i += 1 | |
| continue | |
| # Regular body text | |
| story.append(Paragraph(escape(line), body_style)) | |
| i += 1 | |
| doc.build(story) | |
| if os.path.exists(temp_path): | |
| return temp_path | |
| return None | |
| except Exception as e: | |
| print(f"PDF Error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None | |
| # ============== MAIN FUNCTION ============== | |
| def analyze_and_optimize(resume_file, job_description, industry, experience_level, color_scheme): | |
| """Main function - analyze, optimize with AI, and generate PDF.""" | |
| if resume_file is None: | |
| return "β Please upload your resume", "", "", "", "", None, "" | |
| if not job_description or len(job_description.strip()) < 50: | |
| return "β Please paste a complete job description (at least 50 characters)", "", "", "", "", None, "" | |
| try: | |
| original_resume = parse_resume(resume_file) | |
| if not original_resume or original_resume.startswith("Error") or original_resume.startswith("Unsupported"): | |
| return f"β {original_resume}", "", "", "", "", None, "" | |
| analyzer = ATSCompatibilityAnalyzer() | |
| before_analysis = analyzer.analyze(original_resume, job_description) | |
| optimized_resume, ai_suggestions = optimize_with_llm(original_resume, job_description) | |
| # Extract candidate name - ALWAYS extract from ORIGINAL resume first (most reliable) | |
| # The original resume has the actual person's name, not LLM hallucinations | |
| candidate_name = extract_candidate_name(original_resume) | |
| # Validate the extracted name looks like a real person's name | |
| def is_valid_person_name(name): | |
| """Check if name looks like a real person's name, not garbage.""" | |
| if not name or name == "Candidate" or len(name) < 4: | |
| return False | |
| # Must have at least 2 words | |
| words = name.split() | |
| if len(words) < 2: | |
| return False | |
| # Must be mostly letters | |
| letter_ratio = sum(1 for c in name if c.isalpha() or c.isspace()) / len(name) | |
| if letter_ratio < 0.85: | |
| return False | |
| # Reject obvious non-names (job keywords, etc.) | |
| bad_words = {'reduced', 'feature', 'applied', 'senior', 'junior', 'manager', | |
| 'engineer', 'developer', 'analyst', 'consultant', 'specialist', | |
| 'experience', 'skills', 'summary', 'professional', 'objective'} | |
| name_words = set(w.lower() for w in words) | |
| if name_words & bad_words: | |
| return False | |
| return True | |
| # If original extraction failed or looks invalid, try AI-extracted name as backup | |
| if not is_valid_person_name(candidate_name): | |
| if '__CANDIDATE_NAME__:' in optimized_resume: | |
| for line in optimized_resume.split('\n'): | |
| if line.startswith('__CANDIDATE_NAME__:'): | |
| potential = line.replace('__CANDIDATE_NAME__:', '').strip() | |
| if is_valid_person_name(potential): | |
| candidate_name = potential | |
| break | |
| # Final fallback - if still invalid, use "Candidate" | |
| if not is_valid_person_name(candidate_name): | |
| candidate_name = "Candidate" | |
| after_analysis = analyzer.analyze(optimized_resume, job_description) | |
| before_score = before_analysis['total_score'] | |
| after_score = after_analysis['total_score'] | |
| improvement = after_score - before_score | |
| # Calculate individual metric improvements | |
| kw_improvement = after_analysis['breakdown']['keyword_match'] - before_analysis['breakdown']['keyword_match'] | |
| scores_display = f"""## π Advanced ATS Compatibility Score | |
| | Metric | Before | After | Ξ | | |
| |--------|--------|-------|---| | |
| | **π― Overall Score** | **{before_score}%** | **{after_score}%** | **{'+' if improvement >= 0 else ''}{improvement}%** | | |
| | TF-IDF Keyword Match | {before_analysis['breakdown']['keyword_match']:.0f}% | {after_analysis['breakdown']['keyword_match']:.0f}% | {'+' if kw_improvement >= 0 else ''}{kw_improvement:.0f}% | | |
| | Semantic Role Match | {before_analysis['breakdown']['semantic_match']:.0f}% | {after_analysis['breakdown']['semantic_match']:.0f}% | | | |
| | Experience Match | {before_analysis['breakdown']['experience_match']:.0f}% | {after_analysis['breakdown']['experience_match']:.0f}% | | | |
| | Skills Taxonomy | {before_analysis['breakdown']['skills_match']:.0f}% | {after_analysis['breakdown']['skills_match']:.0f}% | | | |
| | Format Compliance | {before_analysis['breakdown']['format_score']:.0f}% | {after_analysis['breakdown']['format_score']:.0f}% | | | |
| | Section Structure | {before_analysis['breakdown']['section_score']:.0f}% | {after_analysis['breakdown']['section_score']:.0f}% | | | |
| | Action Verbs | {before_analysis['breakdown']['action_verbs']:.0f}% | {after_analysis['breakdown']['action_verbs']:.0f}% | | | |
| | Quantification | {before_analysis['breakdown']['quantification']:.0f}% | {after_analysis['breakdown']['quantification']:.0f}% | | | |
| ### π¬ Scoring Methodology (Mimics Real ATS) | |
| - **TF-IDF Keyword Match**: Weighted matching - rare/important terms score higher | |
| - **Semantic Role Match**: "Data Scientist" β "ML Engineer" β "AI Engineer" | |
| - **Experience Match**: Parses "5+ years" & calculates from date ranges | |
| - **Skills Taxonomy**: ML=Machine Learning, NLP=Natural Language Processing, etc. | |
| β οΈ *Using TF-IDF, stemming, fuzzy matching & skills taxonomy - similar to Workday, Taleo, Greenhouse algorithms* | |
| """ | |
| # Enhanced keyword analysis | |
| matched, missing = analyzer.get_keyword_analysis(optimized_resume, job_description) | |
| before_matched, before_missing = analyzer.get_keyword_analysis(original_resume, job_description) | |
| new_keywords_matched = [kw for kw in matched if kw not in before_matched] | |
| keywords_display = f"""## π Detailed Keyword Analysis | |
| ### β Keywords Matched ({len(matched)}) | |
| `{' | '.join(matched) if matched else 'None detected'}` | |
| ### π New Keywords Added by AI ({len(new_keywords_matched)}) | |
| `{' | '.join(new_keywords_matched) if new_keywords_matched else 'No new keywords added'}` | |
| ### β Still Missing ({len(missing)}) - Consider adding manually: | |
| `{' | '.join(missing) if missing else 'All major keywords present! π'}` | |
| ### π‘ Suggestions for Missing Keywords: | |
| {chr(10).join([f"- **{kw}**: Add to Skills section or work into experience bullets" for kw in missing[:5]]) if missing else "- All important keywords are covered!"} | |
| """ | |
| suggestions_display = "## π€ AI Optimization Changes\n\n" | |
| suggestions_display += "\n".join(ai_suggestions) if ai_suggestions else "No changes made." | |
| suggestions_display += f"\n\n### π Improvement Summary\n" | |
| suggestions_display += f"- **Overall Score**: {before_score}% β {after_score}% ({'+' if improvement >= 0 else ''}{improvement}%)\n" | |
| suggestions_display += f"- **Keyword Match**: {before_analysis['breakdown']['keyword_match']:.0f}% β {after_analysis['breakdown']['keyword_match']:.0f}%\n" | |
| suggestions_display += f"- **New Keywords Injected**: {len(new_keywords_matched)}\n" | |
| pdf_path = generate_pdf(optimized_resume, color_scheme, candidate_name) | |
| # Create HTML preview of the optimized resume | |
| preview_html = create_resume_preview(optimized_resume, candidate_name, color_scheme) | |
| return scores_display, keywords_display, suggestions_display, original_resume, optimized_resume, pdf_path, preview_html | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return f"β Error: {str(e)}", "", "", "", "", None, "" | |
| def create_resume_preview(resume_content: str, candidate_name: str, color_scheme: str) -> str: | |
| """Create a professional HTML preview of the optimized resume.""" | |
| SCHEMES = { | |
| 'Navy Blue': {'primary': '#1a365d', 'accent': '#2c5282', 'text': '#2d3748', 'bg': '#f7fafc', 'light': '#edf2f7'}, | |
| 'Forest Green': {'primary': '#1c4532', 'accent': '#276749', 'text': '#2d3748', 'bg': '#f0fff4', 'light': '#c6f6d5'}, | |
| 'Burgundy': {'primary': '#742a2a', 'accent': '#9b2c2c', 'text': '#2d3748', 'bg': '#fff5f5', 'light': '#fed7d7'}, | |
| 'Charcoal': {'primary': '#1a202c', 'accent': '#4a5568', 'text': '#2d3748', 'bg': '#f7fafc', 'light': '#e2e8f0'}, | |
| 'Royal Purple': {'primary': '#44337a', 'accent': '#6b46c1', 'text': '#2d3748', 'bg': '#faf5ff', 'light': '#e9d8fd'} | |
| } | |
| scheme = SCHEMES.get(color_scheme, SCHEMES['Navy Blue']) | |
| # Clean content - remove any metadata lines | |
| lines = resume_content.split('\n') | |
| cleaned_lines = [l for l in lines if not l.startswith('__CANDIDATE_NAME__:')] | |
| # Section detection keywords | |
| section_headers = ['PROFESSIONAL SUMMARY', 'SUMMARY', 'OBJECTIVE', 'PROFILE', | |
| 'EXPERIENCE', 'PROFESSIONAL EXPERIENCE', 'WORK EXPERIENCE', 'EMPLOYMENT', | |
| 'EDUCATION', 'ACADEMIC BACKGROUND', 'ACADEMIC', | |
| 'SKILLS', 'TECHNICAL SKILLS', 'CORE COMPETENCIES', 'KEY SKILLS', | |
| 'CERTIFICATIONS', 'CERTIFICATES', 'LICENSES', | |
| 'PROJECTS', 'KEY PROJECTS', 'NOTABLE PROJECTS', | |
| 'AWARDS', 'ACHIEVEMENTS', 'HONORS', 'RECOGNITION', | |
| 'PUBLICATIONS', 'RESEARCH', 'PATENTS', | |
| 'LEADERSHIP', 'VOLUNTEER', 'EXTRACURRICULAR', | |
| 'LANGUAGES', 'INTERESTS'] | |
| html_parts = [] | |
| name_rendered = False | |
| contact_rendered = False | |
| i = 0 | |
| while i < len(cleaned_lines): | |
| line = cleaned_lines[i].strip() | |
| if not line: | |
| html_parts.append('<div style="height: 10px;"></div>') | |
| i += 1 | |
| continue | |
| line_upper = line.upper() | |
| # Skip __CANDIDATE_NAME__ metadata | |
| if '__CANDIDATE_NAME__' in line: | |
| i += 1 | |
| continue | |
| # Render name (first substantial text line, not a header) | |
| if not name_rendered and len(line) > 2: | |
| is_header = any(h in line_upper for h in section_headers) | |
| is_contact = '@' in line or re.search(r'\d{3}[-.\s]?\d{3}', line) or 'linkedin' in line.lower() | |
| if not is_header and not is_contact: | |
| # Check for title separator (NAME | TITLE or NAME - TITLE) | |
| if '|' in line or ' - ' in line or ' β ' in line: | |
| parts = re.split(r'\s*[\|β-]\s*', line, maxsplit=1) | |
| name_part = parts[0].strip() | |
| title_part = parts[1].strip() if len(parts) > 1 else '' | |
| html_parts.append(f''' | |
| <div style="text-align: center; margin-bottom: 5px;"> | |
| <h1 style="color: {scheme['primary']}; margin: 0; font-size: 28px; font-weight: 700; letter-spacing: 1px;"> | |
| {name_part.upper() if name_part.islower() else name_part} | |
| </h1> | |
| {f'<div style="color: {scheme["accent"]}; font-size: 14px; font-style: italic; margin-top: 5px;">{title_part}</div>' if title_part else ''} | |
| </div> | |
| ''') | |
| else: | |
| html_parts.append(f''' | |
| <div style="text-align: center; margin-bottom: 5px;"> | |
| <h1 style="color: {scheme['primary']}; margin: 0; font-size: 28px; font-weight: 700; letter-spacing: 1px;"> | |
| {line.upper() if line.islower() else line} | |
| </h1> | |
| </div> | |
| ''') | |
| name_rendered = True | |
| i += 1 | |
| continue | |
| # Render contact info (lines with email, phone, LinkedIn early in doc) | |
| if not contact_rendered and i <= 8: | |
| if '@' in line or re.search(r'[\+]?\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}', line) or 'linkedin' in line.lower() or 'github' in line.lower(): | |
| html_parts.append(f''' | |
| <div style="text-align: center; color: {scheme['text']}; font-size: 11px; margin-bottom: 3px;"> | |
| {line} | |
| </div> | |
| ''') | |
| i += 1 | |
| # Check if next lines are also contact info | |
| while i < len(cleaned_lines) and i <= 8: | |
| next_line = cleaned_lines[i].strip() | |
| if '@' in next_line or 'linkedin' in next_line.lower() or 'github' in next_line.lower() or re.search(r'[\+]?\d', next_line): | |
| html_parts.append(f''' | |
| <div style="text-align: center; color: {scheme['text']}; font-size: 11px; margin-bottom: 3px;"> | |
| {next_line} | |
| </div> | |
| ''') | |
| i += 1 | |
| else: | |
| break | |
| contact_rendered = True | |
| continue | |
| # Skip ===== divider lines (they're formatting aids, not content) | |
| if line.startswith('=') and line.endswith('=') and len(line) > 10: | |
| i += 1 | |
| continue | |
| # Skip βββ Unicode divider lines | |
| if line.startswith('β') and len(line) > 10: | |
| i += 1 | |
| continue | |
| # Skip --- divider lines | |
| if line.startswith('-') and len(line) > 5 and line.replace('-', '') == '': | |
| i += 1 | |
| continue | |
| # Section headers | |
| if any(h in line_upper for h in section_headers) or (line.isupper() and len(line) > 3 and len(line) < 50): | |
| html_parts.append(f''' | |
| <div style="margin-top: 20px; margin-bottom: 10px;"> | |
| <h2 style="color: {scheme['primary']}; font-size: 13px; font-weight: 700; | |
| text-transform: uppercase; letter-spacing: 2px; margin: 0 0 5px 0; | |
| border-bottom: 2px solid {scheme['accent']}; padding-bottom: 5px;"> | |
| {line_upper} | |
| </h2> | |
| </div> | |
| ''') | |
| i += 1 | |
| continue | |
| # Company/Job header lines (contain dates, pipes, or are styled like headers) | |
| date_pattern = re.search(r'\b(20\d{2}|19\d{2}|Present|Current)\b', line, re.IGNORECASE) | |
| has_separator = '|' in line or ' β ' in line | |
| if date_pattern or (has_separator and len(line) < 120): | |
| html_parts.append(f''' | |
| <div style="display: flex; justify-content: space-between; align-items: baseline; | |
| margin-top: 12px; margin-bottom: 4px;"> | |
| <span style="color: {scheme['primary']}; font-weight: 600; font-size: 12px;">{line}</span> | |
| </div> | |
| ''') | |
| i += 1 | |
| continue | |
| # Bullet points | |
| if line.startswith('β’') or line.startswith('-') or line.startswith('*') or line.startswith('βΊ') or line.startswith('βͺ'): | |
| bullet_text = line.lstrip('β’-*βΊβͺ ') | |
| html_parts.append(f''' | |
| <div style="margin-left: 20px; margin-bottom: 5px; color: {scheme['text']}; font-size: 11px; line-height: 1.5;"> | |
| <span style="color: {scheme['accent']}; margin-right: 8px;">β’</span>{bullet_text} | |
| </div> | |
| ''') | |
| i += 1 | |
| continue | |
| # Regular paragraph text | |
| html_parts.append(f''' | |
| <div style="color: {scheme['text']}; font-size: 11px; margin-bottom: 5px; line-height: 1.5; text-align: justify;"> | |
| {line} | |
| </div> | |
| ''') | |
| i += 1 | |
| # Wrap in styled container | |
| html_content = f''' | |
| <div style=" | |
| font-family: 'Segoe UI', 'Helvetica Neue', Arial, sans-serif; | |
| background: white; | |
| padding: 40px 50px; | |
| border-radius: 8px; | |
| box-shadow: 0 4px 20px rgba(0,0,0,0.1); | |
| max-width: 850px; | |
| margin: 20px auto; | |
| border-top: 4px solid {scheme['primary']}; | |
| "> | |
| {''.join(html_parts)} | |
| </div> | |
| ''' | |
| return html_content | |
| # ============== GRADIO UI ============== | |
| with gr.Blocks(title="ATS Resume Optimizer", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π ATS Resume Optimizer Pro | |
| ### Powered by Claude 3.5 Sonnet (Anthropic's SOTA Model) | |
| Upload your resume and paste a job description. Our AI will: | |
| - **Analyze** keyword matching and ATS compatibility using TF-IDF & semantic algorithms | |
| - **Optimize** wording with professionally formatted output (without adding fake info) | |
| - **Generate** a polished, ATS-friendly PDF with proper formatting | |
| β‘ *Using Anthropic's most advanced model for premium-quality resume optimization* | |
| --- | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| resume_file = gr.File(label="π€ Upload Resume (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".doc", ".txt"]) | |
| industry = gr.Dropdown( | |
| choices=["Technology/IT", "Finance/Banking", "Healthcare", "Marketing/Sales", | |
| "Engineering", "Consulting", "Legal", "Education", "Other"], | |
| value="Technology/IT", label="Industry" | |
| ) | |
| experience_level = gr.Dropdown( | |
| choices=["Entry Level (0-2 years)", "Mid Level (3-5 years)", | |
| "Senior (6-10 years)", "Executive (10+ years)"], | |
| value="Mid Level (3-5 years)", label="Experience Level" | |
| ) | |
| color_scheme = gr.Dropdown( | |
| choices=["Navy Blue", "Forest Green", "Burgundy", "Charcoal", "Royal Purple"], | |
| value="Navy Blue", label="PDF Color Scheme" | |
| ) | |
| with gr.Column(scale=1): | |
| job_description = gr.Textbox( | |
| label="π Paste Job Description", | |
| placeholder="Paste the complete job description here...", | |
| lines=12 | |
| ) | |
| analyze_btn = gr.Button("π Analyze & Optimize with AI", variant="primary", size="lg") | |
| gr.Markdown("---") | |
| scores_output = gr.Markdown() | |
| with gr.Row(): | |
| keywords_output = gr.Markdown() | |
| suggestions_output = gr.Markdown() | |
| gr.Markdown("### π Resume Comparison") | |
| with gr.Row(): | |
| original_resume = gr.Textbox(label="Original Resume", lines=15, interactive=False) | |
| optimized_resume = gr.Textbox(label="AI-Optimized Resume", lines=15, interactive=False) | |
| gr.Markdown("### οΏ½οΈ Resume Preview") | |
| gr.Markdown("*Preview how your optimized resume will look in the PDF*") | |
| preview_html = gr.HTML(label="Resume Preview") | |
| gr.Markdown("### οΏ½π₯ Download Optimized PDF") | |
| pdf_output = gr.File(label="Download PDF") | |
| gr.Markdown(""" | |
| --- | |
| ### βΉοΈ How It Works | |
| 1. **AI Analysis**: Claude 3.5 Sonnet analyzes your resume against the job description | |
| 2. **Professional Formatting**: Your resume is reformatted to ATS-optimized industry standards | |
| 3. **Smart Optimization**: Keywords are naturally integrated into your existing content | |
| **What we DO:** β Professional reformatting | β Add relevant keywords naturally | β Strengthen action verbs | β ATS-optimized structure | |
| **What we DON'T do:** β Add fake experiences | β Fabricate achievements | β Misrepresent your background | |
| """) | |
| analyze_btn.click( | |
| fn=analyze_and_optimize, | |
| inputs=[resume_file, job_description, industry, experience_level, color_scheme], | |
| outputs=[scores_output, keywords_output, suggestions_output, original_resume, optimized_resume, pdf_output, preview_html] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |