Salim Shaikh
Fix: max_tokens=8192 (Haiku limit)
34c82cf
"""
ATS Resume Optimizer - Powered by Claude 3.5 Sonnet (Anthropic SOTA)
Optimizes CV/Resume for ATS platforms and generates professional PDF
"""
import gradio as gr
import re
import os
import json
from typing import Dict, List, Tuple
from collections import Counter
import tempfile
# Anthropic API Key (Claude 3.5 Sonnet - SOTA for structured formatting)
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "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")
# OpenAI API Key (fallback)
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
# ============== RESUME PARSER (GENERALIZED) ==============
def parse_resume(file) -> str:
"""Parse resume from uploaded file."""
if file is None:
return ""
file_path = file.name if hasattr(file, 'name') else str(file)
file_ext = file_path.lower().split('.')[-1]
try:
if file_ext == 'pdf':
return _parse_pdf(file_path)
elif file_ext in ['docx', 'doc']:
return _parse_docx(file_path)
elif file_ext == 'txt':
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
else:
return f"Unsupported format: {file_ext}"
except Exception as e:
return f"Error parsing file: {str(e)}"
def _parse_pdf(file_path: str) -> str:
"""Parse PDF with proper line preservation."""
text = ""
# Try PyMuPDF first (best line preservation)
try:
import fitz # PyMuPDF
doc = fitz.open(file_path)
for page in doc:
page_text = page.get_text()
if page_text:
text += page_text + "\n"
doc.close()
if text.strip():
return _clean_resume_text(text)
except:
pass
# Fallback to pdfplumber
try:
import pdfplumber
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
if text.strip():
return _clean_resume_text(text)
except:
pass
# Final fallback to PyPDF2
try:
import PyPDF2
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return _clean_resume_text(text)
except Exception as e:
return f"Error: {str(e)}"
def _parse_docx(file_path: str) -> str:
"""Parse DOCX file."""
try:
from docx import Document
doc = Document(file_path)
text = "\n".join([p.text for p in doc.paragraphs])
return _clean_resume_text(text)
except Exception as e:
return f"Error: {str(e)}"
def _clean_resume_text(text: str) -> str:
"""Clean and merge fragmented PDF text while preserving name on first line."""
# Fix common PDF encoding issues FIRST
text = text.replace('(cid:127)', 'β€’')
text = text.replace('(cid:128)', 'β€’')
text = text.replace('●', 'β€’')
text = text.replace('β—‹', 'β€’')
text = text.replace('β–ͺ', 'β€’')
text = text.replace('β– ', 'β€’')
text = text.replace('β—¦', 'β€’')
text = text.replace('\uf0b7', 'β€’') # Unicode bullet
text = text.replace('\u2022', 'β€’') # Unicode bullet
text = re.sub(r'[^\x00-\x7F]+', lambda m: 'β€’' if m.group() in ['●', 'β—‹', 'β–ͺ', 'β– '] else m.group(), text)
lines = text.split('\n')
section_headers = [
'PROFESSIONAL SUMMARY', 'SUMMARY', 'OBJECTIVE', 'PROFILE', 'ABOUT',
'PROFESSIONAL EXPERIENCE', 'EXPERIENCE', 'EMPLOYMENT', 'WORK HISTORY', 'CAREER',
'EDUCATION', 'ACADEMIC', 'QUALIFICATIONS',
'SKILLS', 'TECHNICAL SKILLS', 'CORE COMPETENCIES', 'COMPETENCIES', 'EXPERTISE', 'TECHNOLOGIES',
'CERTIFICATIONS', 'CERTIFICATES', 'LICENSES',
'PROJECTS', 'PORTFOLIO', 'ACHIEVEMENTS', 'AWARDS',
'PUBLICATIONS', 'RESEARCH', 'VOLUNTEER', 'INTERESTS', 'LEADERSHIP', 'COMMUNITY', 'COMPETITIVE'
]
merged_lines = []
current_line = ""
line_count = 0 # Track which line we're on
for line in lines:
line = re.sub(r'\s+', ' ', line).strip()
# Clean any remaining cid patterns
line = re.sub(r'\(cid:\d+\)', 'β€’', line)
if not line:
continue
line_count += 1
line_upper = line.upper().strip()
is_header = any(line_upper.startswith(h) or line_upper == h for h in section_headers)
is_bullet = line.startswith('β€’') or line.startswith('*') or line.startswith('-')
is_company = bool(re.match(r'^[A-Z][A-Z\s&\.,]+(\s*[\|–-]\s*|\s+)(.*\d{4}|[A-Z][a-z]+,?\s+[A-Z]{2})', line))
# First few lines (1-4) are typically: Name, Title, Contact, Links - keep them separate
is_header_line = line_count <= 4 and len(line) < 100
starts_new = is_header or is_bullet or is_company or is_header_line
if starts_new:
if current_line:
merged_lines.append(current_line)
current_line = line if not line.startswith('*') else 'β€’ ' + line[1:].strip()
elif current_line:
current_line += ' ' + line
else:
current_line = line
if current_line:
merged_lines.append(current_line)
return '\n'.join(merged_lines)
# ============== RESUME POST-PROCESSOR ==============
def post_process_resume_format(resume_text: str) -> str:
"""Clean up and enforce consistent professional formatting on resume output."""
lines = resume_text.split('\n')
processed_lines = []
# Section headers that should be standardized
section_keywords = {
'professional summary': 'PROFESSIONAL SUMMARY',
'summary': 'PROFESSIONAL SUMMARY',
'profile': 'PROFESSIONAL SUMMARY',
'objective': 'PROFESSIONAL SUMMARY',
'professional experience': 'PROFESSIONAL EXPERIENCE',
'experience': 'PROFESSIONAL EXPERIENCE',
'work experience': 'PROFESSIONAL EXPERIENCE',
'employment': 'PROFESSIONAL EXPERIENCE',
'work history': 'PROFESSIONAL EXPERIENCE',
'education': 'EDUCATION',
'academic background': 'EDUCATION',
'skills': 'SKILLS',
'technical skills': 'TECHNICAL SKILLS',
'core competencies': 'CORE COMPETENCIES',
'key skills': 'KEY SKILLS',
'certifications': 'CERTIFICATIONS',
'certificates': 'CERTIFICATIONS',
'projects': 'PROJECTS',
'achievements': 'ACHIEVEMENTS',
'awards': 'AWARDS & RECOGNITION',
'publications': 'PUBLICATIONS',
'languages': 'LANGUAGES',
}
for i, line in enumerate(lines):
stripped = line.strip()
# Skip empty lines but preserve them for spacing
if not stripped:
processed_lines.append('')
continue
# Skip metadata lines
if stripped.startswith('__CANDIDATE_NAME__'):
processed_lines.append(stripped)
continue
# Standardize section headers
stripped_lower = stripped.lower().replace('=', '').replace('-', '').strip()
if stripped_lower in section_keywords:
# Add separator before section header
if processed_lines and processed_lines[-1] != '':
processed_lines.append('')
processed_lines.append('=' * 80)
processed_lines.append(section_keywords[stripped_lower])
processed_lines.append('=' * 80)
continue
# Check for === section dividers (keep them clean)
if stripped.startswith('=') and stripped.endswith('=') and len(stripped) > 10:
processed_lines.append('━' * 80)
continue
# Check for ━━━ Unicode dividers (keep them)
if stripped.startswith('━') and len(stripped) > 10:
processed_lines.append('━' * 80)
continue
# Check for --- dividers
if stripped.startswith('-') and len(stripped) > 10 and stripped.replace('-', '') == '':
processed_lines.append('━' * 80)
continue
# Standardize bullet points
if stripped.startswith(('-', '*', '>', '–', 'β€”')) and len(stripped) > 2:
bullet_text = stripped.lstrip('-*>–— ')
processed_lines.append(f'β€’ {bullet_text}')
continue
# Ensure bullet points have proper spacing
if stripped.startswith('β€’'):
if not stripped.startswith('β€’ '):
stripped = 'β€’ ' + stripped[1:].lstrip()
processed_lines.append(stripped)
continue
# Handle company/date lines - ensure proper formatting
if '|' in stripped and any(year in stripped for year in ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025', 'Present']):
processed_lines.append(stripped)
continue
# Pass through other lines
processed_lines.append(stripped)
result = '\n'.join(processed_lines)
# Clean up excessive blank lines
result = re.sub(r'\n{4,}', '\n\n\n', result)
return result
# ============== CLAUDE 3.5 SONNET OPTIMIZER ==============
def optimize_with_llm(resume_text: str, job_description: str) -> Tuple[str, List[str]]:
"""Use Claude 3.5 Sonnet (Anthropic SOTA) to intelligently optimize and professionally format resume."""
# VALIDATION: Reject empty or minimal resumes - we optimize, not fabricate
resume_stripped = resume_text.strip()
word_count = len(resume_stripped.split())
if len(resume_stripped) < 100 or word_count < 20:
return resume_text, [
"❌ ERROR: Resume is too short to optimize.",
f" Your resume has only {len(resume_stripped)} characters and {word_count} words.",
" Please provide a complete resume with at least:",
" - Contact information",
" - Work experience (with dates)",
" - Skills section",
" - Education",
" Minimum required: 100+ characters, 20+ words"
]
if not job_description.strip() or len(job_description.strip()) < 30:
return resume_text, [
"❌ ERROR: Job description is too short.",
" Please provide a job description with at least 30 characters."
]
try:
import anthropic
client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
# Pre-extract important keywords from JD for the AI
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'have', 'we', 'you', 'they', 'this', 'that', 'will', 'would', 'should', 'can', 'may', 'our', 'your', 'their', 'work', 'working', 'role', 'job', 'candidate', 'looking', 'experience', 'years', 'ability', 'team', 'including', 'across', 'within'}
jd_words = re.findall(r'\b[a-zA-Z]{3,}\b', job_description.lower())
keyword_counts = Counter([w for w in jd_words if w not in stop_words])
top_jd_keywords = [w for w, c in keyword_counts.most_common(25) if c >= 2]
prompt = f"""Transform this resume into a clean, professionally formatted document optimized for ATS systems.
## STRICT OUTPUT TEMPLATE (follow EXACTLY)
```
JOHN SMITH
Senior Software Engineer
john.smith@email.com | (555) 123-4567 | New York, NY | linkedin.com/in/johnsmith
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
PROFESSIONAL SUMMARY
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Results-driven professional with X+ years of experience in [field]. Proven track record of [key achievement]. Expert in [key skills from job description].
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
PROFESSIONAL EXPERIENCE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
COMPANY NAME | City, State
Job Title | Jan 2020 – Present
β€’ Spearheaded [initiative] resulting in [quantified outcome]
β€’ Developed [solution] that improved [metric] by X%
β€’ Led cross-functional team of X to deliver [project]
PREVIOUS COMPANY | City, State
Previous Title | Jan 2018 – Dec 2019
β€’ Managed [responsibility] serving X+ clients
β€’ Achieved [result] through [action]
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
EDUCATION
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Master of Science in Computer Science | Stanford University | 2018
Bachelor of Science in Computer Science | UC Berkeley | 2016
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
TECHNICAL SKILLS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Languages: Python, JavaScript, Java, SQL
Frameworks: React, Node.js, Django, FastAPI
Cloud/DevOps: AWS, Docker, Kubernetes, CI/CD
Tools: Git, Jira, Tableau, Terraform
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
CERTIFICATIONS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
β€’ AWS Solutions Architect Professional – Amazon Web Services | 2023
β€’ PMP Project Management Professional – PMI | 2022
```
## RULES (CRITICAL):
1. Use EXACTLY the formatting above with ━━━ horizontal lines between sections
2. Name on line 1 (FIRST LAST only, no titles), professional title on line 2
3. Contact info on line 3 with | separators
4. Every bullet starts with β€’ and a STRONG ACTION VERB (Spearheaded, Architected, Delivered, etc.)
5. PRESERVE ALL original facts (dates, numbers, companies, degrees) EXACTLY
6. DO NOT fabricate achievements, certifications, or experiences
7. Integrate these keywords naturally: {', '.join(top_jd_keywords[:15])}
## JOB DESCRIPTION:
{job_description[:2000]}
## ORIGINAL RESUME TO TRANSFORM:
{resume_text}
## OUTPUT (JSON only):
Return ONLY valid JSON with no markdown code blocks:
{{"candidate_name": "THE PERSON'S FULL NAME (e.g. John Smith, NOT a company name)", "optimized_resume": "THE COMPLETE FORMATTED RESUME", "changes_made": ["change1", "change2"], "keywords_added": ["kw1", "kw2"]}}
IMPORTANT: candidate_name must be the PERSON's name (like 'Salim Shaikh'), NOT a company name (like 'JP Morgan')."""
response = client.messages.create(
model="claude-3-5-haiku-latest",
max_tokens=8192,
temperature=0.1,
messages=[
{"role": "user", "content": prompt}
],
system="You are an expert executive resume writer. Transform resumes into beautifully formatted, ATS-optimized documents. Output ONLY valid JSON - no markdown code blocks. Follow the template EXACTLY with ━━━ dividers. Every bullet must start with β€’ and an action verb. Preserve all facts."
)
result_text = response.content[0].text
# Clean up JSON extraction
if "```json" in result_text:
result_text = result_text.split("```json")[1].split("```")[0]
elif "```" in result_text:
result_text = result_text.split("```")[1].split("```")[0]
# Try to find JSON in the response
if result_text.strip().startswith('{'):
pass # Already clean
else:
# Find JSON object in text
json_match = re.search(r'\{[\s\S]*\}', result_text)
if json_match:
result_text = json_match.group()
result = json.loads(result_text.strip())
optimized = result.get("optimized_resume", resume_text)
candidate_name_from_ai = result.get("candidate_name", "")
changes = result.get("changes_made", [])
keywords = result.get("keywords_added", [])
# Store candidate name for PDF generation
if candidate_name_from_ai:
optimized = f"__CANDIDATE_NAME__:{candidate_name_from_ai}\n" + optimized
# POST-PROCESS: Clean up and enforce consistent formatting
optimized = post_process_resume_format(optimized)
# VALIDATION: If optimized is significantly shorter, AI truncated it - use fallback
# Lower threshold to 0.6 since AI can legitimately condense verbose resumes
if len(optimized) < len(resume_text) * 0.6:
print(f"Warning: AI truncated resume ({len(optimized)} < {len(resume_text) * 0.6}). Using enhanced fallback.")
optimized = enhanced_optimize(resume_text, job_description)
changes = ["⚠️ AI response was truncated. Applied keyword-based optimization instead."]
keywords = []
# HALLUCINATION CHECK: Verify key facts are preserved
original_lower = resume_text.lower()
optimized_lower = optimized.lower()
# Check if original dates are preserved
original_dates = set(re.findall(r'\b(19|20)\d{2}\b', resume_text))
optimized_dates = set(re.findall(r'\b(19|20)\d{2}\b', optimized))
if not original_dates.issubset(optimized_dates):
missing_dates = original_dates - optimized_dates
print(f"Warning: AI removed dates: {missing_dates}. Using fallback.")
optimized = enhanced_optimize(resume_text, job_description)
changes = ["⚠️ AI modified dates. Applied safe keyword optimization instead."]
# Check if original numbers/percentages are preserved
original_metrics = set(re.findall(r'\d+(?:\.\d+)?%|\$[\d,]+|\d+\+', resume_text))
optimized_metrics = set(re.findall(r'\d+(?:\.\d+)?%|\$[\d,]+|\d+\+', optimized))
if len(original_metrics) > 0 and not original_metrics.issubset(optimized_metrics):
missing_metrics = original_metrics - optimized_metrics
if len(missing_metrics) > 2: # Allow minor losses
print(f"Warning: AI removed metrics: {missing_metrics}. Using fallback.")
optimized = enhanced_optimize(resume_text, job_description)
changes = ["⚠️ AI modified metrics. Applied safe keyword optimization instead."]
suggestions = [f"βœ… {change}" for change in changes[:5]]
if keywords:
suggestions.append(f"πŸ”‘ Keywords added: {', '.join(keywords[:10])}")
return optimized, suggestions
except Exception as e:
print(f"Claude API Error: {str(e)}")
return enhanced_optimize(resume_text, job_description), [f"⚠️ AI unavailable: {str(e)[:50]}. Using keyword optimization."]
def enhanced_optimize(resume_text: str, job_description: str) -> str:
"""Enhanced fallback optimization - formats resume professionally and injects keywords from JD."""
optimized = resume_text
# Extract important keywords from job description
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'have', 'has', 'we', 'you', 'they', 'this', 'that', 'will', 'would', 'could', 'should', 'work', 'working', 'role', 'job', 'candidate', 'looking', 'experience', 'years', 'ability', 'team', 'etc', 'such', 'including'}
jd_words = re.findall(r'\b[a-zA-Z]{4,}\b', job_description.lower())
from collections import Counter
keyword_counts = Counter([w for w in jd_words if w not in stop_words])
top_keywords = [w for w, c in keyword_counts.most_common(15) if c >= 2]
# Strong action verb replacements
replacements = {
r'\bworked on\b': 'developed',
r'\bhelped\b': 'contributed to',
r'\bwas responsible for\b': 'managed',
r'\bhandled\b': 'orchestrated',
r'\bused\b': 'leveraged',
r'\bworked with\b': 'collaborated with',
r'\bmade\b': 'engineered',
r'\bdid\b': 'executed',
r'\bran\b': 'spearheaded',
r'\bbuilt\b': 'architected and built',
}
for pattern, replacement in replacements.items():
optimized = re.sub(pattern, replacement, optimized, flags=re.IGNORECASE)
# ===== PROFESSIONAL FORMATTING =====
# Standardize bullets to professional format
optimized = re.sub(r'^\s*[\*\-\>]\s*', 'β€’ ', optimized, flags=re.MULTILINE)
# Standardize section headers to uppercase
section_headers = ['experience', 'education', 'skills', 'summary', 'objective',
'certifications', 'projects', 'awards', 'publications',
'professional experience', 'work experience', 'technical skills',
'core competencies', 'professional summary', 'career objective']
for header in section_headers:
# Match header at start of line with optional colon
pattern = rf'^({header})\s*:?\s*$'
optimized = re.sub(pattern, header.upper(), optimized, flags=re.IGNORECASE | re.MULTILINE)
# Ensure proper spacing between sections (add blank line before headers)
for header in ['EXPERIENCE', 'EDUCATION', 'SKILLS', 'CERTIFICATIONS', 'PROJECTS',
'AWARDS', 'PUBLICATIONS', 'PROFESSIONAL EXPERIENCE', 'WORK EXPERIENCE',
'TECHNICAL SKILLS', 'CORE COMPETENCIES', 'PROFESSIONAL SUMMARY']:
optimized = re.sub(rf'([^\n])\n({header})', r'\1\n\n\2', optimized)
# Clean up multiple blank lines to max 2
optimized = re.sub(r'\n{3,}', '\n\n', optimized)
# Standardize date formats (Mon YYYY or YYYY)
month_map = {
'january': 'Jan', 'february': 'Feb', 'march': 'Mar', 'april': 'Apr',
'may': 'May', 'june': 'Jun', 'july': 'Jul', 'august': 'Aug',
'september': 'Sep', 'october': 'Oct', 'november': 'Nov', 'december': 'Dec'
}
for full, abbr in month_map.items():
optimized = re.sub(rf'\b{full}\b', abbr, optimized, flags=re.IGNORECASE)
# Add a skills enhancement if "SKILLS" section exists
skills_match = re.search(r'(SKILLS?|TECHNICAL SKILLS?|CORE COMPETENCIES)[:\s]*\n', optimized, re.IGNORECASE)
if skills_match and top_keywords:
# Check which keywords are missing from skills section
skills_section_start = skills_match.end()
skills_section = optimized[skills_section_start:skills_section_start+500].lower()
missing_in_skills = [kw for kw in top_keywords[:8] if kw not in skills_section and kw in optimized.lower()]
# We won't add fake skills, just note them
return optimized
def basic_optimize(resume_text: str, job_description: str) -> str:
"""Fallback basic optimization without LLM."""
return enhanced_optimize(resume_text, job_description)
# ============== ATS COMPATIBILITY ANALYZER (ADVANCED) ==============
class ATSCompatibilityAnalyzer:
"""
Advanced ATS scoring using multiple techniques:
- TF-IDF weighted keyword matching
- Stemming for word variations
- Fuzzy matching for abbreviations
- Experience years parsing
- Skills taxonomy mapping
- Education & job title matching
"""
def __init__(self):
# Weights based on Jobscan research:
# - 76.4% recruiters filter by SKILLS (most important)
# - 59.7% filter by EDUCATION
# - 55.3% filter by JOB TITLE
# - 50.6% filter by CERTIFICATIONS
# - 44% filter by YEARS OF EXPERIENCE
self.weights = {
'keyword_match': 0.30, # Primary: keyword matching (highest priority)
'skills_match': 0.25, # Skills matching (76.4% of recruiters)
'semantic_match': 0.15, # Job title/role matching (55.3%)
'experience_match': 0.12, # Experience years (44%)
'format_score': 0.08, # ATS-friendly formatting
'section_score': 0.05, # Standard sections present
'action_verbs': 0.03, # Impact-oriented language
'quantification': 0.02 # Measurable achievements
}
self.action_verbs = [
'achieved', 'administered', 'analyzed', 'architected', 'automated',
'built', 'collaborated', 'conducted', 'created', 'delivered', 'designed',
'developed', 'directed', 'drove', 'engineered', 'established',
'executed', 'generated', 'implemented', 'improved', 'increased',
'launched', 'led', 'managed', 'optimized', 'orchestrated',
'reduced', 'resolved', 'spearheaded', 'streamlined', 'transformed',
'accelerated', 'consolidated', 'converted', 'customized', 'decreased',
'enhanced', 'exceeded', 'expanded', 'facilitated',
'formulated', 'founded', 'identified', 'initiated', 'innovated',
'integrated', 'leveraged', 'maximized', 'mentored', 'modernized',
'negotiated', 'outperformed', 'pioneered', 'produced', 'programmed',
'proposed', 'redesigned', 'revamped', 'scaled', 'standardized',
# Additional common verbs
'supported', 'trained', 'utilized', 'validated', 'verified',
'wrote', 'maintained', 'monitored', 'performed', 'presented',
'processed', 'provided', 'published', 'recommended', 'researched',
'reviewed', 'supervised', 'tested', 'tracked', 'updated',
'coordinated', 'defined', 'demonstrated', 'documented', 'ensured',
'evaluated', 'examined', 'extracted', 'gathered', 'guided',
'handled', 'influenced', 'instructed', 'interpreted', 'investigated',
'modeled', 'organized', 'oversaw', 'prepared', 'prioritized',
# More verbs found missing in tests
'advised', 'allocated', 'appointed', 'approved', 'assigned',
'assisted', 'attained', 'authored', 'calculated', 'captured',
'chaired', 'clarified', 'coached', 'collected', 'communicated',
'compiled', 'completed', 'composed', 'computed', 'conceptualized',
'conserved', 'constructed', 'consulted', 'contracted', 'controlled',
'convinced', 'cultivated', 'delegated', 'deployed', 'devised',
'diagnosed', 'discovered', 'dispatched', 'earned', 'edited',
'educated', 'enabled', 'encouraged', 'enforced', 'enlisted',
'equipped', 'estimated', 'examined', 'expedited', 'fabricated',
'finalized', 'forecasted', 'fulfilled', 'gained', 'hired',
'hosted', 'illustrated', 'improved', 'incorporated', 'increased',
'inspected', 'installed', 'instituted', 'introduced', 'invented',
'issued', 'lectured', 'licensed', 'logged', 'marketed',
'mediated', 'merged', 'motivated', 'navigated', 'obtained',
'operated', 'ordered', 'originated', 'partnered', 'planned',
'predicted', 'prescribed', 'prevented', 'promoted', 'protected',
'purchased', 'qualified', 'raised', 'ranked', 'rated',
'realized', 'received', 'recognized', 'recruited', 'rectified',
'referred', 'regulated', 'rehabilitated', 'reinforced', 'rendered',
'reorganized', 'repaired', 'replaced', 'reported', 'represented',
'rescued', 'restored', 'restructured', 'retrieved', 'safeguarded',
'saved', 'screened', 'secured', 'selected', 'served',
'shaped', 'simplified', 'solved', 'sorted', 'specified',
'sponsored', 'staffed', 'steered', 'strengthened', 'structured',
'studied', 'submitted', 'succeeded', 'summarized', 'superseded',
'supervised', 'surpassed', 'surveyed', 'sustained', 'targeted',
'taught', 'terminated', 'traded', 'transcribed', 'transferred',
'translated', 'tripled', 'troubleshot', 'tutored', 'uncovered',
'unified', 'upgraded', 'validated', 'valued', 'visualized',
'widened', 'won', 'worked', 'wrote',
# Additional common verbs from test failures
'closed', 'grew', 'covered', 'published', 'filled', 'supported',
'provided', 'trained', 'responded', 'triaged', 'maintained',
'advised', 'drafted', 'reviewed', 'researched', 'processed',
'migrated', 'architected', 'scaled', 'resolved', 'tested',
]
# Skills taxonomy - maps related terms (COMPREHENSIVE FOR ALL DOMAINS)
self.skills_taxonomy = {
# ============== TECHNOLOGY / SOFTWARE ==============
'python': ['python', 'py', 'python3', 'python2'],
'java': ['java', 'java8', 'java11', 'jvm', 'spring boot', 'spring'],
'javascript': ['javascript', 'js', 'node.js', 'nodejs', 'react', 'angular', 'vue', 'typescript'],
'sql': ['sql', 'mysql', 'postgresql', 'postgres', 'sql server', 'tsql', 'plsql', 'oracle'],
'api': ['api', 'apis', 'rest api', 'restful', 'rest', 'graphql', 'soap'],
'agile': ['agile', 'scrum', 'kanban', 'sprint', 'jira', 'waterfall'],
'ci/cd': ['ci/cd', 'cicd', 'ci cd', 'continuous integration', 'continuous deployment', 'jenkins', 'github actions'],
'git': ['git', 'github', 'gitlab', 'bitbucket', 'version control', 'svn'],
'cloud': ['cloud', 'cloud computing', 'cloud services', 'saas', 'paas', 'iaas'],
'aws': ['aws', 'amazon web services', 'amazon cloud', 'ec2', 's3', 'lambda'],
'azure': ['azure', 'microsoft azure', 'azure ml', 'azure cloud'],
'gcp': ['gcp', 'google cloud', 'google cloud platform', 'bigquery'],
'docker': ['docker', 'containerization', 'containers', 'dockerfile'],
'kubernetes': ['kubernetes', 'k8s', 'kube', 'container orchestration'],
'devops': ['devops', 'dev ops', 'sre', 'site reliability'],
'linux': ['linux', 'unix', 'ubuntu', 'centos', 'redhat', 'bash', 'shell'],
'networking': ['networking', 'tcp/ip', 'dns', 'vpn', 'firewall', 'load balancer'],
'security': ['security', 'cybersecurity', 'infosec', 'penetration testing', 'vulnerability'],
# ============== AI / ML / DATA SCIENCE ==============
'machine learning': ['machine learning', 'ml', 'machine-learning', 'machinelearning'],
'deep learning': ['deep learning', 'dl', 'deep-learning', 'neural networks', 'neural nets'],
'artificial intelligence': ['artificial intelligence', 'ai', 'a.i.', 'a.i'],
'natural language processing': ['natural language processing', 'nlp', 'text mining', 'text analytics'],
'data science': ['data science', 'data scientist', 'datascience', 'ds'],
'tensorflow': ['tensorflow', 'tf', 'tensor flow', 'keras'],
'pytorch': ['pytorch', 'torch', 'py torch'],
'llm': ['llm', 'large language model', 'large language models', 'llms', 'chatgpt', 'gpt'],
'generative ai': ['generative ai', 'genai', 'gen ai', 'gen-ai'],
'computer vision': ['computer vision', 'cv', 'image recognition', 'object detection'],
'langchain': ['langchain', 'lang chain', 'langgraph'],
'rag': ['rag', 'retrieval augmented generation', 'similarity search'],
'embeddings': ['embedding', 'embeddings', 'vector embeddings', 'word embeddings'],
'mlops': ['mlops', 'ml ops', 'machine learning operations'],
'spark': ['spark', 'pyspark', 'apache spark', 'spark sql'],
'hadoop': ['hadoop', 'hdfs', 'mapreduce', 'hive'],
# ============== DATA / ANALYTICS ==============
'tableau': ['tableau', 'tableau desktop', 'tableau server'],
'power bi': ['power bi', 'powerbi', 'power-bi', 'pbi'],
'excel': ['excel', 'ms excel', 'microsoft excel', 'spreadsheet', 'vlookup', 'pivot table'],
'data analysis': ['data analysis', 'data analytics', 'analytics', 'analytical'],
'statistics': ['statistics', 'statistical', 'statistical analysis', 'regression', 'hypothesis'],
'visualization': ['visualization', 'data visualization', 'dashboards', 'reporting'],
'etl': ['etl', 'extract transform load', 'data pipeline', 'data integration'],
'business intelligence': ['business intelligence', 'bi', 'reporting', 'insights'],
'forecasting': ['forecasting', 'prediction', 'predictive', 'time series'],
# ============== FINANCE / ACCOUNTING ==============
'financial analysis': ['financial analysis', 'financial modeling', 'financial planning', 'fp&a'],
'accounting': ['accounting', 'accountant', 'bookkeeping', 'ledger'],
'gaap': ['gaap', 'generally accepted accounting principles', 'ifrs'],
'budgeting': ['budgeting', 'budget', 'forecasting', 'variance analysis'],
'auditing': ['auditing', 'audit', 'internal audit', 'external audit', 'sox'],
'tax': ['tax', 'taxation', 'tax planning', 'tax compliance', 'tax return'],
'cpa': ['cpa', 'certified public accountant', 'cma', 'cfa'],
'investment': ['investment', 'investing', 'portfolio', 'asset management'],
'banking': ['banking', 'bank', 'commercial banking', 'retail banking'],
'risk management': ['risk management', 'risk', 'risk assessment', 'credit risk', 'market risk'],
'compliance': ['compliance', 'regulatory', 'regulations', 'regulatory compliance'],
'valuation': ['valuation', 'dcf', 'discounted cash flow', 'comparable analysis'],
'mergers': ['mergers', 'm&a', 'acquisitions', 'merger', 'due diligence'],
'bloomberg': ['bloomberg', 'bloomberg terminal', 'reuters', 'factset'],
'quickbooks': ['quickbooks', 'quick books', 'sage', 'xero', 'netsuite'],
# ============== MARKETING / SALES ==============
'marketing': ['marketing', 'digital marketing', 'marketing strategy', 'brand'],
'seo': ['seo', 'search engine optimization', 'organic search', 'keywords'],
'sem': ['sem', 'search engine marketing', 'ppc', 'pay per click', 'google ads'],
'social media': ['social media', 'social media marketing', 'facebook', 'instagram', 'linkedin', 'twitter'],
'content marketing': ['content marketing', 'content strategy', 'content creation', 'copywriting'],
'email marketing': ['email marketing', 'email campaigns', 'mailchimp', 'hubspot email'],
'crm': ['crm', 'customer relationship management', 'salesforce', 'hubspot', 'zoho'],
'salesforce': ['salesforce', 'sfdc', 'salesforce crm', 'salesforce admin'],
'hubspot': ['hubspot', 'hub spot', 'hubspot crm', 'hubspot marketing'],
'lead generation': ['lead generation', 'leads', 'prospecting', 'pipeline'],
'sales': ['sales', 'selling', 'revenue', 'quota', 'target'],
'b2b': ['b2b', 'business to business', 'enterprise sales', 'corporate sales'],
'b2c': ['b2c', 'business to consumer', 'retail', 'consumer'],
'account management': ['account management', 'account manager', 'client management', 'customer success'],
'market research': ['market research', 'competitive analysis', 'market analysis'],
'branding': ['branding', 'brand management', 'brand strategy', 'brand identity'],
'advertising': ['advertising', 'ads', 'ad campaigns', 'media buying'],
'google analytics': ['google analytics', 'ga', 'analytics', 'web analytics'],
'conversion': ['conversion', 'conversion rate', 'cro', 'conversion optimization'],
# ============== HEALTHCARE / MEDICAL ==============
'healthcare': ['healthcare', 'health care', 'medical', 'clinical'],
'hipaa': ['hipaa', 'hipaa compliance', 'patient privacy', 'phi'],
'emr': ['emr', 'ehr', 'electronic medical records', 'electronic health records', 'epic', 'cerner'],
'patient care': ['patient care', 'patient', 'patients', 'bedside', 'clinical care'],
'nursing': ['nursing', 'nurse', 'rn', 'lpn', 'np', 'nurse practitioner'],
'medical coding': ['medical coding', 'icd-10', 'cpt', 'medical billing', 'coding'],
'pharmacy': ['pharmacy', 'pharmacist', 'pharmaceutical', 'medications', 'drugs'],
'clinical trials': ['clinical trials', 'clinical research', 'research', 'fda'],
'diagnosis': ['diagnosis', 'diagnostic', 'treatment', 'prognosis'],
'public health': ['public health', 'epidemiology', 'population health'],
'mental health': ['mental health', 'behavioral health', 'psychology', 'psychiatry'],
'telehealth': ['telehealth', 'telemedicine', 'virtual care', 'remote care'],
# ============== HUMAN RESOURCES ==============
'recruitment': ['recruitment', 'recruiting', 'talent acquisition', 'hiring', 'sourcing'],
'onboarding': ['onboarding', 'orientation', 'new hire', 'induction'],
'hris': ['hris', 'hcm', 'workday', 'adp', 'peoplesoft', 'successfactors'],
'payroll': ['payroll', 'compensation', 'benefits', 'salary'],
'performance management': ['performance management', 'performance review', 'appraisal', 'feedback'],
'employee relations': ['employee relations', 'labor relations', 'er', 'workplace'],
'training': ['training', 'learning and development', 'l&d', 'development'],
'benefits administration': ['benefits administration', 'benefits', 'health insurance', '401k'],
'hr compliance': ['hr compliance', 'labor law', 'employment law', 'eeoc', 'fmla'],
'shrm': ['shrm', 'phr', 'sphr', 'hr certification'],
'employee engagement': ['employee engagement', 'engagement', 'culture', 'retention'],
'diversity': ['diversity', 'dei', 'inclusion', 'equity', 'd&i'],
# ============== LEGAL ==============
'legal': ['legal', 'law', 'attorney', 'lawyer', 'counsel'],
'contracts': ['contracts', 'contract', 'agreement', 'negotiation', 'drafting'],
'litigation': ['litigation', 'court', 'trial', 'dispute', 'lawsuit'],
'corporate law': ['corporate law', 'corporate', 'governance', 'bylaws'],
'intellectual property': ['intellectual property', 'ip', 'patent', 'trademark', 'copyright'],
'legal research': ['legal research', 'westlaw', 'lexisnexis', 'case law'],
'regulatory': ['regulatory', 'regulations', 'compliance', 'policy'],
'paralegal': ['paralegal', 'legal assistant', 'legal support'],
# ============== OPERATIONS / SUPPLY CHAIN ==============
'operations': ['operations', 'ops', 'operational', 'operating'],
'supply chain': ['supply chain', 'scm', 'logistics', 'procurement', 'sourcing'],
'inventory': ['inventory', 'inventory management', 'stock', 'warehouse'],
'manufacturing': ['manufacturing', 'production', 'assembly', 'factory'],
'quality': ['quality', 'quality control', 'qc', 'quality assurance', 'qa'],
'lean': ['lean', 'lean manufacturing', 'lean six sigma', 'continuous improvement'],
'six sigma': ['six sigma', '6 sigma', 'dmaic', 'green belt', 'black belt'],
'process improvement': ['process improvement', 'optimization', 'efficiency', 'streamline'],
'vendor management': ['vendor management', 'vendor', 'supplier', 'supplier management'],
'erp': ['erp', 'sap', 'oracle erp', 'enterprise resource planning', 'netsuite'],
'logistics': ['logistics', 'transportation', 'shipping', 'freight', 'distribution'],
'project management': ['project management', 'pm', 'pmp', 'project manager'],
'program management': ['program management', 'program manager', 'portfolio'],
'change management': ['change management', 'change', 'transformation'],
# ============== ENGINEERING (NON-SOFTWARE) ==============
'mechanical engineering': ['mechanical engineering', 'mechanical', 'cad', 'solidworks', 'autocad'],
'electrical engineering': ['electrical engineering', 'electrical', 'circuits', 'pcb'],
'civil engineering': ['civil engineering', 'civil', 'structural', 'construction'],
'chemical engineering': ['chemical engineering', 'chemical', 'process engineering'],
'engineering design': ['engineering design', 'design', 'prototyping', 'testing'],
'cad': ['cad', 'autocad', 'solidworks', 'catia', 'inventor'],
'simulation': ['simulation', 'modeling', 'fea', 'cfd', 'ansys'],
# ============== EDUCATION ==============
'teaching': ['teaching', 'teacher', 'instructor', 'educator', 'professor'],
'curriculum': ['curriculum', 'curriculum development', 'lesson plans', 'syllabus'],
'classroom': ['classroom', 'classroom management', 'instruction', 'students'],
'assessment': ['assessment', 'grading', 'evaluation', 'testing'],
'e-learning': ['e-learning', 'online learning', 'lms', 'canvas', 'blackboard'],
'tutoring': ['tutoring', 'tutor', 'mentoring', 'coaching'],
# ============== CUSTOMER SERVICE ==============
'customer service': ['customer service', 'customer support', 'support', 'helpdesk'],
'customer experience': ['customer experience', 'cx', 'customer satisfaction', 'csat'],
'call center': ['call center', 'contact center', 'phone support', 'tickets'],
'troubleshooting': ['troubleshooting', 'problem solving', 'issue resolution'],
'zendesk': ['zendesk', 'freshdesk', 'intercom', 'ticketing'],
# ============== CREATIVE / DESIGN ==============
'graphic design': ['graphic design', 'graphics', 'visual design', 'designer'],
'adobe': ['adobe', 'photoshop', 'illustrator', 'indesign', 'creative suite'],
'figma': ['figma', 'sketch', 'invision', 'xd', 'adobe xd'],
'ui/ux': ['ui/ux', 'ui', 'ux', 'user interface', 'user experience'],
'web design': ['web design', 'website design', 'responsive design'],
'video editing': ['video editing', 'premiere', 'final cut', 'after effects'],
'photography': ['photography', 'photo editing', 'lightroom'],
'branding design': ['branding design', 'brand design', 'logo design', 'identity'],
# ============== GENERAL PROFESSIONAL SKILLS ==============
'communication': ['communication', 'communications', 'verbal', 'written', 'presentation'],
'leadership': ['leadership', 'leader', 'leading', 'management', 'managing'],
'teamwork': ['teamwork', 'team', 'collaboration', 'collaborative', 'cross-functional'],
'problem solving': ['problem solving', 'problem-solving', 'analytical', 'critical thinking'],
'time management': ['time management', 'prioritization', 'multitasking', 'deadline'],
'organization': ['organization', 'organizational', 'organized', 'detail-oriented'],
'strategic': ['strategic', 'strategy', 'strategic planning', 'strategic thinking'],
'stakeholder': ['stakeholder', 'stakeholders', 'stakeholder management'],
'presentation': ['presentation', 'presentations', 'powerpoint', 'public speaking'],
'negotiation': ['negotiation', 'negotiate', 'negotiating', 'deal'],
'decision making': ['decision making', 'decision-making', 'judgment'],
'mentoring': ['mentoring', 'mentor', 'coaching', 'developing others'],
# Additional terms found missing in tests
'version control': ['version control', 'git', 'github', 'gitlab', 'bitbucket', 'svn'],
'analytical': ['analytical', 'analysis', 'analyze', 'analytics', 'analyzing'],
'verbal': ['verbal', 'verbal communication', 'speaking', 'spoken'],
'written': ['written', 'written communication', 'writing', 'documentation'],
'portfolio': ['portfolio', 'portfolios', 'work samples', 'projects'],
'coaching': ['coaching', 'coach', 'mentoring', 'training', 'developing'],
'client relationship': ['client relationship', 'client relations', 'customer relationship', 'account management'],
'care coordination': ['care coordination', 'care management', 'patient coordination', 'case management'],
'patient safety': ['patient safety', 'safety', 'safe care', 'patient care'],
'quality improvement': ['quality improvement', 'qi', 'continuous improvement', 'process improvement'],
'electronic health records': ['electronic health records', 'ehr', 'emr', 'electronic medical records', 'epic', 'cerner'],
'due diligence': ['due diligence', 'diligence', 'research', 'investigation'],
'oracle': ['oracle', 'oracle database', 'oracle erp', 'oracle cloud'],
'testing': ['testing', 'test', 'tests', 'a/b testing', 'quality assurance'],
'systems': ['systems', 'system', 'information systems', 'it systems'],
'equity': ['equity', 'dei', 'diversity equity inclusion', 'fairness'],
'process improvement': ['process improvement', 'process optimization', 'continuous improvement', 'lean'],
# ============== NEW DOMAINS FOR EXTENDED TEST COVERAGE ==============
# Hospitality
'hospitality': ['hospitality', 'guest services', 'hotel', 'resort', 'lodging'],
'food service': ['food service', 'f&b', 'food and beverage', 'restaurant', 'dining', 'catering'],
'culinary': ['culinary', 'chef', 'cooking', 'kitchen', 'cuisine', 'menu'],
'guest experience': ['guest experience', 'guest satisfaction', 'customer experience', 'service excellence'],
'reservation': ['reservation', 'booking', 'front desk', 'check-in', 'concierge'],
# Retail
'retail': ['retail', 'store', 'shop', 'merchandise', 'consumer'],
'merchandising': ['merchandising', 'merchandise', 'product display', 'visual merchandising', 'planogram'],
'inventory management': ['inventory management', 'stock management', 'inventory control', 'stockroom'],
'point of sale': ['point of sale', 'pos', 'cash register', 'checkout', 'transactions'],
'loss prevention': ['loss prevention', 'asset protection', 'shrinkage', 'theft prevention'],
# Government / Public Sector
'policy': ['policy', 'public policy', 'policy analysis', 'policy development', 'legislation'],
'grants': ['grants', 'grant writing', 'grant management', 'federal grants', 'funding'],
'government': ['government', 'public sector', 'federal', 'state', 'municipal', 'public administration'],
'regulations': ['regulations', 'regulatory affairs', 'compliance', 'policy compliance'],
'constituent': ['constituent', 'citizen', 'public', 'stakeholder', 'community'],
# Nonprofit
'nonprofit': ['nonprofit', 'non-profit', 'ngo', 'charity', 'foundation'],
'fundraising': ['fundraising', 'development', 'donor relations', 'major gifts', 'annual fund'],
'volunteer': ['volunteer', 'volunteer management', 'community outreach', 'volunteer coordination'],
'mission': ['mission', 'mission-driven', 'impact', 'social impact', 'cause'],
'program management': ['program management', 'program development', 'program evaluation', 'grants management'],
# Insurance
'insurance': ['insurance', 'underwriting', 'claims', 'policy', 'coverage'],
'underwriting': ['underwriting', 'risk assessment', 'policy writing', 'premium'],
'claims processing': ['claims processing', 'claims adjustment', 'claims investigation', 'claim settlement'],
'actuarial': ['actuarial', 'actuary', 'actuarial analysis', 'risk modeling', 'pricing'],
'reinsurance': ['reinsurance', 'risk transfer', 'ceding', 'treaty'],
# Trades / Construction
'construction': ['construction', 'building', 'contractor', 'general contractor', 'renovation'],
'electrical': ['electrical', 'electrician', 'wiring', 'circuits', 'electrical systems'],
'plumbing': ['plumbing', 'plumber', 'pipes', 'fixtures', 'water systems'],
'hvac': ['hvac', 'heating', 'ventilation', 'air conditioning', 'climate control'],
'carpentry': ['carpentry', 'carpenter', 'woodworking', 'framing', 'finish work'],
'welding': ['welding', 'welder', 'fabrication', 'metal work', 'steel'],
'blueprint': ['blueprint', 'schematic', 'technical drawing', 'construction drawings'],
# Real Estate
'real estate': ['real estate', 'property', 'realty', 'residential', 'commercial'],
'leasing': ['leasing', 'tenant', 'lease agreement', 'property management', 'rental'],
'appraisal': ['appraisal', 'valuation', 'property assessment', 'market value'],
'escrow': ['escrow', 'title', 'closing', 'settlement', 'transaction'],
'mls': ['mls', 'multiple listing', 'listing', 'property listing'],
# Media / Journalism
'journalism': ['journalism', 'reporter', 'news', 'press', 'media'],
'editorial': ['editorial', 'editor', 'editing', 'copy editing', 'proofreading'],
'broadcast': ['broadcast', 'broadcasting', 'tv', 'radio', 'on-air'],
'podcast': ['podcast', 'audio', 'podcasting', 'audio production'],
'publishing': ['publishing', 'publication', 'press', 'print', 'digital publishing'],
# Science / Research
'research': ['research', 'scientific research', 'laboratory', 'lab work', 'experiments'],
'biology': ['biology', 'biological', 'life sciences', 'molecular biology', 'microbiology'],
'chemistry': ['chemistry', 'chemical', 'analytical chemistry', 'organic chemistry'],
'environmental': ['environmental', 'environment', 'sustainability', 'ecology', 'conservation'],
'laboratory': ['laboratory', 'lab', 'bench work', 'lab techniques', 'specimen'],
'scientific method': ['scientific method', 'hypothesis', 'experiments', 'data collection'],
# Consulting
'consulting': ['consulting', 'consultant', 'advisory', 'advisory services'],
'strategy consulting': ['strategy consulting', 'strategic consulting', 'management consulting'],
'implementation': ['implementation', 'deployment', 'rollout', 'go-live', 'execution'],
'business transformation': ['business transformation', 'transformation', 'change management', 'reorganization'],
'client engagement': ['client engagement', 'client management', 'engagement', 'delivery'],
# Additional soft skills
'adaptability': ['adaptability', 'flexible', 'adaptable', 'versatile', 'agility'],
'attention to detail': ['attention to detail', 'detail-oriented', 'detail oriented', 'meticulous', 'thorough'],
'creativity': ['creativity', 'creative', 'innovative', 'creative thinking', 'ideation'],
'initiative': ['initiative', 'self-starter', 'proactive', 'self-motivated'],
'interpersonal': ['interpersonal', 'interpersonal skills', 'relationship building', 'people skills'],
'multitasking': ['multitasking', 'multi-tasking', 'juggling priorities', 'handling multiple tasks'],
'resourcefulness': ['resourcefulness', 'resourceful', 'problem solver', 'solution-oriented'],
}
# Common abbreviation mappings (COMPREHENSIVE FOR ALL DOMAINS)
self.abbreviations = {
# Technology
'ml': 'machine learning', 'ai': 'artificial intelligence', 'dl': 'deep learning',
'nlp': 'natural language processing', 'cv': 'computer vision', 'llm': 'large language model',
'genai': 'generative ai', 'ds': 'data science', 'de': 'data engineering',
'swe': 'software engineer', 'sde': 'software development engineer',
'qa': 'quality assurance', 'qc': 'quality control',
'devops': 'development operations', 'mlops': 'machine learning operations',
'etl': 'extract transform load', 'eda': 'exploratory data analysis',
'api': 'application programming interface', 'ui': 'user interface', 'ux': 'user experience',
'sql': 'structured query language', 'aws': 'amazon web services', 'gcp': 'google cloud platform',
'saas': 'software as a service', 'paas': 'platform as a service', 'iaas': 'infrastructure as a service',
# Business / Finance
'kpi': 'key performance indicator', 'roi': 'return on investment', 'yoe': 'years of experience',
'pm': 'project manager', 'ba': 'business analyst', 'cfo': 'chief financial officer',
'cto': 'chief technology officer', 'ceo': 'chief executive officer', 'coo': 'chief operating officer',
'vp': 'vice president', 'svp': 'senior vice president', 'evp': 'executive vice president',
'p&l': 'profit and loss', 'gaap': 'generally accepted accounting principles',
'ifrs': 'international financial reporting standards', 'sox': 'sarbanes oxley',
'cpa': 'certified public accountant', 'cfa': 'chartered financial analyst',
'cma': 'certified management accountant', 'dcf': 'discounted cash flow',
'm&a': 'mergers and acquisitions', 'ipo': 'initial public offering',
'ebitda': 'earnings before interest taxes depreciation amortization',
'ytd': 'year to date', 'mtd': 'month to date', 'yoy': 'year over year',
'b2b': 'business to business', 'b2c': 'business to consumer',
# Marketing / Sales
'seo': 'search engine optimization', 'sem': 'search engine marketing',
'ppc': 'pay per click', 'cpc': 'cost per click', 'cpm': 'cost per mille',
'crm': 'customer relationship management', 'cro': 'conversion rate optimization',
'ctr': 'click through rate', 'cac': 'customer acquisition cost', 'ltv': 'lifetime value',
'nps': 'net promoter score', 'csat': 'customer satisfaction',
# HR
'hr': 'human resources', 'hris': 'human resources information system',
'hcm': 'human capital management', 'phr': 'professional human resources',
'sphr': 'senior professional human resources', 'shrm': 'society human resources management',
'dei': 'diversity equity inclusion', 'l&d': 'learning and development',
'eeoc': 'equal employment opportunity commission', 'fmla': 'family medical leave act',
# Healthcare
'rn': 'registered nurse', 'lpn': 'licensed practical nurse', 'np': 'nurse practitioner',
'md': 'medical doctor', 'do': 'doctor of osteopathic medicine',
'emr': 'electronic medical records', 'ehr': 'electronic health records',
'hipaa': 'health insurance portability accountability act', 'phi': 'protected health information',
'icd': 'international classification of diseases', 'cpt': 'current procedural terminology',
# Operations
'scm': 'supply chain management', 'erp': 'enterprise resource planning',
'pmp': 'project management professional', 'six sigma': 'six sigma',
'tqm': 'total quality management', 'jit': 'just in time',
# Legal
'jd': 'juris doctor', 'llm': 'master of laws', 'ip': 'intellectual property',
'nda': 'non disclosure agreement', 'sla': 'service level agreement',
}
self.stop_words = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of',
'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have',
'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
'may', 'might', 'must', 'shall', 'can', 'we', 'you', 'they', 'he', 'she',
'it', 'i', 'me', 'my', 'your', 'our', 'their', 'his', 'her', 'its',
'this', 'that', 'these', 'those', 'what', 'which', 'who', 'whom', 'how',
'all', 'each', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
'no', 'not', 'only', 'same', 'so', 'than', 'too', 'very', 'just', 'also',
'now', 'as', 'from', 'about', 'into', 'through', 'during', 'before',
'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then',
'once', 'here', 'there', 'when', 'where', 'why', 'while', 'any', 'every',
# Generic words that appear in JDs but aren't real keywords
'strong', 'related', 'junior', 'senior', 'mid', 'level', 'new', 'within',
'using', 'use', 'used', 'able', 'ability', 'include', 'including', 'includes',
'need', 'needed', 'needs', 'well', 'good', 'great', 'excellent', 'required',
'requirements', 'qualification', 'qualifications', 'preferred', 'desired',
'role', 'position', 'job', 'candidate', 'looking', 'seek', 'seeking',
'year', 'years', 'month', 'months', 'day', 'days', 'time', 'times',
'work', 'working', 'worker', 'works', 'join', 'joining', 'team', 'teams',
'based', 'base', 'company', 'organization', 'organizations', 'firm',
'make', 'making', 'made', 'get', 'getting', 'got', 'take', 'taking', 'took',
# More generic filler words
'solid', 'background', 'capabilities', 'capability', 'knowledge',
'proficiency', 'proficient', 'expertise', 'expert', 'familiar',
'understanding', 'hands', 'hand', 'proven', 'track', 'record'
}
def _stem_word(self, word: str) -> str:
"""Simple Porter-like stemming for common suffixes."""
word = word.lower().strip()
# Common suffix patterns
suffixes = [
('ational', 'ate'), ('tional', 'tion'), ('enci', 'ence'), ('anci', 'ance'),
('izer', 'ize'), ('isation', 'ize'), ('ization', 'ize'), ('ation', 'ate'),
('ator', 'ate'), ('alism', 'al'), ('iveness', 'ive'), ('fulness', 'ful'),
('ousness', 'ous'), ('aliti', 'al'), ('iviti', 'ive'), ('biliti', 'ble'),
('ling', 'l'), ('ment', ''), ('ness', ''), ('ity', ''), ('ies', 'y'),
('ing', ''), ('ed', ''), ('er', ''), ('ly', ''), ('es', ''), ('s', '')
]
for suffix, replacement in suffixes:
if word.endswith(suffix) and len(word) > len(suffix) + 2:
return word[:-len(suffix)] + replacement
return word
def _fuzzy_match(self, word1: str, word2: str, threshold: float = 0.70) -> bool:
"""Check if two words are similar using sequence matching."""
from difflib import SequenceMatcher
word1, word2 = word1.lower(), word2.lower()
if word1 == word2:
return True
# Also check if one contains the other (e.g., 'collaborate' in 'collaboration')
if word1 in word2 or word2 in word1:
return True
# Check stem match
if self._stem_word(word1) == self._stem_word(word2):
return True
ratio = SequenceMatcher(None, word1, word2).ratio()
return ratio >= threshold
def _expand_with_taxonomy(self, words: List[str]) -> set:
"""Expand words using skills taxonomy - STRICT matching only.
Only adds related terms for EXACT skill matches to prevent false positives.
"""
expanded = set(words)
for word in words:
word_lower = word.lower()
# STRICT: Only expand if word is an EXACT match for a taxonomy key
if word_lower in self.skills_taxonomy:
# Add variations but NOT other unrelated skills
expanded.update(self.skills_taxonomy[word_lower])
# Check abbreviation expansions - EXACT match only
if word_lower in self.abbreviations:
expanded.add(self.abbreviations[word_lower])
# Reverse: if full form EXACTLY matches, add abbreviation
for abbr, full in self.abbreviations.items():
if word_lower == full.lower():
expanded.add(abbr)
return expanded
def _extract_years_experience(self, text: str) -> List[int]:
"""Extract years of experience mentions from text."""
patterns = [
r'(\d+)\+?\s*(?:years?|yrs?)(?:\s+of)?\s+(?:experience|exp)',
r'(?:experience|exp)(?:\s+of)?\s*:?\s*(\d+)\+?\s*(?:years?|yrs?)',
r'(\d+)\+?\s*(?:years?|yrs?)\s+(?:in|of|working)',
r'over\s+(\d+)\s+(?:years?|yrs?)',
r'(\d+)\+?\s*yoe',
]
years = []
for pattern in patterns:
matches = re.findall(pattern, text.lower())
years.extend([int(m) for m in matches if m.isdigit()])
return years
def _calculate_tfidf_score(self, resume: str, job_desc: str) -> float:
"""Calculate TF-IDF weighted keyword match score - REALISTIC SCORING.
Based on real ATS systems:
- No artificial baselines
- Score = (matched keywords / required keywords) Γ— 100
- Empty resumes score ~0-10%, not 60%+
"""
import math
resume_lower = resume.lower()
jd_lower = job_desc.lower()
# INPUT VALIDATION: Check for minimum content
if len(resume.strip()) < 50:
return max(5, len(resume.strip()) // 10) # Very short = very low score
# Tokenize and clean
resume_words = re.findall(r'\b[a-zA-Z]{2,}\b', resume_lower)
jd_words = re.findall(r'\b[a-zA-Z]{2,}\b', jd_lower)
# Filter stop words and stem
resume_words = [self._stem_word(w) for w in resume_words if w not in self.stop_words]
jd_words = [self._stem_word(w) for w in jd_words if w not in self.stop_words]
if not jd_words:
return 50 # No JD keywords to match - neutral score
if len(resume_words) < 10:
return 10 # Very sparse resume
# Calculate TF for job description
jd_tf = Counter(jd_words)
# Calculate IDF-like weights (words appearing less often are more important)
max_count = max(jd_tf.values()) if jd_tf else 1
jd_weights = {word: 1 + math.log(max_count / count) for word, count in jd_tf.items()}
# Expand resume words with taxonomy
resume_expanded = self._expand_with_taxonomy(resume_words)
resume_stems = {self._stem_word(w) for w in resume_expanded}
# Also add raw words for substring matching
resume_raw = set(resume_lower.split())
# Calculate weighted match score - STRICT matching to avoid false positives
weighted_matches = 0
total_weight = 0
for word, weight in jd_weights.items():
total_weight += weight
# Check direct match (highest confidence)
if word in resume_stems:
weighted_matches += weight
# Check exact word in resume text
elif word in resume_lower:
weighted_matches += weight
# Check containment ONLY for stems (e.g., 'develop' in 'developer')
# But require the word to be at least 5 chars to avoid false positives
elif len(word) >= 5 and any(word in rw for rw in resume_stems if len(rw) >= 5):
weighted_matches += weight * 0.9
# Check fuzzy match - STRICT threshold (0.80+) for technical terms
elif len(word) >= 5 and any(self._fuzzy_match(word, rw, 0.80) for rw in resume_stems if len(rw) >= 5):
weighted_matches += weight * 0.8
# NO 3-char or 4-char prefix matching - causes too many false positives
if total_weight == 0:
return 15 # No weighted keywords found
# REALISTIC SCORING: Direct percentage based on actual matches
raw_score = (weighted_matches / total_weight) * 100
return min(100, max(0, raw_score))
def _skills_match_score(self, resume: str, job_desc: str) -> float:
"""Score based on technical skills matching with taxonomy - REALISTIC.
Based on Jobscan research: 76.4% of recruiters filter by skills
Score = (matched skills / required skills) Γ— 100
"""
resume_lower = resume.lower()
jd_lower = job_desc.lower()
# INPUT VALIDATION
if len(resume.strip()) < 50:
return 5 # Empty/minimal resume
# Extract skills from JD using taxonomy
jd_skills = set()
for skill_name, variations in self.skills_taxonomy.items():
for var in variations:
if var in jd_lower:
jd_skills.add(skill_name)
break
# Also extract raw important words from JD as potential skills
jd_words = set(re.findall(r'\b[a-zA-Z]{4,}\b', jd_lower)) - self.stop_words
if not jd_skills:
# No taxonomy matches - fall back to direct word matching
if jd_words:
matched = sum(1 for w in list(jd_words)[:20] if w in resume_lower or w[:4] in resume_lower)
# Direct percentage: matched out of checked words
return min(100, max(0, (matched / min(20, len(jd_words))) * 100))
return 40 # No skills detected in JD - neutral
# Check which skills are in resume - STRICT matching only
matched_skills = 0
for skill_name in jd_skills:
variations = self.skills_taxonomy.get(skill_name, [skill_name])
# Check direct match only (full word or variation)
if any(var in resume_lower for var in variations):
matched_skills += 1
# Check stem variations - EXACT stem match only
elif any(self._stem_word(var) in resume_lower for var in variations):
matched_skills += 0.9
# NO prefix matching - causes false positives
# REALISTIC: Direct percentage scoring - no baseline
match_ratio = matched_skills / len(jd_skills)
# Score = match_ratio * 100 (0% matched = 0%, 100% matched = 100%)
return min(100, max(0, match_ratio * 100))
def _experience_match_score(self, resume: str, job_desc: str) -> float:
"""Score based on years of experience matching - REALISTIC SCORING."""
# INPUT VALIDATION
if len(resume.strip()) < 50:
return 5 # Empty/minimal resume
jd_years = self._extract_years_experience(job_desc)
resume_years = self._extract_years_experience(resume)
# Also calculate from date ranges in resume
calculated_years = self._calculate_years_from_dates(resume)
if not jd_years:
# No requirement specified - give moderate score if has any dates
if calculated_years > 0 or resume_years:
return 70 # Has experience, no requirement - good match
return 40 # No experience detected, no requirement
required_years = max(jd_years) # Take the highest requirement
# Use the best available years data
if resume_years:
candidate_years = max(resume_years)
elif calculated_years > 0:
candidate_years = calculated_years
else:
return 20 # Can't detect any experience - low score
if candidate_years >= required_years:
return 100
elif candidate_years >= required_years * 0.8:
return 90
elif candidate_years >= required_years * 0.6:
return 75
else:
return max(50, 100 - (required_years - candidate_years) * 8)
def _calculate_years_from_dates(self, resume: str) -> int:
"""Calculate total years of experience from date ranges in resume."""
import datetime
current_year = datetime.datetime.now().year
# Pattern: 2018 - 2023 or 2018 - Present
date_pattern = r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?\s*(20\d{2}|19\d{2})\s*[-–to]+\s*(20\d{2}|19\d{2}|[Pp]resent|[Cc]urrent)'
matches = re.findall(date_pattern, resume)
total_years = 0
for start, end in matches:
try:
start_year = int(start)
if end.lower() in ['present', 'current']:
end_year = current_year
else:
end_year = int(end)
total_years += max(0, end_year - start_year)
except:
continue
return total_years
# ================== FRAUD/GAMING DETECTION FUNCTIONS ==================
def _detect_jd_copy(self, resume: str, job_desc: str) -> Dict:
"""Detect if resume is copied from job description (gaming attempt)."""
resume_words = set(resume.lower().split())
jd_words = set(job_desc.lower().split())
if len(jd_words) < 10:
return {'is_copy': False, 'similarity': 0, 'penalty': 0}
# Calculate word overlap
overlap = len(resume_words & jd_words)
similarity = overlap / len(jd_words) if jd_words else 0
# Check for phrase copying (more damning)
resume_lower = resume.lower()
jd_sentences = [s.strip() for s in job_desc.split('.') if len(s.strip()) > 30]
copied_phrases = sum(1 for s in jd_sentences if s.lower() in resume_lower)
phrase_copy_ratio = copied_phrases / len(jd_sentences) if jd_sentences else 0
# High similarity or phrase copying = gaming
is_copy = similarity > 0.75 or phrase_copy_ratio > 0.3
penalty = 60 if phrase_copy_ratio > 0.5 else (50 if similarity > 0.85 else (40 if similarity > 0.75 else 0))
return {'is_copy': is_copy, 'similarity': similarity, 'phrase_copy': phrase_copy_ratio, 'penalty': penalty}
def _detect_skills_without_experience(self, resume: str) -> Dict:
"""Detect if resume lists skills without work context."""
resume_lower = resume.lower()
# Check for experience section markers
experience_markers = ['experience', 'employment', 'work history', 'professional background',
'career history', 'positions held', 'job history']
has_experience_section = any(marker in resume_lower for marker in experience_markers)
# Check for work context indicators
work_context = ['worked', 'managed', 'developed', 'implemented', 'created', 'led', 'designed',
'built', 'achieved', 'delivered', 'company', 'organization', 'team', 'project',
'responsible for', 'collaborated', 'years', 'months', 'position', 'role']
work_context_count = sum(1 for w in work_context if w in resume_lower)
# Check for date patterns (employment dates)
date_pattern = re.compile(r'\b(19|20)\d{2}\b|\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s*\d{4}', re.IGNORECASE)
has_dates = bool(date_pattern.search(resume))
# Skills-only resume lacks context
is_skills_only = not has_experience_section and work_context_count < 5 and not has_dates
penalty = 35 if is_skills_only else 0
return {'is_skills_only': is_skills_only, 'has_experience_section': has_experience_section,
'work_context_count': work_context_count, 'penalty': penalty}
def _detect_industry_mismatch(self, resume: str, job_desc: str) -> Dict:
"""Detect complete industry/profession mismatch (e.g., plumber applying for surgeon)."""
# Define industry clusters - roles that should NOT cross
industry_clusters = {
'healthcare_clinical': ['doctor', 'physician', 'surgeon', 'nurse', 'rn', 'lpn', 'np', 'pa', 'medical',
'patient care', 'diagnosis', 'treatment', 'clinical', 'hospital', 'healthcare provider',
'anesthesiologist', 'cardiologist', 'pediatrician', 'oncologist', 'radiologist'],
'trades': ['plumber', 'electrician', 'carpenter', 'mechanic', 'hvac', 'welder', 'mason',
'pipefitter', 'roofer', 'contractor', 'construction worker', 'handyman'],
'legal': ['lawyer', 'attorney', 'paralegal', 'legal counsel', 'solicitor', 'barrister',
'judge', 'litigation', 'legal assistant', 'law clerk', 'jd', 'bar admission'],
'aviation': ['pilot', 'flight', 'aviation', 'aircraft', 'airline', 'cockpit', 'atc',
'air traffic', 'faa', 'flight hours', 'aircraft type rating'],
'culinary': ['chef', 'cook', 'culinary', 'kitchen', 'restaurant', 'sous chef', 'pastry',
'food prep', 'catering', 'menu', 'cuisine'],
'education_teaching': ['teacher', 'professor', 'educator', 'instructor', 'teaching', 'classroom',
'curriculum', 'lesson plan', 'students', 'pedagogy', 'education degree'],
'law_enforcement': ['police', 'officer', 'detective', 'law enforcement', 'sheriff', 'trooper',
'patrol', 'investigation', 'criminal justice', 'peace officer'],
'finance_licensed': ['cpa', 'cfa', 'cfp', 'series 7', 'series 66', 'finra', 'broker',
'financial advisor', 'investment advisor', 'registered representative'],
'quantitative_finance': ['quant', 'quantitative', 'algorithmic', 'derivatives', 'risk model',
'stochastic', 'monte carlo', 'var', 'hedge fund', 'trading desk',
'fixed income', 'structured products', 'credit derivatives'],
'wellness_fitness': ['yoga', 'meditation', 'wellness', 'fitness', 'personal trainer',
'pilates', 'mindfulness', 'holistic', 'spa', 'massage'],
'academia': ['phd', 'professor', 'tenure', 'publish', 'research grant', 'dissertation',
'peer review', 'journal', 'academic', 'postdoc', 'faculty'],
'investment_banking': ['investment bank', 'ib analyst', 'm&a', 'merger', 'acquisition',
'dcf', 'lbo', 'pitch book', 'bulge bracket', 'deal flow'],
'retail_banking': ['retail bank', 'branch', 'teller', 'customer accounts', 'deposit',
'consumer banking', 'branch manager', 'retail lending'],
}
resume_lower = resume.lower()
jd_lower = job_desc.lower()
# Special fast-path for PhD-required academic positions
if ('phd' in jd_lower and 'required' in jd_lower) or 'assistant professor' in jd_lower or 'associate professor' in jd_lower:
# This is an academic position - check if resume has academic credentials
has_phd = bool(re.search(r'\b(phd|ph\.?d\.?|doctorate)\b(?!\s*(?:required|needed|preferred|student))', resume_lower))
if 'no phd' in resume_lower or 'phd: none' in resume_lower:
has_phd = False
has_publications = bool(re.search(r'\b(publication|published|journal|conference paper|peer.?review)\b', resume_lower))
if re.search(r'publication[s]?\s*[:=]?\s*(none|n/a|0|no)\b', resume_lower):
has_publications = False
has_teaching = bool(re.search(r'\b(taught|professor|lecturer|instructor)\b', resume_lower))
if re.search(r'teaching\s*(?:experience)?\s*[:=]?\s*(none|n/a|0|no)\b', resume_lower):
has_teaching = False
if not has_phd and not has_publications and not has_teaching:
return {'is_mismatch': True, 'resume_industry': 'industry/non-academic',
'job_industry': 'academia', 'penalty': 50}
# Find which clusters the job belongs to (can match multiple)
job_clusters = {}
for cluster, keywords in industry_clusters.items():
matches = sum(1 for kw in keywords if kw in jd_lower)
if matches >= 2:
job_clusters[cluster] = matches
# Get the primary job cluster
job_cluster = max(job_clusters, key=job_clusters.get) if job_clusters else None
job_cluster_matches = job_clusters.get(job_cluster, 0) if job_cluster else 0
# If job is in a specialized cluster, check if resume is from a different specialized cluster
if job_cluster and job_cluster_matches >= 2:
resume_cluster = None
resume_cluster_matches = 0
for cluster, keywords in industry_clusters.items():
matches = sum(1 for kw in keywords if kw in resume_lower)
if matches > resume_cluster_matches:
resume_cluster_matches = matches
resume_cluster = cluster
# Mismatch: resume is strongly in a DIFFERENT specialized cluster
if resume_cluster and resume_cluster != job_cluster and resume_cluster_matches >= 2:
return {'is_mismatch': True, 'resume_industry': resume_cluster,
'job_industry': job_cluster, 'penalty': 55}
# Special case: academia jobs require academic background
if job_cluster == 'academia':
# Check for PhD (but not "PhD required" from JD copy or "No PhD")
has_phd = bool(re.search(r'\b(phd|ph\.?d\.?|doctorate)\b(?!\s*(?:required|needed|preferred|student))', resume_lower))
# Filter out "no phd" or "none"
if 'no phd' in resume_lower or 'phd: none' in resume_lower:
has_phd = False
# Check for publications (not "none" or "n/a")
has_publications = bool(re.search(r'\b(publication|published|journal|conference paper|peer.?review)\b', resume_lower))
if 'publication' in resume_lower:
# Check if it's followed by "none", "n/a", "0", etc.
if re.search(r'publication[s]?\s*[:=]?\s*(none|n/a|0|no)\b', resume_lower):
has_publications = False
# Check for teaching (not "none" or "n/a")
has_teaching = bool(re.search(r'\b(taught|teaching|professor|lecturer|instructor|course|students)\b', resume_lower))
if 'teaching' in resume_lower:
# Check if it's followed by "none", "n/a", "0", etc.
if re.search(r'teaching\s*(?:experience)?\s*[:=]?\s*(none|n/a|0|no)\b', resume_lower):
has_teaching = False
if not has_phd and not has_publications and not has_teaching:
return {'is_mismatch': True, 'resume_industry': 'industry/non-academic',
'job_industry': 'academia', 'penalty': 50}
# Special case: investment banking vs retail banking
if job_cluster == 'investment_banking' and resume_cluster == 'retail_banking':
return {'is_mismatch': True, 'resume_industry': 'retail_banking',
'job_industry': 'investment_banking', 'penalty': 45}
return {'is_mismatch': False, 'penalty': 0}
def _detect_suspicious_dates(self, resume: str) -> Dict:
"""Detect future dates, impossible timelines, and concurrent role issues."""
import datetime
current_year = datetime.datetime.now().year
issues = []
penalty = 0
# Find all years mentioned
years = list(set(int(m) for m in re.findall(r'\b(19\d{2}|20\d{2})\b', resume)))
# Check for future years (but allow current year + 1 for expected graduations)
future_years = [y for y in years if y > current_year + 1]
if future_years:
issues.append(f"Future dates: {future_years}")
penalty += 35
# Check for impossibly old dates (before 1960 in work history)
old_years = [y for y in years if y < 1960]
if old_years and 'education' not in resume.lower()[:500]:
issues.append(f"Suspicious old dates: {old_years}")
penalty += 15
# Check for impossibly long tenure (>40 years at one company)
year_ranges = re.findall(r'(19\d{2}|20\d{2})\s*[-–to]+\s*(19\d{2}|20\d{2}|present|current)', resume, re.IGNORECASE)
for start, end in year_ranges:
try:
start_yr = int(start)
end_yr = current_year if end.lower() in ['present', 'current'] else int(end)
if end_yr - start_yr > 40:
issues.append(f"Impossible tenure: {start}-{end}")
penalty += 20
except:
pass
# Detect EXCESSIVE concurrent roles - 4+ simultaneous "present" jobs is suspicious
# (but 2-3 is common: full-time job + freelance/consulting + board seat)
present_mentions = len(re.findall(r'\b(present|current)\b', resume, re.IGNORECASE))
if present_mentions >= 5: # Raised threshold from 3 to 5
issues.append(f"Too many concurrent roles ({present_mentions} 'present' positions)")
penalty += 20 # Reduced from 25
# Check for too many roles in too short a time (rapid job hopping or fabrication)
role_indicators = re.findall(r'\b(manager|director|lead|head|chief|vp|president|ceo|cto|cfo)\b', resume, re.IGNORECASE)
if len(role_indicators) >= 8: # Only flag extreme cases
if years:
year_span = max(years) - min(years) if len(years) > 1 else 1
if year_span <= 2 and len(role_indicators) >= 8:
issues.append(f"Impossibly rapid advancement: {len(role_indicators)} senior roles in {year_span} years")
penalty += 30
return {'has_issues': len(issues) > 0, 'issues': issues, 'penalty': min(penalty, 50)}
def _detect_negative_sentiment(self, resume: str) -> Dict:
"""Detect negative language that shouldn't appear in resumes."""
negative_words = [
'fired', 'terminated', 'dismissed', 'let go', 'laid off', # Job loss
'failed', 'failure', 'unsuccessful', 'unable to', # Failure
'conflict', 'disagreement', 'dispute', 'argument', # Interpersonal issues
'lawsuit', 'sued', 'legal action', 'harassment', # Legal issues
'mistake', 'error', 'wrong', 'poor performance', # Performance issues
'hate', 'hated', 'terrible', 'awful', 'worst', # Emotional language
'struggled', 'struggling', 'difficult time', # Weakness indicators
'not skilled', 'lack of', 'weakness', 'weaknesses', # Self-deprecation
'unemployed', 'gap in employment', 'taking time off', # Employment gaps
'criminal', 'arrest', 'conviction', 'probation', # Legal history
]
resume_lower = resume.lower()
found_negatives = [word for word in negative_words if word in resume_lower]
penalty = min(len(found_negatives) * 15, 45) # Up to 45% penalty
return {'has_negatives': len(found_negatives) > 0, 'found': found_negatives, 'penalty': penalty}
def _detect_missing_required_credentials(self, resume: str, job_desc: str) -> Dict:
"""Detect if job requires specific credentials that are missing from resume."""
# Credentials that are REQUIRED for certain roles
required_credentials = {
# Healthcare
('rn', 'registered nurse', 'nursing'): ['rn', 'registered nurse', 'nursing license', 'nclex', 'bsn', 'nursing degree'],
('md', 'physician', 'doctor', 'medical doctor'): ['md', 'm.d.', 'medical degree', 'residency', 'board certified', 'medical license'],
('np', 'nurse practitioner'): ['np', 'nurse practitioner', 'aprn', 'dnp', 'msn'],
('pa', 'physician assistant'): ['pa-c', 'physician assistant', 'pa license'],
('pharmacist',): ['pharmd', 'rph', 'pharmacy license', 'pharmacy degree'],
('dentist',): ['dds', 'dmd', 'dental license', 'dental degree'],
# Legal
('attorney', 'lawyer', 'legal counsel'): ['jd', 'j.d.', 'bar admission', 'bar license', 'law degree', 'esquire', 'esq'],
# Accounting
('cpa required', 'certified public accountant'): ['cpa', 'certified public accountant'],
# Finance
('cfa required',): ['cfa', 'chartered financial analyst'],
('cfp required', 'certified financial planner'): ['cfp', 'certified financial planner'],
('series 7', 'registered representative'): ['series 7', 'finra', 'securities license'],
# Engineering
('pe required', 'professional engineer'): ['pe', 'p.e.', 'professional engineer', 'engineering license'],
# Aviation
('pilot', 'captain', 'first officer'): ['atp', 'cpl', 'pilot license', 'flight hours', 'type rating', 'faa'],
# Real Estate
('real estate agent', 'realtor'): ['real estate license', 'realtor license', 'licensed agent'],
# Insurance
('insurance agent', 'insurance broker'): ['insurance license', 'licensed agent', 'p&c license', 'life license'],
}
jd_lower = job_desc.lower()
resume_lower = resume.lower()
for job_keywords, required_creds in required_credentials.items():
# Check if job requires this credential
if any(kw in jd_lower for kw in job_keywords):
# Check if resume has any of the required credentials
has_credential = any(cred in resume_lower for cred in required_creds)
if not has_credential:
return {'missing_credential': True, 'required_for': job_keywords[0],
'needed': required_creds[:3], 'penalty': 40}
return {'missing_credential': False, 'penalty': 0}
def _detect_impossible_metrics(self, resume: str) -> Dict:
"""Detect impossible or exaggerated claims."""
issues = []
# Check for impossibly high percentages
percentages = re.findall(r'(\d+)%', resume)
for pct in percentages:
pct_val = int(pct)
if pct_val > 1000: # >1000% improvement claims
issues.append(f"Impossible percentage: {pct_val}%")
elif pct_val > 500 and pct_val != 100: # Suspicious large percentages
issues.append(f"Suspicious percentage: {pct_val}%")
# Check for impossibly large numbers in context
money_patterns = re.findall(r'\$\s*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(million|billion|trillion|m|b|k)?', resume, re.IGNORECASE)
for amount, unit in money_patterns:
amount_num = float(amount.replace(',', ''))
if unit and unit.lower() in ['trillion', 't']:
issues.append(f"Unlikely amount: ${amount} {unit}")
elif unit and unit.lower() in ['billion', 'b'] and amount_num > 100:
issues.append(f"Suspicious large amount: ${amount} {unit}")
# Check for impossibly high team sizes for individual contributors
team_patterns = re.findall(r'(?:led|managed|supervised)\s+(?:a\s+)?(?:team\s+of\s+)?(\d+)\+?\s*(?:people|employees|staff|team members|engineers)', resume, re.IGNORECASE)
for size in team_patterns:
if int(size) > 1000:
issues.append(f"Unlikely team size: {size}")
penalty = min(len(issues) * 15, 40)
return {'has_issues': len(issues) > 0, 'issues': issues, 'penalty': penalty}
def _detect_format_issues(self, resume: str) -> Dict:
"""Detect format issues like bullet-only or no structure."""
issues = []
penalty = 0
lines = resume.strip().split('\n')
non_empty_lines = [l for l in lines if l.strip()]
if len(non_empty_lines) < 5:
issues.append("Too few content lines")
penalty += 30 # Increased
# Check for bullet-only format (no sentences, just bullets)
bullet_chars = ['β€’', '-', '*', 'β†’', '>', 'Β·']
bullet_lines = sum(1 for l in non_empty_lines if any(l.strip().startswith(b) for b in bullet_chars))
bullet_ratio = bullet_lines / len(non_empty_lines) if non_empty_lines else 0
if bullet_ratio > 0.60 and len(non_empty_lines) <= 10: # Short bullet-only
issues.append("Minimal bullet-only format")
penalty += 35
elif bullet_ratio > 0.80 and len(non_empty_lines) > 5:
issues.append("Bullet-only format (no context)")
penalty += 30
# Check for code block format
code_block_count = resume.count('```')
if code_block_count >= 2:
issues.append("Resume contains code blocks")
penalty += 25
# Check for lack of any sections/headers
section_patterns = ['experience', 'education', 'skills', 'summary', 'objective',
'work history', 'employment', 'qualifications', 'professional']
has_sections = any(pat in resume.lower() for pat in section_patterns)
if not has_sections and len(resume) > 200:
issues.append("No recognizable sections")
penalty += 15
# Check for all-caps (often indicates bad parsing or shouting)
caps_ratio = sum(1 for c in resume if c.isupper()) / len(resume) if resume else 0
if caps_ratio > 0.6:
issues.append("Excessive capitalization")
penalty += 10
# Check for lack of work verbs/action
action_words = ['managed', 'developed', 'created', 'led', 'implemented', 'designed',
'built', 'achieved', 'improved', 'increased', 'delivered', 'established']
has_action = any(word in resume.lower() for word in action_words)
if not has_action and len(resume) > 300:
issues.append("No action verbs (passive resume)")
penalty += 10
return {'has_issues': len(issues) > 0, 'issues': issues, 'penalty': min(penalty, 60)}
def _detect_experience_level_mismatch(self, resume: str, job_desc: str) -> Dict:
"""Detect if experience level in resume doesn't match job requirements."""
resume_lower = resume.lower()
jd_lower = job_desc.lower()
# Job level indicators
senior_indicators = ['senior', 'sr.', 'lead', 'principal', 'staff', 'architect', 'director',
'manager', 'head of', 'vp', 'chief', '10+ years', '8+ years', '7+ years']
entry_indicators = ['entry level', 'junior', 'associate', 'intern', 'graduate', 'new grad',
'0-2 years', '1-2 years', 'no experience required', 'entry-level']
# Check job level
job_is_senior = any(ind in jd_lower for ind in senior_indicators)
job_is_entry = any(ind in jd_lower for ind in entry_indicators)
# Check resume experience level
# Count years of experience mentioned
years_mentioned = re.findall(r'(\d+)\+?\s*years?\s*(?:of\s+)?(?:experience)?', resume_lower)
max_years = max([int(y) for y in years_mentioned], default=0)
# Check for entry-level resume indicators
resume_is_entry = (
'recent graduate' in resume_lower or
'new graduate' in resume_lower or
'entry level' in resume_lower or
('intern' in resume_lower and 'senior' not in resume_lower) or
max_years <= 2
)
# Check for senior/overqualified resume indicators
has_phd = 'phd' in resume_lower or 'ph.d' in resume_lower or 'doctorate' in resume_lower
has_executive_titles = any(title in resume_lower for title in ['director', 'vp', 'vice president', 'chief', 'head of', 'principal', 'ceo', 'cto', 'cfo'])
resume_is_senior = has_executive_titles or max_years >= 8 or has_phd
# Mismatch detection
if job_is_senior and resume_is_entry:
return {'is_mismatch': True, 'job_level': 'senior', 'resume_level': 'entry',
'detail': 'Entry-level applying for senior role', 'penalty': 35}
# NEW: Overqualified detection - PhD/executive for junior role
if job_is_entry and resume_is_senior:
overqualified_penalty = 0
reasons = []
if has_phd and 'phd' not in jd_lower and 'research' not in jd_lower:
reasons.append('PhD for non-research entry role')
overqualified_penalty += 20
if has_executive_titles:
reasons.append('Executive experience for entry role')
overqualified_penalty += 15
if max_years >= 15:
reasons.append(f'{max_years}+ years experience for entry role')
overqualified_penalty += 10
if overqualified_penalty > 0:
return {'is_mismatch': True, 'job_level': 'entry', 'resume_level': 'senior/overqualified',
'detail': f"Overqualified: {', '.join(reasons)}", 'penalty': min(overqualified_penalty, 35)}
return {'is_mismatch': False, 'penalty': 0}
def analyze(self, resume: str, job_desc: str) -> Dict:
"""Calculate comprehensive ATS compatibility score with fraud detection."""
# First calculate base scores
scores = {
'keyword_match': self._calculate_tfidf_score(resume, job_desc),
'semantic_match': self._semantic_section_match(resume, job_desc),
'experience_match': self._experience_match_score(resume, job_desc),
'skills_match': self._skills_match_score(resume, job_desc),
'format_score': self._format_score(resume),
'section_score': self._section_score(resume),
'action_verbs': self._action_verb_score(resume),
'quantification': self._quantification_score(resume)
}
# Run fraud/gaming detection - ONLY apply penalties for SEVERE issues
# Light warnings are informational, not score-reducing
fraud_checks = {
'jd_copy': self._detect_jd_copy(resume, job_desc),
'skills_only': self._detect_skills_without_experience(resume),
'industry_mismatch': self._detect_industry_mismatch(resume, job_desc),
'date_issues': self._detect_suspicious_dates(resume),
'negative_sentiment': self._detect_negative_sentiment(resume),
'missing_credentials': self._detect_missing_required_credentials(resume, job_desc),
'impossible_metrics': self._detect_impossible_metrics(resume),
'format_issues': self._detect_format_issues(resume),
'experience_mismatch': self._detect_experience_level_mismatch(resume, job_desc),
}
# ONLY apply penalties for truly severe issues (not minor flags)
# Severe = JD copy, industry mismatch, impossible metrics, negative sentiment
# Minor = format, experience level (informational warnings only)
severe_penalty = 0
if fraud_checks['jd_copy'].get('is_copy'):
severe_penalty += fraud_checks['jd_copy'].get('penalty', 0)
if fraud_checks['industry_mismatch'].get('is_mismatch') and fraud_checks['industry_mismatch'].get('penalty', 0) >= 45:
severe_penalty += fraud_checks['industry_mismatch'].get('penalty', 0)
if fraud_checks['impossible_metrics'].get('has_issues'):
severe_penalty += fraud_checks['impossible_metrics'].get('penalty', 0)
if fraud_checks['negative_sentiment'].get('has_negatives') and len(fraud_checks['negative_sentiment'].get('found', [])) >= 2:
severe_penalty += min(fraud_checks['negative_sentiment'].get('penalty', 0), 20)
if fraud_checks['date_issues'].get('has_issues'):
# Only penalize truly impossible dates (future, pre-1960)
date_penalty = fraud_checks['date_issues'].get('penalty', 0)
if date_penalty >= 30: # Significant date issue
severe_penalty += min(date_penalty, 35)
# Cap total penalty at 40% - allow legitimate resumes to score 60%+
total_penalty = min(severe_penalty, 40)
# Calculate base score
base_total = sum(scores[k] * self.weights[k] for k in scores)
# Apply penalty
final_total = max(5, base_total - total_penalty)
# Collect warnings for feedback
warnings = []
if fraud_checks['jd_copy']['is_copy']:
warnings.append("⚠️ Resume appears to be copied from job description")
if fraud_checks['skills_only']['is_skills_only']:
warnings.append("⚠️ Skills listed without work experience context")
if fraud_checks['industry_mismatch']['is_mismatch']:
warnings.append(f"⚠️ Industry mismatch: Your background ({fraud_checks['industry_mismatch'].get('resume_industry', 'unknown')}) doesn't match job ({fraud_checks['industry_mismatch'].get('job_industry', 'unknown')})")
if fraud_checks['date_issues']['has_issues']:
warnings.append(f"⚠️ Date issues detected: {', '.join(fraud_checks['date_issues']['issues'])}")
if fraud_checks['negative_sentiment']['has_negatives']:
warnings.append(f"⚠️ Negative language detected: {', '.join(fraud_checks['negative_sentiment']['found'][:3])}")
if fraud_checks['missing_credentials']['missing_credential']:
warnings.append(f"⚠️ Missing required credential for {fraud_checks['missing_credentials']['required_for']}")
if fraud_checks['impossible_metrics']['has_issues']:
warnings.append(f"⚠️ Suspicious claims: {', '.join(fraud_checks['impossible_metrics']['issues'][:2])}")
if fraud_checks['experience_mismatch']['is_mismatch']:
warnings.append(f"⚠️ Experience level mismatch: {fraud_checks['experience_mismatch']['detail']}")
return {
'total_score': min(99, max(1, int(final_total))),
'base_score': int(base_total),
'penalty_applied': int(total_penalty),
'breakdown': scores,
'keyword_match_pct': scores['keyword_match'],
'fraud_checks': fraud_checks,
'warnings': warnings
}
def _semantic_section_match(self, resume: str, job_desc: str) -> float:
"""Match job title/role semantically - COMPREHENSIVE FOR ALL 120+ DOMAINS."""
# Input validation - empty/minimal resumes get minimal scores
if len(resume.strip()) < 50:
return 5
# Common role patterns across all industries - MASSIVELY EXPANDED
role_patterns = {
# Technology - Software Engineering
'software engineer': ['software engineer', 'software developer', 'swe', 'developer', 'programmer', 'sde', 'full stack', 'backend', 'frontend', 'web developer', 'application developer'],
'frontend': ['frontend', 'front-end', 'front end', 'ui developer', 'react developer', 'angular developer', 'vue developer', 'web developer'],
'backend': ['backend', 'back-end', 'back end', 'server-side', 'api developer', 'node developer', 'python developer', 'java developer'],
'mobile': ['mobile developer', 'ios developer', 'android developer', 'mobile engineer', 'app developer', 'react native', 'flutter'],
'devops': ['devops', 'sre', 'site reliability', 'platform engineer', 'infrastructure engineer', 'cloud engineer', 'systems engineer'],
'security': ['security engineer', 'cybersecurity', 'information security', 'security analyst', 'infosec', 'penetration tester', 'soc analyst'],
'qa engineer': ['qa engineer', 'quality assurance', 'test engineer', 'sdet', 'automation engineer', 'qa analyst', 'quality engineer'],
'database': ['database administrator', 'dba', 'database engineer', 'data architect', 'sql developer'],
'network': ['network engineer', 'network administrator', 'network architect', 'systems administrator', 'it administrator'],
# Technology - Data & AI
'data scientist': ['data scientist', 'data science', 'ml engineer', 'machine learning engineer', 'ai engineer', 'research scientist', 'applied scientist'],
'data analyst': ['data analyst', 'business analyst', 'analytics', 'bi analyst', 'reporting analyst', 'data analytics', 'analytics analyst'],
'data engineer': ['data engineer', 'etl developer', 'data pipeline', 'de', 'big data engineer', 'analytics engineer', 'data architect'],
'bi analyst': ['bi analyst', 'business intelligence', 'tableau developer', 'power bi developer', 'reporting analyst'],
'quantitative': ['quantitative analyst', 'quant', 'quantitative researcher', 'quantitative developer', 'algo trader'],
# Management / Leadership - ALL LEVELS
'product manager': ['product manager', 'pm', 'product owner', 'po', 'product lead', 'product director', 'product management'],
'engineering manager': ['engineering manager', 'em', 'tech lead', 'technical lead', 'team lead', 'development manager', 'software manager'],
'project manager': ['project manager', 'program manager', 'pmp', 'scrum master', 'agile coach', 'delivery manager'],
'director': ['director', 'senior director', 'managing director', 'head of', 'department head'],
'vp': ['vice president', 'vp', 'avp', 'assistant vice president', 'svp', 'evp'],
'c-level': ['ceo', 'cto', 'cfo', 'coo', 'cmo', 'cio', 'chief', 'president', 'founder'],
'operations manager': ['operations manager', 'ops manager', 'operations director', 'operations lead', 'operations supervisor'],
# Finance / Accounting - EXPANDED
'accountant': ['accountant', 'accounting', 'cpa', 'staff accountant', 'senior accountant', 'controller', 'accounting manager'],
'financial analyst': ['financial analyst', 'finance analyst', 'fp&a', 'investment analyst', 'equity analyst', 'research analyst'],
'auditor': ['auditor', 'internal auditor', 'external auditor', 'audit manager', 'audit associate', 'sox auditor'],
'banker': ['banker', 'investment banker', 'relationship manager', 'commercial banker', 'private banker'],
'tax': ['tax accountant', 'tax analyst', 'tax manager', 'tax specialist', 'tax preparer', 'tax advisor'],
'credit': ['credit analyst', 'credit manager', 'credit officer', 'underwriter', 'loan officer', 'credit risk'],
'portfolio': ['portfolio manager', 'asset manager', 'fund manager', 'investment manager', 'wealth manager'],
'bookkeeper': ['bookkeeper', 'bookkeeping', 'accounts clerk', 'accounting clerk', 'payroll clerk'],
'payroll': ['payroll specialist', 'payroll manager', 'payroll administrator', 'payroll coordinator'],
'controller': ['controller', 'financial controller', 'assistant controller', 'corporate controller'],
'cfo': ['cfo', 'chief financial officer', 'finance director', 'vp finance'],
# Marketing - EXPANDED
'marketing manager': ['marketing manager', 'marketing director', 'brand manager', 'marketing lead', 'head of marketing'],
'digital marketing': ['digital marketing', 'seo specialist', 'sem specialist', 'performance marketing', 'growth marketing', 'ppc specialist'],
'content': ['content manager', 'content strategist', 'content writer', 'copywriter', 'content marketing', 'copy editor'],
'brand': ['brand manager', 'brand strategist', 'brand marketing', 'brand director'],
'product marketing': ['product marketing manager', 'pmm', 'product marketer', 'go-to-market'],
'email marketing': ['email marketing', 'email specialist', 'email marketing manager', 'crm specialist'],
'pr': ['public relations', 'pr specialist', 'pr manager', 'communications manager', 'media relations'],
'event': ['event manager', 'event coordinator', 'event planner', 'conference manager'],
'seo': ['seo specialist', 'seo manager', 'seo analyst', 'search specialist'],
# Sales - EXPANDED
'sales': ['sales representative', 'sales manager', 'account executive', 'sales director', 'business development', 'sales associate'],
'sdr': ['sdr', 'sales development representative', 'bdr', 'business development representative', 'lead generation'],
'account executive': ['account executive', 'ae', 'enterprise ae', 'strategic ae', 'senior ae'],
'sales engineer': ['sales engineer', 'solutions engineer', 'presales', 'technical sales', 'se'],
'channel': ['channel manager', 'channel sales', 'partner manager', 'alliance manager', 'partner sales'],
'vp sales': ['vp sales', 'sales director', 'chief revenue officer', 'cro', 'head of sales'],
'account manager': ['account manager', 'customer success', 'client manager', 'relationship manager', 'key account manager'],
# HR - EXPANDED
'recruiter': ['recruiter', 'talent acquisition', 'sourcer', 'recruiting manager', 'hr recruiter', 'technical recruiter'],
'hr manager': ['hr manager', 'hr director', 'hr business partner', 'hrbp', 'people manager', 'people ops'],
'hr generalist': ['hr generalist', 'hr coordinator', 'hr specialist', 'hr administrator', 'hr associate'],
'compensation': ['compensation analyst', 'compensation manager', 'total rewards', 'comp and benefits'],
'learning': ['learning and development', 'l&d', 'training manager', 'training specialist', 'instructional designer'],
'hris': ['hris analyst', 'hris manager', 'hr systems', 'workday analyst', 'peoplesoft'],
'benefits': ['benefits manager', 'benefits specialist', 'benefits administrator', 'benefits analyst'],
# Healthcare - EXPANDED
'nurse': ['nurse', 'rn', 'registered nurse', 'lpn', 'nurse practitioner', 'np', 'clinical nurse', 'charge nurse', 'nurse manager'],
'physician': ['physician', 'doctor', 'md', 'do', 'attending physician', 'resident', 'hospitalist', 'specialist'],
'pharmacist': ['pharmacist', 'pharmacy', 'clinical pharmacist', 'pharmacy manager', 'pharmd'],
'physical therapist': ['physical therapist', 'pt', 'physiotherapist', 'rehabilitation', 'physical therapy'],
'medical coder': ['medical coder', 'medical billing', 'coding specialist', 'hcpcs', 'cpc', 'icd-10'],
'clinical research': ['clinical research', 'cra', 'clinical research associate', 'clinical trial', 'crc'],
'hospital admin': ['hospital administrator', 'healthcare administrator', 'medical director', 'clinic manager'],
'dental': ['dentist', 'dental hygienist', 'dental assistant', 'orthodontist'],
'occupational therapist': ['occupational therapist', 'ot', 'occupational therapy'],
'medical assistant': ['medical assistant', 'clinical assistant', 'patient care technician'],
# Legal - EXPANDED
'attorney': ['attorney', 'lawyer', 'counsel', 'legal counsel', 'associate attorney', 'staff attorney'],
'paralegal': ['paralegal', 'legal assistant', 'legal secretary', 'litigation paralegal'],
'litigation': ['litigation attorney', 'litigator', 'trial attorney', 'trial lawyer'],
'ip': ['ip attorney', 'patent attorney', 'intellectual property', 'trademark attorney'],
'compliance': ['compliance officer', 'compliance manager', 'compliance analyst', 'regulatory compliance'],
'legal ops': ['legal operations', 'legal ops manager', 'legal project manager'],
'contract': ['contract manager', 'contracts administrator', 'contract specialist'],
# Operations / Supply Chain - EXPANDED
'supply chain': ['supply chain manager', 'logistics manager', 'procurement manager', 'sourcing manager', 'supply chain analyst'],
'warehouse': ['warehouse manager', 'warehouse supervisor', 'inventory manager', 'distribution manager', 'warehouse associate'],
'production': ['production manager', 'manufacturing manager', 'plant manager', 'production supervisor', 'operations manager'],
'quality': ['quality manager', 'quality engineer', 'qa manager', 'quality control', 'quality assurance manager'],
'procurement': ['procurement manager', 'buyer', 'purchasing manager', 'procurement specialist', 'strategic sourcing'],
'facilities': ['facilities manager', 'facilities coordinator', 'building manager', 'maintenance manager'],
# Education - EXPANDED
'teacher': ['teacher', 'instructor', 'professor', 'educator', 'lecturer', 'tutor', 'faculty'],
'principal': ['principal', 'assistant principal', 'school administrator', 'dean', 'headmaster'],
'professor': ['professor', 'associate professor', 'assistant professor', 'lecturer', 'adjunct'],
'instructional designer': ['instructional designer', 'curriculum developer', 'learning designer', 'course developer'],
'academic advisor': ['academic advisor', 'counselor', 'student advisor', 'guidance counselor'],
'curriculum': ['curriculum specialist', 'curriculum coordinator', 'curriculum manager'],
# Creative / Design - EXPANDED
'designer': ['designer', 'graphic designer', 'ui designer', 'ux designer', 'product designer', 'visual designer', 'web designer'],
'creative': ['creative director', 'art director', 'creative lead', 'design director'],
'art director': ['art director', 'ad', 'creative director', 'design lead'],
'copywriter': ['copywriter', 'copy editor', 'content writer', 'creative writer'],
'video': ['video producer', 'videographer', 'video editor', 'multimedia producer', 'motion designer'],
'photographer': ['photographer', 'photo editor', 'photography', 'photojournalist'],
'3d artist': ['3d artist', '3d modeler', 'cgi artist', 'visual effects', 'animator'],
# Hospitality - NEW
'hotel manager': ['hotel manager', 'general manager', 'front desk manager', 'hospitality manager', 'resort manager'],
'restaurant manager': ['restaurant manager', 'food service manager', 'f&b manager', 'dining manager'],
'chef': ['chef', 'executive chef', 'sous chef', 'head chef', 'culinary', 'cook'],
'event coordinator': ['event coordinator', 'banquet manager', 'catering manager', 'conference coordinator'],
'concierge': ['concierge', 'guest services', 'guest relations', 'hospitality'],
# Retail - NEW
'store manager': ['store manager', 'retail manager', 'assistant manager', 'shop manager'],
'buyer': ['buyer', 'merchandise buyer', 'retail buyer', 'category manager'],
'visual merchandiser': ['visual merchandiser', 'merchandising', 'display coordinator'],
'loss prevention': ['loss prevention', 'asset protection', 'security manager', 'lp manager'],
# Government / Public Sector - NEW
'policy analyst': ['policy analyst', 'policy advisor', 'policy specialist', 'legislative analyst'],
'city planner': ['city planner', 'urban planner', 'regional planner', 'planning director'],
'grant writer': ['grant writer', 'grants manager', 'proposal writer', 'development writer'],
'public affairs': ['public affairs', 'government relations', 'public policy', 'lobbyist'],
# Nonprofit - NEW
'program director': ['program director', 'program manager', 'program coordinator', 'program officer'],
'fundraiser': ['fundraiser', 'development director', 'major gifts', 'annual fund', 'donor relations'],
'volunteer coordinator': ['volunteer coordinator', 'volunteer manager', 'community outreach'],
'executive director': ['executive director', 'ed', 'nonprofit director', 'ceo'],
# Insurance - NEW
'underwriter': ['underwriter', 'underwriting', 'underwriting analyst', 'risk underwriter'],
'claims': ['claims adjuster', 'claims analyst', 'claims examiner', 'claims representative'],
'actuary': ['actuary', 'actuarial analyst', 'actuarial consultant', 'pricing actuary'],
'insurance agent': ['insurance agent', 'insurance broker', 'insurance producer', 'insurance advisor'],
# Engineering (Non-Software) - EXPANDED
'mechanical engineer': ['mechanical engineer', 'mechanical designer', 'cad engineer', 'product engineer'],
'electrical engineer': ['electrical engineer', 'electronics engineer', 'hardware engineer', 'ee'],
'civil engineer': ['civil engineer', 'structural engineer', 'construction engineer', 'project engineer'],
'chemical engineer': ['chemical engineer', 'process engineer', 'manufacturing engineer'],
'aerospace': ['aerospace engineer', 'aeronautical engineer', 'flight engineer', 'propulsion'],
'industrial': ['industrial engineer', 'manufacturing engineer', 'process engineer', 'ie'],
# Science / Research - NEW
'biologist': ['biologist', 'research scientist', 'lab scientist', 'microbiologist', 'molecular biologist'],
'chemist': ['chemist', 'analytical chemist', 'research chemist', 'quality chemist'],
'environmental': ['environmental scientist', 'environmental engineer', 'environmental consultant'],
'lab technician': ['lab technician', 'laboratory technician', 'research technician', 'lab assistant'],
# Media / Journalism - NEW
'journalist': ['journalist', 'reporter', 'correspondent', 'news writer', 'staff writer'],
'editor': ['editor', 'managing editor', 'copy editor', 'content editor', 'senior editor'],
'podcast': ['podcast producer', 'audio producer', 'podcast host', 'audio engineer'],
'social media': ['social media manager', 'social media specialist', 'community manager', 'social strategist'],
# Real Estate - NEW
'real estate agent': ['real estate agent', 'realtor', 'real estate broker', 'listing agent'],
'property manager': ['property manager', 'building manager', 'leasing manager', 'asset manager'],
'appraiser': ['appraiser', 'real estate appraiser', 'property appraiser', 'valuation analyst'],
# Consulting - NEW
'consultant': ['consultant', 'management consultant', 'strategy consultant', 'business consultant'],
'it consultant': ['it consultant', 'technology consultant', 'systems consultant', 'sap consultant'],
'strategy': ['strategy consultant', 'strategic advisor', 'strategy analyst', 'corporate strategy'],
# Customer Service - EXPANDED
'customer service': ['customer service', 'customer support', 'support specialist', 'helpdesk', 'service rep', 'csr'],
'customer success': ['customer success manager', 'csm', 'customer success', 'client success'],
'support manager': ['support manager', 'customer support manager', 'service manager'],
'technical support': ['technical support', 'tech support', 'it support', 'it helpdesk', 'desktop support'],
# Trades - NEW
'electrician': ['electrician', 'electrical technician', 'journeyman electrician', 'master electrician'],
'plumber': ['plumber', 'plumbing technician', 'pipefitter', 'journeyman plumber'],
'hvac': ['hvac technician', 'hvac installer', 'hvac mechanic', 'heating and cooling'],
'carpenter': ['carpenter', 'woodworker', 'cabinet maker', 'finish carpenter'],
}
resume_lower = resume.lower()
jd_lower = job_desc.lower()
# Find role in JD - check all patterns
jd_role = None
max_matches = 0
for role, variations in role_patterns.items():
matches = sum(1 for var in variations if var in jd_lower)
if matches > max_matches:
max_matches = matches
jd_role = role
if not jd_role:
# Fallback: check for any professional words in RESUME
professional_indicators = ['manager', 'engineer', 'analyst', 'specialist', 'coordinator', 'director',
'consultant', 'developer', 'designer', 'administrator', 'supervisor',
'technician', 'associate', 'representative', 'officer', 'executive']
# Check if resume has professional role indicators
resume_has_professional = any(ind in resume_lower for ind in professional_indicators)
jd_has_professional = any(ind in jd_lower for ind in professional_indicators)
if resume_has_professional and jd_has_professional:
return 50 # Both have some professional content, but can't match specifically
elif resume_has_professional or jd_has_professional:
return 30 # Only one side has professional content
return 20 # Can't determine role - low score
# Check if resume has matching role
role_variations = role_patterns.get(jd_role, [jd_role])
if any(var in resume_lower for var in role_variations):
return 100
# Check for related roles with fuzzy matching
for var in role_variations:
# Check first 500 chars (title area) - high priority
if var in resume_lower[:500]:
return 95
# Check substring match (5+ char prefix)
if any(var[:5] in word for word in resume_lower.split() if len(var) >= 5):
return 80
# Check 4-char prefix match
if len(var) >= 4 and any(var[:4] in word for word in resume_lower.split()):
return 65
# Check for generic professional overlap - different roles
resume_has_roles = any(any(v in resume_lower for v in vars) for vars in role_patterns.values())
if resume_has_roles:
return 40 # Has A role, but not the RIGHT role - significant penalty
return 15 # No professional role detected in resume
def _format_score(self, resume: str) -> float:
"""Score based on ATS-friendly formatting - REALISTIC SCORING."""
# Input validation - empty/minimal resumes get minimal scores
if len(resume.strip()) < 50:
return 5
score = 0
elements_found = 0
total_elements = 7 # Number of formatting elements we check
# Email present (essential for contact)
if re.search(r'[\w\.-]+@[\w\.-]+\.\w+', resume):
score += 20
elements_found += 1
# Phone present (essential for contact)
if re.search(r'\+?[\d\s\-\(\)]{10,}', resume):
score += 20
elements_found += 1
# Bullet points (proper formatting) - more patterns
if re.search(r'β€’|\-\s|\*\s|^\s*\d+\.|^\s*[a-z]\)', resume, re.MULTILINE):
score += 15
elements_found += 1
# LinkedIn/GitHub (professional presence)
if re.search(r'linkedin|github', resume.lower()):
score += 10
elements_found += 1
# Has dates (shows proper experience formatting)
if re.search(r'\d{4}|present|current', resume.lower()):
score += 15
elements_found += 1
# Has location/address
if re.search(r'\b[A-Z][a-z]+,?\s+[A-Z]{2}\b|\bcity\b|\bstate\b', resume):
score += 10
elements_found += 1
# Has name (capitalized words at start)
if re.search(r'^[A-Z][a-z]+\s+[A-Z][a-z]+', resume.strip()):
score += 10
elements_found += 1
# If no formatting elements found, return very low score
if elements_found == 0:
return 10
return min(100, score)
def _section_score(self, resume: str) -> float:
"""Score based on standard section presence - REALISTIC SCORING."""
# Input validation - empty/minimal resumes get minimal scores
if len(resume.strip()) < 50:
return 5
resume_lower = resume.lower()
# Core sections that most resumes should have
core_sections = {
'experience': ['experience', 'employment', 'work history', 'professional experience',
'career', 'work experience', 'professional background', 'employment history',
'positions held', 'career history', 'professional history'],
'skills': ['skills', 'technical skills', 'competencies', 'technologies', 'expertise',
'proficiencies', 'core competencies', 'areas of expertise', 'technical expertise',
'key skills', 'professional skills', 'skill set'],
}
# Optional sections that add value
optional_sections = {
'summary': ['summary', 'objective', 'profile', 'about', 'introduction', 'overview',
'professional summary', 'career objective', 'executive summary', 'highlights'],
'education': ['education', 'academic', 'qualification', 'degree', 'university',
'college', 'training', 'academic background', 'educational background',
'school', 'bachelor', 'master', 'phd', 'mba', 'certification'],
'certifications': ['certification', 'certificate', 'credentials', 'licensed', 'certif',
'accreditation', 'licenses', 'professional development', 'training'],
'achievements': ['achievement', 'accomplishment', 'award', 'honor', 'recognition', 'highlights'],
'projects': ['project', 'portfolio', 'case stud', 'initiatives'],
}
# Check for implicit experience (job titles, dates indicate experience section)
job_titles = ['manager', 'engineer', 'analyst', 'developer', 'director', 'specialist',
'coordinator', 'consultant', 'lead', 'senior', 'junior', 'associate',
'supervisor', 'administrator', 'officer', 'technician', 'representative',
'executive', 'accountant', 'nurse', 'teacher', 'designer', 'writer']
has_job_indicators = bool(re.search(r'\d{4}\s*[-–]\s*(?:\d{4}|present|current)', resume_lower))
has_job_titles = any(title in resume_lower for title in job_titles)
core_found = sum(1 for keywords in core_sections.values() if any(kw in resume_lower for kw in keywords))
optional_found = sum(1 for keywords in optional_sections.values() if any(kw in resume_lower for kw in keywords))
# If resume has job indicators, give credit for implicit experience section
if (has_job_indicators or has_job_titles) and core_found == 0:
core_found = 1
# REALISTIC SCORING: Start at 0, build up based on sections found
# Core sections: 25 points each (max 50 for both)
# Optional sections: 10 points each (max 50 for 5 sections)
core_score = core_found * 25
optional_score = optional_found * 10
# If no sections found at all, very low score
if core_found == 0 and optional_found == 0:
return 15
return min(100, core_score + optional_score)
def _action_verb_score(self, resume: str) -> float:
"""Score based on strong action verb usage - REALISTIC SCORING."""
# Input validation - empty/minimal resumes get minimal scores
if len(resume.strip()) < 50:
return 5
resume_lower = resume.lower()
found = sum(1 for v in self.action_verbs if re.search(rf'\b{v}', resume_lower))
# REALISTIC SCORING: Start at 0, each action verb adds points
# 0 verbs = 10%, 1-2 verbs = 30-40%, 3-5 verbs = 50-70%, 6+ verbs = 80-100%
if found == 0:
return 10
elif found <= 2:
return 20 + (found * 10)
elif found <= 5:
return 40 + ((found - 2) * 10)
elif found <= 10:
return 70 + ((found - 5) * 6)
else:
return min(100, 90 + (found - 10) * 2)
def _quantification_score(self, resume: str) -> float:
"""Score based on quantified achievements - REALISTIC SCORING."""
# Input validation - empty/minimal resumes get minimal scores
if len(resume.strip()) < 50:
return 5
patterns = [
r'\d+%', # Percentages
r'\$[\d,\.]+[MKB]?', # Dollar amounts
r'\d+\+?\s*(?:years?|months?)', # Time periods
r'\d+[MKB]\+?', # Large numbers with suffix (1M, 5K)
r'#\d+', # Rankings (#1, top #10)
r'\d+\+?\s*(?:customers?|users?|clients?|employees?|team\s*members?|staff|people|patients?|students?|members?|associates?|reps?|agents?|nurses?|engineers?|developers?)', # People counts
r'\d+x', # Multipliers (3x, 10x)
r'top\s*\d+%?', # Top rankings
r'\d+\s*(?:projects?|deals?|accounts?|transactions?|contracts?|cases?|clients?|positions?|requisitions?|hires?)', # Work counts
r'\d+\s*(?:million|billion|thousand)', # Large numbers written
r'\d{1,3}(?:,\d{3})+', # Numbers with commas (1,000,000)
r'\d+\s*(?:per\s*(?:day|week|month|year|hour|shift))', # Rate metrics
r'\d+\s*(?:daily|weekly|monthly|annually|yearly)', # Frequency
r'\d+\s*(?:hours?|days?|weeks?|minutes?)', # Time
r'\d+\s*(?:interviews?|reviews?|audits?|reports?|presentations?|meetings?|calls?)', # Work output
r'(?:reduced|increased|improved|grew|saved|generated|delivered|managed|led|oversaw|handled|closed|achieved|exceeded|surpassed|maintained|built|developed|created|launched|completed)\s*(?:by\s*)?\d+', # Action + number
r'\d+\s*(?:teams?|departments?|offices?|locations?|sites?|branches?|units?|facilities?|stores?)', # Organizational scale
r'\d+\s*(?:products?|features?|releases?|launches?|applications?|systems?|tools?|platforms?)', # Product metrics
r'\d+\s*(?:campaigns?|initiatives?|programs?|events?|workshops?|trainings?|courses?)', # Program metrics
r'(?:over|more than|approximately|about|nearly|almost|up to|exceeding)\s*\d+', # Approximations
r'\d+\s*(?:countries|regions|states|markets|territories|cities)', # Geographic scope
r'\d+-(?:bed|person|member|seat)', # Capacity descriptions (40-bed unit)
r'\d+\s*(?:vendors?|suppliers?|partners?|contractors?|agencies?)', # Business relationships
r'\d+\s*(?:downloads?|installs?|views?|clicks?|impressions?|conversions?|leads?)', # Digital metrics
r'\d+\s*(?:articles?|papers?|publications?|patents?|blogs?|posts?)', # Content metrics
r'\d+\s*(?:beds?|rooms?|units?|seats?|pods?)', # Facility metrics
r'\d+\s*(?:tickets?|issues?|requests?|inquiries?)', # Support metrics
]
total_quantifications = 0
for pattern in patterns:
matches = re.findall(pattern, resume, re.IGNORECASE)
total_quantifications += len(matches)
# Also count standalone significant numbers (likely metrics)
# Numbers like 500, 1000, 50000 that aren't part of dates
standalone_numbers = re.findall(r'(?<!\d)\d{2,}(?:,\d{3})*(?!\d)', resume)
# Filter out years (1990-2030)
standalone_numbers = [n for n in standalone_numbers if not (1980 <= int(n.replace(',', '')[:4]) <= 2030 and len(n.replace(',', '')) == 4)]
total_quantifications += len(standalone_numbers) // 2 # Partial credit for standalone numbers
# REALISTIC SCORING: Start at 0, each quantification adds points
# 0 quants = 10%, 1-2 = 30-40%, 3-5 = 50-70%, 6+ = 80-100%
if total_quantifications == 0:
# Check if resume is too short
if len(resume.strip()) < 50:
return 5
return 15 # No quantifications in a real resume
elif total_quantifications <= 2:
return 25 + (total_quantifications * 10)
elif total_quantifications <= 5:
return 45 + ((total_quantifications - 2) * 10)
elif total_quantifications <= 10:
return 75 + ((total_quantifications - 5) * 5)
else:
return min(100, 95 + (total_quantifications - 10))
def get_keyword_analysis(self, resume: str, job_desc: str) -> Tuple[List[str], List[str]]:
"""Get detailed keyword analysis with taxonomy expansion and fuzzy matching."""
jd_lower = job_desc.lower()
resume_lower = resume.lower()
# Multi-word terms across ALL DOMAINS
multi_word_terms = [
# Technology / AI / ML
'machine learning', 'deep learning', 'natural language processing',
'large language model', 'generative ai', 'prompt engineering',
'feature store', 'data pipeline', 'reinforcement learning',
'computer vision', 'neural network', 'transfer learning',
'agentic workflow', 'similarity search', 'gpu optimization',
'fine tuning', 'model inference', 'cloud native',
'recommender system', 'embedding model', 'vector database',
'software development', 'full stack', 'front end', 'back end',
'continuous integration', 'continuous deployment', 'version control',
'agile methodology', 'scrum master', 'sprint planning',
# Finance / Accounting
'financial analysis', 'financial modeling', 'financial planning',
'budget management', 'variance analysis', 'cash flow',
'accounts payable', 'accounts receivable', 'general ledger',
'risk management', 'credit risk', 'market risk', 'operational risk',
'due diligence', 'internal audit', 'external audit',
'tax planning', 'tax compliance', 'financial reporting',
'investment banking', 'private equity', 'venture capital',
'portfolio management', 'asset management', 'wealth management',
# Marketing / Sales
'digital marketing', 'content marketing', 'email marketing',
'search engine optimization', 'pay per click', 'social media marketing',
'lead generation', 'sales pipeline', 'customer acquisition',
'brand management', 'market research', 'competitive analysis',
'customer relationship management', 'account management',
'business development', 'revenue growth', 'quota attainment',
# HR
'talent acquisition', 'performance management', 'employee engagement',
'learning and development', 'succession planning', 'workforce planning',
'compensation and benefits', 'employee relations', 'labor relations',
'diversity and inclusion', 'organizational development',
# Healthcare
'patient care', 'clinical research', 'clinical trials',
'electronic health records', 'medical records', 'healthcare management',
'quality improvement', 'patient safety', 'care coordination',
# Operations / Supply Chain
'supply chain management', 'inventory management', 'warehouse management',
'process improvement', 'lean manufacturing', 'six sigma',
'quality control', 'quality assurance', 'vendor management',
'project management', 'program management', 'change management',
# Legal
'contract negotiation', 'legal research', 'intellectual property',
'regulatory compliance', 'corporate governance', 'risk assessment',
# General Professional
'cross functional', 'stakeholder management', 'strategic planning',
'team leadership', 'client relationship', 'problem solving',
]
# Check which multi-word terms are in JD but missing from resume
important_multiword = []
for term in multi_word_terms:
if term in jd_lower:
important_multiword.append(term)
# Single word extraction
jd_words = re.findall(r'\b[a-zA-Z]{3,}\b', jd_lower)
jd_words = [w for w in jd_words if w not in self.stop_words]
keyword_counts = Counter(jd_words)
# Get important keywords (appearing multiple times or in taxonomy)
important_keywords = []
for word, count in keyword_counts.most_common(50):
# Check if it's a technical term
is_technical = any(word in variations for variations in self.skills_taxonomy.values())
if count >= 2 or is_technical:
important_keywords.append(word)
# Expand with taxonomy
resume_lower = resume.lower()
resume_words = set(re.findall(r'\b[a-zA-Z]{3,}\b', resume_lower))
resume_stems = {self._stem_word(w) for w in resume_words}
matched = []
missing = []
for kw in important_keywords[:30]:
found = False
# Check 1: Direct match
if kw in resume_lower:
found = True
# Check 2: Stem match (collaborate = collaborated = collaborating)
if not found:
kw_stem = self._stem_word(kw)
if kw_stem in resume_stems or any(kw_stem in stem for stem in resume_stems):
found = True
# Check 3: Containment (support in supported, supporting)
if not found:
if any(kw in word or word in kw for word in resume_words if len(word) > 3):
found = True
# Check 4: Taxonomy expansion
if not found:
kw_expanded = self._expand_with_taxonomy([kw])
if any(exp in resume_lower for exp in kw_expanded):
found = True
# Check 5: Fuzzy match against resume words
if not found:
for resume_word in resume_words:
if self._fuzzy_match(kw, resume_word):
found = True
break
if found:
matched.append(kw)
else:
missing.append(kw)
# Also check multi-word terms
for term in important_multiword:
# Check if any variation of this term exists in resume
# Try both with space and underscore as key
term_variations = self.skills_taxonomy.get(term,
self.skills_taxonomy.get(term.replace(' ', '_'), [term]))
term_found = any(var in resume_lower for var in term_variations) or term in resume_lower
# Also check fuzzy match for variations
if not term_found:
for var in term_variations:
if var in resume_lower or any(self._fuzzy_match(var, rw) for rw in resume_words):
term_found = True
break
if not term_found:
# Check if component words exist
term_words = term.split()
if not all(any(tw in rw for rw in resume_words) for tw in term_words):
if term not in missing:
missing.insert(0, term) # Add at beginning (more important)
else:
if term not in matched:
matched.insert(0, term)
return matched[:20], missing[:15]
# ============== PDF GENERATOR ==============
def extract_candidate_name(resume_content: str) -> str:
"""Extract candidate name from resume using NER (spaCy) with rule-based fallback."""
# Get first few lines where name typically appears (first 500 chars or first 5 lines)
lines = resume_content.strip().split('\n')
first_lines = '\n'.join(lines[:5])[:500]
# Company and non-name indicators to filter out
company_indicators = {
'inc', 'corp', 'corporation', 'llc', 'ltd', 'limited', 'company', 'co',
'bank', 'chase', 'citi', 'citibank', 'jpmorgan', 'goldman', 'morgan stanley',
'google', 'meta', 'amazon', 'microsoft', 'apple', 'netflix', 'tesla',
'technologies', 'solutions', 'consulting', 'services', 'partners', 'group',
'capital', 'investments', 'financial', 'advisors', 'associates',
'jp morgan', 'hdfc', 'icici', 'vodafone', 'infosys', 'wipro', 'tcs',
'accenture', 'deloitte', 'kpmg', 'pwc', 'mckinsey', 'bcg', 'bain',
'monte carlo', 'bayesian', 'markov', 'gaussian' # Technical terms often misidentified
}
def is_valid_person_name(name):
"""Check if name looks like a real person's name."""
if not name or len(name) < 4:
return False
name_lower = name.lower()
# Check against company indicators
if any(comp in name_lower for comp in company_indicators):
return False
words = name.split()
if len(words) < 2 or len(words) > 5:
return False
# Must be mostly letters
letter_ratio = sum(1 for c in name if c.isalpha() or c.isspace()) / len(name)
return letter_ratio > 0.85
# METHOD 1: Try spaCy NER on FIRST FEW LINES only
try:
import spacy
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
import subprocess
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
nlp = spacy.load("en_core_web_sm")
# Only analyze first few lines (where name should be)
doc = nlp(first_lines)
# Get PERSON entities, prioritize those appearing earliest
for ent in doc.ents:
if ent.label_ == "PERSON":
name = ent.text.strip()
name = re.sub(r'[,;:\-–|]+$', '', name).strip()
if is_valid_person_name(name):
words = name.split()[:4]
name = ' '.join(words)
if name.isupper():
return name.title()
return name
except Exception:
pass
# METHOD 2: Rule-based fallback
skip_patterns = ['resume', 'curriculum vitae', 'cv', 'contact', 'personal', 'profile',
'phone:', 'email:', 'address:', 'summary', 'objective', 'experience',
'education', 'skills', 'professional']
title_words = {
'vice', 'president', 'vp', 'director', 'manager', 'engineer', 'analyst',
'developer', 'consultant', 'specialist', 'lead', 'senior', 'junior',
'associate', 'executive', 'coordinator', 'administrator', 'officer',
'ceo', 'cto', 'cfo', 'coo', 'chief', 'head', 'applied', 'data',
'software', 'product', 'project', 'avp', 'svp', 'evp', 'md'
}
for line in lines[:15]:
line = line.strip()
if not line or len(line) < 3:
continue
line_lower = line.lower()
if any(pat in line_lower for pat in skip_patterns):
continue
if any(comp in line_lower for comp in company_indicators):
continue
if '@' in line or re.search(r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}', line):
continue
if 'linkedin' in line_lower or 'github' in line_lower or 'kaggle' in line_lower:
continue
if line.startswith('β€’') or line.startswith('-') or line.startswith('*'):
continue
name = line
for sep in ['|', ' - ', ' – ']:
if sep in name:
name = name.split(sep)[0].strip()
words = name.split()
name_words = []
for word in words:
word_clean = re.sub(r'[^\w]', '', word).lower()
if word_clean in title_words:
break
name_words.append(word)
if len(name_words) >= 2:
name = ' '.join(name_words[:4])
name = re.sub(r'[,;:\-–|]+$', '', name).strip()
if is_valid_person_name(name):
if name.isupper():
return name.title()
return name
return "Candidate"
def generate_pdf(resume_content: str, color_scheme: str, candidate_name: str = None) -> str:
"""Generate professional, well-formatted PDF resume."""
try:
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, HRFlowable, ListFlowable, ListItem
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
SCHEMES = {
'Navy Blue': {'primary': '#1a365d', 'accent': '#2c5282', 'text': '#2d3748'},
'Forest Green': {'primary': '#1c4532', 'accent': '#276749', 'text': '#2d3748'},
'Burgundy': {'primary': '#742a2a', 'accent': '#9b2c2c', 'text': '#2d3748'},
'Charcoal': {'primary': '#1a202c', 'accent': '#4a5568', 'text': '#2d3748'},
'Royal Purple': {'primary': '#44337a', 'accent': '#6b46c1', 'text': '#2d3748'}
}
scheme = SCHEMES.get(color_scheme, SCHEMES['Navy Blue'])
primary = colors.HexColor(scheme['primary'])
accent = colors.HexColor(scheme['accent'])
text_color = colors.HexColor(scheme['text'])
light_gray = colors.HexColor('#e2e8f0')
def escape(t):
if not t: return ""
# Clean up ALL PDF encoding artifacts
t = str(t)
t = re.sub(r'\(cid:\d+\)', '', t) # Remove all cid patterns
# Replace various bullet characters with a simple dash (safe for PDF)
t = t.replace('●', '-').replace('β—‹', '-').replace('β–ͺ', '-').replace('β– ', '-')
t = t.replace('β€’', '-').replace('β–Ί', '-').replace('β–Έ', '-').replace('β—†', '-')
t = t.replace('━', '-').replace('─', '-') # Replace divider chars
t = t.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
return t.strip()
# Generate filename based on candidate name
if candidate_name:
safe_name = re.sub(r'[^\w\s\-]', '', candidate_name).replace(' ', '_')
else:
safe_name = "Resume"
temp_path = os.path.join(tempfile.gettempdir(), f"{safe_name}_ATS_Optimized.pdf")
doc = SimpleDocTemplate(temp_path, pagesize=letter,
rightMargin=0.5*inch, leftMargin=0.5*inch,
topMargin=0.4*inch, bottomMargin=0.4*inch)
story = []
# Define styles
name_style = ParagraphStyle('Name', fontSize=20, textColor=primary,
fontName='Helvetica-Bold', alignment=TA_CENTER,
spaceAfter=2, leading=24)
title_style = ParagraphStyle('Title', fontSize=11, textColor=accent,
fontName='Helvetica-Oblique', alignment=TA_CENTER,
spaceAfter=4, leading=14)
contact_style = ParagraphStyle('Contact', fontSize=9, textColor=text_color,
fontName='Helvetica', alignment=TA_CENTER,
spaceAfter=8, leading=12)
section_style = ParagraphStyle('Section', fontSize=11, textColor=primary,
fontName='Helvetica-Bold', spaceBefore=14,
spaceAfter=4, leading=14)
company_style = ParagraphStyle('Company', fontSize=10, textColor=text_color,
fontName='Helvetica-Bold', spaceBefore=8,
spaceAfter=2, leading=13)
role_style = ParagraphStyle('Role', fontSize=9, textColor=accent,
fontName='Helvetica-Oblique', spaceAfter=4, leading=12)
body_style = ParagraphStyle('Body', fontSize=9.5, textColor=text_color,
fontName='Helvetica', spaceAfter=3, leading=12,
alignment=TA_JUSTIFY)
bullet_style = ParagraphStyle('Bullet', fontSize=9.5, textColor=text_color,
fontName='Helvetica', leftIndent=15,
firstLineIndent=-15, spaceAfter=4, leading=13,
bulletIndent=0)
# Section headers to detect
section_keywords = ['professional summary', 'summary', 'objective', 'profile',
'professional experience', 'experience', 'employment', 'work history',
'education', 'academic', 'technical skills', 'skills', 'competencies',
'certifications', 'certificates', 'projects', 'achievements',
'awards', 'publications', 'leadership', 'community', 'competitive']
lines = resume_content.split('\n')
# Process resume line by line with smart detection
i = 0
name_found = False
while i < len(lines):
line = lines[i].strip()
if not line:
i += 1
continue
line_lower = line.lower()
# First non-empty line is the name
if not name_found:
# Check if it contains job title (split name and title)
if '|' in line or ' - ' in line:
parts = re.split(r'\s*[\|–-]\s*', line, maxsplit=1)
story.append(Paragraph(escape(parts[0].strip()), name_style))
if len(parts) > 1:
story.append(Paragraph(escape(parts[1].strip()), title_style))
else:
story.append(Paragraph(escape(line), name_style))
name_found = True
i += 1
continue
# Contact info (early lines with email, phone, LinkedIn)
if i <= 5 and ('@' in line or '+91' in line or '+1' in line or 'linkedin' in line_lower or 'github' in line_lower):
story.append(Paragraph(escape(line), contact_style))
i += 1
continue
# Section headers
is_section = any(kw in line_lower for kw in section_keywords) and len(line) < 60
is_all_caps = line.isupper() and len(line) < 50 and len(line) > 3
if is_section or is_all_caps:
story.append(Spacer(1, 6))
story.append(Paragraph(escape(line.upper()), section_style))
story.append(HRFlowable(width="100%", thickness=1, color=primary, spaceAfter=6))
i += 1
continue
# Company/Role lines (COMPANY NAME | Location | Date pattern)
company_pattern = re.match(r'^([A-Z][A-Za-z\s&\.,]+(?:LTD|LIMITED|INC|CO|CORP|BANK|CHASE)?\.?)\s*[\|–-]\s*(.+)$', line)
if company_pattern or (line.isupper() and ('|' in line or any(m in line for m in ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']))):
story.append(Paragraph(escape(line), company_style))
i += 1
# Check if next line is role/title
if i < len(lines):
next_line = lines[i].strip()
if next_line and not next_line.startswith('β€’') and len(next_line) < 100:
# Likely a role line
if 'vice president' in next_line.lower() or 'manager' in next_line.lower() or 'lead' in next_line.lower() or 'team' in next_line.lower():
story.append(Paragraph(escape(next_line), role_style))
i += 1
continue
# Bullet points
if line.startswith('β€’') or line.startswith('-') or line.startswith('*') or line.startswith('━'):
bullet_text = line.lstrip('β€’-*━► ')
# Use simple dash for PDF compatibility (Unicode bullets cause cid encoding issues)
story.append(Paragraph(f"- {escape(bullet_text)}", bullet_style))
i += 1
continue
# Regular body text
story.append(Paragraph(escape(line), body_style))
i += 1
doc.build(story)
if os.path.exists(temp_path):
return temp_path
return None
except Exception as e:
print(f"PDF Error: {e}")
import traceback
traceback.print_exc()
return None
# ============== MAIN FUNCTION ==============
def analyze_and_optimize(resume_file, job_description, industry, experience_level, color_scheme):
"""Main function - analyze, optimize with AI, and generate PDF."""
if resume_file is None:
return "❌ Please upload your resume", "", "", "", "", None, ""
if not job_description or len(job_description.strip()) < 50:
return "❌ Please paste a complete job description (at least 50 characters)", "", "", "", "", None, ""
try:
original_resume = parse_resume(resume_file)
if not original_resume or original_resume.startswith("Error") or original_resume.startswith("Unsupported"):
return f"❌ {original_resume}", "", "", "", "", None, ""
analyzer = ATSCompatibilityAnalyzer()
before_analysis = analyzer.analyze(original_resume, job_description)
optimized_resume, ai_suggestions = optimize_with_llm(original_resume, job_description)
# Extract candidate name - ALWAYS extract from ORIGINAL resume first (most reliable)
# The original resume has the actual person's name, not LLM hallucinations
candidate_name = extract_candidate_name(original_resume)
# Validate the extracted name looks like a real person's name
def is_valid_person_name(name):
"""Check if name looks like a real person's name, not garbage."""
if not name or name == "Candidate" or len(name) < 4:
return False
# Must have at least 2 words
words = name.split()
if len(words) < 2:
return False
# Must be mostly letters
letter_ratio = sum(1 for c in name if c.isalpha() or c.isspace()) / len(name)
if letter_ratio < 0.85:
return False
# Reject obvious non-names (job keywords, etc.)
bad_words = {'reduced', 'feature', 'applied', 'senior', 'junior', 'manager',
'engineer', 'developer', 'analyst', 'consultant', 'specialist',
'experience', 'skills', 'summary', 'professional', 'objective'}
name_words = set(w.lower() for w in words)
if name_words & bad_words:
return False
return True
# If original extraction failed or looks invalid, try AI-extracted name as backup
if not is_valid_person_name(candidate_name):
if '__CANDIDATE_NAME__:' in optimized_resume:
for line in optimized_resume.split('\n'):
if line.startswith('__CANDIDATE_NAME__:'):
potential = line.replace('__CANDIDATE_NAME__:', '').strip()
if is_valid_person_name(potential):
candidate_name = potential
break
# Final fallback - if still invalid, use "Candidate"
if not is_valid_person_name(candidate_name):
candidate_name = "Candidate"
after_analysis = analyzer.analyze(optimized_resume, job_description)
before_score = before_analysis['total_score']
after_score = after_analysis['total_score']
improvement = after_score - before_score
# Calculate individual metric improvements
kw_improvement = after_analysis['breakdown']['keyword_match'] - before_analysis['breakdown']['keyword_match']
scores_display = f"""## πŸ“Š Advanced ATS Compatibility Score
| Metric | Before | After | Ξ” |
|--------|--------|-------|---|
| **🎯 Overall Score** | **{before_score}%** | **{after_score}%** | **{'+' if improvement >= 0 else ''}{improvement}%** |
| TF-IDF Keyword Match | {before_analysis['breakdown']['keyword_match']:.0f}% | {after_analysis['breakdown']['keyword_match']:.0f}% | {'+' if kw_improvement >= 0 else ''}{kw_improvement:.0f}% |
| Semantic Role Match | {before_analysis['breakdown']['semantic_match']:.0f}% | {after_analysis['breakdown']['semantic_match']:.0f}% | |
| Experience Match | {before_analysis['breakdown']['experience_match']:.0f}% | {after_analysis['breakdown']['experience_match']:.0f}% | |
| Skills Taxonomy | {before_analysis['breakdown']['skills_match']:.0f}% | {after_analysis['breakdown']['skills_match']:.0f}% | |
| Format Compliance | {before_analysis['breakdown']['format_score']:.0f}% | {after_analysis['breakdown']['format_score']:.0f}% | |
| Section Structure | {before_analysis['breakdown']['section_score']:.0f}% | {after_analysis['breakdown']['section_score']:.0f}% | |
| Action Verbs | {before_analysis['breakdown']['action_verbs']:.0f}% | {after_analysis['breakdown']['action_verbs']:.0f}% | |
| Quantification | {before_analysis['breakdown']['quantification']:.0f}% | {after_analysis['breakdown']['quantification']:.0f}% | |
### πŸ”¬ Scoring Methodology (Mimics Real ATS)
- **TF-IDF Keyword Match**: Weighted matching - rare/important terms score higher
- **Semantic Role Match**: "Data Scientist" β‰ˆ "ML Engineer" β‰ˆ "AI Engineer"
- **Experience Match**: Parses "5+ years" & calculates from date ranges
- **Skills Taxonomy**: ML=Machine Learning, NLP=Natural Language Processing, etc.
⚠️ *Using TF-IDF, stemming, fuzzy matching & skills taxonomy - similar to Workday, Taleo, Greenhouse algorithms*
"""
# Enhanced keyword analysis
matched, missing = analyzer.get_keyword_analysis(optimized_resume, job_description)
before_matched, before_missing = analyzer.get_keyword_analysis(original_resume, job_description)
new_keywords_matched = [kw for kw in matched if kw not in before_matched]
keywords_display = f"""## πŸ”‘ Detailed Keyword Analysis
### βœ… Keywords Matched ({len(matched)})
`{' | '.join(matched) if matched else 'None detected'}`
### πŸ†• New Keywords Added by AI ({len(new_keywords_matched)})
`{' | '.join(new_keywords_matched) if new_keywords_matched else 'No new keywords added'}`
### ❌ Still Missing ({len(missing)}) - Consider adding manually:
`{' | '.join(missing) if missing else 'All major keywords present! πŸŽ‰'}`
### πŸ’‘ Suggestions for Missing Keywords:
{chr(10).join([f"- **{kw}**: Add to Skills section or work into experience bullets" for kw in missing[:5]]) if missing else "- All important keywords are covered!"}
"""
suggestions_display = "## πŸ€– AI Optimization Changes\n\n"
suggestions_display += "\n".join(ai_suggestions) if ai_suggestions else "No changes made."
suggestions_display += f"\n\n### πŸ“ˆ Improvement Summary\n"
suggestions_display += f"- **Overall Score**: {before_score}% β†’ {after_score}% ({'+' if improvement >= 0 else ''}{improvement}%)\n"
suggestions_display += f"- **Keyword Match**: {before_analysis['breakdown']['keyword_match']:.0f}% β†’ {after_analysis['breakdown']['keyword_match']:.0f}%\n"
suggestions_display += f"- **New Keywords Injected**: {len(new_keywords_matched)}\n"
pdf_path = generate_pdf(optimized_resume, color_scheme, candidate_name)
# Create HTML preview of the optimized resume
preview_html = create_resume_preview(optimized_resume, candidate_name, color_scheme)
return scores_display, keywords_display, suggestions_display, original_resume, optimized_resume, pdf_path, preview_html
except Exception as e:
import traceback
traceback.print_exc()
return f"❌ Error: {str(e)}", "", "", "", "", None, ""
def create_resume_preview(resume_content: str, candidate_name: str, color_scheme: str) -> str:
"""Create a professional HTML preview of the optimized resume."""
SCHEMES = {
'Navy Blue': {'primary': '#1a365d', 'accent': '#2c5282', 'text': '#2d3748', 'bg': '#f7fafc', 'light': '#edf2f7'},
'Forest Green': {'primary': '#1c4532', 'accent': '#276749', 'text': '#2d3748', 'bg': '#f0fff4', 'light': '#c6f6d5'},
'Burgundy': {'primary': '#742a2a', 'accent': '#9b2c2c', 'text': '#2d3748', 'bg': '#fff5f5', 'light': '#fed7d7'},
'Charcoal': {'primary': '#1a202c', 'accent': '#4a5568', 'text': '#2d3748', 'bg': '#f7fafc', 'light': '#e2e8f0'},
'Royal Purple': {'primary': '#44337a', 'accent': '#6b46c1', 'text': '#2d3748', 'bg': '#faf5ff', 'light': '#e9d8fd'}
}
scheme = SCHEMES.get(color_scheme, SCHEMES['Navy Blue'])
# Clean content - remove any metadata lines
lines = resume_content.split('\n')
cleaned_lines = [l for l in lines if not l.startswith('__CANDIDATE_NAME__:')]
# Section detection keywords
section_headers = ['PROFESSIONAL SUMMARY', 'SUMMARY', 'OBJECTIVE', 'PROFILE',
'EXPERIENCE', 'PROFESSIONAL EXPERIENCE', 'WORK EXPERIENCE', 'EMPLOYMENT',
'EDUCATION', 'ACADEMIC BACKGROUND', 'ACADEMIC',
'SKILLS', 'TECHNICAL SKILLS', 'CORE COMPETENCIES', 'KEY SKILLS',
'CERTIFICATIONS', 'CERTIFICATES', 'LICENSES',
'PROJECTS', 'KEY PROJECTS', 'NOTABLE PROJECTS',
'AWARDS', 'ACHIEVEMENTS', 'HONORS', 'RECOGNITION',
'PUBLICATIONS', 'RESEARCH', 'PATENTS',
'LEADERSHIP', 'VOLUNTEER', 'EXTRACURRICULAR',
'LANGUAGES', 'INTERESTS']
html_parts = []
name_rendered = False
contact_rendered = False
i = 0
while i < len(cleaned_lines):
line = cleaned_lines[i].strip()
if not line:
html_parts.append('<div style="height: 10px;"></div>')
i += 1
continue
line_upper = line.upper()
# Skip __CANDIDATE_NAME__ metadata
if '__CANDIDATE_NAME__' in line:
i += 1
continue
# Render name (first substantial text line, not a header)
if not name_rendered and len(line) > 2:
is_header = any(h in line_upper for h in section_headers)
is_contact = '@' in line or re.search(r'\d{3}[-.\s]?\d{3}', line) or 'linkedin' in line.lower()
if not is_header and not is_contact:
# Check for title separator (NAME | TITLE or NAME - TITLE)
if '|' in line or ' - ' in line or ' – ' in line:
parts = re.split(r'\s*[\|–-]\s*', line, maxsplit=1)
name_part = parts[0].strip()
title_part = parts[1].strip() if len(parts) > 1 else ''
html_parts.append(f'''
<div style="text-align: center; margin-bottom: 5px;">
<h1 style="color: {scheme['primary']}; margin: 0; font-size: 28px; font-weight: 700; letter-spacing: 1px;">
{name_part.upper() if name_part.islower() else name_part}
</h1>
{f'<div style="color: {scheme["accent"]}; font-size: 14px; font-style: italic; margin-top: 5px;">{title_part}</div>' if title_part else ''}
</div>
''')
else:
html_parts.append(f'''
<div style="text-align: center; margin-bottom: 5px;">
<h1 style="color: {scheme['primary']}; margin: 0; font-size: 28px; font-weight: 700; letter-spacing: 1px;">
{line.upper() if line.islower() else line}
</h1>
</div>
''')
name_rendered = True
i += 1
continue
# Render contact info (lines with email, phone, LinkedIn early in doc)
if not contact_rendered and i <= 8:
if '@' in line or re.search(r'[\+]?\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}', line) or 'linkedin' in line.lower() or 'github' in line.lower():
html_parts.append(f'''
<div style="text-align: center; color: {scheme['text']}; font-size: 11px; margin-bottom: 3px;">
{line}
</div>
''')
i += 1
# Check if next lines are also contact info
while i < len(cleaned_lines) and i <= 8:
next_line = cleaned_lines[i].strip()
if '@' in next_line or 'linkedin' in next_line.lower() or 'github' in next_line.lower() or re.search(r'[\+]?\d', next_line):
html_parts.append(f'''
<div style="text-align: center; color: {scheme['text']}; font-size: 11px; margin-bottom: 3px;">
{next_line}
</div>
''')
i += 1
else:
break
contact_rendered = True
continue
# Skip ===== divider lines (they're formatting aids, not content)
if line.startswith('=') and line.endswith('=') and len(line) > 10:
i += 1
continue
# Skip ━━━ Unicode divider lines
if line.startswith('━') and len(line) > 10:
i += 1
continue
# Skip --- divider lines
if line.startswith('-') and len(line) > 5 and line.replace('-', '') == '':
i += 1
continue
# Section headers
if any(h in line_upper for h in section_headers) or (line.isupper() and len(line) > 3 and len(line) < 50):
html_parts.append(f'''
<div style="margin-top: 20px; margin-bottom: 10px;">
<h2 style="color: {scheme['primary']}; font-size: 13px; font-weight: 700;
text-transform: uppercase; letter-spacing: 2px; margin: 0 0 5px 0;
border-bottom: 2px solid {scheme['accent']}; padding-bottom: 5px;">
{line_upper}
</h2>
</div>
''')
i += 1
continue
# Company/Job header lines (contain dates, pipes, or are styled like headers)
date_pattern = re.search(r'\b(20\d{2}|19\d{2}|Present|Current)\b', line, re.IGNORECASE)
has_separator = '|' in line or ' – ' in line
if date_pattern or (has_separator and len(line) < 120):
html_parts.append(f'''
<div style="display: flex; justify-content: space-between; align-items: baseline;
margin-top: 12px; margin-bottom: 4px;">
<span style="color: {scheme['primary']}; font-weight: 600; font-size: 12px;">{line}</span>
</div>
''')
i += 1
continue
# Bullet points
if line.startswith('β€’') or line.startswith('-') or line.startswith('*') or line.startswith('β–Ί') or line.startswith('β–ͺ'):
bullet_text = line.lstrip('β€’-*β–Ίβ–ͺ ')
html_parts.append(f'''
<div style="margin-left: 20px; margin-bottom: 5px; color: {scheme['text']}; font-size: 11px; line-height: 1.5;">
<span style="color: {scheme['accent']}; margin-right: 8px;">β€’</span>{bullet_text}
</div>
''')
i += 1
continue
# Regular paragraph text
html_parts.append(f'''
<div style="color: {scheme['text']}; font-size: 11px; margin-bottom: 5px; line-height: 1.5; text-align: justify;">
{line}
</div>
''')
i += 1
# Wrap in styled container
html_content = f'''
<div style="
font-family: 'Segoe UI', 'Helvetica Neue', Arial, sans-serif;
background: white;
padding: 40px 50px;
border-radius: 8px;
box-shadow: 0 4px 20px rgba(0,0,0,0.1);
max-width: 850px;
margin: 20px auto;
border-top: 4px solid {scheme['primary']};
">
{''.join(html_parts)}
</div>
'''
return html_content
# ============== GRADIO UI ==============
with gr.Blocks(title="ATS Resume Optimizer", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ“„ ATS Resume Optimizer Pro
### Powered by Claude 3.5 Sonnet (Anthropic's SOTA Model)
Upload your resume and paste a job description. Our AI will:
- **Analyze** keyword matching and ATS compatibility using TF-IDF & semantic algorithms
- **Optimize** wording with professionally formatted output (without adding fake info)
- **Generate** a polished, ATS-friendly PDF with proper formatting
⚑ *Using Anthropic's most advanced model for premium-quality resume optimization*
---
""")
with gr.Row():
with gr.Column(scale=1):
resume_file = gr.File(label="πŸ“€ Upload Resume (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".doc", ".txt"])
industry = gr.Dropdown(
choices=["Technology/IT", "Finance/Banking", "Healthcare", "Marketing/Sales",
"Engineering", "Consulting", "Legal", "Education", "Other"],
value="Technology/IT", label="Industry"
)
experience_level = gr.Dropdown(
choices=["Entry Level (0-2 years)", "Mid Level (3-5 years)",
"Senior (6-10 years)", "Executive (10+ years)"],
value="Mid Level (3-5 years)", label="Experience Level"
)
color_scheme = gr.Dropdown(
choices=["Navy Blue", "Forest Green", "Burgundy", "Charcoal", "Royal Purple"],
value="Navy Blue", label="PDF Color Scheme"
)
with gr.Column(scale=1):
job_description = gr.Textbox(
label="πŸ“‹ Paste Job Description",
placeholder="Paste the complete job description here...",
lines=12
)
analyze_btn = gr.Button("πŸš€ Analyze & Optimize with AI", variant="primary", size="lg")
gr.Markdown("---")
scores_output = gr.Markdown()
with gr.Row():
keywords_output = gr.Markdown()
suggestions_output = gr.Markdown()
gr.Markdown("### πŸ“ Resume Comparison")
with gr.Row():
original_resume = gr.Textbox(label="Original Resume", lines=15, interactive=False)
optimized_resume = gr.Textbox(label="AI-Optimized Resume", lines=15, interactive=False)
gr.Markdown("### �️ Resume Preview")
gr.Markdown("*Preview how your optimized resume will look in the PDF*")
preview_html = gr.HTML(label="Resume Preview")
gr.Markdown("### οΏ½πŸ“₯ Download Optimized PDF")
pdf_output = gr.File(label="Download PDF")
gr.Markdown("""
---
### ℹ️ How It Works
1. **AI Analysis**: Claude 3.5 Sonnet analyzes your resume against the job description
2. **Professional Formatting**: Your resume is reformatted to ATS-optimized industry standards
3. **Smart Optimization**: Keywords are naturally integrated into your existing content
**What we DO:** βœ… Professional reformatting | βœ… Add relevant keywords naturally | βœ… Strengthen action verbs | βœ… ATS-optimized structure
**What we DON'T do:** ❌ Add fake experiences | ❌ Fabricate achievements | ❌ Misrepresent your background
""")
analyze_btn.click(
fn=analyze_and_optimize,
inputs=[resume_file, job_description, industry, experience_level, color_scheme],
outputs=[scores_output, keywords_output, suggestions_output, original_resume, optimized_resume, pdf_output, preview_html]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)