Spaces:

Akarrahe
/

IQKiller

Runtime error

App Files Files Community

IQKiller / micro /enhanced_resume_parser.py

AvikalpK

✨ Enhanced salary negotiation game with 30 scenarios and removed API keys

16a9080 10 months ago

raw

history blame contribute delete

24 kB

	"""
	Enhanced Resume Parser v2.0
	Provides structured extraction of skills, experience, projects, and education
	with proper normalization and context understanding.
	"""

	from typing import Any, Dict, List, Optional
	import re
	import json
	from dataclasses import dataclass, asdict, field
	from llm_client import LLMClient
	from metrics import log_metric
	import tiktoken

	@dataclass
	class Experience:
	title: str
	company: str
	duration: str
	location: str = ""
	responsibilities: List[str] = field(default_factory=list)
	achievements: List[str] = field(default_factory=list)
	technologies: List[str] = field(default_factory=list)
	start_date: str = ""
	end_date: str = ""
	is_current: bool = False

	def __post_init__(self):
	if self.responsibilities is None:
	self.responsibilities = []
	if self.achievements is None:
	self.achievements = []
	if self.technologies is None:
	self.technologies = []

	@dataclass
	class Project:
	name: str
	description: str
	technologies: List[str] = field(default_factory=list)
	github_url: str = ""
	demo_url: str = ""
	duration: str = ""
	key_features: List[str] = field(default_factory=list)

	def __post_init__(self):
	if self.technologies is None:
	self.technologies = []
	if self.key_features is None:
	self.key_features = []

	@dataclass
	class Education:
	degree: str
	field: str
	school: str
	graduation_year: str = ""
	gpa: str = ""
	relevant_courses: List[str] = None
	honors: List[str] = None

	def __post_init__(self):
	if self.relevant_courses is None:
	self.relevant_courses = []
	if self.honors is None:
	self.honors = []

	@dataclass
	class Skills:
	technical: List[str] = None
	programming_languages: List[str] = None
	frameworks: List[str] = None
	tools: List[str] = None
	databases: List[str] = None
	cloud_platforms: List[str] = None
	methodologies: List[str] = None
	soft_skills: List[str] = None

	def __post_init__(self):
	for field in ['technical', 'programming_languages', 'frameworks', 'tools',
	'databases', 'cloud_platforms', 'methodologies', 'soft_skills']:
	if getattr(self, field) is None:
	setattr(self, field, [])

	@dataclass
	class ResumeData:
	personal_info: Dict[str, str]
	summary: str
	skills: Skills
	experience: List[Experience]
	education: List[Education]
	projects: List[Project]
	certifications: List[Dict[str, str]]
	languages: List[str]
	years_of_experience: int = 0

	def __post_init__(self):
	if not self.certifications:
	self.certifications = []
	if not self.languages:
	self.languages = []

	class SkillsNormalizer:
	"""Normalizes and categorizes skills with synonym detection"""

	def __init__(self):
	self.skill_synonyms = {
	# Programming Languages
	"python": ["python", "python3", "python 3", "py"],
	"javascript": ["javascript", "js", "node.js", "nodejs", "node js"],
	"typescript": ["typescript", "ts"],
	"java": ["java", "java 8", "java 11", "java 17"],
	"csharp": ["c#", "csharp", "c sharp", ".net", "dotnet"],
	"cpp": ["c++", "cpp", "c plus plus"],
	"go": ["go", "golang"],
	"rust": ["rust", "rust-lang"],
	"swift": ["swift", "ios development"],
	"kotlin": ["kotlin", "android development"],
	"r": ["r", "r programming"],
	"scala": ["scala"],
	"php": ["php", "php 7", "php 8"],
	"ruby": ["ruby", "ruby on rails", "ror"],

	# Web Frameworks
	"react": ["react", "reactjs", "react.js", "react js"],
	"angular": ["angular", "angularjs", "angular 2+"],
	"vue": ["vue", "vue.js", "vuejs", "vue js"],
	"svelte": ["svelte", "sveltekit"],
	"django": ["django", "django rest framework", "drf"],
	"flask": ["flask", "flask-restful"],
	"fastapi": ["fastapi", "fast api"],
	"express": ["express", "express.js", "expressjs"],
	"spring": ["spring", "spring boot", "spring framework"],
	"laravel": ["laravel"],
	"rails": ["rails", "ruby on rails", "ror"],

	# Databases
	"postgresql": ["postgresql", "postgres", "pg", "psql"],
	"mysql": ["mysql", "my sql"],
	"mongodb": ["mongodb", "mongo", "mongo db"],
	"redis": ["redis"],
	"elasticsearch": ["elasticsearch", "elastic search"],
	"cassandra": ["cassandra", "apache cassandra"],
	"dynamodb": ["dynamodb", "dynamo db"],
	"sqlite": ["sqlite", "sqlite3"],

	# Cloud Platforms
	"aws": ["aws", "amazon web services", "amazon aws"],
	"azure": ["azure", "microsoft azure"],
	"gcp": ["gcp", "google cloud", "google cloud platform"],
	"heroku": ["heroku"],
	"digitalocean": ["digitalocean", "digital ocean"],
	"vercel": ["vercel"],
	"netlify": ["netlify"],

	# DevOps & Tools
	"docker": ["docker", "containerization", "containers"],
	"kubernetes": ["kubernetes", "k8s", "container orchestration"],
	"jenkins": ["jenkins", "ci/cd"],
	"github actions": ["github actions", "gh actions"],
	"terraform": ["terraform", "infrastructure as code", "iac"],
	"ansible": ["ansible"],
	"git": ["git", "version control", "source control"],
	"linux": ["linux", "unix", "ubuntu", "centos"],

	# Data Science & ML
	"machine learning": ["machine learning", "ml", "artificial intelligence", "ai"],
	"deep learning": ["deep learning", "neural networks"],
	"tensorflow": ["tensorflow", "tf"],
	"pytorch": ["pytorch", "torch"],
	"scikit-learn": ["scikit-learn", "sklearn", "scikit learn"],
	"pandas": ["pandas", "data manipulation"],
	"numpy": ["numpy", "numerical computing"],
	"matplotlib": ["matplotlib", "data visualization"],
	"seaborn": ["seaborn"],
	"jupyter": ["jupyter", "jupyter notebooks"],

	# Testing
	"pytest": ["pytest", "python testing"],
	"jest": ["jest", "javascript testing"],
	"selenium": ["selenium", "web automation"],
	"cypress": ["cypress", "e2e testing"],

	# Methodologies
	"agile": ["agile", "scrum", "kanban"],
	"devops": ["devops", "dev ops"],
	"microservices": ["microservices", "micro services"],
	"rest api": ["rest", "rest api", "restful", "api development"],
	"graphql": ["graphql", "graph ql"],
	}

	self.skill_categories = {
	"programming_languages": ["python", "javascript", "typescript", "java", "csharp", "cpp", "go", "rust", "swift", "kotlin", "r", "scala", "php", "ruby"],
	"frameworks": ["react", "angular", "vue", "svelte", "django", "flask", "fastapi", "express", "spring", "laravel", "rails"],
	"databases": ["postgresql", "mysql", "mongodb", "redis", "elasticsearch", "cassandra", "dynamodb", "sqlite"],
	"cloud_platforms": ["aws", "azure", "gcp", "heroku", "digitalocean", "vercel", "netlify"],
	"tools": ["docker", "kubernetes", "jenkins", "github actions", "terraform", "ansible", "git", "linux", "pytest", "jest", "selenium", "cypress"],
	"methodologies": ["agile", "devops", "microservices", "rest api", "graphql"]
	}

	def normalize_skill(self, skill: str) -> Optional[str]:
	"""Normalize a skill to its canonical form"""
	skill_lower = skill.lower().strip()

	for canonical, synonyms in self.skill_synonyms.items():
	if skill_lower in synonyms:
	return canonical

	return skill_lower if len(skill_lower) > 1 else None

	def categorize_skill(self, normalized_skill: str) -> str:
	"""Categorize a normalized skill"""
	for category, skills in self.skill_categories.items():
	if normalized_skill in skills:
	return category
	return "technical"

	def normalize_skill_list(self, skills: List[str]) -> Dict[str, List[str]]:
	"""Normalize and categorize a list of skills"""
	categorized = {
	"programming_languages": [],
	"frameworks": [],
	"databases": [],
	"cloud_platforms": [],
	"tools": [],
	"methodologies": [],
	"technical": []
	}

	for skill in skills:
	normalized = self.normalize_skill(skill)
	if normalized:
	category = self.categorize_skill(normalized)
	if normalized not in categorized[category]:
	categorized[category].append(normalized)

	return categorized

	class EnhancedResumeParser:
	"""Enhanced resume parser with structured extraction and normalization"""

	def __init__(self):
	self.skills_normalizer = SkillsNormalizer()
	self.llm_client = LLMClient()

	async def run(self, data: Dict[str, Any]) -> Dict[str, Any]:
	resume_text = data.get("resume_text", "")

	if not resume_text:
	return {**data, "resume_data_enhanced": {"error": "No resume content provided"}}

	try:
	# Extract structured resume data
	resume_data = await self._extract_resume_data_structured(resume_text)

	log_metric("resume_parse_enhanced_success", {
	"skills_count": len(self._get_all_skills(resume_data.skills)),
	"experience_count": len(resume_data.experience),
	"projects_count": len(resume_data.projects),
	"years_experience": resume_data.years_of_experience
	})

	return {**data, "resume_data_enhanced": asdict(resume_data)}

	except Exception as e:
	log_metric("resume_parse_enhanced_error", {"error": str(e)})
	return {**data, "resume_data_enhanced": {"error": f"Enhanced resume parsing failed: {e}"}}

	async def _extract_resume_data_structured(self, resume_text: str) -> ResumeData:
	"""Extract structured resume data using multiple approaches"""

	# First, try comprehensive LLM extraction
	try:
	structured_data = await self._llm_extract_structured(resume_text)
	if structured_data:
	return structured_data
	except Exception as e:
	log_metric("resume_llm_extraction_error", {"error": str(e)})

	# Fallback to section-based extraction
	return await self._section_based_extraction(resume_text)

	async def _llm_extract_structured(self, resume_text: str) -> Optional[ResumeData]:
	"""Use LLM to extract structured resume data"""

	# Check token count and chunk if necessary
	token_count = self._count_tokens(resume_text)
	if token_count > 15000:
	# For very long resumes, extract in sections
	return await self._chunked_extraction(resume_text)

	prompt = f"""
	Extract comprehensive structured data from this resume. Return ONLY valid JSON with this exact structure:

	{{
	"personal_info": {{
	"name": "Full Name",
	"email": "email@domain.com",
	"phone": "+1234567890",
	"location": "City, State",
	"linkedin": "linkedin.com/in/username",
	"github": "github.com/username",
	"website": "personal-website.com"
	}},
	"summary": "Professional summary or objective statement",
	"skills": {{
	"technical": ["skill1", "skill2"],
	"programming_languages": ["Python", "JavaScript"],
	"frameworks": ["React", "Django"],
	"tools": ["Git", "Docker"],
	"databases": ["PostgreSQL", "MongoDB"],
	"cloud_platforms": ["AWS", "Azure"],
	"methodologies": ["Agile", "DevOps"],
	"soft_skills": ["Leadership", "Communication"]
	}},
	"experience": [
	{{
	"title": "Job Title",
	"company": "Company Name",
	"duration": "Jan 2020 - Present",
	"location": "City, State",
	"start_date": "2020-01",
	"end_date": "Present",
	"is_current": true,
	"responsibilities": ["responsibility 1", "responsibility 2"],
	"achievements": ["achievement 1", "achievement 2"],
	"technologies": ["tech1", "tech2"]
	}}
	],
	"education": [
	{{
	"degree": "Bachelor of Science",
	"field": "Computer Science",
	"school": "University Name",
	"graduation_year": "2020",
	"gpa": "3.8",
	"relevant_courses": ["Data Structures", "Algorithms"],
	"honors": ["Dean's List", "Magna Cum Laude"]
	}}
	],
	"projects": [
	{{
	"name": "Project Name",
	"description": "Brief description of the project",
	"technologies": ["tech1", "tech2"],
	"github_url": "github.com/user/repo",
	"demo_url": "live-demo-url.com",
	"duration": "3 months",
	"key_features": ["feature1", "feature2"]
	}}
	],
	"certifications": [
	{{
	"name": "Certification Name",
	"issuer": "Organization",
	"date": "2023",
	"credential_id": "123456"
	}}
	],
	"languages": ["English (Native)", "Spanish (Conversational)"],
	"years_of_experience": 5
	}}

	Important guidelines:
	1. Extract ALL skills mentioned, including those in job descriptions and projects
	2. Normalize technology names (e.g., "React.js" → "React", "ML" → "Machine Learning")
	3. Calculate years_of_experience from work history
	4. Parse dates in YYYY-MM format when possible
	5. Group similar skills appropriately
	6. Extract quantifiable achievements when possible
	7. If information is missing, omit the field or use empty array/string

	Resume text:
	{resume_text}
	"""

	try:
	response = self.llm_client.call_llm(prompt, temperature=0, max_tokens=4000)

	# Clean response to extract just JSON
	json_start = response.find('{')
	json_end = response.rfind('}') + 1
	if json_start != -1 and json_end > json_start:
	json_str = response[json_start:json_end]
	else:
	json_str = response

	data = json.loads(json_str)

	# Convert to structured objects
	return self._convert_to_resume_data(data)

	except json.JSONDecodeError as e:
	log_metric("resume_json_parse_error", {"error": str(e), "response": response[:500]})
	return None
	except Exception as e:
	log_metric("resume_llm_error", {"error": str(e)})
	return None

	async def _chunked_extraction(self, resume_text: str) -> ResumeData:
	"""Extract data from long resumes by processing in chunks"""

	sections = self._split_resume_sections(resume_text)

	# Extract each section separately
	personal_info = await self._extract_personal_info(sections.get("header", ""))
	summary = await self._extract_summary(sections.get("summary", ""))
	skills = await self._extract_skills(sections.get("skills", ""))
	experience = await self._extract_experience(sections.get("experience", ""))
	education = await self._extract_education(sections.get("education", ""))
	projects = await self._extract_projects(sections.get("projects", ""))
	certifications = await self._extract_certifications(sections.get("certifications", ""))

	# Calculate years of experience
	years_exp = self._calculate_years_experience(experience)

	return ResumeData(
	personal_info=personal_info,
	summary=summary,
	skills=skills,
	experience=experience,
	education=education,
	projects=projects,
	certifications=certifications,
	languages=[],
	years_of_experience=years_exp
	)

	async def _section_based_extraction(self, resume_text: str) -> ResumeData:
	"""Fallback extraction using regex and basic parsing"""

	# Basic regex-based extraction
	personal_info = self._extract_personal_info_regex(resume_text)
	skills = self._extract_skills_regex(resume_text)

	return ResumeData(
	personal_info=personal_info,
	summary="",
	skills=skills,
	experience=[],
	education=[],
	projects=[],
	certifications=[],
	languages=[],
	years_of_experience=0
	)

	def _convert_to_resume_data(self, data: Dict[str, Any]) -> ResumeData:
	"""Convert parsed JSON to ResumeData objects"""

	# Parse skills with normalization
	skills_data = data.get("skills", {})
	all_skills = []
	for skill_list in skills_data.values():
	if isinstance(skill_list, list):
	all_skills.extend(skill_list)

	normalized_skills = self.skills_normalizer.normalize_skill_list(all_skills)
	skills = Skills(**normalized_skills)

	# Parse experience
	experience = []
	for exp_data in data.get("experience", []):
	exp = Experience(**exp_data)
	experience.append(exp)

	# Parse education
	education = []
	for edu_data in data.get("education", []):
	edu = Education(**edu_data)
	education.append(edu)

	# Parse projects
	projects = []
	for proj_data in data.get("projects", []):
	proj = Project(**proj_data)
	projects.append(proj)

	return ResumeData(
	personal_info=data.get("personal_info", {}),
	summary=data.get("summary", ""),
	skills=skills,
	experience=experience,
	education=education,
	projects=projects,
	certifications=data.get("certifications", []),
	languages=data.get("languages", []),
	years_of_experience=data.get("years_of_experience", 0)
	)

	def _get_all_skills(self, skills: Skills) -> List[str]:
	"""Get all skills as a flat list"""
	all_skills = []
	for field in ['technical', 'programming_languages', 'frameworks', 'tools',
	'databases', 'cloud_platforms', 'methodologies']:
	all_skills.extend(getattr(skills, field, []))
	return all_skills

	def _split_resume_sections(self, resume_text: str) -> Dict[str, str]:
	"""Split resume into sections using common headers"""

	sections = {}
	current_section = "header"
	current_content = []

	# Common section headers
	section_patterns = {
	r'(experience\|work experience\|employment\|professional experience)': 'experience',
	r'(education\|academic background)': 'education',
	r'(skills\|technical skills\|core competencies)': 'skills',
	r'(projects\|personal projects\|side projects)': 'projects',
	r'(summary\|objective\|profile)': 'summary',
	r'(certifications\|licenses)': 'certifications'
	}

	lines = resume_text.split('\n')

	for line in lines:
	line_lower = line.lower().strip()

	# Check if this line is a section header
	section_found = False
	for pattern, section_name in section_patterns.items():
	if re.search(pattern, line_lower):
	# Save previous section
	if current_content:
	sections[current_section] = '\n'.join(current_content)

	current_section = section_name
	current_content = []
	section_found = True
	break

	if not section_found:
	current_content.append(line)

	# Save final section
	if current_content:
	sections[current_section] = '\n'.join(current_content)

	return sections

	def _extract_personal_info_regex(self, text: str) -> Dict[str, str]:
	"""Extract personal information using regex"""

	# Email
	email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b', text)
	email = email_match.group() if email_match else ""

	# Phone
	phone_match = re.search(r'(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})', text)
	phone = phone_match.group() if phone_match else ""

	# LinkedIn
	linkedin_match = re.search(r'linkedin\.com/in/[\w-]+', text, re.IGNORECASE)
	linkedin = f"https://{linkedin_match.group()}" if linkedin_match else ""

	# GitHub
	github_match = re.search(r'github\.com/[\w-]+', text, re.IGNORECASE)
	github = f"https://{github_match.group()}" if github_match else ""

	return {
	"email": email,
	"phone": phone,
	"linkedin": linkedin,
	"github": github
	}

	def _extract_skills_regex(self, text: str) -> Skills:
	"""Extract skills using regex patterns"""

	# Common technical skills to look for
	tech_keywords = [
	'Python', 'JavaScript', 'Java', 'C++', 'React', 'Node.js', 'SQL',
	'AWS', 'Docker', 'Git', 'Machine Learning', 'Data Science',
	'TensorFlow', 'PyTorch', 'Pandas', 'NumPy', 'Django', 'Flask',
	'PostgreSQL', 'MongoDB', 'Redis', 'Kubernetes', 'Jenkins'
	]

	found_skills = []
	for skill in tech_keywords:
	if re.search(rf'\b{re.escape(skill)}\b', text, re.IGNORECASE):
	found_skills.append(skill)

	# Normalize skills
	normalized = self.skills_normalizer.normalize_skill_list(found_skills)

	return Skills(**normalized)

	def _calculate_years_experience(self, experience: List[Experience]) -> int:
	"""Calculate total years of experience"""
	if not experience:
	return 0

	# Simple calculation based on number of roles
	# In practice, you'd want to parse dates and calculate overlap
	return len(experience)

	def _count_tokens(self, text: str) -> int:
	"""Count tokens in text"""
	try:
	encoding = tiktoken.encoding_for_model("gpt-4o-mini")
	return len(encoding.encode(text))
	except:
	# Fallback: approximate as 4 chars per token
	return len(text) // 4

	# Placeholder methods for individual section extraction
	# These would be implemented with specific LLM calls for each section

	async def _extract_personal_info(self, text: str) -> Dict[str, str]:
	return self._extract_personal_info_regex(text)

	async def _extract_summary(self, text: str) -> str:
	return text.strip()

	async def _extract_skills(self, text: str) -> Skills:
	return self._extract_skills_regex(text)

	async def _extract_experience(self, text: str) -> List[Experience]:
	return []

	async def _extract_education(self, text: str) -> List[Education]:
	return []

	async def _extract_projects(self, text: str) -> List[Project]:
	return []

	async def _extract_certifications(self, text: str) -> List[Dict[str, str]]:
	return []