Spaces:
Sleeping
Sleeping
| """ | |
| NLP Processor Module | |
| Handles natural language processing tasks including skill extraction, | |
| entity recognition, and text analysis. | |
| """ | |
| import re | |
| import spacy | |
| from typing import List, Dict, Set, Optional | |
| from collections import Counter | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class NLPProcessor: | |
| """ | |
| Advanced NLP processor for resume analysis using spaCy and transformers. | |
| """ | |
| # Comprehensive skill database | |
| TECH_SKILLS = { | |
| 'programming': [ | |
| 'python', 'java', 'javascript', 'typescript', 'c++', 'c#', 'ruby', 'go', | |
| 'rust', 'php', 'swift', 'kotlin', 'scala', 'r', 'matlab', 'sql', 'bash' | |
| ], | |
| 'web': [ | |
| 'html', 'css', 'react', 'angular', 'vue.js', 'node.js', 'express', | |
| 'django', 'flask', 'fastapi', 'spring', 'asp.net', 'jquery' | |
| ], | |
| 'data_science': [ | |
| 'machine learning', 'deep learning', 'neural networks', 'nlp', | |
| 'computer vision', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', | |
| 'pandas', 'numpy', 'matplotlib', 'seaborn', 'data analysis', | |
| 'statistical analysis', 'predictive modeling' | |
| ], | |
| 'cloud': [ | |
| 'aws', 'azure', 'gcp', 'google cloud', 'docker', 'kubernetes', | |
| 'terraform', 'jenkins', 'ci/cd', 'devops', 'lambda', 's3', 'ec2' | |
| ], | |
| 'database': [ | |
| 'mysql', 'postgresql', 'mongodb', 'redis', 'cassandra', 'dynamodb', | |
| 'oracle', 'sql server', 'sqlite', 'elasticsearch' | |
| ], | |
| 'tools': [ | |
| 'git', 'github', 'gitlab', 'jira', 'confluence', 'slack', 'linux', | |
| 'windows', 'macos', 'vscode', 'jupyter', 'postman' | |
| ], | |
| 'soft_skills': [ | |
| 'leadership', 'communication', 'teamwork', 'problem solving', | |
| 'critical thinking', 'project management', 'agile', 'scrum', | |
| 'collaboration', 'presentation', 'negotiation' | |
| ] | |
| } | |
| def __init__(self, use_gpu: bool = False): | |
| """Initialize NLP processor with models.""" | |
| self.nlp = None | |
| try: | |
| import spacy | |
| self.nlp = spacy.load("en_core_web_sm") | |
| logger.info("Loaded spaCy model: en_core_web_sm") | |
| except Exception as e: | |
| logger.warning(f"spaCy model not available: {e}. Using fallback methods.") | |
| self.nlp = None | |
| self.all_skills = [] | |
| for category, skills in self.TECH_SKILLS.items(): | |
| self.all_skills.extend(skills) | |
| logger.info("NLPProcessor initialized") | |
| def extract_skills(self, text: str) -> Dict[str, List[str]]: | |
| """Extract technical and soft skills from text.""" | |
| text_lower = text.lower() | |
| found_skills = {} | |
| for category, skills in self.TECH_SKILLS.items(): | |
| found = [] | |
| for skill in skills: | |
| pattern = r'\b' + re.escape(skill) + r'\b' | |
| if re.search(pattern, text_lower): | |
| found.append(skill) | |
| if found: | |
| found_skills[category] = found | |
| return found_skills | |
| def extract_entities(self, text: str) -> Dict[str, List[str]]: | |
| """Extract named entities using spaCy.""" | |
| if not self.nlp: | |
| return {} | |
| doc = self.nlp(text) | |
| entities = {} | |
| for ent in doc.ents: | |
| entity_type = ent.label_ | |
| if entity_type not in entities: | |
| entities[entity_type] = [] | |
| entities[entity_type].append(ent.text) | |
| return {k: list(set(v)) for k, v in entities.items()} | |
| def extract_experience(self, text: str) -> List[Dict[str, str]]: | |
| """Extract work experience entries from text.""" | |
| experiences = [] | |
| date_pattern = r'(\d{4}|\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+\d{4})' | |
| exp_section = self._find_section(text, ['experience', 'employment', 'work history']) | |
| if not exp_section: | |
| return experiences | |
| lines = exp_section.split('\n') | |
| current_entry = {} | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| if current_entry: | |
| experiences.append(current_entry) | |
| current_entry = {} | |
| continue | |
| dates = re.findall(date_pattern, line, re.IGNORECASE) | |
| if dates and len(dates) >= 1: | |
| if current_entry: | |
| experiences.append(current_entry) | |
| current_entry = { | |
| 'raw_text': line, | |
| 'dates': dates, | |
| 'description': [] | |
| } | |
| elif current_entry: | |
| current_entry['description'].append(line) | |
| if current_entry: | |
| experiences.append(current_entry) | |
| return experiences | |
| def extract_education(self, text: str) -> List[Dict[str, str]]: | |
| """Extract education information.""" | |
| education = [] | |
| degree_patterns = [ | |
| r'\b(bachelor|b\.s\.|b\.a\.|bs|ba|undergraduate)\b', | |
| r'\b(master|m\.s\.|m\.a\.|ms|ma|mba|graduate)\b', | |
| r'\b(phd|ph\.d\.|doctorate|doctoral)\b', | |
| r'\b(associate|a\.s\.|a\.a\.)\b' | |
| ] | |
| edu_section = self._find_section(text, ['education', 'academic']) | |
| if not edu_section: | |
| return education | |
| lines = edu_section.split('\n') | |
| for line in lines: | |
| line = line.strip() | |
| for pattern in degree_patterns: | |
| if re.search(pattern, line, re.IGNORECASE): | |
| education.append({'raw_text': line, 'degree_mentioned': True}) | |
| break | |
| return education | |
| def _find_section(self, text: str, keywords: List[str]) -> Optional[str]: | |
| """Find a section in text based on keywords.""" | |
| text_lower = text.lower() | |
| for keyword in keywords: | |
| pattern = r'\b' + keyword + r'\b.*?(?=\n[A-Z]{2,}|\Z)' | |
| match = re.search(pattern, text_lower, re.DOTALL) | |
| if match: | |
| start = match.start() | |
| next_section = re.search(r'\n[A-Z\s]{10,}\n', text[start + 50:]) | |
| end = start + 50 + next_section.start() if next_section else len(text) | |
| return text[start:end] | |
| return None | |
| def calculate_experience_years(self, text: str) -> float: | |
| """Estimate total years of experience from text.""" | |
| years = re.findall(r'\b(19|20)\d{2}\b', text) | |
| years = [int(y) for y in years] | |
| if len(years) < 2: | |
| return 0.0 | |
| min_year = min(years) | |
| max_year = max(years) | |
| experience_years = max_year - min_year | |
| return min(experience_years, 50) | |
| def extract_keywords(self, text: str, top_n: int = 20) -> List[tuple]: | |
| """Extract top keywords from text.""" | |
| if not self.nlp: | |
| words = re.findall(r'\b\w+\b', text.lower()) | |
| common_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for']) | |
| words = [w for w in words if w not in common_words and len(w) > 3] | |
| return Counter(words).most_common(top_n) | |
| doc = self.nlp(text) | |
| important_tokens = [ | |
| token.lemma_.lower() | |
| for token in doc | |
| if token.pos_ in ['NOUN', 'PROPN', 'ADJ'] | |
| and not token.is_stop | |
| and len(token.text) > 2 | |
| ] | |
| return Counter(important_tokens).most_common(top_n) | |