Akshay Chame
π Add LinkedIn Profile Enhancer Streamlit app with all agents and dependencies
035c4af
| # LinkedIn Data Parser | |
| import re | |
| from typing import Dict, Any, List, Optional | |
| from datetime import datetime | |
| class LinkedInParser: | |
| """Utility class for parsing and cleaning LinkedIn profile data""" | |
| def __init__(self): | |
| self.skill_categories = { | |
| 'technical': ['python', 'javascript', 'java', 'react', 'node.js', 'sql', 'aws', 'docker'], | |
| 'management': ['leadership', 'project management', 'team management', 'agile', 'scrum'], | |
| 'marketing': ['seo', 'social media', 'content marketing', 'digital marketing', 'analytics'], | |
| 'design': ['ui/ux', 'photoshop', 'figma', 'adobe', 'design thinking'] | |
| } | |
| def clean_profile_data(self, raw_data: Dict[str, Any]) -> Dict[str, Any]: | |
| """ | |
| Clean and standardize raw profile data | |
| Args: | |
| raw_data (Dict[str, Any]): Raw scraped data | |
| Returns: | |
| Dict[str, Any]: Cleaned profile data | |
| """ | |
| cleaned_data = {} | |
| # Clean basic info | |
| cleaned_data['name'] = self._clean_text(raw_data.get('name', '')) | |
| cleaned_data['headline'] = self._clean_text(raw_data.get('headline', '')) | |
| cleaned_data['location'] = self._clean_text(raw_data.get('location', '')) | |
| cleaned_data['about'] = self._clean_text(raw_data.get('about', '')) | |
| # Clean experience | |
| cleaned_data['experience'] = self._clean_experience_list( | |
| raw_data.get('experience', []) | |
| ) | |
| # Clean education | |
| cleaned_data['education'] = self._clean_education_list( | |
| raw_data.get('education', []) | |
| ) | |
| # Clean and categorize skills | |
| cleaned_data['skills'] = self._clean_skills_list( | |
| raw_data.get('skills', []) | |
| ) | |
| # Parse additional info | |
| cleaned_data['connections'] = self._parse_connections( | |
| raw_data.get('connections', '') | |
| ) | |
| cleaned_data['url'] = raw_data.get('url', '') | |
| cleaned_data['parsed_at'] = datetime.now().isoformat() | |
| return cleaned_data | |
| def extract_keywords(self, text: str, min_length: int = 3) -> List[str]: | |
| """ | |
| Extract meaningful keywords from text | |
| Args: | |
| text (str): Input text | |
| min_length (int): Minimum keyword length | |
| Returns: | |
| List[str]: Extracted keywords | |
| """ | |
| # Remove special characters and convert to lowercase | |
| clean_text = re.sub(r'[^\w\s]', ' ', text.lower()) | |
| # Split into words and filter | |
| words = clean_text.split() | |
| # Common stop words to exclude | |
| stop_words = { | |
| 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', | |
| 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', | |
| 'after', 'above', 'below', 'between', 'among', 'within', 'without', | |
| 'under', 'over', 'is', 'are', 'was', 'were', 'be', 'been', 'being', | |
| 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', | |
| 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', | |
| 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', | |
| 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'their' | |
| } | |
| # Filter keywords | |
| keywords = [ | |
| word for word in words | |
| if len(word) >= min_length and word not in stop_words | |
| ] | |
| # Remove duplicates while preserving order | |
| unique_keywords = [] | |
| seen = set() | |
| for keyword in keywords: | |
| if keyword not in seen: | |
| unique_keywords.append(keyword) | |
| seen.add(keyword) | |
| return unique_keywords | |
| def parse_duration(self, duration_str: str) -> Dict[str, Any]: | |
| """ | |
| Parse duration strings like "2020 - Present" or "Jan 2020 - Dec 2022" | |
| Args: | |
| duration_str (str): Duration string | |
| Returns: | |
| Dict[str, Any]: Parsed duration info | |
| """ | |
| duration_info = { | |
| 'raw': duration_str, | |
| 'start_date': None, | |
| 'end_date': None, | |
| 'is_current': False, | |
| 'duration_months': 0 | |
| } | |
| if not duration_str: | |
| return duration_info | |
| # Check if current position | |
| if 'present' in duration_str.lower(): | |
| duration_info['is_current'] = True | |
| # Extract years using regex | |
| year_pattern = r'\b(19|20)\d{2}\b' | |
| years = re.findall(year_pattern, duration_str) | |
| if years: | |
| duration_info['start_date'] = years[0] if len(years) > 0 else None | |
| duration_info['end_date'] = years[1] if len(years) > 1 else None | |
| return duration_info | |
| def categorize_skills(self, skills: List[str]) -> Dict[str, List[str]]: | |
| """ | |
| Categorize skills into different types | |
| Args: | |
| skills (List[str]): List of skills | |
| Returns: | |
| Dict[str, List[str]]: Categorized skills | |
| """ | |
| categorized = { | |
| 'technical': [], | |
| 'management': [], | |
| 'marketing': [], | |
| 'design': [], | |
| 'other': [] | |
| } | |
| for skill in skills: | |
| skill_lower = skill.lower() | |
| categorized_flag = False | |
| for category, keywords in self.skill_categories.items(): | |
| if any(keyword in skill_lower for keyword in keywords): | |
| categorized[category].append(skill) | |
| categorized_flag = True | |
| break | |
| if not categorized_flag: | |
| categorized['other'].append(skill) | |
| return categorized | |
| def extract_achievements(self, text: str) -> List[str]: | |
| """ | |
| Extract achievements with numbers/metrics from text | |
| Args: | |
| text (str): Input text | |
| Returns: | |
| List[str]: List of achievements | |
| """ | |
| achievements = [] | |
| # Patterns for achievements with numbers | |
| patterns = [ | |
| r'[^.]*\b\d+%[^.]*', # Percentage achievements | |
| r'[^.]*\b\d+[kK]\+?[^.]*', # Numbers with K (thousands) | |
| r'[^.]*\b\d+[mM]\+?[^.]*', # Numbers with M (millions) | |
| r'[^.]*\$\d+[^.]*', # Money amounts | |
| r'[^.]*\b\d+\s*(years?|months?)[^.]*', # Time periods | |
| ] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, text, re.IGNORECASE) | |
| achievements.extend([match.strip() for match in matches]) | |
| return achievements | |
| def _clean_text(self, text: str) -> str: | |
| """Clean and normalize text""" | |
| if not text: | |
| return "" | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| # Remove special characters but keep basic punctuation | |
| text = re.sub(r'[^\w\s\-.,!?()&/]', '', text) | |
| return text | |
| def _clean_experience_list(self, experience_list: List[Dict]) -> List[Dict]: | |
| """Clean experience entries""" | |
| cleaned_experience = [] | |
| for exp in experience_list: | |
| if isinstance(exp, dict): | |
| cleaned_exp = { | |
| 'title': self._clean_text(exp.get('title', '')), | |
| 'company': self._clean_text(exp.get('company', '')), | |
| 'duration': self._clean_text(exp.get('duration', '')), | |
| 'description': self._clean_text(exp.get('description', '')), | |
| 'location': self._clean_text(exp.get('location', '')), | |
| } | |
| # Parse duration | |
| cleaned_exp['duration_info'] = self.parse_duration(cleaned_exp['duration']) | |
| # Extract achievements | |
| cleaned_exp['achievements'] = self.extract_achievements( | |
| cleaned_exp['description'] | |
| ) | |
| cleaned_experience.append(cleaned_exp) | |
| return cleaned_experience | |
| def _clean_education_list(self, education_list: List[Dict]) -> List[Dict]: | |
| """Clean education entries""" | |
| cleaned_education = [] | |
| for edu in education_list: | |
| if isinstance(edu, dict): | |
| cleaned_edu = { | |
| 'degree': self._clean_text(edu.get('degree', '')), | |
| 'school': self._clean_text(edu.get('school', '')), | |
| 'year': self._clean_text(edu.get('year', '')), | |
| 'field': self._clean_text(edu.get('field', '')), | |
| } | |
| cleaned_education.append(cleaned_edu) | |
| return cleaned_education | |
| def _clean_skills_list(self, skills_list: List[str]) -> List[str]: | |
| """Clean and deduplicate skills""" | |
| if not skills_list: | |
| return [] | |
| cleaned_skills = [] | |
| seen_skills = set() | |
| for skill in skills_list: | |
| cleaned_skill = self._clean_text(str(skill)) | |
| skill_lower = cleaned_skill.lower() | |
| if cleaned_skill and skill_lower not in seen_skills: | |
| cleaned_skills.append(cleaned_skill) | |
| seen_skills.add(skill_lower) | |
| return cleaned_skills | |
| def _parse_connections(self, connections_str: str) -> int: | |
| """Parse connection count from string""" | |
| if not connections_str: | |
| return 0 | |
| # Extract numbers from connection string | |
| numbers = re.findall(r'\d+', connections_str) | |
| if numbers: | |
| return int(numbers[0]) | |
| # Handle "500+" format | |
| if '500+' in connections_str: | |
| return 500 | |
| return 0 | |