Spaces:
Sleeping
Sleeping
| import pymupdf as fitz | |
| import re | |
| from difflib import get_close_matches | |
| def extract_text_from_pdf(file_path: str) -> str: | |
| doc = fitz.open(file_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| doc.close() | |
| return text | |
| def parse_resume_text(text: str) -> dict: | |
| """Enhanced resume parsing with skill validation""" | |
| lines = [line.strip() for line in text.split('\n') if line.strip()] | |
| text_lower = text.lower() | |
| extracted = { | |
| "name": "", | |
| "email": "", | |
| "phone": "", | |
| "skills": [], | |
| "experience": "" | |
| } | |
| # Valid skills database for matching | |
| valid_skills = [ | |
| 'FastAPI', 'React', 'Next.js', 'Flask', 'MongoDB', 'Tailwind CSS', | |
| 'Machine Learning', 'Python', 'JavaScript', 'HTML', 'CSS', 'Node.js', | |
| 'Docker', 'Kubernetes', 'AWS', 'Git', 'GitHub', 'TensorFlow', 'PyTorch', | |
| 'Streamlit', 'Qdrant', 'LangChain', 'Gemini API', 'OpenAI', 'Gradio', | |
| 'Pandas', 'NumPy', 'Scikit-learn', 'OpenCV', 'Django', 'Vue.js', | |
| 'Angular', 'TypeScript', 'PostgreSQL', 'MySQL', 'Redis', 'GraphQL', | |
| 'RESTful API', 'Microservices', 'CI/CD', 'Linux', 'Ubuntu', 'Nginx', | |
| 'Apache', 'Jenkins', 'Terraform', 'Ansible', 'Elasticsearch' | |
| ] | |
| # Extract Email using regex | |
| email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
| email_match = re.search(email_pattern, text) | |
| if email_match: | |
| extracted["email"] = email_match.group() | |
| # Extract Phone using regex | |
| phone_pattern = r'\b(?:\+91|91)?[6-9]\d{9}\b' | |
| phone_match = re.search(phone_pattern, text) | |
| if phone_match: | |
| extracted["phone"] = phone_match.group() | |
| # Extract Name | |
| for i, line in enumerate(lines[:10]): | |
| skip_keywords = ['course', 'email', 'mobile', 'cgpa', 'academic', 'details'] | |
| if any(keyword in line.lower() for keyword in skip_keywords): | |
| continue | |
| if re.match(r'^[A-Z][A-Z\s]+$', line) and len(line.split()) >= 2: | |
| extracted["name"] = line.title() | |
| break | |
| # Extract and clean skills | |
| raw_skills = [] | |
| # Look for explicit skill mentions | |
| for skill in valid_skills: | |
| if skill.lower() in text_lower: | |
| raw_skills.append(skill) | |
| # Extract from common skill patterns | |
| skill_patterns = [ | |
| r'built with (.*?)(?:\.|,|;|\n)', | |
| r'using (.*?)(?:\.|,|;|\n)', | |
| r'technologies?:?\s*(.*?)(?:\.|,|;|\n)', | |
| r'skills?:?\s*(.*?)(?:\.|,|;|\n)', | |
| r'stack:?\s*(.*?)(?:\.|,|;|\n)' | |
| ] | |
| for pattern in skill_patterns: | |
| matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL) | |
| for match in matches: | |
| # Split by common delimiters | |
| words = re.split(r'[,\.\sand\s&\s]+', match.strip()) | |
| for word in words: | |
| word = word.strip() | |
| if len(word) > 2: | |
| # Try to match with valid skills using fuzzy matching | |
| close_matches = get_close_matches(word, valid_skills, n=1, cutoff=0.7) | |
| if close_matches: | |
| raw_skills.append(close_matches[0]) | |
| # Remove duplicates and limit | |
| extracted["skills"] = list(set(raw_skills))[:12] | |
| # Extract Experience | |
| exp_patterns = [ | |
| r'(\d+)\+?\s*years?\s*(?:of\s*)?experience', | |
| r'experience\s*:?\s*(\d+)\+?\s*years?' | |
| ] | |
| for pattern in exp_patterns: | |
| match = re.search(pattern, text_lower) | |
| if match: | |
| extracted["experience"] = f"{match.group(1)} years" | |
| break | |
| if not extracted["experience"]: | |
| if 'intern' in text_lower and 'b.tech' in text_lower: | |
| extracted["experience"] = "0-1 years (Student/Intern)" | |
| else: | |
| extracted["experience"] = "Fresher" | |
| return extracted | |