Spaces:

Jayanthk2004
/

TalentScout-Ai

Sleeping

GitHub Actions

Deploy FastAPI backend (backend.main:app) via GitHub Actions

31e79c4 6 months ago

3.89 kB

	import pymupdf as fitz
	import re
	from difflib import get_close_matches

	def extract_text_from_pdf(file_path: str) -> str:
	doc = fitz.open(file_path)
	text = ""
	for page in doc:
	text += page.get_text()
	doc.close()
	return text

	def parse_resume_text(text: str) -> dict:
	"""Enhanced resume parsing with skill validation"""
	lines = [line.strip() for line in text.split('\n') if line.strip()]
	text_lower = text.lower()

	extracted = {
	"name": "",
	"email": "",
	"phone": "",
	"skills": [],
	"experience": ""
	}

	# Valid skills database for matching
	valid_skills = [
	'FastAPI', 'React', 'Next.js', 'Flask', 'MongoDB', 'Tailwind CSS',
	'Machine Learning', 'Python', 'JavaScript', 'HTML', 'CSS', 'Node.js',
	'Docker', 'Kubernetes', 'AWS', 'Git', 'GitHub', 'TensorFlow', 'PyTorch',
	'Streamlit', 'Qdrant', 'LangChain', 'Gemini API', 'OpenAI', 'Gradio',
	'Pandas', 'NumPy', 'Scikit-learn', 'OpenCV', 'Django', 'Vue.js',
	'Angular', 'TypeScript', 'PostgreSQL', 'MySQL', 'Redis', 'GraphQL',
	'RESTful API', 'Microservices', 'CI/CD', 'Linux', 'Ubuntu', 'Nginx',
	'Apache', 'Jenkins', 'Terraform', 'Ansible', 'Elasticsearch'
	]

	# Extract Email using regex
	email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'
	email_match = re.search(email_pattern, text)
	if email_match:
	extracted["email"] = email_match.group()

	# Extract Phone using regex
	phone_pattern = r'\b(?:\+91\|91)?[6-9]\d{9}\b'
	phone_match = re.search(phone_pattern, text)
	if phone_match:
	extracted["phone"] = phone_match.group()

	# Extract Name
	for i, line in enumerate(lines[:10]):
	skip_keywords = ['course', 'email', 'mobile', 'cgpa', 'academic', 'details']
	if any(keyword in line.lower() for keyword in skip_keywords):
	continue

	if re.match(r'^[A-Z][A-Z\s]+$', line) and len(line.split()) >= 2:
	extracted["name"] = line.title()
	break

	# Extract and clean skills
	raw_skills = []

	# Look for explicit skill mentions
	for skill in valid_skills:
	if skill.lower() in text_lower:
	raw_skills.append(skill)

	# Extract from common skill patterns
	skill_patterns = [
	r'built with (.*?)(?:\.\|,\|;\|\n)',
	r'using (.*?)(?:\.\|,\|;\|\n)',
	r'technologies?:?\s(.?)(?:\.\|,\|;\|\n)',
	r'skills?:?\s(.?)(?:\.\|,\|;\|\n)',
	r'stack:?\s(.?)(?:\.\|,\|;\|\n)'
	]

	for pattern in skill_patterns:
	matches = re.findall(pattern, text, re.IGNORECASE \| re.DOTALL)
	for match in matches:
	# Split by common delimiters
	words = re.split(r'[,\.\sand\s&\s]+', match.strip())
	for word in words:
	word = word.strip()
	if len(word) > 2:
	# Try to match with valid skills using fuzzy matching
	close_matches = get_close_matches(word, valid_skills, n=1, cutoff=0.7)
	if close_matches:
	raw_skills.append(close_matches[0])

	# Remove duplicates and limit
	extracted["skills"] = list(set(raw_skills))[:12]

	# Extract Experience
	exp_patterns = [
	r'(\d+)\+?\syears?\s(?:of\s*)?experience',
	r'experience\s:?\s(\d+)\+?\s*years?'
	]

	for pattern in exp_patterns:
	match = re.search(pattern, text_lower)
	if match:
	extracted["experience"] = f"{match.group(1)} years"
	break

	if not extracted["experience"]:
	if 'intern' in text_lower and 'b.tech' in text_lower:
	extracted["experience"] = "0-1 years (Student/Intern)"
	else:
	extracted["experience"] = "Fresher"

	return extracted