Spaces:

TryRaisins
/

cv_analyzer

Sleeping

App Files Files Community

cv_analyzer / app.py

TryRaisins

Update app.py

649ca4c verified 7 months ago

raw

history blame contribute delete

14 kB

	from flask import Flask, request, jsonify, send_from_directory
	from flask_cors import CORS
	import os
	from werkzeug.utils import secure_filename
	import PyPDF2
	import docx
	import re
	import numpy as np
	from typing import List, Dict, Any
	import uuid
	import logging
	from logging.handlers import RotatingFileHandler

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	app = Flask(__name__)
	CORS(app)

	# Configuration
	UPLOAD_FOLDER = os.path.join("/tmp", "uploads")
	ALLOWED_EXTENSIONS = {'txt', 'pdf', 'doc', 'docx'}
	MAX_FILE_SIZE = 16 * 1024 * 1024 # 16MB
	os.environ["HF_HOME"] = "/tmp/hf_home" # writable in Hugging Face Spaces


	app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
	app.config['MAX_CONTENT_LENGTH'] = MAX_FILE_SIZE

	# Create upload directory if it doesn't exist
	os.makedirs(UPLOAD_FOLDER, exist_ok=True)


	# Try to load AI models (optional)
	ai_models_loaded = False
	classifier = None

	try:

	from transformers import pipeline
	# Use a smaller, more efficient model
	classifier = pipeline(
	"zero-shot-classification",

	# model="facebook/bart-large-mnli",

	model="valhalla/distilbart-mnli-12-1", # ✅ Lighter model than bart-large-mnli

	device=-1, # Use CPU
	framework="pt"
	)
	ai_models_loaded = True
	logger.info("AI models loaded successfully (using distilbart-mnli-12-1)")

	except ImportError:
	logger.warning("Transformers not installed, using fallback methods")
	except Exception as e:
	logger.error(f"Error loading AI models: {e}, using fallback")


	def allowed_file(filename):
	return '.' in filename and \
	filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

	def extract_text_from_file(file_path, filename):
	"""Extract text from various file types"""
	text = ""

	if filename.endswith('.pdf'):
	try:
	with open(file_path, 'rb') as f:
	pdf_reader = PyPDF2.PdfReader(f)
	for page in pdf_reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	except Exception as e:
	logger.error(f"Error reading PDF: {e}")
	raise Exception(f"Failed to extract text from PDF: {e}")

	elif filename.endswith(('.doc', '.docx')):
	try:
	doc = docx.Document(file_path)
	for paragraph in doc.paragraphs:
	if paragraph.text:
	text += paragraph.text + "\n"
	except Exception as e:
	logger.error(f"Error reading DOCX: {e}")
	raise Exception(f"Failed to extract text from DOCX: {e}")

	elif filename.endswith('.txt'):
	try:
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	text = f.read()
	except Exception as e:
	logger.error(f"Error reading TXT: {e}")
	raise Exception(f"Failed to extract text from TXT: {e}")

	if not text.strip():
	raise Exception("No text could be extracted from the file")

	# Clean up text
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def extract_skills(text):
	"""Extract skills from text using pattern matching"""
	# Comprehensive skills list with improved matching
	common_skills = [
	'python', 'java', 'javascript', 'typescript', 'react', 'angular', 'vue',
	'node.js', 'express', 'django', 'flask', 'spring', 'laravel', 'ruby',
	'php', 'html', 'css', 'sass', 'less', 'bootstrap', 'tailwind',
	'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'oracle',
	'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes',
	'jenkins', 'git', 'github', 'gitlab', 'ci/cd', 'devops',
	'machine learning', 'ml', 'ai', 'deep learning', 'tensorflow',
	'pytorch', 'keras', 'scikit-learn', 'data analysis', 'pandas',
	'numpy', 'r', 'tableau', 'power bi', 'excel',
	'agile', 'scrum', 'kanban', 'project management',
	'rest api', 'graphql', 'microservices', 'api development',
	'c++', 'c#', 'net', 'swift', 'kotlin', 'go', 'rust'
	]

	found_skills = set()
	text_lower = text.lower()

	# Use word boundaries for better matching
	for skill in common_skills:
	# Match whole words only to avoid false positives
	if re.search(r'\b' + re.escape(skill) + r'\b', text_lower):
	found_skills.add(skill.title())

	return list(found_skills)

	def calculate_score(job_description, candidate_text, skills):
	"""Calculate relevance score using AI models or fallback methods"""
	if classifier and ai_models_loaded:
	try:
	# Use AI model for scoring with better error handling
	sequence_to_classify = candidate_text[:512] # Limit text length for the model

	# More specific labels for better classification
	candidate_labels = [
	"highly relevant candidate for the job",
	"somewhat relevant candidate",
	"irrelevant candidate for this position"
	]

	result = classifier(sequence_to_classify, candidate_labels)
	# Weight the scores (highest for most relevant)
	relevance_score = (result['scores'][0] * 0.7 + result['scores'][1] * 0.3) * 100

	# Skills matching with better approach
	if skills:
	skill_match_score = min(100, len(skills) * 5) # Cap at 100
	else:
	skill_match_score = 30

	# Combine scores (weighted average)
	final_score = (relevance_score * 0.7) + (skill_match_score * 0.3)

	return min(100, max(0, int(final_score)))

	except Exception as e:
	logger.error(f"Error in AI scoring: {e}, using fallback")

	# Fallback scoring method
	return calculate_fallback_score(job_description, candidate_text, skills)

	def calculate_fallback_score(job_description, candidate_text, skills):
	"""Fallback scoring method without AI"""
	score = 40 # Lower base score

	# Simple keyword matching with better approach
	job_lower = job_description.lower()
	candidate_lower = candidate_text.lower()

	# Extract meaningful words (4+ characters)
	job_words = set(re.findall(r'\b[a-z]{4,}\b', job_lower))
	candidate_words = set(re.findall(r'\b[a-z]{4,}\b', candidate_lower))

	# Remove common stop words
	stop_words = {'with', 'this', 'that', 'have', 'from', 'they', 'which', 'were', 'their'}
	job_words = job_words - stop_words
	candidate_words = candidate_words - stop_words

	common_words = job_words & candidate_words
	if job_words:
	keyword_match = len(common_words) / len(job_words) * 40 # Increased weight
	score += min(40, keyword_match)

	# Skills bonus
	if skills:
	score += min(20, len(skills) * 3) # Increased bonus per skill

	# Experience indicators with context
	experience_indicators = [
	'experience', 'years', 'worked', 'developed', 'created', 'built',
	'managed', 'led', 'implemented', 'designed'
	]
	for indicator in experience_indicators:
	if re.search(r'\b' + indicator + r'\b', candidate_lower):
	score += 2 # Increased points per indicator

	return min(100, max(0, int(score)))

	def extract_candidate_info(text, filename):
	"""Extract candidate information from text with improved patterns"""
	# Extract name with better pattern
	name_patterns = [
	r'(?:^\|\n)[\s]([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)[\s](?:\n\|$)',
	r'Resume[\s\S]{0,500}?([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
	r'Name[:]?[\s]*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)'
	]

	name = filename.split('.')[0] # Default to filename

	for pattern in name_patterns:
	name_match = re.search(pattern, text, re.IGNORECASE)
	if name_match:
	name = name_match.group(1).strip()
	break

	# Extract email
	email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b', text)
	email = email_match.group(0) if email_match else "No email found"

	# Improved phone regex for international numbers
	phone_patterns = [
	r'(\+?\d{1,3}[-.\s]?)?$?\d{3}$?[-.\s]?\d{3}[-.\s]?\d{4}',
	r'(\+?\d{1,3}[-.\s]?)?$?\d{2}$?[-.\s]?\d{4}[-.\s]?\d{4}',
	r'(\+?\d{1,3}[-.\s]?)?$?\d{4}$?[-.\s]?\d{3}[-.\s]?\d{3}'
	]

	phone = "No phone found"
	for pattern in phone_patterns:
	phone_match = re.search(pattern, text)
	if phone_match:
	phone = phone_match.group(0)
	break

	return name, email, phone

	def analyze_candidate(job_description, candidate_text, filename):
	"""Analyze a single candidate"""
	try:
	skills = extract_skills(candidate_text)
	score = calculate_score(job_description, candidate_text, skills)
	name, email, phone = extract_candidate_info(candidate_text, filename)

	return {
	'id': str(uuid.uuid4()),
	'name': name,
	'email': email,
	'phone': phone,
	'skills': skills,
	'score': score,
	'text_preview': candidate_text[:200] + '...' if len(candidate_text) > 200 else candidate_text
	}
	except Exception as e:
	logger.error(f"Error analyzing candidate: {e}")
	return {
	'id': str(uuid.uuid4()),
	'name': filename.split('.')[0],
	'email': "Error in extraction",
	'phone': "Error in extraction",
	'skills': [],
	'score': 0,
	'text_preview': "Error processing file",
	'error': str(e)
	}

	@app.route('/api/process-resumes', methods=['POST'])
	def process_resumes():
	"""Process uploaded resumes against job description"""
	try:
	# Check if files are present
	if 'resumes' not in request.files:
	return jsonify({'error': 'Missing resume files'}), 400

	if 'jobDescription' not in request.files:
	return jsonify({'error': 'Missing job description file'}), 400

	job_desc_file = request.files['jobDescription']
	resume_files = request.files.getlist('resumes')

	# Validate job description file
	if job_desc_file.filename == '':
	return jsonify({'error': 'No job description file selected'}), 400

	if not allowed_file(job_desc_file.filename):
	return jsonify({'error': 'Invalid job description file type'}), 400

	# Validate resume files
	valid_resumes = []
	for file in resume_files:
	if file.filename != '' and allowed_file(file.filename):
	valid_resumes.append(file)

	if not valid_resumes:
	return jsonify({'error': 'No valid resume files'}), 400

	# Save and process job description
	job_desc_filename = secure_filename(job_desc_file.filename)
	job_desc_path = os.path.join(app.config['UPLOAD_FOLDER'], job_desc_filename)
	job_desc_file.save(job_desc_path)

	try:
	job_description = extract_text_from_file(job_desc_path, job_desc_filename)
	except Exception as e:
	return jsonify({'error': f'Failed to process job description: {str(e)}'}), 400

	# Process each resume
	candidates = []
	for resume_file in valid_resumes:
	resume_filename = secure_filename(resume_file.filename)
	resume_path = os.path.join(app.config['UPLOAD_FOLDER'], resume_filename)
	resume_file.save(resume_path)

	try:
	# Extract text from resume
	resume_text = extract_text_from_file(resume_path, resume_filename)

	# Analyze candidate
	candidate = analyze_candidate(job_description, resume_text, resume_filename)
	candidates.append(candidate)

	except Exception as e:
	logger.error(f"Error processing {resume_filename}: {e}")
	candidates.append({
	'id': str(uuid.uuid4()),
	'name': resume_filename.split('.')[0],
	'email': "Processing error",
	'phone': "Processing error",
	'skills': [],
	'score': 0,
	'text_preview': f"Error: {str(e)}",
	'error': str(e)
	})

	# Clean up resume file
	try:
	os.remove(resume_path)
	except:
	pass

	# Clean up job description file
	try:
	os.remove(job_desc_path)
	except:
	pass

	# Sort candidates by score
	candidates.sort(key=lambda x: x['score'], reverse=True)

	return jsonify({
	'candidates': candidates,
	'job_description': job_description[:500] + '...' if len(job_description) > 500 else job_description,
	'total_processed': len(candidates),
	'ai_used': ai_models_loaded
	})

	except Exception as e:
	logger.error(f"Error processing resumes: {e}")
	return jsonify({'error': 'Internal server error'}), 500

	@app.route('/api/health', methods=['GET'])
	def health_check():
	"""Health check endpoint"""
	return jsonify({
	'status': 'healthy',
	'ai_models_loaded': ai_models_loaded,
	'upload_folder_exists': os.path.exists(UPLOAD_FOLDER)
	})

	@app.route('/')
	def index():
	return jsonify({'message': 'Resume Analyzer API is running'})

	if __name__ == "__main__":
	port = int(os.environ.get("PORT", 10000))
	app.run(host="0.0.0.0", port=port, debug=False)