from flask import Flask, request, jsonify, send_from_directory from flask_cors import CORS import os from werkzeug.utils import secure_filename import PyPDF2 import docx import re import numpy as np from typing import List, Dict, Any import uuid import logging from logging.handlers import RotatingFileHandler # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = Flask(__name__) CORS(app) # Configuration UPLOAD_FOLDER = os.path.join("/tmp", "uploads") ALLOWED_EXTENSIONS = {'txt', 'pdf', 'doc', 'docx'} MAX_FILE_SIZE = 16 * 1024 * 1024 # 16MB os.environ["HF_HOME"] = "/tmp/hf_home" # writable in Hugging Face Spaces app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER app.config['MAX_CONTENT_LENGTH'] = MAX_FILE_SIZE # Create upload directory if it doesn't exist os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Try to load AI models (optional) ai_models_loaded = False classifier = None try: from transformers import pipeline # Use a smaller, more efficient model classifier = pipeline( "zero-shot-classification", # model="facebook/bart-large-mnli", model="valhalla/distilbart-mnli-12-1", # ✅ Lighter model than bart-large-mnli device=-1, # Use CPU framework="pt" ) ai_models_loaded = True logger.info("AI models loaded successfully (using distilbart-mnli-12-1)") except ImportError: logger.warning("Transformers not installed, using fallback methods") except Exception as e: logger.error(f"Error loading AI models: {e}, using fallback") def allowed_file(filename): return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS def extract_text_from_file(file_path, filename): """Extract text from various file types""" text = "" if filename.endswith('.pdf'): try: with open(file_path, 'rb') as f: pdf_reader = PyPDF2.PdfReader(f) for page in pdf_reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" except Exception as e: logger.error(f"Error reading PDF: {e}") raise Exception(f"Failed to extract text from PDF: {e}") elif filename.endswith(('.doc', '.docx')): try: doc = docx.Document(file_path) for paragraph in doc.paragraphs: if paragraph.text: text += paragraph.text + "\n" except Exception as e: logger.error(f"Error reading DOCX: {e}") raise Exception(f"Failed to extract text from DOCX: {e}") elif filename.endswith('.txt'): try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: text = f.read() except Exception as e: logger.error(f"Error reading TXT: {e}") raise Exception(f"Failed to extract text from TXT: {e}") if not text.strip(): raise Exception("No text could be extracted from the file") # Clean up text text = re.sub(r'\s+', ' ', text).strip() return text def extract_skills(text): """Extract skills from text using pattern matching""" # Comprehensive skills list with improved matching common_skills = [ 'python', 'java', 'javascript', 'typescript', 'react', 'angular', 'vue', 'node.js', 'express', 'django', 'flask', 'spring', 'laravel', 'ruby', 'php', 'html', 'css', 'sass', 'less', 'bootstrap', 'tailwind', 'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'oracle', 'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'github', 'gitlab', 'ci/cd', 'devops', 'machine learning', 'ml', 'ai', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'data analysis', 'pandas', 'numpy', 'r', 'tableau', 'power bi', 'excel', 'agile', 'scrum', 'kanban', 'project management', 'rest api', 'graphql', 'microservices', 'api development', 'c++', 'c#', 'net', 'swift', 'kotlin', 'go', 'rust' ] found_skills = set() text_lower = text.lower() # Use word boundaries for better matching for skill in common_skills: # Match whole words only to avoid false positives if re.search(r'\b' + re.escape(skill) + r'\b', text_lower): found_skills.add(skill.title()) return list(found_skills) def calculate_score(job_description, candidate_text, skills): """Calculate relevance score using AI models or fallback methods""" if classifier and ai_models_loaded: try: # Use AI model for scoring with better error handling sequence_to_classify = candidate_text[:512] # Limit text length for the model # More specific labels for better classification candidate_labels = [ "highly relevant candidate for the job", "somewhat relevant candidate", "irrelevant candidate for this position" ] result = classifier(sequence_to_classify, candidate_labels) # Weight the scores (highest for most relevant) relevance_score = (result['scores'][0] * 0.7 + result['scores'][1] * 0.3) * 100 # Skills matching with better approach if skills: skill_match_score = min(100, len(skills) * 5) # Cap at 100 else: skill_match_score = 30 # Combine scores (weighted average) final_score = (relevance_score * 0.7) + (skill_match_score * 0.3) return min(100, max(0, int(final_score))) except Exception as e: logger.error(f"Error in AI scoring: {e}, using fallback") # Fallback scoring method return calculate_fallback_score(job_description, candidate_text, skills) def calculate_fallback_score(job_description, candidate_text, skills): """Fallback scoring method without AI""" score = 40 # Lower base score # Simple keyword matching with better approach job_lower = job_description.lower() candidate_lower = candidate_text.lower() # Extract meaningful words (4+ characters) job_words = set(re.findall(r'\b[a-z]{4,}\b', job_lower)) candidate_words = set(re.findall(r'\b[a-z]{4,}\b', candidate_lower)) # Remove common stop words stop_words = {'with', 'this', 'that', 'have', 'from', 'they', 'which', 'were', 'their'} job_words = job_words - stop_words candidate_words = candidate_words - stop_words common_words = job_words & candidate_words if job_words: keyword_match = len(common_words) / len(job_words) * 40 # Increased weight score += min(40, keyword_match) # Skills bonus if skills: score += min(20, len(skills) * 3) # Increased bonus per skill # Experience indicators with context experience_indicators = [ 'experience', 'years', 'worked', 'developed', 'created', 'built', 'managed', 'led', 'implemented', 'designed' ] for indicator in experience_indicators: if re.search(r'\b' + indicator + r'\b', candidate_lower): score += 2 # Increased points per indicator return min(100, max(0, int(score))) def extract_candidate_info(text, filename): """Extract candidate information from text with improved patterns""" # Extract name with better pattern name_patterns = [ r'(?:^|\n)[\s]*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)[\s]*(?:\n|$)', r'Resume[\s\S]{0,500}?([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', r'Name[:]?[\s]*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)' ] name = filename.split('.')[0] # Default to filename for pattern in name_patterns: name_match = re.search(pattern, text, re.IGNORECASE) if name_match: name = name_match.group(1).strip() break # Extract email email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) email = email_match.group(0) if email_match else "No email found" # Improved phone regex for international numbers phone_patterns = [ r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', r'(\+?\d{1,3}[-.\s]?)?\(?\d{2}\)?[-.\s]?\d{4}[-.\s]?\d{4}', r'(\+?\d{1,3}[-.\s]?)?\(?\d{4}\)?[-.\s]?\d{3}[-.\s]?\d{3}' ] phone = "No phone found" for pattern in phone_patterns: phone_match = re.search(pattern, text) if phone_match: phone = phone_match.group(0) break return name, email, phone def analyze_candidate(job_description, candidate_text, filename): """Analyze a single candidate""" try: skills = extract_skills(candidate_text) score = calculate_score(job_description, candidate_text, skills) name, email, phone = extract_candidate_info(candidate_text, filename) return { 'id': str(uuid.uuid4()), 'name': name, 'email': email, 'phone': phone, 'skills': skills, 'score': score, 'text_preview': candidate_text[:200] + '...' if len(candidate_text) > 200 else candidate_text } except Exception as e: logger.error(f"Error analyzing candidate: {e}") return { 'id': str(uuid.uuid4()), 'name': filename.split('.')[0], 'email': "Error in extraction", 'phone': "Error in extraction", 'skills': [], 'score': 0, 'text_preview': "Error processing file", 'error': str(e) } @app.route('/api/process-resumes', methods=['POST']) def process_resumes(): """Process uploaded resumes against job description""" try: # Check if files are present if 'resumes' not in request.files: return jsonify({'error': 'Missing resume files'}), 400 if 'jobDescription' not in request.files: return jsonify({'error': 'Missing job description file'}), 400 job_desc_file = request.files['jobDescription'] resume_files = request.files.getlist('resumes') # Validate job description file if job_desc_file.filename == '': return jsonify({'error': 'No job description file selected'}), 400 if not allowed_file(job_desc_file.filename): return jsonify({'error': 'Invalid job description file type'}), 400 # Validate resume files valid_resumes = [] for file in resume_files: if file.filename != '' and allowed_file(file.filename): valid_resumes.append(file) if not valid_resumes: return jsonify({'error': 'No valid resume files'}), 400 # Save and process job description job_desc_filename = secure_filename(job_desc_file.filename) job_desc_path = os.path.join(app.config['UPLOAD_FOLDER'], job_desc_filename) job_desc_file.save(job_desc_path) try: job_description = extract_text_from_file(job_desc_path, job_desc_filename) except Exception as e: return jsonify({'error': f'Failed to process job description: {str(e)}'}), 400 # Process each resume candidates = [] for resume_file in valid_resumes: resume_filename = secure_filename(resume_file.filename) resume_path = os.path.join(app.config['UPLOAD_FOLDER'], resume_filename) resume_file.save(resume_path) try: # Extract text from resume resume_text = extract_text_from_file(resume_path, resume_filename) # Analyze candidate candidate = analyze_candidate(job_description, resume_text, resume_filename) candidates.append(candidate) except Exception as e: logger.error(f"Error processing {resume_filename}: {e}") candidates.append({ 'id': str(uuid.uuid4()), 'name': resume_filename.split('.')[0], 'email': "Processing error", 'phone': "Processing error", 'skills': [], 'score': 0, 'text_preview': f"Error: {str(e)}", 'error': str(e) }) # Clean up resume file try: os.remove(resume_path) except: pass # Clean up job description file try: os.remove(job_desc_path) except: pass # Sort candidates by score candidates.sort(key=lambda x: x['score'], reverse=True) return jsonify({ 'candidates': candidates, 'job_description': job_description[:500] + '...' if len(job_description) > 500 else job_description, 'total_processed': len(candidates), 'ai_used': ai_models_loaded }) except Exception as e: logger.error(f"Error processing resumes: {e}") return jsonify({'error': 'Internal server error'}), 500 @app.route('/api/health', methods=['GET']) def health_check(): """Health check endpoint""" return jsonify({ 'status': 'healthy', 'ai_models_loaded': ai_models_loaded, 'upload_folder_exists': os.path.exists(UPLOAD_FOLDER) }) @app.route('/') def index(): return jsonify({'message': 'Resume Analyzer API is running'}) if __name__ == "__main__": port = int(os.environ.get("PORT", 10000)) app.run(host="0.0.0.0", port=port, debug=False)