Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import numpy as np | |
| if not hasattr(np, 'bool'): | |
| np.bool = bool | |
| import requests | |
| from pymongo import MongoClient | |
| from dotenv import load_dotenv | |
| from bson import ObjectId | |
| import transformers | |
| import re | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity | |
| import faiss | |
| from sentence_transformers import SentenceTransformer, util | |
| from flask import Flask, request, jsonify | |
| # ========== Google Generative AI (Gemini) imports ========== | |
| import google.generativeai as genai | |
| # Load environment variables | |
| load_dotenv() | |
| # Get API keys from environment variables | |
| GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY') | |
| MONGO_URI = os.getenv('MONGO_URI') | |
| # Configure the Gemini client with the API key | |
| genai.configure(api_key=GOOGLE_API_KEY) | |
| # Connect to MongoDB | |
| try: | |
| mongo_client = MongoClient(MONGO_URI) | |
| db = mongo_client["resume_database"] | |
| resumes_collection = db["resumes"] | |
| jobs_collection = db["jobs"] | |
| grouped_candidates_collection = db["grouped_candidates"] | |
| print("✅ Connected to MongoDB successfully.") | |
| except Exception as e: | |
| raise Exception(f"❌ MongoDB connection failed: {str(e)}") | |
| # Create Flask app | |
| app = Flask(__name__) | |
| # ============================================================================= | |
| # Revised Evaluation Rubric for Accurate, Job-Specific Scoring (Total 100) | |
| # ============================================================================= | |
| NEW_RUBRIC = """ | |
| You are an expert resume evaluator and technical recruiter. Evaluate the candidate's resume against the job description with extreme precision, and deduct points aggressively for any misalignment. Consider the resume a poor match if it does not include key skills, relevant work experience, or projects that demonstrate a close fit with the job description. Use the following criteria to assign an exact score out of 100: | |
| 1. Skills (25 pts): | |
| - Direct match with required technical skills: up to 12 pts. Deduct points sharply for missing or weak skills. | |
| - Soft skills alignment: up to 5 pts. Deduct if soft skills are not clearly demonstrated. | |
| - Relevant years of experience: up to 8 pts. Deduct proportionally if experience is insufficient. | |
| 2. Work Experience (25 pts): | |
| - Relevance of roles and industry: up to 12 pts. Deduct heavily for roles not matching the job requirements. | |
| - Impact, achievements, and career progression: up to 13 pts. Deduct if achievements are unquantifiable or progression is unclear. | |
| 3. Project Relevance (30 pts): | |
| - Applicability to job, technical depth, deployment status, and innovation: up to 30 pts. Deduct sharply if projects lack demonstrable outcomes or technical complexity. | |
| 4. Education & Certifications (15 pts): | |
| - Alignment of education with job requirements: up to 10 pts. | |
| - Relevant certifications: up to 5 pts. Deduct for missing or irrelevant certifications. | |
| 5. Presentation & Clarity (5 pts): | |
| - Formatting, clarity, conciseness, and professionalism: 5 pts. Deduct for errors, inconsistencies, or unclear presentation. | |
| Return only valid JSON with the following keys: | |
| { | |
| "overall_score": "XX/100", | |
| "score_breakdown": { | |
| "skills_match": "XX/25", | |
| "work_experience": "XX/25", | |
| "project_relevance": "XX/30", | |
| "education_certifications": "XX/15", | |
| "presentation_clarity": "XX/5" | |
| }, | |
| "key_matching_skills": ["skill1", "skill2"], | |
| "missing_skills": ["skill3", "skill4"], | |
| "strengths": "Specific, evidence-based strengths with examples", | |
| "weaknesses": "Concrete gaps or shortcomings with specific examples", | |
| "notable_projects": [ | |
| { | |
| "title": "Project 1", | |
| "relevance_score": "XX/30", | |
| "reason": "Why this project is relevant or not" | |
| } | |
| ], | |
| "gaps": [ | |
| "Specific gap 1 with evidence", | |
| "Specific gap 2 with evidence" | |
| ], | |
| "recommendation": "One of: Strong Reject, Reject, Consider with Reservations, Interview, Strong Recommendation" | |
| } | |
| Be highly critical and deduct points aggressively for any deviation from the ideal candidate profile. | |
| """ | |
| # ============================================================================= | |
| # Text Cleaning Helper Function | |
| # ============================================================================= | |
| def clean_text(text, extra_stopwords=None): | |
| """ | |
| Clean the input text by: | |
| - Converting to lowercase. | |
| - Removing punctuation. | |
| - Removing extra unwanted words using a custom stopword list. | |
| """ | |
| text = text.lower() | |
| text = re.sub(r'[^\w\s]', '', text) | |
| stopwords = {'the', 'and', 'is', 'in', 'at', 'of', 'a', 'an', 'to', 'for', 'with', 'on', 'by'} | |
| if extra_stopwords: | |
| stopwords.update(extra_stopwords) | |
| tokens = text.split() | |
| cleaned_tokens = [token for token in tokens if token not in stopwords] | |
| return " ".join(cleaned_tokens) | |
| # ============================================================================= | |
| # FAISS and Embedding Helper Functions with Hybrid Score | |
| # ============================================================================= | |
| # Initialize embedding model | |
| MODEL_PATH = os.path.join(os.getcwd(), "models", "all-MiniLM-L6-v2") | |
| if os.path.exists(MODEL_PATH): | |
| print(f"Loading model from local path: {MODEL_PATH}") | |
| embedding_model = SentenceTransformer(MODEL_PATH) | |
| else: | |
| print("Local model not found, downloading from Hugging Face") | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| def get_embedding(text, prefix="passage: ", normalize=True): | |
| formatted_text = f"{prefix}{text.strip()}" | |
| emb = embedding_model.encode(formatted_text, convert_to_numpy=True) | |
| if normalize: | |
| norm = np.linalg.norm(emb) | |
| if norm > 0: | |
| emb = emb / norm | |
| return emb | |
| def build_faiss_index(embeddings, use_inner_product=True): | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatIP(dim) if use_inner_product else faiss.IndexFlatL2(dim) | |
| index.add(embeddings) | |
| return index | |
| def compute_tfidf_similarity(job_text, resume_text): | |
| clean_job = clean_text(job_text) | |
| clean_resume = clean_text(resume_text) | |
| vectorizer = TfidfVectorizer(stop_words='english') | |
| tfidf_matrix = vectorizer.fit_transform([clean_job, clean_resume]) | |
| cosine_sim = sklearn_cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] | |
| return cosine_sim | |
| def compute_keyword_score(job_text, resume_text, keyword_list): | |
| clean_job = set(clean_text(job_text).split()) | |
| clean_resume = set(clean_text(resume_text).split()) | |
| match_count = sum(1 for kw in keyword_list if kw.lower() in clean_job and kw.lower() in clean_resume) | |
| return match_count / len(keyword_list) if keyword_list else 0 | |
| def compute_hybrid_score(embedding_sim, tfidf_sim, keyword_sim, llm_confidence, weights): | |
| return (weights['embedding'] * embedding_sim + | |
| weights['tfidf'] * tfidf_sim + | |
| weights['keyword'] * keyword_sim + | |
| weights['llm'] * llm_confidence) | |
| # ============================================================================= | |
| # Resume Evaluation Function Using LLM (Gemini) and FAISS Similarity | |
| # ============================================================================= | |
| gemini_model = genai.GenerativeModel(model_name="gemini-2.0-flash") | |
| def is_resume_substantive(resume_text, min_word_count=50): | |
| text = resume_text.lower() | |
| words = text.split() | |
| if len(words) < min_word_count: | |
| print(f"⚠ Resume failed word count check: {len(words)} words (min {min_word_count} required)") | |
| return False | |
| return True | |
| def evaluate_resume(resume_text, job_description, evaluation_rubric=NEW_RUBRIC): | |
| if not is_resume_substantive(resume_text): | |
| empty_result = { | |
| "overall_score": "0/100", | |
| "score_breakdown": { | |
| "skills_match": "0/25", | |
| "work_experience": "0/25", | |
| "project_relevance": "0/30", | |
| "education_certifications": "0/15", | |
| "presentation_clarity": "0/5" | |
| }, | |
| "key_matching_skills": [], | |
| "missing_skills": [], | |
| "irrelevant_experience": "", | |
| "strengths": "Resume text is too short or lacks essential sections.", | |
| "weaknesses": "Insufficient or unstructured content provided in resume.", | |
| "notable_projects": [], | |
| "gaps": ["The resume has no substantive information."], | |
| "recommendation": "Reject", | |
| "numerical_score": 0 | |
| } | |
| print("⚠ Resume did not pass the substantive content checks.") | |
| return json.dumps(empty_result) | |
| try: | |
| prompt = f""" | |
| You are an expert resume evaluator and technical recruiter. | |
| Below are the original resume text and job description, followed by Evaluation Criteria. | |
| Be extremely critical. Deduct points aggressively if key skills, work experience, projects, or education do not match the job description. | |
| Assign an exact score out of 100 (do not provide a generic score) and provide detailed evidence for each scoring category. | |
| Original Resume Text: | |
| {resume_text} | |
| Original Job Description: | |
| {job_description} | |
| Evaluation Criteria: | |
| {evaluation_rubric} | |
| Return only valid JSON with the following keys: | |
| overall_score, score_breakdown (skills_match, work_experience, project_relevance, education_certifications, presentation_clarity), | |
| key_matching_skills, missing_skills, irrelevant_experience, strengths, weaknesses, notable_projects (list of objects with title, relevance_score, reason), gaps, recommendation. | |
| """ | |
| response = gemini_model.generate_content(prompt) | |
| generated_text = response.text.strip() if response.text else "" | |
| if not generated_text: | |
| print("❌ No generated text found in the response from Gemini.") | |
| return "{}" | |
| json_start = generated_text.find('{') | |
| if json_start == -1: | |
| print(f"❌ No JSON found in the generated text: {generated_text}") | |
| return "{}" | |
| json_str = generated_text[json_start:].strip() | |
| brace_count = 0 | |
| extracted_json = "" | |
| for i, char in enumerate(json_str): | |
| if char == '{': | |
| brace_count += 1 | |
| elif char == '}': | |
| brace_count -= 1 | |
| if brace_count == 0: | |
| extracted_json = json_str[:i+1] | |
| break | |
| return extracted_json | |
| except Exception as e: | |
| print(f"❌ Exception during evaluation: {e}") | |
| return "{}" | |
| # ============================================================================= | |
| # Resume Processing & Candidate Grouping with FAISS Integration | |
| # ============================================================================= | |
| from bson import ObjectId | |
| def get_full_job_text(job_doc): | |
| """ | |
| Concatenates all text fields from a job document into a single string. | |
| """ | |
| if not job_doc: | |
| return "No description available." | |
| text_parts = [] | |
| for key, value in job_doc.items(): | |
| if isinstance(value, list): | |
| text_parts.append(" ".join(value)) | |
| elif isinstance(value, str): | |
| text_parts.append(value) | |
| return " ".join(text_parts).strip() | |
| def process_resumes(job_id_filter): | |
| try: | |
| # Retrieve the job document by _id instead of jobId. | |
| job_doc = jobs_collection.find_one({"_id": ObjectId(job_id_filter)}) | |
| if job_doc: | |
| job_description = get_full_job_text(job_doc) | |
| else: | |
| job_description = "No description available." | |
| print(f"ℹ Using job description: {job_description}") | |
| resumes = list(resumes_collection.find({"jobId": job_id_filter})) | |
| print(f"📂 Found {len(resumes)} resumes to process for Job ID: {job_id_filter}.") | |
| resume_texts = [] | |
| resume_ids = [] | |
| resume_mapping = {} | |
| for resume in resumes: | |
| job_id = resume.get("jobId") | |
| if not job_id: | |
| print(f"⚠ Skipping resume (ID: {resume.get('_id')}) - Missing jobId.") | |
| continue | |
| resume_text = resume.get("resume_data", "") | |
| if isinstance(resume_text, dict): | |
| resume_text = json.dumps(resume_text) | |
| resume_text = resume_text.strip() | |
| if not resume_text: | |
| print(f"⚠ Skipping resume (ID: {resume.get('_id')}) - No resume text found.") | |
| continue | |
| resume_texts.append(resume_text) | |
| resume_ids.append(resume["_id"]) | |
| resume_mapping[len(resume_texts)-1] = resume | |
| if not resume_texts: | |
| print("⚠ No new resumes to process.") | |
| return | |
| # Prepare keywords from the cleaned job description. | |
| job_keywords = list(set(clean_text(job_description).split())) | |
| # Compute embeddings with distinct prefixes and normalize them. | |
| job_embedding = get_embedding(job_description, prefix="query: ", normalize=True) | |
| resume_embeddings = np.array([get_embedding(text, prefix="passage: ", normalize=True) for text in resume_texts]) | |
| index = build_faiss_index(resume_embeddings, use_inner_product=True) | |
| job_embedding_expanded = np.expand_dims(job_embedding, axis=0) | |
| similarities, indices = index.search(job_embedding_expanded, len(resume_texts)) | |
| embedding_similarity_scores = [max(sim, 0) for sim in similarities[0]] | |
| # Adjusted weights: embedding and LLM have more influence. | |
| weights = { | |
| 'embedding': 0.25, | |
| 'tfidf': 0.25, | |
| 'keyword': 0.15, | |
| 'llm': 0.50 | |
| } | |
| # Boost factor to calibrate final scores. | |
| boost_factor = 1.5 | |
| for idx, resume_text in enumerate(resume_texts): | |
| embedding_sim = float(embedding_similarity_scores[idx]) | |
| tfidf_sim = compute_tfidf_similarity(job_description, resume_text) | |
| keyword_sim = compute_keyword_score(job_description, resume_text, job_keywords) | |
| analysis_result_text = evaluate_resume(resume_text, job_description) | |
| try: | |
| analysis_result = json.loads(analysis_result_text) | |
| except json.JSONDecodeError as e: | |
| print(f"❌ Error parsing analysis result for resume ID {resume_ids[idx]}: {e}") | |
| analysis_result = { | |
| "overall_score": "XX/100", | |
| "score_breakdown": { | |
| "skills_match": "XX/25", | |
| "work_experience": "XX/25", | |
| "project_relevance": "XX/30", | |
| "education_certifications": "XX/15", | |
| "presentation_clarity": "XX/5" | |
| }, | |
| "strengths": "Unable to properly evaluate - parsing error", | |
| "weaknesses": "System could not properly analyze this resume", | |
| "notable_projects": [], | |
| "gaps": ["Analysis failed - please review manually"], | |
| "recommendation": "Consider with Reservations", | |
| } | |
| try: | |
| score_string = analysis_result.get("overall_score", "0/100") | |
| score_parts = score_string.split('/') | |
| llm_score = int(score_parts[0]) if len(score_parts) >= 1 else 0 | |
| except (ValueError, TypeError): | |
| llm_score = 0 | |
| llm_confidence = llm_score / 100.0 | |
| # Compute the hybrid score using the weighted sum of the four components. | |
| hybrid_score = compute_hybrid_score(embedding_sim, tfidf_sim, keyword_sim, llm_confidence, weights) | |
| # If LLM confidence (in percentage) is less than 50, do not apply the boost factor. | |
| if llm_score < 50: | |
| final_score = int(hybrid_score * 100) | |
| else: | |
| final_score = min(int(hybrid_score * boost_factor * 100), 100) | |
| analysis_result["embedding_similarity"] = round(embedding_sim, 3) | |
| analysis_result["tfidf_similarity"] = round(tfidf_sim, 3) | |
| analysis_result["keyword_similarity"] = round(keyword_sim, 3) | |
| analysis_result["llm_confidence"] = f"{llm_score}/100" | |
| analysis_result["overall_score"] = f"{final_score}/100" | |
| analysis_result["numerical_score"] = final_score | |
| resumes_collection.update_one( | |
| {"_id": resume_ids[idx]}, | |
| {"$set": {"analysis_result": analysis_result}} | |
| ) | |
| print(f"✅ Processed resume (ID: {resume_ids[idx]}) with Hybrid Score: {analysis_result.get('overall_score', 'N/A')} (Embedding: {round(embedding_sim, 3)}, TF-IDF: {round(tfidf_sim, 3)}, Keyword: {round(keyword_sim, 3)})") | |
| print("🎯 All resumes processed successfully.") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Error processing resumes: {str(e)}") | |
| import traceback | |
| print(traceback.format_exc()) | |
| return False | |
| def group_and_store_candidates(job_id_filter): | |
| try: | |
| pipeline = [ | |
| {"$match": {"jobId": job_id_filter}}, | |
| { | |
| "$group": { | |
| "_id": { | |
| "jobId": "$jobId", | |
| "jobTitle": "$jobTitle" | |
| }, | |
| "candidates": { | |
| "$push": { | |
| "candidate_id": "$_id", | |
| "candidate_name": "$formName", | |
| "candidate_email": "$formEmail", | |
| "candidate_phone": "$formPhone", | |
| "resume_data": "$resume_data", | |
| "analysis_result": "$analysis_result" | |
| } | |
| } | |
| } | |
| } | |
| ] | |
| grouped_resumes = resumes_collection.aggregate(pipeline) | |
| job_descriptions = {} | |
| for job in jobs_collection.find({"jobId": job_id_filter}): | |
| job_id = job.get("jobId") | |
| if job_id: | |
| job_title = job.get("title", "No Title Provided") | |
| short_desc = job.get("shortDescription", "") | |
| full_desc = job.get("fullDescription", "") | |
| responsibilities = " ".join(job.get("responsibilities", [])) | |
| requirements = " ".join(job.get("requirements", [])) | |
| core_expertise = job.get("coreExpertise", "") | |
| full_job_description = ( | |
| f"{job_title}. {short_desc} {full_desc} " | |
| f"Responsibilities: {responsibilities}. " | |
| f"Requirements: {requirements}. " | |
| f"Core Expertise: {core_expertise}." | |
| ) | |
| job_descriptions[job_id] = full_job_description | |
| grouped_candidates_collection.delete_many({"jobId": job_id_filter}) | |
| for group in grouped_resumes: | |
| _id = group.get("_id", {}) | |
| job_id = _id.get("jobId") | |
| job_title = _id.get("jobTitle", "No Title Provided") | |
| if not job_id: | |
| print(f"⚠ Skipping grouped set without jobId: {group}") | |
| continue | |
| candidates = group["candidates"] | |
| for candidate in candidates: | |
| analysis_result = candidate.get("analysis_result", {}) | |
| if isinstance(analysis_result, str): | |
| try: | |
| analysis_result = json.loads(analysis_result) | |
| candidate["analysis_result"] = analysis_result | |
| except (json.JSONDecodeError, TypeError): | |
| analysis_result = {} | |
| if not analysis_result.get("numerical_score"): | |
| try: | |
| score_string = analysis_result.get("overall_score", "0/100") | |
| score_parts = score_string.split('/') | |
| numerical_score = int(score_parts[0]) if len(score_parts) >= 1 else 0 | |
| analysis_result["numerical_score"] = numerical_score | |
| except (ValueError, TypeError, IndexError): | |
| analysis_result["numerical_score"] = 0 | |
| sorted_candidates = sorted( | |
| candidates, | |
| key=lambda x: x.get("analysis_result", {}).get("numerical_score", 0), | |
| reverse=True | |
| ) | |
| recommendation_tiers = { | |
| "Strong Recommendation": [], | |
| "Interview": [], | |
| "Consider with Reservations": [], | |
| "Reject": [], | |
| "Strong Reject": [], | |
| "Uncategorized": [] | |
| } | |
| for candidate in sorted_candidates: | |
| recommendation = candidate.get("analysis_result", {}).get("recommendation", "Uncategorized") | |
| if recommendation in recommendation_tiers: | |
| recommendation_tiers[recommendation].append(candidate) | |
| else: | |
| recommendation_tiers["Uncategorized"].append(candidate) | |
| tiered_candidates = [] | |
| for tier, tier_candidates in recommendation_tiers.items(): | |
| tier_candidates.sort( | |
| key=lambda x: x.get("analysis_result", {}).get("numerical_score", 0), | |
| reverse=True | |
| ) | |
| tiered_candidates.extend(tier_candidates) | |
| grouped_candidates_collection.update_one( | |
| {"jobId": job_id}, | |
| { | |
| "$set": { | |
| "jobId": job_id, | |
| "jobTitle": job_title, | |
| "candidates": sorted_candidates, | |
| "tiered_candidates": tiered_candidates, | |
| "candidate_tiers": { | |
| tier: len(candidates) for tier, candidates in recommendation_tiers.items() if candidates | |
| }, | |
| "total_candidates": len(candidates), | |
| "top_candidates": sorted_candidates[:5] if len(sorted_candidates) >= 5 else sorted_candidates | |
| } | |
| }, | |
| upsert=True | |
| ) | |
| print(f"✅ Grouped {len(candidates)} candidates for Job: {job_title} (Job ID: {job_id})") | |
| print("🎯 All grouped candidates stored successfully.") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Error grouping candidates: {str(e)}") | |
| import traceback | |
| print(traceback.format_exc()) | |
| return False | |
| # API endpoints | |
| def api_process_resumes(): | |
| data = request.json | |
| job_id = data.get('job_id') | |
| if not job_id: | |
| return jsonify({"status": "error", "message": "Job ID is required"}), 400 | |
| process_success = process_resumes(job_id) | |
| grouping_success = group_and_store_candidates(job_id) | |
| if process_success and grouping_success: | |
| return jsonify({ | |
| "status": "success", | |
| "message": f"Resumes processed successfully for job ID: {job_id}" | |
| }) | |
| else: | |
| return jsonify({ | |
| "status": "error", | |
| "message": f"Error processing resumes for job ID: {job_id}" | |
| }), 500 | |
| if __name__ == "__main__": | |
| app.run(debug=True, host='0.0.0.0', port=7860) |