import os import fitz from docx import Document from sentence_transformers import SentenceTransformer, util from transformers import pipeline import pandas as pd from datetime import datetime import zipfile import shutil import re # ================== Models ================== MODELS = { "Fast (MiniLM)": "sentence-transformers/all-MiniLM-L6-v2", "Balanced (Recommended)": "sentence-transformers/all-mpnet-base-v2", "High Accuracy": "sentence-transformers/multi-qa-mpnet-base-dot-v1" } loaded_models = {} skills_classifier = None def get_model(model_name: str): if model_name not in loaded_models: print(f"🔄 Loading model: {model_name}") loaded_models[model_name] = SentenceTransformer(MODELS[model_name]) return loaded_models[model_name] def get_skills_classifier(): global skills_classifier if skills_classifier is None: skills_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") return skills_classifier def extract_text_from_file(file_path: str) -> str: ext = os.path.splitext(file_path)[1].lower() try: if ext == ".pdf": doc = fitz.open(file_path) text = "\n".join([page.get_text("text") for page in doc]) doc.close() return text.strip() elif ext in [".docx", ".doc"]: doc = Document(file_path) full_text = [para.text.strip() for para in doc.paragraphs if para.text.strip()] for table in doc.tables: for row in table.rows: for cell in row.cells: if cell.text.strip(): full_text.append(cell.text.strip()) return "\n".join(full_text).strip() return "" except Exception as e: print(f"Extraction error: {e}") return "" def extract_qualifications(text: str): """Extract only Degrees / Qualifications""" if not text: return "Not mentioned" text_lower = text.lower() degree_pattern = r'\b(bba|bs|be|bit|bsc|ba|b\.?a\.?|b\.?sc\.?|bachelor|master|mba|msc|m\.?sc\.?|mphil|phd|doctorate|intermediate|matric)\b' degrees = re.findall(degree_pattern, text_lower) degrees = list(set([d.upper().replace('.', '') for d in degrees if d not in ["ms office", "microsoft"]])) return ", ".join(degrees) if degrees else "Not mentioned" def extract_courses(text: str): """Extract Courses, Certifications & Trainings""" if not text: return "Not mentioned" text_lower = text.lower() course_keywords = ["course", "certification", "certificate", "certified", "training", "workshop", "bootcamp", "specialization", "diploma"] courses = [] sentences = re.split(r'[.!?;\n]', text) for sentence in sentences: sent = sentence.strip() sent_lower = sent.lower() if any(kw in sent_lower for kw in course_keywords): clean_sent = re.sub(r'\s+', ' ', sent) if 15 < len(clean_sent) < 110: # reasonable length courses.append(clean_sent) courses = list(dict.fromkeys(courses))[:8] # max 8 courses return " | ".join(courses) if courses else "No courses/certifications mentioned" def get_required_qualifications(job_description: str): jd_lower = job_description.lower() degree_keywords = ["bba", "bachelor", "bs", "be", "master", "mba", "phd", "degree", "graduation", "qualification", "engineering"] low_edu_keywords = ["guard", "security", "watchman", "peon", "driver", "helper", "labor"] if any(kw in jd_lower for kw in low_edu_keywords): return False, "Degree not required" if any(kw in jd_lower for kw in degree_keywords): return True, "Degree required" return False, "Qualification preferred" def extract_skills(resume_text: str): if len(resume_text) < 100: return "Insufficient content" classifier = get_skills_classifier() labels = ["Python", "Machine Learning", "Deep Learning", "Data Science", "SQL", "FastAPI", "AWS", "Docker", "Communication", "Leadership"] try: result = classifier(resume_text[:2000], labels, multi_label=True) skills = [label for label, score in zip(result['labels'], result['scores']) if score > 0.40] return ", ".join(skills[:12]) if skills else "Basic skills" except: return "Skills extraction failed" # ================== Main Function ================== def screen_resumes_backend(job_description: str, cv_folder, model_option: str, threshold: float = 0.65): if not cv_folder: return [], "No CVs found.", None, None files = cv_folder if isinstance(cv_folder, list) else [cv_folder] model = get_model(model_option) results = [] os.makedirs("outputs/selected_cvs", exist_ok=True) os.makedirs("outputs/reports", exist_ok=True) job_embedding = model.encode(job_description, convert_to_tensor=True) degree_required, qual_note = get_required_qualifications(job_description) for file_obj in files: source_path = file_obj.name if hasattr(file_obj, 'name') else str(file_obj) filename = os.path.basename(source_path) resume_text = extract_text_from_file(source_path) if len(resume_text) < 100: results.append({ "Candidate": filename, "Score (%)": 0.0, "Qualifications": "❌ Extraction Failed", "Courses": "N/A", "Status": "❌ Extraction Failed", "Extracted Skills": "N/A", "Model Used": model_option }) continue qualifications = extract_qualifications(resume_text) courses = extract_courses(resume_text) skills = extract_skills(resume_text) resume_embedding = model.encode(resume_text, convert_to_tensor=True) similarity = util.cos_sim(job_embedding, resume_embedding)[0][0].item() final_status = "✅ Shortlisted" if degree_required and qualifications == "Not mentioned": final_status = "❌ Rejected (Missing Qualification)" elif similarity < threshold: final_status = "❌ Rejected (Low Score)" if final_status == "✅ Shortlisted": shutil.copy2(source_path, os.path.join("outputs/selected_cvs", filename)) results.append({ "Candidate": filename, "Score (%)": round(similarity * 100, 2), "Qualifications": qualifications, "Courses": courses, "Status": final_status, "Extracted Skills": skills, "Model Used": model_option }) df = pd.DataFrame(results) df.to_csv(f"outputs/reports/report_{datetime.now().strftime('%Y%m%d_%H%M')}.csv", index=False) shortlisted = sum(1 for r in results if "Shortlisted" in r["Status"]) summary = f""" **✅ Screening Completed Successfully!** **Model Used:** {model_option} **Total CVs Processed:** {len(files)} **Shortlisted:** {shortlisted} **Threshold:** {threshold*100}% **Note:** Qualifications and Courses are now shown in separate columns. """ zip_path = None if shortlisted > 0: zip_path = f"outputs/shortlisted_cvs_{datetime.now().strftime('%Y%m%d_%H%M')}.zip" with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for f in os.listdir("outputs/selected_cvs"): zipf.write(os.path.join("outputs/selected_cvs", f), f) return results, summary.strip(), "outputs/selected_cvs/" if shortlisted > 0 else "", zip_path