| import os |
| import fitz |
| from docx import Document |
| from sentence_transformers import SentenceTransformer, util |
| from transformers import pipeline |
| import pandas as pd |
| from datetime import datetime |
| import zipfile |
| import shutil |
| import re |
|
|
| |
| MODELS = { |
| "Fast (MiniLM)": "sentence-transformers/all-MiniLM-L6-v2", |
| "Balanced (Recommended)": "sentence-transformers/all-mpnet-base-v2", |
| "High Accuracy": "sentence-transformers/multi-qa-mpnet-base-dot-v1" |
| } |
|
|
| loaded_models = {} |
| skills_classifier = None |
|
|
| def get_model(model_name: str): |
| if model_name not in loaded_models: |
| print(f"π Loading model: {model_name}") |
| loaded_models[model_name] = SentenceTransformer(MODELS[model_name]) |
| return loaded_models[model_name] |
|
|
| def get_skills_classifier(): |
| global skills_classifier |
| if skills_classifier is None: |
| skills_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") |
| return skills_classifier |
|
|
| def extract_text_from_file(file_path: str) -> str: |
| ext = os.path.splitext(file_path)[1].lower() |
| try: |
| if ext == ".pdf": |
| doc = fitz.open(file_path) |
| text = "\n".join([page.get_text("text") for page in doc]) |
| doc.close() |
| return text.strip() |
| |
| elif ext in [".docx", ".doc"]: |
| doc = Document(file_path) |
| full_text = [para.text.strip() for para in doc.paragraphs if para.text.strip()] |
| for table in doc.tables: |
| for row in table.rows: |
| for cell in row.cells: |
| if cell.text.strip(): |
| full_text.append(cell.text.strip()) |
| return "\n".join(full_text).strip() |
| return "" |
| except Exception as e: |
| print(f"Extraction error: {e}") |
| return "" |
|
|
| def extract_qualifications(text: str): |
| """Extract only Degrees / Qualifications""" |
| if not text: |
| return "Not mentioned" |
| |
| text_lower = text.lower() |
| degree_pattern = r'\b(bba|bs|be|bit|bsc|ba|b\.?a\.?|b\.?sc\.?|bachelor|master|mba|msc|m\.?sc\.?|mphil|phd|doctorate|intermediate|matric)\b' |
| |
| degrees = re.findall(degree_pattern, text_lower) |
| degrees = list(set([d.upper().replace('.', '') for d in degrees if d not in ["ms office", "microsoft"]])) |
| |
| return ", ".join(degrees) if degrees else "Not mentioned" |
|
|
| def extract_courses(text: str): |
| """Extract Courses, Certifications & Trainings""" |
| if not text: |
| return "Not mentioned" |
| |
| text_lower = text.lower() |
| course_keywords = ["course", "certification", "certificate", "certified", "training", "workshop", "bootcamp", "specialization", "diploma"] |
| |
| courses = [] |
| sentences = re.split(r'[.!?;\n]', text) |
| |
| for sentence in sentences: |
| sent = sentence.strip() |
| sent_lower = sent.lower() |
| if any(kw in sent_lower for kw in course_keywords): |
| clean_sent = re.sub(r'\s+', ' ', sent) |
| if 15 < len(clean_sent) < 110: |
| courses.append(clean_sent) |
| |
| courses = list(dict.fromkeys(courses))[:8] |
| return " | ".join(courses) if courses else "No courses/certifications mentioned" |
|
|
| def get_required_qualifications(job_description: str): |
| jd_lower = job_description.lower() |
| degree_keywords = ["bba", "bachelor", "bs", "be", "master", "mba", "phd", "degree", "graduation", "qualification", "engineering"] |
| low_edu_keywords = ["guard", "security", "watchman", "peon", "driver", "helper", "labor"] |
| |
| if any(kw in jd_lower for kw in low_edu_keywords): |
| return False, "Degree not required" |
| |
| if any(kw in jd_lower for kw in degree_keywords): |
| return True, "Degree required" |
| return False, "Qualification preferred" |
|
|
| def extract_skills(resume_text: str): |
| if len(resume_text) < 100: |
| return "Insufficient content" |
| classifier = get_skills_classifier() |
| labels = ["Python", "Machine Learning", "Deep Learning", "Data Science", "SQL", "FastAPI", "AWS", "Docker", "Communication", "Leadership"] |
| try: |
| result = classifier(resume_text[:2000], labels, multi_label=True) |
| skills = [label for label, score in zip(result['labels'], result['scores']) if score > 0.40] |
| return ", ".join(skills[:12]) if skills else "Basic skills" |
| except: |
| return "Skills extraction failed" |
|
|
| |
| def screen_resumes_backend(job_description: str, cv_folder, model_option: str, threshold: float = 0.65): |
| if not cv_folder: |
| return [], "No CVs found.", None, None |
|
|
| files = cv_folder if isinstance(cv_folder, list) else [cv_folder] |
| model = get_model(model_option) |
| results = [] |
| |
| os.makedirs("outputs/selected_cvs", exist_ok=True) |
| os.makedirs("outputs/reports", exist_ok=True) |
|
|
| job_embedding = model.encode(job_description, convert_to_tensor=True) |
| degree_required, qual_note = get_required_qualifications(job_description) |
|
|
| for file_obj in files: |
| source_path = file_obj.name if hasattr(file_obj, 'name') else str(file_obj) |
| filename = os.path.basename(source_path) |
|
|
| resume_text = extract_text_from_file(source_path) |
|
|
| if len(resume_text) < 100: |
| results.append({ |
| "Candidate": filename, |
| "Score (%)": 0.0, |
| "Qualifications": "β Extraction Failed", |
| "Courses": "N/A", |
| "Status": "β Extraction Failed", |
| "Extracted Skills": "N/A", |
| "Model Used": model_option |
| }) |
| continue |
|
|
| qualifications = extract_qualifications(resume_text) |
| courses = extract_courses(resume_text) |
| skills = extract_skills(resume_text) |
|
|
| resume_embedding = model.encode(resume_text, convert_to_tensor=True) |
| similarity = util.cos_sim(job_embedding, resume_embedding)[0][0].item() |
|
|
| final_status = "β
Shortlisted" |
| if degree_required and qualifications == "Not mentioned": |
| final_status = "β Rejected (Missing Qualification)" |
| elif similarity < threshold: |
| final_status = "β Rejected (Low Score)" |
|
|
| if final_status == "β
Shortlisted": |
| shutil.copy2(source_path, os.path.join("outputs/selected_cvs", filename)) |
|
|
| results.append({ |
| "Candidate": filename, |
| "Score (%)": round(similarity * 100, 2), |
| "Qualifications": qualifications, |
| "Courses": courses, |
| "Status": final_status, |
| "Extracted Skills": skills, |
| "Model Used": model_option |
| }) |
|
|
| df = pd.DataFrame(results) |
| df.to_csv(f"outputs/reports/report_{datetime.now().strftime('%Y%m%d_%H%M')}.csv", index=False) |
|
|
| shortlisted = sum(1 for r in results if "Shortlisted" in r["Status"]) |
|
|
| summary = f""" |
| **β
Screening Completed Successfully!** |
| **Model Used:** {model_option} |
| **Total CVs Processed:** {len(files)} |
| **Shortlisted:** {shortlisted} |
| **Threshold:** {threshold*100}% |
| **Note:** Qualifications and Courses are now shown in separate columns. |
| """ |
|
|
| zip_path = None |
| if shortlisted > 0: |
| zip_path = f"outputs/shortlisted_cvs_{datetime.now().strftime('%Y%m%d_%H%M')}.zip" |
| with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: |
| for f in os.listdir("outputs/selected_cvs"): |
| zipf.write(os.path.join("outputs/selected_cvs", f), f) |
|
|
| return results, summary.strip(), "outputs/selected_cvs/" if shortlisted > 0 else "", zip_path |