nadish1210's picture
Update model.py
97597a5 verified
import os
import fitz
from docx import Document
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import pandas as pd
from datetime import datetime
import zipfile
import shutil
import re
# ================== Models ==================
MODELS = {
"Fast (MiniLM)": "sentence-transformers/all-MiniLM-L6-v2",
"Balanced (Recommended)": "sentence-transformers/all-mpnet-base-v2",
"High Accuracy": "sentence-transformers/multi-qa-mpnet-base-dot-v1"
}
loaded_models = {}
skills_classifier = None
def get_model(model_name: str):
if model_name not in loaded_models:
print(f"πŸ”„ Loading model: {model_name}")
loaded_models[model_name] = SentenceTransformer(MODELS[model_name])
return loaded_models[model_name]
def get_skills_classifier():
global skills_classifier
if skills_classifier is None:
skills_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
return skills_classifier
def extract_text_from_file(file_path: str) -> str:
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".pdf":
doc = fitz.open(file_path)
text = "\n".join([page.get_text("text") for page in doc])
doc.close()
return text.strip()
elif ext in [".docx", ".doc"]:
doc = Document(file_path)
full_text = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
full_text.append(cell.text.strip())
return "\n".join(full_text).strip()
return ""
except Exception as e:
print(f"Extraction error: {e}")
return ""
def extract_qualifications(text: str):
"""Extract only Degrees / Qualifications"""
if not text:
return "Not mentioned"
text_lower = text.lower()
degree_pattern = r'\b(bba|bs|be|bit|bsc|ba|b\.?a\.?|b\.?sc\.?|bachelor|master|mba|msc|m\.?sc\.?|mphil|phd|doctorate|intermediate|matric)\b'
degrees = re.findall(degree_pattern, text_lower)
degrees = list(set([d.upper().replace('.', '') for d in degrees if d not in ["ms office", "microsoft"]]))
return ", ".join(degrees) if degrees else "Not mentioned"
def extract_courses(text: str):
"""Extract Courses, Certifications & Trainings"""
if not text:
return "Not mentioned"
text_lower = text.lower()
course_keywords = ["course", "certification", "certificate", "certified", "training", "workshop", "bootcamp", "specialization", "diploma"]
courses = []
sentences = re.split(r'[.!?;\n]', text)
for sentence in sentences:
sent = sentence.strip()
sent_lower = sent.lower()
if any(kw in sent_lower for kw in course_keywords):
clean_sent = re.sub(r'\s+', ' ', sent)
if 15 < len(clean_sent) < 110: # reasonable length
courses.append(clean_sent)
courses = list(dict.fromkeys(courses))[:8] # max 8 courses
return " | ".join(courses) if courses else "No courses/certifications mentioned"
def get_required_qualifications(job_description: str):
jd_lower = job_description.lower()
degree_keywords = ["bba", "bachelor", "bs", "be", "master", "mba", "phd", "degree", "graduation", "qualification", "engineering"]
low_edu_keywords = ["guard", "security", "watchman", "peon", "driver", "helper", "labor"]
if any(kw in jd_lower for kw in low_edu_keywords):
return False, "Degree not required"
if any(kw in jd_lower for kw in degree_keywords):
return True, "Degree required"
return False, "Qualification preferred"
def extract_skills(resume_text: str):
if len(resume_text) < 100:
return "Insufficient content"
classifier = get_skills_classifier()
labels = ["Python", "Machine Learning", "Deep Learning", "Data Science", "SQL", "FastAPI", "AWS", "Docker", "Communication", "Leadership"]
try:
result = classifier(resume_text[:2000], labels, multi_label=True)
skills = [label for label, score in zip(result['labels'], result['scores']) if score > 0.40]
return ", ".join(skills[:12]) if skills else "Basic skills"
except:
return "Skills extraction failed"
# ================== Main Function ==================
def screen_resumes_backend(job_description: str, cv_folder, model_option: str, threshold: float = 0.65):
if not cv_folder:
return [], "No CVs found.", None, None
files = cv_folder if isinstance(cv_folder, list) else [cv_folder]
model = get_model(model_option)
results = []
os.makedirs("outputs/selected_cvs", exist_ok=True)
os.makedirs("outputs/reports", exist_ok=True)
job_embedding = model.encode(job_description, convert_to_tensor=True)
degree_required, qual_note = get_required_qualifications(job_description)
for file_obj in files:
source_path = file_obj.name if hasattr(file_obj, 'name') else str(file_obj)
filename = os.path.basename(source_path)
resume_text = extract_text_from_file(source_path)
if len(resume_text) < 100:
results.append({
"Candidate": filename,
"Score (%)": 0.0,
"Qualifications": "❌ Extraction Failed",
"Courses": "N/A",
"Status": "❌ Extraction Failed",
"Extracted Skills": "N/A",
"Model Used": model_option
})
continue
qualifications = extract_qualifications(resume_text)
courses = extract_courses(resume_text)
skills = extract_skills(resume_text)
resume_embedding = model.encode(resume_text, convert_to_tensor=True)
similarity = util.cos_sim(job_embedding, resume_embedding)[0][0].item()
final_status = "βœ… Shortlisted"
if degree_required and qualifications == "Not mentioned":
final_status = "❌ Rejected (Missing Qualification)"
elif similarity < threshold:
final_status = "❌ Rejected (Low Score)"
if final_status == "βœ… Shortlisted":
shutil.copy2(source_path, os.path.join("outputs/selected_cvs", filename))
results.append({
"Candidate": filename,
"Score (%)": round(similarity * 100, 2),
"Qualifications": qualifications,
"Courses": courses,
"Status": final_status,
"Extracted Skills": skills,
"Model Used": model_option
})
df = pd.DataFrame(results)
df.to_csv(f"outputs/reports/report_{datetime.now().strftime('%Y%m%d_%H%M')}.csv", index=False)
shortlisted = sum(1 for r in results if "Shortlisted" in r["Status"])
summary = f"""
**βœ… Screening Completed Successfully!**
**Model Used:** {model_option}
**Total CVs Processed:** {len(files)}
**Shortlisted:** {shortlisted}
**Threshold:** {threshold*100}%
**Note:** Qualifications and Courses are now shown in separate columns.
"""
zip_path = None
if shortlisted > 0:
zip_path = f"outputs/shortlisted_cvs_{datetime.now().strftime('%Y%m%d_%H%M')}.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for f in os.listdir("outputs/selected_cvs"):
zipf.write(os.path.join("outputs/selected_cvs", f), f)
return results, summary.strip(), "outputs/selected_cvs/" if shortlisted > 0 else "", zip_path