File size: 4,734 Bytes
04b4b5c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | import os
import glob
import gradio as gr
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
import docx
import re
from collections import Counter
# -------------------------
# CONFIG
# -------------------------
DATASET_FOLDER = "resumes" # Folder with 288 resumes
TOP_K = 3 # Top 3 recommendations
# -------------------------
# HELPER FUNCTIONS
# -------------------------
def extract_text_from_pdf(file):
text = ""
try:
reader = PdfReader(file)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + " "
except:
pass
return text
def extract_text_from_docx(file):
text = ""
try:
doc = docx.Document(file)
text = " ".join([para.text for para in doc.paragraphs])
except:
pass
return text
def extract_text(file):
ext = file.name.split('.')[-1].lower() if hasattr(file, "name") else "txt"
if ext == "pdf":
return extract_text_from_pdf(file)
elif ext == "docx":
return extract_text_from_docx(file)
elif ext == "txt":
try:
file.seek(0)
return file.read().decode("utf-8")
except:
return ""
else:
return ""
def load_resume_dataset(folder_path):
resumes = []
names = []
paths = glob.glob(os.path.join(folder_path, "*"))
for path in paths:
text = ""
ext = path.split('.')[-1].lower()
try:
if ext == "pdf":
text = extract_text_from_pdf(path)
elif ext == "docx":
text = extract_text_from_docx(path)
elif ext == "txt":
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read()
except:
continue
if text.strip():
resumes.append(text)
names.append(os.path.basename(path))
return names, resumes
# -------------------------
# DYNAMIC JOB ROLE EXTRACTION
# -------------------------
def infer_job_from_text(text):
"""
Extract probable job/role from resume text.
Uses heuristics: first lines, capitalized phrases, or frequent nouns.
"""
lines = text.split("\n")
# Try first 5 lines for capitalized phrases
candidate_lines = lines[:5]
pattern = re.compile(r'\b[A-Z][a-zA-Z &/-]{2,}\b')
roles = []
for line in candidate_lines:
matches = pattern.findall(line)
roles.extend(matches)
# Fallback: top 1 frequent capitalized word
if roles:
most_common = Counter(roles).most_common(1)
return most_common[0][0]
else:
# fallback: "Other"
return "Other"
# -------------------------
# LOAD MODEL & DATASET
# -------------------------
st_model = SentenceTransformer('all-MiniLM-L6-v2')
resume_names, resume_texts = load_resume_dataset(DATASET_FOLDER)
resume_embeddings = st_model.encode(resume_texts, convert_to_numpy=True)
# Build dynamic job roles dict
resume_roles = {name: infer_job_from_text(text) for name, text in zip(resume_names, resume_texts)}
# -------------------------
# MATCH FUNCTION
# -------------------------
def match_resume(file):
input_text = extract_text(file)
if not input_text.strip():
return pd.DataFrame([{"Error": "Could not extract text from this resume."}])
input_emb = st_model.encode([input_text], convert_to_numpy=True)
sims = cosine_similarity(input_emb, resume_embeddings)[0]
top_indices = sims.argsort()[-TOP_K:][::-1]
results = []
for idx in top_indices:
matched_resume_name = resume_names[idx]
similarity_score = sims[idx]
recommended_job = resume_roles[matched_resume_name]
results.append({
"Matched Resume": matched_resume_name,
"Recommended Job": recommended_job,
"Confidence Score": f"{similarity_score*100:.2f}%"
})
return pd.DataFrame(results)
# -------------------------
# GRADIO UI
# -------------------------
css = """
body {background-color: #f7f9fc;}
h1 {color: #333; text-align: center;}
.gr-button {background-color: #4CAF50; color: white;}
"""
title = "<h1>AI Resume Analyzer & Job Matcher</h1>"
iface = gr.Interface(
fn=match_resume,
inputs=gr.File(label="Upload Your Resume (PDF, DOCX, TXT)"),
outputs=gr.Dataframe(label="Top Job Matches"),
title="AI Resume Analyzer & Job Matcher",
description="Upload a resume to get top 3 job recommendations with confidence scores.",
css=css,
)
if __name__ == "__main__":
iface.launch()
|