srinikesh1432's picture
Update app.py
04b4b5c verified
import os
import glob
import gradio as gr
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
import docx
import re
from collections import Counter
# -------------------------
# CONFIG
# -------------------------
DATASET_FOLDER = "resumes" # Folder with 288 resumes
TOP_K = 3 # Top 3 recommendations
# -------------------------
# HELPER FUNCTIONS
# -------------------------
def extract_text_from_pdf(file):
text = ""
try:
reader = PdfReader(file)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + " "
except:
pass
return text
def extract_text_from_docx(file):
text = ""
try:
doc = docx.Document(file)
text = " ".join([para.text for para in doc.paragraphs])
except:
pass
return text
def extract_text(file):
ext = file.name.split('.')[-1].lower() if hasattr(file, "name") else "txt"
if ext == "pdf":
return extract_text_from_pdf(file)
elif ext == "docx":
return extract_text_from_docx(file)
elif ext == "txt":
try:
file.seek(0)
return file.read().decode("utf-8")
except:
return ""
else:
return ""
def load_resume_dataset(folder_path):
resumes = []
names = []
paths = glob.glob(os.path.join(folder_path, "*"))
for path in paths:
text = ""
ext = path.split('.')[-1].lower()
try:
if ext == "pdf":
text = extract_text_from_pdf(path)
elif ext == "docx":
text = extract_text_from_docx(path)
elif ext == "txt":
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read()
except:
continue
if text.strip():
resumes.append(text)
names.append(os.path.basename(path))
return names, resumes
# -------------------------
# DYNAMIC JOB ROLE EXTRACTION
# -------------------------
def infer_job_from_text(text):
"""
Extract probable job/role from resume text.
Uses heuristics: first lines, capitalized phrases, or frequent nouns.
"""
lines = text.split("\n")
# Try first 5 lines for capitalized phrases
candidate_lines = lines[:5]
pattern = re.compile(r'\b[A-Z][a-zA-Z &/-]{2,}\b')
roles = []
for line in candidate_lines:
matches = pattern.findall(line)
roles.extend(matches)
# Fallback: top 1 frequent capitalized word
if roles:
most_common = Counter(roles).most_common(1)
return most_common[0][0]
else:
# fallback: "Other"
return "Other"
# -------------------------
# LOAD MODEL & DATASET
# -------------------------
st_model = SentenceTransformer('all-MiniLM-L6-v2')
resume_names, resume_texts = load_resume_dataset(DATASET_FOLDER)
resume_embeddings = st_model.encode(resume_texts, convert_to_numpy=True)
# Build dynamic job roles dict
resume_roles = {name: infer_job_from_text(text) for name, text in zip(resume_names, resume_texts)}
# -------------------------
# MATCH FUNCTION
# -------------------------
def match_resume(file):
input_text = extract_text(file)
if not input_text.strip():
return pd.DataFrame([{"Error": "Could not extract text from this resume."}])
input_emb = st_model.encode([input_text], convert_to_numpy=True)
sims = cosine_similarity(input_emb, resume_embeddings)[0]
top_indices = sims.argsort()[-TOP_K:][::-1]
results = []
for idx in top_indices:
matched_resume_name = resume_names[idx]
similarity_score = sims[idx]
recommended_job = resume_roles[matched_resume_name]
results.append({
"Matched Resume": matched_resume_name,
"Recommended Job": recommended_job,
"Confidence Score": f"{similarity_score*100:.2f}%"
})
return pd.DataFrame(results)
# -------------------------
# GRADIO UI
# -------------------------
css = """
body {background-color: #f7f9fc;}
h1 {color: #333; text-align: center;}
.gr-button {background-color: #4CAF50; color: white;}
"""
title = "<h1>AI Resume Analyzer & Job Matcher</h1>"
iface = gr.Interface(
fn=match_resume,
inputs=gr.File(label="Upload Your Resume (PDF, DOCX, TXT)"),
outputs=gr.Dataframe(label="Top Job Matches"),
title="AI Resume Analyzer & Job Matcher",
description="Upload a resume to get top 3 job recommendations with confidence scores.",
css=css,
)
if __name__ == "__main__":
iface.launch()