import os import glob import gradio as gr import numpy as np import pandas as pd from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer from PyPDF2 import PdfReader import docx import re from collections import Counter # ------------------------- # CONFIG # ------------------------- DATASET_FOLDER = "resumes" # Folder with 288 resumes TOP_K = 3 # Top 3 recommendations # ------------------------- # HELPER FUNCTIONS # ------------------------- def extract_text_from_pdf(file): text = "" try: reader = PdfReader(file) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + " " except: pass return text def extract_text_from_docx(file): text = "" try: doc = docx.Document(file) text = " ".join([para.text for para in doc.paragraphs]) except: pass return text def extract_text(file): ext = file.name.split('.')[-1].lower() if hasattr(file, "name") else "txt" if ext == "pdf": return extract_text_from_pdf(file) elif ext == "docx": return extract_text_from_docx(file) elif ext == "txt": try: file.seek(0) return file.read().decode("utf-8") except: return "" else: return "" def load_resume_dataset(folder_path): resumes = [] names = [] paths = glob.glob(os.path.join(folder_path, "*")) for path in paths: text = "" ext = path.split('.')[-1].lower() try: if ext == "pdf": text = extract_text_from_pdf(path) elif ext == "docx": text = extract_text_from_docx(path) elif ext == "txt": with open(path, 'r', encoding='utf-8', errors='ignore') as f: text = f.read() except: continue if text.strip(): resumes.append(text) names.append(os.path.basename(path)) return names, resumes # ------------------------- # DYNAMIC JOB ROLE EXTRACTION # ------------------------- def infer_job_from_text(text): """ Extract probable job/role from resume text. Uses heuristics: first lines, capitalized phrases, or frequent nouns. """ lines = text.split("\n") # Try first 5 lines for capitalized phrases candidate_lines = lines[:5] pattern = re.compile(r'\b[A-Z][a-zA-Z &/-]{2,}\b') roles = [] for line in candidate_lines: matches = pattern.findall(line) roles.extend(matches) # Fallback: top 1 frequent capitalized word if roles: most_common = Counter(roles).most_common(1) return most_common[0][0] else: # fallback: "Other" return "Other" # ------------------------- # LOAD MODEL & DATASET # ------------------------- st_model = SentenceTransformer('all-MiniLM-L6-v2') resume_names, resume_texts = load_resume_dataset(DATASET_FOLDER) resume_embeddings = st_model.encode(resume_texts, convert_to_numpy=True) # Build dynamic job roles dict resume_roles = {name: infer_job_from_text(text) for name, text in zip(resume_names, resume_texts)} # ------------------------- # MATCH FUNCTION # ------------------------- def match_resume(file): input_text = extract_text(file) if not input_text.strip(): return pd.DataFrame([{"Error": "Could not extract text from this resume."}]) input_emb = st_model.encode([input_text], convert_to_numpy=True) sims = cosine_similarity(input_emb, resume_embeddings)[0] top_indices = sims.argsort()[-TOP_K:][::-1] results = [] for idx in top_indices: matched_resume_name = resume_names[idx] similarity_score = sims[idx] recommended_job = resume_roles[matched_resume_name] results.append({ "Matched Resume": matched_resume_name, "Recommended Job": recommended_job, "Confidence Score": f"{similarity_score*100:.2f}%" }) return pd.DataFrame(results) # ------------------------- # GRADIO UI # ------------------------- css = """ body {background-color: #f7f9fc;} h1 {color: #333; text-align: center;} .gr-button {background-color: #4CAF50; color: white;} """ title = "

AI Resume Analyzer & Job Matcher

" iface = gr.Interface( fn=match_resume, inputs=gr.File(label="Upload Your Resume (PDF, DOCX, TXT)"), outputs=gr.Dataframe(label="Top Job Matches"), title="AI Resume Analyzer & Job Matcher", description="Upload a resume to get top 3 job recommendations with confidence scores.", css=css, ) if __name__ == "__main__": iface.launch()