File size: 4,734 Bytes
04b4b5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
import glob
import gradio as gr
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
import docx
import re
from collections import Counter

# -------------------------
# CONFIG
# -------------------------
DATASET_FOLDER = "resumes"  # Folder with 288 resumes
TOP_K = 3                   # Top 3 recommendations

# -------------------------
# HELPER FUNCTIONS
# -------------------------

def extract_text_from_pdf(file):
    text = ""
    try:
        reader = PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
    except:
        pass
    return text

def extract_text_from_docx(file):
    text = ""
    try:
        doc = docx.Document(file)
        text = " ".join([para.text for para in doc.paragraphs])
    except:
        pass
    return text

def extract_text(file):
    ext = file.name.split('.')[-1].lower() if hasattr(file, "name") else "txt"
    if ext == "pdf":
        return extract_text_from_pdf(file)
    elif ext == "docx":
        return extract_text_from_docx(file)
    elif ext == "txt":
        try:
            file.seek(0)
            return file.read().decode("utf-8")
        except:
            return ""
    else:
        return ""

def load_resume_dataset(folder_path):
    resumes = []
    names = []
    paths = glob.glob(os.path.join(folder_path, "*"))
    for path in paths:
        text = ""
        ext = path.split('.')[-1].lower()
        try:
            if ext == "pdf":
                text = extract_text_from_pdf(path)
            elif ext == "docx":
                text = extract_text_from_docx(path)
            elif ext == "txt":
                with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
        except:
            continue
        if text.strip():
            resumes.append(text)
            names.append(os.path.basename(path))
    return names, resumes

# -------------------------
# DYNAMIC JOB ROLE EXTRACTION
# -------------------------

def infer_job_from_text(text):
    """
    Extract probable job/role from resume text.
    Uses heuristics: first lines, capitalized phrases, or frequent nouns.
    """
    lines = text.split("\n")
    # Try first 5 lines for capitalized phrases
    candidate_lines = lines[:5]
    pattern = re.compile(r'\b[A-Z][a-zA-Z &/-]{2,}\b')
    roles = []
    for line in candidate_lines:
        matches = pattern.findall(line)
        roles.extend(matches)
    # Fallback: top 1 frequent capitalized word
    if roles:
        most_common = Counter(roles).most_common(1)
        return most_common[0][0]
    else:
        # fallback: "Other"
        return "Other"

# -------------------------
# LOAD MODEL & DATASET
# -------------------------

st_model = SentenceTransformer('all-MiniLM-L6-v2')
resume_names, resume_texts = load_resume_dataset(DATASET_FOLDER)
resume_embeddings = st_model.encode(resume_texts, convert_to_numpy=True)

# Build dynamic job roles dict
resume_roles = {name: infer_job_from_text(text) for name, text in zip(resume_names, resume_texts)}

# -------------------------
# MATCH FUNCTION
# -------------------------

def match_resume(file):
    input_text = extract_text(file)
    if not input_text.strip():
        return pd.DataFrame([{"Error": "Could not extract text from this resume."}])
    
    input_emb = st_model.encode([input_text], convert_to_numpy=True)
    sims = cosine_similarity(input_emb, resume_embeddings)[0]
    top_indices = sims.argsort()[-TOP_K:][::-1]
    
    results = []
    for idx in top_indices:
        matched_resume_name = resume_names[idx]
        similarity_score = sims[idx]
        recommended_job = resume_roles[matched_resume_name]
        results.append({
            "Matched Resume": matched_resume_name,
            "Recommended Job": recommended_job,
            "Confidence Score": f"{similarity_score*100:.2f}%"
        })
    return pd.DataFrame(results)

# -------------------------
# GRADIO UI
# -------------------------
css = """
body {background-color: #f7f9fc;}
h1 {color: #333; text-align: center;}
.gr-button {background-color: #4CAF50; color: white;}
"""

title = "<h1>AI Resume Analyzer & Job Matcher</h1>"

iface = gr.Interface(
    fn=match_resume,
    inputs=gr.File(label="Upload Your Resume (PDF, DOCX, TXT)"),
    outputs=gr.Dataframe(label="Top Job Matches"),
    title="AI Resume Analyzer & Job Matcher",
    description="Upload a resume to get top 3 job recommendations with confidence scores.",
    css=css,
)

if __name__ == "__main__":
    iface.launch()