srinikesh1432 commited on
Commit
b2cba08
·
verified ·
1 Parent(s): f0afd6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -62
app.py CHANGED
@@ -1,62 +1,163 @@
1
- import gradio as gr
2
- import os
3
- from pathlib import Path
4
- import numpy as np
5
- import pandas as pd
6
- from sklearn.feature_extraction.text import TfidfVectorizer
7
- from sklearn.metrics.pairwise import cosine_similarity
8
- import PyPDF2, docx
9
-
10
- # ----------- CONFIG ------------
11
- RESUME_DIR = Path("resumes")
12
- MAX_FEATURES = 20000
13
- # -------------------------------
14
-
15
- def extract_text(path):
16
- if path.suffix.lower() == ".pdf":
17
- pdf = PyPDF2.PdfReader(open(path, "rb"))
18
- return "\n".join([p.extract_text() or "" for p in pdf.pages])
19
- elif path.suffix.lower() in [".docx", ".doc"]:
20
- d = docx.Document(path)
21
- return "\n".join([p.text for p in d.paragraphs])
22
- else:
23
- return path.read_text(encoding="utf-8", errors="ignore")
24
-
25
- def load_resumes():
26
- texts, names = [], []
27
- for p in RESUME_DIR.glob("**/*"):
28
- if p.suffix.lower() in [".pdf", ".docx", ".doc", ".txt"]:
29
- try:
30
- txt = extract_text(p)
31
- texts.append(" ".join(txt.lower().split()))
32
- names.append(p.name)
33
- except Exception:
34
- pass
35
- return names, texts
36
-
37
- filenames, texts = load_resumes()
38
- vectorizer = TfidfVectorizer(stop_words="english", max_features=MAX_FEATURES)
39
- X = vectorizer.fit_transform(texts)
40
-
41
- def match_resume(job_description):
42
- jd = " ".join(job_description.lower().split())
43
- jd_vec = vectorizer.transform([jd])
44
- sims = cosine_similarity(jd_vec, X).flatten()
45
- mean, std = sims.mean(), sims.std() or 1e-6
46
- conf = 1 / (1 + np.exp(-((sims - mean) / std))) # sigmoid confidence
47
- df = pd.DataFrame({
48
- "Resume": filenames,
49
- "Match (%)": (sims * 100).round(2),
50
- "Confidence (%)": (conf * 100).round(2)
51
- }).sort_values("Match (%)", ascending=False).head(10)
52
- return df
53
-
54
- iface = gr.Interface(
55
- fn=match_resume,
56
- inputs=gr.Textbox(lines=6, label="Paste Job Description"),
57
- outputs=gr.Dataframe(label="Top Matching Resumes"),
58
- title="AI Resume Analyzer + Job Matcher",
59
- description="Upload your resume dataset and match against a job description using NLP (TF-IDF + Confidence Scoring)."
60
- )
61
-
62
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import gradio as gr
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from sentence_transformers import SentenceTransformer
8
+ from PyPDF2 import PdfReader
9
+ import docx
10
+ import re
11
+ from collections import Counter
12
+
13
+ # -------------------------
14
+ # CONFIG
15
+ # -------------------------
16
+ DATASET_FOLDER = "resumes" # Folder with 288 resumes
17
+ TOP_K = 3 # Top 3 recommendations
18
+
19
+ # -------------------------
20
+ # HELPER FUNCTIONS
21
+ # -------------------------
22
+
23
+ def extract_text_from_pdf(file):
24
+ text = ""
25
+ try:
26
+ reader = PdfReader(file)
27
+ for page in reader.pages:
28
+ page_text = page.extract_text()
29
+ if page_text:
30
+ text += page_text + " "
31
+ except:
32
+ pass
33
+ return text
34
+
35
+ def extract_text_from_docx(file):
36
+ text = ""
37
+ try:
38
+ doc = docx.Document(file)
39
+ text = " ".join([para.text for para in doc.paragraphs])
40
+ except:
41
+ pass
42
+ return text
43
+
44
+ def extract_text(file):
45
+ ext = file.name.split('.')[-1].lower() if hasattr(file, "name") else "txt"
46
+ if ext == "pdf":
47
+ return extract_text_from_pdf(file)
48
+ elif ext == "docx":
49
+ return extract_text_from_docx(file)
50
+ elif ext == "txt":
51
+ try:
52
+ file.seek(0)
53
+ return file.read().decode("utf-8")
54
+ except:
55
+ return ""
56
+ else:
57
+ return ""
58
+
59
+ def load_resume_dataset(folder_path):
60
+ resumes = []
61
+ names = []
62
+ paths = glob.glob(os.path.join(folder_path, "*"))
63
+ for path in paths:
64
+ text = ""
65
+ ext = path.split('.')[-1].lower()
66
+ try:
67
+ if ext == "pdf":
68
+ text = extract_text_from_pdf(path)
69
+ elif ext == "docx":
70
+ text = extract_text_from_docx(path)
71
+ elif ext == "txt":
72
+ with open(path, 'r', encoding='utf-8', errors='ignore') as f:
73
+ text = f.read()
74
+ except:
75
+ continue
76
+ if text.strip():
77
+ resumes.append(text)
78
+ names.append(os.path.basename(path))
79
+ return names, resumes
80
+
81
+ # -------------------------
82
+ # DYNAMIC JOB ROLE EXTRACTION
83
+ # -------------------------
84
+
85
+ def infer_job_from_text(text):
86
+ """
87
+ Extract probable job/role from resume text.
88
+ Uses heuristics: first lines, capitalized phrases, or frequent nouns.
89
+ """
90
+ lines = text.split("\n")
91
+ # Try first 5 lines for capitalized phrases
92
+ candidate_lines = lines[:5]
93
+ pattern = re.compile(r'\b[A-Z][a-zA-Z &/-]{2,}\b')
94
+ roles = []
95
+ for line in candidate_lines:
96
+ matches = pattern.findall(line)
97
+ roles.extend(matches)
98
+ # Fallback: top 1 frequent capitalized word
99
+ if roles:
100
+ most_common = Counter(roles).most_common(1)
101
+ return most_common[0][0]
102
+ else:
103
+ # fallback: "Other"
104
+ return "Other"
105
+
106
+ # -------------------------
107
+ # LOAD MODEL & DATASET
108
+ # -------------------------
109
+
110
+ st_model = SentenceTransformer('all-MiniLM-L6-v2')
111
+ resume_names, resume_texts = load_resume_dataset(DATASET_FOLDER)
112
+ resume_embeddings = st_model.encode(resume_texts, convert_to_numpy=True)
113
+
114
+ # Build dynamic job roles dict
115
+ resume_roles = {name: infer_job_from_text(text) for name, text in zip(resume_names, resume_texts)}
116
+
117
+ # -------------------------
118
+ # MATCH FUNCTION
119
+ # -------------------------
120
+
121
+ def match_resume(file):
122
+ input_text = extract_text(file)
123
+ if not input_text.strip():
124
+ return pd.DataFrame([{"Error": "Could not extract text from this resume."}])
125
+
126
+ input_emb = st_model.encode([input_text], convert_to_numpy=True)
127
+ sims = cosine_similarity(input_emb, resume_embeddings)[0]
128
+ top_indices = sims.argsort()[-TOP_K:][::-1]
129
+
130
+ results = []
131
+ for idx in top_indices:
132
+ matched_resume_name = resume_names[idx]
133
+ similarity_score = sims[idx]
134
+ recommended_job = resume_roles[matched_resume_name]
135
+ results.append({
136
+ "Matched Resume": matched_resume_name,
137
+ "Recommended Job": recommended_job,
138
+ "Confidence Score": f"{similarity_score*100:.2f}%"
139
+ })
140
+ return pd.DataFrame(results)
141
+
142
+ # -------------------------
143
+ # GRADIO UI
144
+ # -------------------------
145
+ css = """
146
+ body {background-color: #f7f9fc;}
147
+ h1 {color: #333; text-align: center;}
148
+ .gr-button {background-color: #4CAF50; color: white;}
149
+ """
150
+
151
+ title = "<h1>AI Resume Analyzer & Job Matcher</h1>"
152
+
153
+ iface = gr.Interface(
154
+ fn=match_resume,
155
+ inputs=gr.File(label="Upload Your Resume (PDF, DOCX, TXT)"),
156
+ outputs=gr.Dataframe(label="Top Job Matches"),
157
+ title="AI Resume Analyzer & Job Matcher",
158
+ description="Upload a resume to get top 3 job recommendations with confidence scores.",
159
+ css=css,
160
+ )
161
+
162
+ if __name__ == "__main__":
163
+ iface.launch()