SoS13 commited on
Commit
74f28d3
·
verified ·
1 Parent(s): ba6ad42

Upload 10 files

Browse files
Files changed (10) hide show
  1. app.py +136 -0
  2. ats_scorer.pkl +3 -0
  3. clf.pkl +3 -0
  4. encoder.pkl +3 -0
  5. get-pip.py +0 -0
  6. prototypes.pkl +3 -0
  7. requirements.txt +7 -0
  8. tfidf.pkl +3 -0
  9. train_ats_model.py +94 -0
  10. train_model.py +69 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ import re
4
+ import docx
5
+ import PyPDF2
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ # 1. CONFIG
9
+ st.set_page_config(page_title="AI Resume Screening", layout="wide")
10
+ import os
11
+
12
+ # def ensure_models():
13
+ # if not os.path.exists("clf.pkl") or not os.path.exists("tfidf.pkl"):
14
+ # os.system("python train_model.py")
15
+ # if not os.path.exists("ats_scorer.pkl"):
16
+ # os.system("python train_ats_model.py")
17
+
18
+ # ensure_models()
19
+
20
+ # 2. LOAD RESOURCES
21
+ @st.cache_resource
22
+ def load_resources():
23
+ try:
24
+ clf = pickle.load(open('clf.pkl', 'rb'))
25
+ tfidf = pickle.load(open('tfidf.pkl', 'rb'))
26
+ le = pickle.load(open('encoder.pkl', 'rb'))
27
+ ats = pickle.load(open('ats_scorer.pkl', 'rb'))
28
+ prototypes = pickle.load(open('prototypes.pkl', 'rb'))
29
+ return clf, tfidf, le, ats, prototypes
30
+ except FileNotFoundError:
31
+ return None, None, None, None, None
32
+
33
+ clf, tfidf, le, ats_model, prototypes = load_resources()
34
+
35
+ # 3. UTILS
36
+ def clean_text(txt):
37
+ txt = re.sub(r'http\S+\s', ' ', txt)
38
+ txt = re.sub(r'[^\w\s]', ' ', txt)
39
+ return txt.lower()
40
+
41
+ def extract_text(file):
42
+ try:
43
+ if file.name.endswith('.pdf'):
44
+ reader = PyPDF2.PdfReader(file)
45
+ return " ".join([page.extract_text() for page in reader.pages])
46
+ elif file.name.endswith('.docx'):
47
+ doc = docx.Document(file)
48
+ return " ".join([p.text for p in doc.paragraphs])
49
+ elif file.name.endswith('.txt'):
50
+ return file.read().decode('utf-8')
51
+ except:
52
+ return ""
53
+
54
+ def calculate_scores(text, category):
55
+ # Retrieve the "Master Profile" for the predicted category
56
+ if category not in prototypes:
57
+ return 0, 0, 0
58
+
59
+ master_profile = prototypes[category]
60
+ cleaned_resume = clean_text(text)
61
+
62
+ # 1. Cosine Similarity
63
+ vecs = tfidf.transform([cleaned_resume, master_profile])
64
+ cosine_sim = cosine_similarity(vecs[0], vecs[1])[0][0]
65
+
66
+
67
+ # 2. Keyword Match
68
+ res_tokens = set(cleaned_resume.split())
69
+ mp_tokens = set(master_profile.split())
70
+ keyword_match = len(res_tokens.intersection(mp_tokens)) / len(mp_tokens) if mp_tokens else 0
71
+
72
+ # 3. AI Prediction
73
+ try:
74
+ ml_score = ats_model.predict([[cosine_sim, keyword_match]])[0]
75
+ except:
76
+ ml_score = 0
77
+
78
+ # 4. Fallback Logic (Prevent 0 Scores)
79
+ # If the AI predicts extremely low but similarity is okay, fallback to math
80
+ if ml_score < 10:
81
+ final_score = cosine_sim * 100
82
+ else:
83
+ final_score = ml_score
84
+
85
+ # Visual Scaling (Raw cosine sim is usually low, e.g. 0.4, we map it to 0-100 scale)
86
+ if final_score < 1: # If it's 0.85 style
87
+ final_score *= 100
88
+
89
+ return round(final_score, 1), round(cosine_sim*100, 1), round(keyword_match*100, 1)
90
+
91
+ # 4. MAIN APP
92
+ def main():
93
+ st.title("📄 AI Resume Classifier & ATS Scorer")
94
+ st.markdown("Powered by `AzharAli05` (Classification) & `0xnbk` (Scoring)")
95
+
96
+ if not clf:
97
+ st.error("⚠️ Models missing! Run `train_model.py` then `train_ats_model.py`.")
98
+ st.stop()
99
+
100
+ file = st.file_uploader("Upload Resume", type=['pdf', 'docx', 'txt'])
101
+
102
+ if file:
103
+ text = extract_text(file)
104
+ if len(text) > 20:
105
+ # Predict Category
106
+ clean = clean_text(text)
107
+ vec = tfidf.transform([clean])
108
+ cat_id = clf.predict(vec)[0]
109
+ category = le.inverse_transform([cat_id])[0]
110
+
111
+ # Predict Score
112
+ ats_score, raw_sim, key_match = calculate_scores(text, category)
113
+
114
+ # Display
115
+ st.success(f"### Predicted Role: {category}")
116
+
117
+ col1, col2, col3 = st.columns(3)
118
+ col1.metric("ATS Score (AI)", f"{ats_score}%")
119
+ col2.metric("Content Match", f"{raw_sim}%")
120
+ col3.metric("Keyword Overlap", f"{key_match}%")
121
+
122
+ st.progress(min(ats_score/100, 1.0))
123
+
124
+ if ats_score > 75:
125
+ st.balloons()
126
+ st.info("Great match!")
127
+ elif ats_score < 40:
128
+ st.warning("Low match. Try adding more relevant keywords.")
129
+
130
+ with st.expander("Show Extracted Text"):
131
+ st.text(text)
132
+ else:
133
+ st.warning("Could not extract text. File might be an image/scan.")
134
+
135
+ if __name__ == "__main__":
136
+ main()
ats_scorer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28f396d5ab19b12933b5a71789d5f0254411d0db7e365ddc0a3da905b8075b59
3
+ size 122242
clf.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9512f20d5f66e6ec170d176dac997b427778a66df5542a4e0bd76fcbd26c5557
3
+ size 473258954
encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ff3d1c7cb00d72279224dbd00003f2e36838db699fe5005211d58e18e4de34
3
+ size 1103
get-pip.py ADDED
The diff for this file is too large to render. See raw diff
 
prototypes.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3061636877e02cc7eef55acb57a6ba899c63ce14c310021816e258d89250cfd4
3
+ size 27298599
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ scikit-learn
5
+ datasets
6
+ PyPDF2
7
+ python-docx
tfidf.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7100b313e4a3deb761bb098b95af1f34503e17faeca4c57debdcb0cb9b4f2463
3
+ size 261920
train_ats_model.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pickle
3
+ import numpy as np
4
+ from datasets import load_dataset
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from sklearn.ensemble import GradientBoostingRegressor
7
+ from sklearn.model_selection import train_test_split
8
+ import re
9
+ import time
10
+
11
+ def train_ats_scorer():
12
+ # 1. Load Dependencies
13
+ print("Loading TF-IDF Vectorizer (from Step 1)...")
14
+ try:
15
+ tfidf = pickle.load(open('tfidf.pkl', 'rb'))
16
+ except FileNotFoundError:
17
+ print("ERROR: 'tfidf.pkl' not found. Run 'train_model.py' first!")
18
+ exit()
19
+
20
+ # 2. Load ATS Dataset (0xnbk)
21
+ print("Loading 0xnbk/resume-ats-score-v1-en...")
22
+ try:
23
+ ds = load_dataset("0xnbk/resume-ats-score-v1-en")
24
+ df = pd.DataFrame(ds['train'])
25
+ print(f"Loaded {len(df)} rows.")
26
+ except Exception as e:
27
+ print(f"Error loading dataset: {e}")
28
+ exit()
29
+
30
+ # 3. Pre-Process
31
+ res_col = 'text'
32
+ score_col = 'ats_score'
33
+ cat_col = 'original_label'
34
+
35
+ df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
36
+ df.dropna(subset=[score_col, res_col], inplace=True)
37
+
38
+ # 4. Generate Training Prototypes
39
+ print("Generating Training Prototypes...")
40
+ # Group resumes by label to simulate "Job Descriptions"
41
+ train_prototypes = df.groupby(cat_col)[res_col].apply(lambda x: ' '.join(x)).to_dict()
42
+
43
+ # Optimization: Pre-calculate vectors
44
+ print("Pre-calculating vectors...")
45
+ proto_vectors = {}
46
+ proto_tokens = {}
47
+
48
+ for cat, text in train_prototypes.items():
49
+ proto_vectors[cat] = tfidf.transform([text])
50
+ proto_tokens[cat] = set(re.findall(r'\w+', text.lower()))
51
+
52
+ # 5. Feature Engineering
53
+ print("Calculating features...")
54
+ cosine_sims = []
55
+ keyword_matches = []
56
+
57
+ for i, row in enumerate(df.itertuples()):
58
+ text = str(getattr(row, res_col))
59
+ cat = getattr(row, cat_col)
60
+
61
+ if cat in proto_vectors:
62
+ # Feature 1: Similarity
63
+ vec = tfidf.transform([text])
64
+ target_vec = proto_vectors[cat]
65
+ sim = cosine_similarity(vec, target_vec)[0][0]
66
+
67
+ # Feature 2: Keyword Match
68
+ tokens = set(re.findall(r'\w+', text.lower()))
69
+ target_tokens = proto_tokens[cat]
70
+ match = len(tokens.intersection(target_tokens)) / len(target_tokens) if target_tokens else 0
71
+ else:
72
+ sim = 0
73
+ match = 0
74
+
75
+ cosine_sims.append(sim)
76
+ keyword_matches.append(match)
77
+
78
+ df['cosine_sim'] = cosine_sims
79
+ df['keyword_match'] = keyword_matches
80
+
81
+ # 6. Train Regressor
82
+ print("Training ATS Regressor...")
83
+ X = df[['cosine_sim', 'keyword_match']]
84
+ y = df[score_col]
85
+
86
+ reg = GradientBoostingRegressor()
87
+ reg.fit(X, y)
88
+
89
+ # 7. Save
90
+ pickle.dump(reg, open('ats_scorer.pkl', 'wb'))
91
+ print("SUCCESS: 'ats_scorer.pkl' saved.")
92
+
93
+ if __name__ == "__main__":
94
+ train_ats_scorer()
train_model.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pickle
3
+ import re
4
+ from datasets import load_dataset
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.multiclass import OneVsRestClassifier
7
+ from sklearn.neighbors import KNeighborsClassifier
8
+ from sklearn.preprocessing import LabelEncoder
9
+
10
+ def train_classifier():
11
+ # 1. Load Dataset (AzharAli05)
12
+ print("Loading AzharAli05/Resume-Screening-Dataset...")
13
+ try:
14
+ ds = load_dataset("AzharAli05/Resume-Screening-Dataset")
15
+ df = pd.DataFrame(ds['train'])
16
+ print(f"Loaded {len(df)} resumes.")
17
+ except Exception as e:
18
+ print(f"Error loading dataset: {e}")
19
+ exit()
20
+
21
+ # 2. Setup Columns
22
+ # Based on your dataset check: Text='Resume', Label='Role'
23
+ text_col = 'Resume'
24
+ label_col = 'Role'
25
+
26
+ # 3. Cleaning Function
27
+ def clean_resume(txt):
28
+ cleanText = re.sub(r'http\S+\s', ' ', str(txt))
29
+ cleanText = re.sub(r'RT|cc', ' ', cleanText)
30
+ cleanText = re.sub(r'#\S+\s', ' ', cleanText)
31
+ cleanText = re.sub(r'@\S+', ' ', cleanText)
32
+ cleanText = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', ' ', cleanText)
33
+ cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText)
34
+ cleanText = re.sub(r'\s+', ' ', cleanText)
35
+ return cleanText
36
+
37
+ print("Cleaning data...")
38
+ df['cleaned_resume'] = df[text_col].apply(clean_resume)
39
+
40
+ # 4. Generate & Save Prototypes (Crucial for App)
41
+ print("Generating Master Profiles (Prototypes)...")
42
+ # We combine all resumes for a specific role to create a "Master Profile"
43
+ prototypes = df.groupby(label_col)['cleaned_resume'].apply(lambda x: ' '.join(x)).to_dict()
44
+ pickle.dump(prototypes, open('prototypes.pkl', 'wb'))
45
+
46
+ # 5. Encoding Labels
47
+ le = LabelEncoder()
48
+ df['Category_ID'] = le.fit_transform(df[label_col])
49
+
50
+ # 6. Vectorizing
51
+ print("Vectorizing...")
52
+ tfidf = TfidfVectorizer(stop_words='english', max_features=200)
53
+ tfidf.fit(df['cleaned_resume'])
54
+ requiredText = tfidf.transform(df['cleaned_resume'])
55
+
56
+ # 7. Training
57
+ print("Training Classifier...")
58
+ clf = OneVsRestClassifier(KNeighborsClassifier())
59
+ clf.fit(requiredText, df['Category_ID'])
60
+
61
+ # 8. Saving Models
62
+ print("Saving models...")
63
+ pickle.dump(clf, open('clf.pkl', 'wb'))
64
+ pickle.dump(tfidf, open('tfidf.pkl', 'wb'))
65
+ pickle.dump(le, open('encoder.pkl', 'wb'))
66
+ print("SUCCESS: Classification models + Prototypes saved.")
67
+
68
+ if __name__ == "__main__":
69
+ train_classifier()