Upload 10 files

Browse files

Files changed (10) hide show

app.py +136 -0
ats_scorer.pkl +3 -0
clf.pkl +3 -0
encoder.pkl +3 -0
get-pip.py +0 -0
prototypes.pkl +3 -0
requirements.txt +7 -0
tfidf.pkl +3 -0
train_ats_model.py +94 -0
train_model.py +69 -0

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import streamlit as st
+import pickle
+import re
+import docx
+import PyPDF2
+from sklearn.metrics.pairwise import cosine_similarity
+# 1. CONFIG
+st.set_page_config(page_title="AI Resume Screening", layout="wide")
+import os
+# def ensure_models():
+#     if not os.path.exists("clf.pkl") or not os.path.exists("tfidf.pkl"):
+#         os.system("python train_model.py")
+#     if not os.path.exists("ats_scorer.pkl"):
+#         os.system("python train_ats_model.py")
+# ensure_models()
+# 2. LOAD RESOURCES
+@st.cache_resource
+def load_resources():
+    try:
+        clf = pickle.load(open('clf.pkl', 'rb'))
+        tfidf = pickle.load(open('tfidf.pkl', 'rb'))
+        le = pickle.load(open('encoder.pkl', 'rb'))
+        ats = pickle.load(open('ats_scorer.pkl', 'rb'))
+        prototypes = pickle.load(open('prototypes.pkl', 'rb'))
+        return clf, tfidf, le, ats, prototypes
+    except FileNotFoundError:
+        return None, None, None, None, None
+clf, tfidf, le, ats_model, prototypes = load_resources()
+# 3. UTILS
+def clean_text(txt):
+    txt = re.sub(r'http\S+\s', ' ', txt)
+    txt = re.sub(r'[^\w\s]', ' ', txt)
+    return txt.lower()
+def extract_text(file):
+    try:
+        if file.name.endswith('.pdf'):
+            reader = PyPDF2.PdfReader(file)
+            return " ".join([page.extract_text() for page in reader.pages])
+        elif file.name.endswith('.docx'):
+            doc = docx.Document(file)
+            return " ".join([p.text for p in doc.paragraphs])
+        elif file.name.endswith('.txt'):
+            return file.read().decode('utf-8')
+    except:
+        return ""
+def calculate_scores(text, category):
+    # Retrieve the "Master Profile" for the predicted category
+    if category not in prototypes:
+        return 0, 0, 0
+    master_profile = prototypes[category]
+    cleaned_resume = clean_text(text)
+    # 1. Cosine Similarity
+    vecs = tfidf.transform([cleaned_resume, master_profile])
+    cosine_sim = cosine_similarity(vecs[0], vecs[1])[0][0]
+    # 2. Keyword Match
+    res_tokens = set(cleaned_resume.split())
+    mp_tokens = set(master_profile.split())
+    keyword_match = len(res_tokens.intersection(mp_tokens)) / len(mp_tokens) if mp_tokens else 0
+    # 3. AI Prediction
+    try:
+        ml_score = ats_model.predict([[cosine_sim, keyword_match]])[0]
+    except:
+        ml_score = 0
+    # 4. Fallback Logic (Prevent 0 Scores)
+    # If the AI predicts extremely low but similarity is okay, fallback to math
+    if ml_score < 10:
+        final_score = cosine_sim * 100
+    else:
+        final_score = ml_score
+    # Visual Scaling (Raw cosine sim is usually low, e.g. 0.4, we map it to 0-100 scale)
+    if final_score < 1: # If it's 0.85 style
+        final_score *= 100
+    return round(final_score, 1), round(cosine_sim*100, 1), round(keyword_match*100, 1)
+# 4. MAIN APP
+def main():
+    st.title("📄 AI Resume Classifier & ATS Scorer")
+    st.markdown("Powered by `AzharAli05` (Classification) & `0xnbk` (Scoring)")
+    if not clf:
+        st.error("⚠️ Models missing! Run `train_model.py` then `train_ats_model.py`.")
+        st.stop()
+    file = st.file_uploader("Upload Resume", type=['pdf', 'docx', 'txt'])
+    if file:
+        text = extract_text(file)
+        if len(text) > 20:
+            # Predict Category
+            clean = clean_text(text)
+            vec = tfidf.transform([clean])
+            cat_id = clf.predict(vec)[0]
+            category = le.inverse_transform([cat_id])[0]
+            # Predict Score
+            ats_score, raw_sim, key_match = calculate_scores(text, category)
+            # Display
+            st.success(f"### Predicted Role: {category}")
+            col1, col2, col3 = st.columns(3)
+            col1.metric("ATS Score (AI)", f"{ats_score}%")
+            col2.metric("Content Match", f"{raw_sim}%")
+            col3.metric("Keyword Overlap", f"{key_match}%")
+            st.progress(min(ats_score/100, 1.0))
+            if ats_score > 75:
+                st.balloons()
+                st.info("Great match!")
+            elif ats_score < 40:
+                st.warning("Low match. Try adding more relevant keywords.")
+            with st.expander("Show Extracted Text"):
+                st.text(text)
+        else:
+            st.warning("Could not extract text. File might be an image/scan.")
+if __name__ == "__main__":
+    main()

ats_scorer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28f396d5ab19b12933b5a71789d5f0254411d0db7e365ddc0a3da905b8075b59
+size 122242

clf.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9512f20d5f66e6ec170d176dac997b427778a66df5542a4e0bd76fcbd26c5557
+size 473258954

encoder.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18ff3d1c7cb00d72279224dbd00003f2e36838db699fe5005211d58e18e4de34
+size 1103

get-pip.py ADDED Viewed

The diff for this file is too large to render. See raw diff

prototypes.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3061636877e02cc7eef55acb57a6ba899c63ce14c310021816e258d89250cfd4
+size 27298599

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+pandas
+numpy
+scikit-learn
+datasets
+PyPDF2
+python-docx

tfidf.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7100b313e4a3deb761bb098b95af1f34503e17faeca4c57debdcb0cb9b4f2463
+size 261920

train_ats_model.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import pandas as pd
+import pickle
+import numpy as np
+from datasets import load_dataset
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.model_selection import train_test_split
+import re
+import time
+def train_ats_scorer():
+    # 1. Load Dependencies
+    print("Loading TF-IDF Vectorizer (from Step 1)...")
+    try:
+        tfidf = pickle.load(open('tfidf.pkl', 'rb'))
+    except FileNotFoundError:
+        print("ERROR: 'tfidf.pkl' not found. Run 'train_model.py' first!")
+        exit()
+    # 2. Load ATS Dataset (0xnbk)
+    print("Loading 0xnbk/resume-ats-score-v1-en...")
+    try:
+        ds = load_dataset("0xnbk/resume-ats-score-v1-en")
+        df = pd.DataFrame(ds['train'])
+        print(f"Loaded {len(df)} rows.")
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        exit()
+    # 3. Pre-Process
+    res_col = 'text'
+    score_col = 'ats_score'
+    cat_col = 'original_label'
+    df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
+    df.dropna(subset=[score_col, res_col], inplace=True)
+    # 4. Generate Training Prototypes
+    print("Generating Training Prototypes...")
+    # Group resumes by label to simulate "Job Descriptions"
+    train_prototypes = df.groupby(cat_col)[res_col].apply(lambda x: ' '.join(x)).to_dict()
+    # Optimization: Pre-calculate vectors
+    print("Pre-calculating vectors...")
+    proto_vectors = {}
+    proto_tokens = {}
+    for cat, text in train_prototypes.items():
+        proto_vectors[cat] = tfidf.transform([text])
+        proto_tokens[cat] = set(re.findall(r'\w+', text.lower()))
+    # 5. Feature Engineering
+    print("Calculating features...")
+    cosine_sims = []
+    keyword_matches = []
+    for i, row in enumerate(df.itertuples()):
+        text = str(getattr(row, res_col))
+        cat = getattr(row, cat_col)
+        if cat in proto_vectors:
+            # Feature 1: Similarity
+            vec = tfidf.transform([text])
+            target_vec = proto_vectors[cat]
+            sim = cosine_similarity(vec, target_vec)[0][0]
+            # Feature 2: Keyword Match
+            tokens = set(re.findall(r'\w+', text.lower()))
+            target_tokens = proto_tokens[cat]
+            match = len(tokens.intersection(target_tokens)) / len(target_tokens) if target_tokens else 0
+        else:
+            sim = 0
+            match = 0
+        cosine_sims.append(sim)
+        keyword_matches.append(match)
+    df['cosine_sim'] = cosine_sims
+    df['keyword_match'] = keyword_matches
+    # 6. Train Regressor
+    print("Training ATS Regressor...")
+    X = df[['cosine_sim', 'keyword_match']]
+    y = df[score_col]
+    reg = GradientBoostingRegressor()
+    reg.fit(X, y)
+    # 7. Save
+    pickle.dump(reg, open('ats_scorer.pkl', 'wb'))
+    print("SUCCESS: 'ats_scorer.pkl' saved.")
+if __name__ == "__main__":
+    train_ats_scorer()

train_model.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import pandas as pd
+import pickle
+import re
+from datasets import load_dataset
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import LabelEncoder
+def train_classifier():
+    # 1. Load Dataset (AzharAli05)
+    print("Loading AzharAli05/Resume-Screening-Dataset...")
+    try:
+        ds = load_dataset("AzharAli05/Resume-Screening-Dataset")
+        df = pd.DataFrame(ds['train'])
+        print(f"Loaded {len(df)} resumes.")
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        exit()
+    # 2. Setup Columns
+    # Based on your dataset check: Text='Resume', Label='Role'
+    text_col = 'Resume'
+    label_col = 'Role'
+    # 3. Cleaning Function
+    def clean_resume(txt):
+        cleanText = re.sub(r'http\S+\s', ' ', str(txt))
+        cleanText = re.sub(r'RT|cc', ' ', cleanText)
+        cleanText = re.sub(r'#\S+\s', ' ', cleanText)
+        cleanText = re.sub(r'@\S+', '  ', cleanText)
+        cleanText = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', ' ', cleanText)
+        cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText)
+        cleanText = re.sub(r'\s+', ' ', cleanText)
+        return cleanText
+    print("Cleaning data...")
+    df['cleaned_resume'] = df[text_col].apply(clean_resume)
+    # 4. Generate & Save Prototypes (Crucial for App)
+    print("Generating Master Profiles (Prototypes)...")
+    # We combine all resumes for a specific role to create a "Master Profile"
+    prototypes = df.groupby(label_col)['cleaned_resume'].apply(lambda x: ' '.join(x)).to_dict()
+    pickle.dump(prototypes, open('prototypes.pkl', 'wb'))
+    # 5. Encoding Labels
+    le = LabelEncoder()
+    df['Category_ID'] = le.fit_transform(df[label_col])
+    # 6. Vectorizing
+    print("Vectorizing...")
+    tfidf = TfidfVectorizer(stop_words='english', max_features=200)
+    tfidf.fit(df['cleaned_resume'])
+    requiredText = tfidf.transform(df['cleaned_resume'])
+    # 7. Training
+    print("Training Classifier...")
+    clf = OneVsRestClassifier(KNeighborsClassifier())
+    clf.fit(requiredText, df['Category_ID'])
+    # 8. Saving Models
+    print("Saving models...")
+    pickle.dump(clf, open('clf.pkl', 'wb'))
+    pickle.dump(tfidf, open('tfidf.pkl', 'wb'))
+    pickle.dump(le, open('encoder.pkl', 'wb'))
+    print("SUCCESS: Classification models + Prototypes saved.")
+if __name__ == "__main__":
+    train_classifier()