File size: 4,549 Bytes
74f28d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import streamlit as st
import pickle
import re
import docx
import PyPDF2
from sklearn.metrics.pairwise import cosine_similarity

# 1. CONFIG
st.set_page_config(page_title="AI Resume Screening", layout="wide")
import os

# def ensure_models():
#     if not os.path.exists("clf.pkl") or not os.path.exists("tfidf.pkl"):
#         os.system("python train_model.py")
#     if not os.path.exists("ats_scorer.pkl"):
#         os.system("python train_ats_model.py")

# ensure_models()

# 2. LOAD RESOURCES
@st.cache_resource
def load_resources():
    try:
        clf = pickle.load(open('clf.pkl', 'rb'))
        tfidf = pickle.load(open('tfidf.pkl', 'rb'))
        le = pickle.load(open('encoder.pkl', 'rb'))
        ats = pickle.load(open('ats_scorer.pkl', 'rb'))
        prototypes = pickle.load(open('prototypes.pkl', 'rb'))
        return clf, tfidf, le, ats, prototypes
    except FileNotFoundError:
        return None, None, None, None, None

clf, tfidf, le, ats_model, prototypes = load_resources()

# 3. UTILS
def clean_text(txt):
    txt = re.sub(r'http\S+\s', ' ', txt)
    txt = re.sub(r'[^\w\s]', ' ', txt)
    return txt.lower()

def extract_text(file):
    try:
        if file.name.endswith('.pdf'):
            reader = PyPDF2.PdfReader(file)
            return " ".join([page.extract_text() for page in reader.pages])
        elif file.name.endswith('.docx'):
            doc = docx.Document(file)
            return " ".join([p.text for p in doc.paragraphs])
        elif file.name.endswith('.txt'):
            return file.read().decode('utf-8')
    except:
        return ""

def calculate_scores(text, category):
    # Retrieve the "Master Profile" for the predicted category
    if category not in prototypes:
        return 0, 0, 0
    
    master_profile = prototypes[category]
    cleaned_resume = clean_text(text)
    
    # 1. Cosine Similarity
    vecs = tfidf.transform([cleaned_resume, master_profile])
    cosine_sim = cosine_similarity(vecs[0], vecs[1])[0][0]
    
    
    # 2. Keyword Match
    res_tokens = set(cleaned_resume.split())
    mp_tokens = set(master_profile.split())
    keyword_match = len(res_tokens.intersection(mp_tokens)) / len(mp_tokens) if mp_tokens else 0
    
    # 3. AI Prediction
    try:
        ml_score = ats_model.predict([[cosine_sim, keyword_match]])[0]
    except:
        ml_score = 0
    
    # 4. Fallback Logic (Prevent 0 Scores)
    # If the AI predicts extremely low but similarity is okay, fallback to math
    if ml_score < 10:
        final_score = cosine_sim * 100
    else:
        final_score = ml_score

    # Visual Scaling (Raw cosine sim is usually low, e.g. 0.4, we map it to 0-100 scale)
    if final_score < 1: # If it's 0.85 style
        final_score *= 100

    return round(final_score, 1), round(cosine_sim*100, 1), round(keyword_match*100, 1)

# 4. MAIN APP
def main():
    st.title("📄 AI Resume Classifier & ATS Scorer")
    st.markdown("Powered by `AzharAli05` (Classification) & `0xnbk` (Scoring)")
    
    if not clf:
        st.error("⚠️ Models missing! Run `train_model.py` then `train_ats_model.py`.")
        st.stop()

    file = st.file_uploader("Upload Resume", type=['pdf', 'docx', 'txt'])

    if file:
        text = extract_text(file)
        if len(text) > 20:
            # Predict Category
            clean = clean_text(text)
            vec = tfidf.transform([clean])
            cat_id = clf.predict(vec)[0]
            category = le.inverse_transform([cat_id])[0]

            # Predict Score
            ats_score, raw_sim, key_match = calculate_scores(text, category)
            
            # Display
            st.success(f"### Predicted Role: {category}")
            
            col1, col2, col3 = st.columns(3)
            col1.metric("ATS Score (AI)", f"{ats_score}%")
            col2.metric("Content Match", f"{raw_sim}%")
            col3.metric("Keyword Overlap", f"{key_match}%")
            
            st.progress(min(ats_score/100, 1.0))
            
            if ats_score > 75: 
                st.balloons()
                st.info("Great match!")
            elif ats_score < 40:
                st.warning("Low match. Try adding more relevant keywords.")

            with st.expander("Show Extracted Text"):
                st.text(text)
        else:
            st.warning("Could not extract text. File might be an image/scan.")

if __name__ == "__main__":
    main()