lexicalspace commited on
Commit
5114bea
Β·
verified Β·
1 Parent(s): a1b9a4e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +253 -0
app.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pypdf
4
+ import re
5
+ import io
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import plotly.graph_objects as go
9
+ import plotly.express as px
10
+ import nltk
11
+ from nltk.corpus import stopwords
12
+ from nltk.tokenize import word_tokenize
13
+
14
+ # --- 1. SYSTEM CONFIGURATION & SETUP ---
15
+ st.set_page_config(
16
+ page_title="Smart ATS Optimizer",
17
+ page_icon="🎯",
18
+ layout="wide",
19
+ initial_sidebar_state="expanded"
20
+ )
21
+
22
+ # NLTK Setup (Runs once to download dictionary data)
23
+ @st.cache_resource
24
+ def setup_nltk():
25
+ try:
26
+ nltk.data.find('tokenizers/punkt')
27
+ except LookupError:
28
+ nltk.download('punkt')
29
+ try:
30
+ nltk.data.find('corpora/stopwords')
31
+ except LookupError:
32
+ nltk.download('stopwords')
33
+
34
+ setup_nltk()
35
+
36
+ # --- 2. BACKEND LOGIC (The Complex Part) ---
37
+
38
+ def extract_text_from_pdf(uploaded_file):
39
+ """
40
+ Parses PDF file and returns raw text.
41
+ Handles exceptions for encrypted or corrupted files.
42
+ """
43
+ try:
44
+ pdf_reader = pypdf.PdfReader(uploaded_file)
45
+ text = ""
46
+ for page in pdf_reader.pages:
47
+ content = page.extract_text()
48
+ if content:
49
+ text += content
50
+ return text
51
+ except Exception as e:
52
+ st.error(f"Error reading PDF: {str(e)}")
53
+ return None
54
+
55
+ def clean_text(text):
56
+ """
57
+ NLP Pipeline:
58
+ 1. Lowercase
59
+ 2. Remove special characters (keep only alphanumeric)
60
+ 3. Tokenize (split into words)
61
+ 4. Remove Stopwords (common words that add no meaning)
62
+ """
63
+ # 1. Regex Cleaning
64
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
65
+
66
+ # 2. Tokenization & Stopword Removal
67
+ stop_words = set(stopwords.words('english'))
68
+ words = word_tokenize(text)
69
+ filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
70
+
71
+ return " ".join(filtered_words)
72
+
73
+ def calculate_similarity(resume_text, job_desc_text):
74
+ """
75
+ Mathematical Core:
76
+ Uses TF-IDF (Term Frequency-Inverse Document Frequency) to convert text into numbers (vectors).
77
+ Then calculates Cosine Similarity (angle between vectors) to determine match %.
78
+ """
79
+ # Create the vectorizer
80
+ tfidf = TfidfVectorizer()
81
+
82
+ # Fit and transform the documents
83
+ vectors = tfidf.fit_transform([resume_text, job_desc_text])
84
+
85
+ # Calculate Cosine Similarity (Result is a matrix like [[1, 0.7], [0.7, 1]])
86
+ similarity_matrix = cosine_similarity(vectors)
87
+
88
+ # We want the similarity between Doc 0 (Resume) and Doc 1 (Job)
89
+ match_percentage = similarity_matrix[0][1] * 100
90
+
91
+ # Get Feature Names (Words) for keyword analysis
92
+ feature_names = tfidf.get_feature_names_out()
93
+
94
+ # Extract non-zero vectors to find which words are present
95
+ dense_vector = vectors.todense()
96
+ resume_vector = dense_vector[0].tolist()[0]
97
+ job_vector = dense_vector[1].tolist()[0]
98
+
99
+ # Create a DataFrame of keywords
100
+ df = pd.DataFrame({
101
+ 'Keyword': feature_names,
102
+ 'Resume Score': resume_vector,
103
+ 'Job Score': job_vector
104
+ })
105
+
106
+ # Filter for significant words (score > 0)
107
+ df = df[(df['Resume Score'] > 0) | (df['Job Score'] > 0)]
108
+
109
+ # Identify Missing Keywords (Present in Job but ZERO in Resume)
110
+ missing_keywords = df[(df['Job Score'] > 0) & (df['Resume Score'] == 0)]['Keyword'].tolist()
111
+
112
+ return match_percentage, missing_keywords, df
113
+
114
+ def analyze_structure(text):
115
+ """
116
+ Checks for essential resume elements using Regex.
117
+ """
118
+ issues = []
119
+
120
+ # Email Check
121
+ if not re.search(r'[\w\.-]+@[\w\.-]+', text):
122
+ issues.append("❌ Missing Email Address")
123
+
124
+ # Phone Check (Basic pattern)
125
+ if not re.search(r'\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}', text):
126
+ issues.append("⚠️ Missing Phone Number")
127
+
128
+ # Section Checks (Simple keyword search)
129
+ sections = ['experience', 'education', 'skills', 'projects']
130
+ missing_sections = [s.capitalize() for s in sections if s not in text.lower()]
131
+
132
+ if missing_sections:
133
+ issues.append(f"⚠️ Missing Sections: {', '.join(missing_sections)}")
134
+
135
+ return issues
136
+
137
+ # --- 3. FRONTEND UI (Streamlit) ---
138
+
139
+ # Sidebar
140
+ st.sidebar.header("βš™οΈ Controls")
141
+ st.sidebar.info(
142
+ "This tool uses TF-IDF Vectorization and Cosine Similarity "
143
+ "to analyze how well your resume matches a specific job description."
144
+ )
145
+ confidence_threshold = st.sidebar.slider("Match Threshold (Target)", 0, 100, 75)
146
+
147
+ # Main Content
148
+ st.title("🎯 Smart ATS Resume Optimizer")
149
+ st.markdown("Optimize your resume for Applicant Tracking Systems (ATS) using AI-driven text analysis.")
150
+
151
+ # Layout: Two Columns for Input
152
+ col1, col2 = st.columns(2)
153
+
154
+ with col1:
155
+ st.subheader("1. Upload Resume")
156
+ uploaded_file = st.file_uploader("Upload PDF", type=['pdf'], help="Only PDF files are supported")
157
+
158
+ with col2:
159
+ st.subheader("2. Job Description")
160
+ job_description = st.text_area("Paste JD here...", height=300, placeholder="Copy text from LinkedIn/Indeed...")
161
+
162
+ # Start Analysis Button
163
+ if st.button("πŸš€ Analyze Resume", type="primary"):
164
+
165
+ if uploaded_file and job_description:
166
+ with st.spinner("Parsing PDF and crunching numbers..."):
167
+
168
+ # A. Text Extraction
169
+ resume_text = extract_text_from_pdf(uploaded_file)
170
+
171
+ if resume_text:
172
+ # B. NLP Cleaning
173
+ clean_resume = clean_text(resume_text)
174
+ clean_jd = clean_text(job_description)
175
+
176
+ # C. Analysis Engine
177
+ match_score, missing_keywords, keyword_df = calculate_similarity(clean_resume, clean_jd)
178
+ structure_issues = analyze_structure(resume_text)
179
+
180
+ # --- RESULTS DASHBOARD ---
181
+ st.divider()
182
+ st.markdown("### πŸ“Š Analysis Report")
183
+
184
+ # Top Metric Cards
185
+ m1, m2, m3 = st.columns(3)
186
+ m1.metric("Match Score", f"{match_score:.1f}%", delta=f"{match_score - confidence_threshold:.1f}% vs Target")
187
+ m2.metric("Missing Keywords", len(missing_keywords), delta=-len(missing_keywords), delta_color="inverse")
188
+ m3.metric("Structure Issues", len(structure_issues), delta=-len(structure_issues), delta_color="inverse")
189
+
190
+ # Gauge Chart (Visual Appeal)
191
+ fig = go.Figure(go.Indicator(
192
+ mode = "gauge+number",
193
+ value = match_score,
194
+ domain = {'x': [0, 1], 'y': [0, 1]},
195
+ title = {'text': "ATS Confidence Score"},
196
+ gauge = {
197
+ 'axis': {'range': [None, 100]},
198
+ 'bar': {'color': "#FF4B4B"},
199
+ 'steps': [
200
+ {'range': [0, 50], 'color': "#fce4e4"},
201
+ {'range': [50, 75], 'color': "#fccfcf"},
202
+ {'range': [75, 100], 'color': "#ffb3b3"}],
203
+ }
204
+ ))
205
+ st.plotly_chart(fig, use_container_width=True)
206
+
207
+ # Detail Tabs
208
+ tab1, tab2, tab3 = st.tabs(["πŸ” Keyword Gap", "πŸ“ Resume Structure", "πŸ› οΈ Raw Data"])
209
+
210
+ with tab1:
211
+ st.subheader("Missing Hard Skills & Keywords")
212
+ st.caption("These words appear frequently in the Job Description but are missing from your Resume.")
213
+
214
+ if missing_keywords:
215
+ # Display as chips/tags
216
+ st.markdown(" ".join([f"`{k}`" for k in missing_keywords[:20]]))
217
+ if len(missing_keywords) > 20:
218
+ st.info(f"...and {len(missing_keywords)-20} more.")
219
+ else:
220
+ st.success("Amazing! You have all the key keywords.")
221
+
222
+ # Keyword Overlap Chart
223
+ st.subheader("Keyword Frequency Comparison")
224
+ # Get top 10 keywords from JD
225
+ top_keywords = keyword_df.sort_values(by='Job Score', ascending=False).head(15)
226
+
227
+ bar_fig = px.bar(
228
+ top_keywords,
229
+ x='Keyword',
230
+ y=['Job Score', 'Resume Score'],
231
+ barmode='group',
232
+ title="Top Keyword Importance (Resume vs JD)"
233
+ )
234
+ st.plotly_chart(bar_fig, use_container_width=True)
235
+
236
+ with tab2:
237
+ st.subheader("Formatting & Structure Check")
238
+ if structure_issues:
239
+ for issue in structure_issues:
240
+ st.error(issue)
241
+ st.info("Tip: Ensure your resume has clear headings for Experience, Education, and Skills.")
242
+ else:
243
+ st.success("βœ… Your resume structure looks great! Essential contact info and sections detected.")
244
+
245
+ with tab3:
246
+ st.subheader("Processed Text Debug")
247
+ with st.expander("View Cleaned Resume Text"):
248
+ st.write(clean_resume)
249
+ with st.expander("View Cleaned JD Text"):
250
+ st.write(clean_jd)
251
+
252
+ else:
253
+ st.warning("Please upload a resume and paste a job description.")