TheAllanB commited on
Commit
e7ee614
·
verified ·
1 Parent(s): 3db1942
Files changed (1) hide show
  1. app.py +16 -49
app.py CHANGED
@@ -16,12 +16,9 @@ import shutil
16
  import zipfile
17
 
18
 
19
-
20
- # Download necessary NLTK data
21
  nltk.download('punkt', quiet=True)
22
  nltk.download('stopwords', quiet=True)
23
 
24
- # Functions from the previous script
25
  def extract_text_from_docx(docx_path):
26
  doc = Document(docx_path)
27
  return " ".join([paragraph.text for paragraph in doc.paragraphs])
@@ -55,8 +52,7 @@ def preprocess_text(text):
55
 
56
  def classify_resume(text):
57
  classification = defaultdict(str)
58
-
59
- # Job role/industry
60
  job_roles = {
61
  "software": ["software engineer", "developer", "programmer"],
62
  "data": ["data scientist", "data analyst", "machine learning"],
@@ -68,32 +64,27 @@ def classify_resume(text):
68
  if any(keyword in text.lower() for keyword in keywords):
69
  classification["job role"] = role
70
  break
71
-
72
- # Education level
73
  education_levels = ["High School", "Associate", "Bachelor", "Master", "PhD"]
74
  for level in education_levels:
75
  if level.lower() in text.lower():
76
  classification["education"] = level
77
  break
78
-
79
- # Years of experience
80
  experience_match = re.search(r"(\d+)\s*(?:years?|yrs?)(?:\s+of)?\s+experience", text, re.IGNORECASE)
81
  if experience_match:
82
  classification["years_experience"] = experience_match.group(1)
83
-
84
- # Skills
85
  skills = ["Python", "Java", "C++", "JavaScript", "SQL", "AWS", "Docker", "Kubernetes",
86
  "Machine Learning", "Data Analysis", "Project Management", "Agile", "Scrum"]
87
  found_skills = [skill for skill in skills if skill.lower() in text.lower()]
88
  classification["skills"] = ", ".join(found_skills)
89
 
90
- # Phone number
91
  phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
92
  phone_match = re.search(phone_pattern, text)
93
  if phone_match:
94
  classification["phone number"] = phone_match.group()
95
 
96
- # Address (basic pattern, might need refinement)
97
  address_pattern = r'\d{1,5}\s\w+\s\w+\.?(?:\s\w+\.?)?\s*,?\s*\w+\s*,?\s*[A-Z]{2}\s*\d{5}'
98
  address_match = re.search(address_pattern, text)
99
  if address_match:
@@ -104,51 +95,37 @@ def classify_resume(text):
104
  def create_resume_ranking_model(job_description, resume_directory):
105
  # Process resumes
106
  resume_texts = process_resume_directory(resume_directory)
107
-
108
- # Classify resumes
109
  classified_resumes = {filename: classify_resume(text) for filename, text in resume_texts.items()}
110
-
111
- # Create DataFrame from classified resumes
112
  df = pd.DataFrame.from_dict(classified_resumes, orient='index')
113
  df['filename'] = df.index
114
  df.reset_index(drop=True, inplace=True)
115
-
116
- # Combine relevant columns into a single text field
117
  df['combined_text'] = df[['education', 'job role', 'skills']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
118
-
119
- # Add years of experience to the combined text
120
  df['combined_text'] += ' ' + df['years_experience'].astype(str) + ' years experience'
121
-
122
- # Preprocess job description and resumes
123
  preprocessed_jd = preprocess_text(job_description)
124
  preprocessed_resumes = df['combined_text'].apply(preprocess_text)
125
-
126
- # Create TF-IDF vectorizer
127
  vectorizer = TfidfVectorizer()
128
-
129
- # Fit and transform the job description and resumes
130
  tfidf_matrix = vectorizer.fit_transform([preprocessed_jd] + list(preprocessed_resumes))
131
-
132
- # Calculate cosine similarity between job description and each resume
133
  cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
134
-
135
- # Add similarity scores to the dataframe
136
  df['similarity_score'] = cosine_similarities
137
-
138
- # Sort resumes by similarity score in descending order
139
  ranked_resumes = df.sort_values('similarity_score', ascending=False).reset_index(drop=True)
140
 
141
  return ranked_resumes
142
 
143
 
144
-
145
- #Streamlit App
146
-
147
  import streamlit as st
148
  import tempfile
149
  import os
150
 
151
- # Streamlit app
152
  st.title('Resume Ranking System')
153
 
154
  st.write("""
@@ -156,34 +133,27 @@ This app ranks resumes based on their similarity to a given job description.
156
  Upload resume files (PDF and DOCX formats) and enter a job description to get started.
157
  """)
158
 
159
- # Job description input
160
  job_description = st.text_area("Enter the job description:", height=200)
161
 
162
- # File uploader for resumes
163
  uploaded_files = st.file_uploader("Upload resume files", accept_multiple_files=True, type=['pdf', 'docx'])
164
 
165
  if st.button('Rank Resumes'):
166
  if job_description and uploaded_files:
167
  try:
168
- # Create a temporary directory to store uploaded files
169
  with tempfile.TemporaryDirectory() as temp_dir:
170
- # Save uploaded files to the temporary directory
171
  for uploaded_file in uploaded_files:
172
  file_path = os.path.join(temp_dir, uploaded_file.name)
173
  with open(file_path, "wb") as f:
174
  f.write(uploaded_file.getbuffer())
175
 
176
- # Process resumes
177
  with st.spinner('Processing resumes...'):
178
  ranked_resumes = create_resume_ranking_model(job_description, temp_dir)
179
 
180
  st.success('Resumes ranked successfully!')
181
 
182
- # Display results
183
  st.write("Top 5 Ranked Resumes:")
184
  st.dataframe(ranked_resumes.head())
185
 
186
- # Create a folder with ranked resumes
187
  output_folder = "ranked_resumes"
188
  if os.path.exists(output_folder):
189
  shutil.rmtree(output_folder)
@@ -193,11 +163,9 @@ if st.button('Rank Resumes'):
193
  src_file = os.path.join(temp_dir, row['filename'])
194
  dst_file = os.path.join(output_folder, f"{index+1:03d}_{row['filename']}")
195
  shutil.copy2(src_file, dst_file)
196
-
197
- # Create a zip file of the ranked resumes
198
  shutil.make_archive(output_folder, 'zip', output_folder)
199
 
200
- # Offer the zip file for download
201
  with open(f"{output_folder}.zip", "rb") as file:
202
  st.download_button(
203
  label="Download ranked resumes as ZIP",
@@ -205,8 +173,7 @@ if st.button('Rank Resumes'):
205
  file_name="ranked_resumes.zip",
206
  mime="application/zip"
207
  )
208
-
209
- # Option to download full results as CSV
210
  csv = ranked_resumes.to_csv(index=False)
211
  st.download_button(
212
  label="Download full results as CSV",
 
16
  import zipfile
17
 
18
 
 
 
19
  nltk.download('punkt', quiet=True)
20
  nltk.download('stopwords', quiet=True)
21
 
 
22
  def extract_text_from_docx(docx_path):
23
  doc = Document(docx_path)
24
  return " ".join([paragraph.text for paragraph in doc.paragraphs])
 
52
 
53
  def classify_resume(text):
54
  classification = defaultdict(str)
55
+
 
56
  job_roles = {
57
  "software": ["software engineer", "developer", "programmer"],
58
  "data": ["data scientist", "data analyst", "machine learning"],
 
64
  if any(keyword in text.lower() for keyword in keywords):
65
  classification["job role"] = role
66
  break
67
+
 
68
  education_levels = ["High School", "Associate", "Bachelor", "Master", "PhD"]
69
  for level in education_levels:
70
  if level.lower() in text.lower():
71
  classification["education"] = level
72
  break
73
+
 
74
  experience_match = re.search(r"(\d+)\s*(?:years?|yrs?)(?:\s+of)?\s+experience", text, re.IGNORECASE)
75
  if experience_match:
76
  classification["years_experience"] = experience_match.group(1)
77
+
 
78
  skills = ["Python", "Java", "C++", "JavaScript", "SQL", "AWS", "Docker", "Kubernetes",
79
  "Machine Learning", "Data Analysis", "Project Management", "Agile", "Scrum"]
80
  found_skills = [skill for skill in skills if skill.lower() in text.lower()]
81
  classification["skills"] = ", ".join(found_skills)
82
 
 
83
  phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
84
  phone_match = re.search(phone_pattern, text)
85
  if phone_match:
86
  classification["phone number"] = phone_match.group()
87
 
 
88
  address_pattern = r'\d{1,5}\s\w+\s\w+\.?(?:\s\w+\.?)?\s*,?\s*\w+\s*,?\s*[A-Z]{2}\s*\d{5}'
89
  address_match = re.search(address_pattern, text)
90
  if address_match:
 
95
  def create_resume_ranking_model(job_description, resume_directory):
96
  # Process resumes
97
  resume_texts = process_resume_directory(resume_directory)
98
+
 
99
  classified_resumes = {filename: classify_resume(text) for filename, text in resume_texts.items()}
100
+
 
101
  df = pd.DataFrame.from_dict(classified_resumes, orient='index')
102
  df['filename'] = df.index
103
  df.reset_index(drop=True, inplace=True)
104
+
 
105
  df['combined_text'] = df[['education', 'job role', 'skills']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
106
+
 
107
  df['combined_text'] += ' ' + df['years_experience'].astype(str) + ' years experience'
108
+
 
109
  preprocessed_jd = preprocess_text(job_description)
110
  preprocessed_resumes = df['combined_text'].apply(preprocess_text)
111
+
 
112
  vectorizer = TfidfVectorizer()
113
+
 
114
  tfidf_matrix = vectorizer.fit_transform([preprocessed_jd] + list(preprocessed_resumes))
115
+
 
116
  cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
117
+
 
118
  df['similarity_score'] = cosine_similarities
119
+
 
120
  ranked_resumes = df.sort_values('similarity_score', ascending=False).reset_index(drop=True)
121
 
122
  return ranked_resumes
123
 
124
 
 
 
 
125
  import streamlit as st
126
  import tempfile
127
  import os
128
 
 
129
  st.title('Resume Ranking System')
130
 
131
  st.write("""
 
133
  Upload resume files (PDF and DOCX formats) and enter a job description to get started.
134
  """)
135
 
 
136
  job_description = st.text_area("Enter the job description:", height=200)
137
 
 
138
  uploaded_files = st.file_uploader("Upload resume files", accept_multiple_files=True, type=['pdf', 'docx'])
139
 
140
  if st.button('Rank Resumes'):
141
  if job_description and uploaded_files:
142
  try:
 
143
  with tempfile.TemporaryDirectory() as temp_dir:
 
144
  for uploaded_file in uploaded_files:
145
  file_path = os.path.join(temp_dir, uploaded_file.name)
146
  with open(file_path, "wb") as f:
147
  f.write(uploaded_file.getbuffer())
148
 
 
149
  with st.spinner('Processing resumes...'):
150
  ranked_resumes = create_resume_ranking_model(job_description, temp_dir)
151
 
152
  st.success('Resumes ranked successfully!')
153
 
 
154
  st.write("Top 5 Ranked Resumes:")
155
  st.dataframe(ranked_resumes.head())
156
 
 
157
  output_folder = "ranked_resumes"
158
  if os.path.exists(output_folder):
159
  shutil.rmtree(output_folder)
 
163
  src_file = os.path.join(temp_dir, row['filename'])
164
  dst_file = os.path.join(output_folder, f"{index+1:03d}_{row['filename']}")
165
  shutil.copy2(src_file, dst_file)
166
+
 
167
  shutil.make_archive(output_folder, 'zip', output_folder)
168
 
 
169
  with open(f"{output_folder}.zip", "rb") as file:
170
  st.download_button(
171
  label="Download ranked resumes as ZIP",
 
173
  file_name="ranked_resumes.zip",
174
  mime="application/zip"
175
  )
176
+
 
177
  csv = ranked_resumes.to_csv(index=False)
178
  st.download_button(
179
  label="Download full results as CSV",