Spaces:

TheAllanB
/

ResumeScreeningModel

Sleeping

App Files Files Community

TheAllanB commited on Aug 1, 2024

Commit

e7ee614

verified ·

1 Parent(s): 3db1942

!init

Browse files

Files changed (1) hide show

app.py +16 -49

app.py CHANGED Viewed

@@ -16,12 +16,9 @@ import shutil
 import zipfile
-# Download necessary NLTK data
 nltk.download('punkt', quiet=True)
 nltk.download('stopwords', quiet=True)
-# Functions from the previous script
 def extract_text_from_docx(docx_path):
     doc = Document(docx_path)
     return " ".join([paragraph.text for paragraph in doc.paragraphs])
@@ -55,8 +52,7 @@ def preprocess_text(text):
 def classify_resume(text):
     classification = defaultdict(str)
-    # Job role/industry
     job_roles = {
         "software": ["software engineer", "developer", "programmer"],
         "data": ["data scientist", "data analyst", "machine learning"],
@@ -68,32 +64,27 @@ def classify_resume(text):
         if any(keyword in text.lower() for keyword in keywords):
             classification["job role"] = role
             break
-    # Education level
     education_levels = ["High School", "Associate", "Bachelor", "Master", "PhD"]
     for level in education_levels:
         if level.lower() in text.lower():
             classification["education"] = level
             break
-    # Years of experience
     experience_match = re.search(r"(\d+)\s*(?:years?|yrs?)(?:\s+of)?\s+experience", text, re.IGNORECASE)
     if experience_match:
         classification["years_experience"] = experience_match.group(1)
-    # Skills
     skills = ["Python", "Java", "C++", "JavaScript", "SQL", "AWS", "Docker", "Kubernetes",
               "Machine Learning", "Data Analysis", "Project Management", "Agile", "Scrum"]
     found_skills = [skill for skill in skills if skill.lower() in text.lower()]
     classification["skills"] = ", ".join(found_skills)
-    # Phone number
     phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
     phone_match = re.search(phone_pattern, text)
     if phone_match:
         classification["phone number"] = phone_match.group()
-    # Address (basic pattern, might need refinement)
     address_pattern = r'\d{1,5}\s\w+\s\w+\.?(?:\s\w+\.?)?\s*,?\s*\w+\s*,?\s*[A-Z]{2}\s*\d{5}'
     address_match = re.search(address_pattern, text)
     if address_match:
@@ -104,51 +95,37 @@ def classify_resume(text):
 def create_resume_ranking_model(job_description, resume_directory):
     # Process resumes
     resume_texts = process_resume_directory(resume_directory)
-    # Classify resumes
     classified_resumes = {filename: classify_resume(text) for filename, text in resume_texts.items()}
-    # Create DataFrame from classified resumes
     df = pd.DataFrame.from_dict(classified_resumes, orient='index')
     df['filename'] = df.index
     df.reset_index(drop=True, inplace=True)
-    # Combine relevant columns into a single text field
     df['combined_text'] = df[['education', 'job role', 'skills']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
-    # Add years of experience to the combined text
     df['combined_text'] += ' ' + df['years_experience'].astype(str) + ' years experience'
-    # Preprocess job description and resumes
     preprocessed_jd = preprocess_text(job_description)
     preprocessed_resumes = df['combined_text'].apply(preprocess_text)
-    # Create TF-IDF vectorizer
     vectorizer = TfidfVectorizer()
-    # Fit and transform the job description and resumes
     tfidf_matrix = vectorizer.fit_transform([preprocessed_jd] + list(preprocessed_resumes))
-    # Calculate cosine similarity between job description and each resume
     cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
-    # Add similarity scores to the dataframe
     df['similarity_score'] = cosine_similarities
-    # Sort resumes by similarity score in descending order
     ranked_resumes = df.sort_values('similarity_score', ascending=False).reset_index(drop=True)
     return ranked_resumes
-#Streamlit App
 import streamlit as st
 import tempfile
 import os
-# Streamlit app
 st.title('Resume Ranking System')
 st.write("""
@@ -156,34 +133,27 @@ This app ranks resumes based on their similarity to a given job description.
 Upload resume files (PDF and DOCX formats) and enter a job description to get started.
 """)
-# Job description input
 job_description = st.text_area("Enter the job description:", height=200)
-# File uploader for resumes
 uploaded_files = st.file_uploader("Upload resume files", accept_multiple_files=True, type=['pdf', 'docx'])
 if st.button('Rank Resumes'):
     if job_description and uploaded_files:
         try:
-            # Create a temporary directory to store uploaded files
             with tempfile.TemporaryDirectory() as temp_dir:
-                # Save uploaded files to the temporary directory
                 for uploaded_file in uploaded_files:
                     file_path = os.path.join(temp_dir, uploaded_file.name)
                     with open(file_path, "wb") as f:
                         f.write(uploaded_file.getbuffer())
-                # Process resumes
                 with st.spinner('Processing resumes...'):
                     ranked_resumes = create_resume_ranking_model(job_description, temp_dir)
                 st.success('Resumes ranked successfully!')
-                # Display results
                 st.write("Top 5 Ranked Resumes:")
                 st.dataframe(ranked_resumes.head())
-                # Create a folder with ranked resumes
                 output_folder = "ranked_resumes"
                 if os.path.exists(output_folder):
                     shutil.rmtree(output_folder)
@@ -193,11 +163,9 @@ if st.button('Rank Resumes'):
                     src_file = os.path.join(temp_dir, row['filename'])
                     dst_file = os.path.join(output_folder, f"{index+1:03d}_{row['filename']}")
                     shutil.copy2(src_file, dst_file)
-                # Create a zip file of the ranked resumes
                 shutil.make_archive(output_folder, 'zip', output_folder)
-                # Offer the zip file for download
                 with open(f"{output_folder}.zip", "rb") as file:
                     st.download_button(
                         label="Download ranked resumes as ZIP",
@@ -205,8 +173,7 @@ if st.button('Rank Resumes'):
                         file_name="ranked_resumes.zip",
                         mime="application/zip"
                     )
-                # Option to download full results as CSV
                 csv = ranked_resumes.to_csv(index=False)
                 st.download_button(
                     label="Download full results as CSV",

 import zipfile
 nltk.download('punkt', quiet=True)
 nltk.download('stopwords', quiet=True)
 def extract_text_from_docx(docx_path):
     doc = Document(docx_path)
     return " ".join([paragraph.text for paragraph in doc.paragraphs])
 def classify_resume(text):
     classification = defaultdict(str)
     job_roles = {
         "software": ["software engineer", "developer", "programmer"],
         "data": ["data scientist", "data analyst", "machine learning"],
         if any(keyword in text.lower() for keyword in keywords):
             classification["job role"] = role
             break
     education_levels = ["High School", "Associate", "Bachelor", "Master", "PhD"]
     for level in education_levels:
         if level.lower() in text.lower():
             classification["education"] = level
             break
     experience_match = re.search(r"(\d+)\s*(?:years?|yrs?)(?:\s+of)?\s+experience", text, re.IGNORECASE)
     if experience_match:
         classification["years_experience"] = experience_match.group(1)
     skills = ["Python", "Java", "C++", "JavaScript", "SQL", "AWS", "Docker", "Kubernetes",
               "Machine Learning", "Data Analysis", "Project Management", "Agile", "Scrum"]
     found_skills = [skill for skill in skills if skill.lower() in text.lower()]
     classification["skills"] = ", ".join(found_skills)
     phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
     phone_match = re.search(phone_pattern, text)
     if phone_match:
         classification["phone number"] = phone_match.group()
     address_pattern = r'\d{1,5}\s\w+\s\w+\.?(?:\s\w+\.?)?\s*,?\s*\w+\s*,?\s*[A-Z]{2}\s*\d{5}'
     address_match = re.search(address_pattern, text)
     if address_match:
 def create_resume_ranking_model(job_description, resume_directory):
     # Process resumes
     resume_texts = process_resume_directory(resume_directory)
     classified_resumes = {filename: classify_resume(text) for filename, text in resume_texts.items()}
     df = pd.DataFrame.from_dict(classified_resumes, orient='index')
     df['filename'] = df.index
     df.reset_index(drop=True, inplace=True)
     df['combined_text'] = df[['education', 'job role', 'skills']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
     df['combined_text'] += ' ' + df['years_experience'].astype(str) + ' years experience'
     preprocessed_jd = preprocess_text(job_description)
     preprocessed_resumes = df['combined_text'].apply(preprocess_text)
     vectorizer = TfidfVectorizer()
     tfidf_matrix = vectorizer.fit_transform([preprocessed_jd] + list(preprocessed_resumes))
     cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
     df['similarity_score'] = cosine_similarities
     ranked_resumes = df.sort_values('similarity_score', ascending=False).reset_index(drop=True)
     return ranked_resumes
 import streamlit as st
 import tempfile
 import os
 st.title('Resume Ranking System')
 st.write("""
 Upload resume files (PDF and DOCX formats) and enter a job description to get started.
 """)
 job_description = st.text_area("Enter the job description:", height=200)
 uploaded_files = st.file_uploader("Upload resume files", accept_multiple_files=True, type=['pdf', 'docx'])
 if st.button('Rank Resumes'):
     if job_description and uploaded_files:
         try:
             with tempfile.TemporaryDirectory() as temp_dir:
                 for uploaded_file in uploaded_files:
                     file_path = os.path.join(temp_dir, uploaded_file.name)
                     with open(file_path, "wb") as f:
                         f.write(uploaded_file.getbuffer())
                 with st.spinner('Processing resumes...'):
                     ranked_resumes = create_resume_ranking_model(job_description, temp_dir)
                 st.success('Resumes ranked successfully!')
                 st.write("Top 5 Ranked Resumes:")
                 st.dataframe(ranked_resumes.head())
                 output_folder = "ranked_resumes"
                 if os.path.exists(output_folder):
                     shutil.rmtree(output_folder)
                     src_file = os.path.join(temp_dir, row['filename'])
                     dst_file = os.path.join(output_folder, f"{index+1:03d}_{row['filename']}")
                     shutil.copy2(src_file, dst_file)
                 shutil.make_archive(output_folder, 'zip', output_folder)
                 with open(f"{output_folder}.zip", "rb") as file:
                     st.download_button(
                         label="Download ranked resumes as ZIP",
                         file_name="ranked_resumes.zip",
                         mime="application/zip"
                     )
                 csv = ranked_resumes.to_csv(index=False)
                 st.download_button(
                     label="Download full results as CSV",