Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import pickle | |
| import networkx as nx | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.neighbors import NearestNeighbors | |
| from sklearn.cluster import KMeans | |
| from collections import Counter | |
| from io import StringIO | |
| import pdfplumber | |
| import docx | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Load SBERT Model | |
| MODEL_NAME = "all-MiniLM-L6-v2" | |
| model = SentenceTransformer(MODEL_NAME) | |
| # Load Model & Data | |
| def load_model(path="models/"): | |
| with open(f"{path}knn_model.pkl", "rb") as f: | |
| knn = pickle.load(f) | |
| embeddings = np.load(f"{path}embeddings.npy") | |
| data = pd.read_csv(f"{path}job_data.csv") | |
| print("loaded model") | |
| return data, embeddings, knn | |
| def cluster_skills(skills, n_clusters=10): | |
| """Clusters skills into meaningful groups using KMeans.""" | |
| vectorized_skills = np.array([model.encode(skill) for skill in skills]) | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42) | |
| labels = kmeans.fit_predict(vectorized_skills) | |
| return dict(zip(skills, labels)) | |
| def gap_analysis(user_skills, job_skills): | |
| """Identifies missing skills by comparing user skills with job requirements.""" | |
| user_skills_set = set(user_skills) | |
| job_skills_set = set(job_skills) | |
| missing_skills = job_skills_set - user_skills_set | |
| return list(missing_skills) | |
| def find_matching_jobs(user_skills, data, embeddings, knn, top_n=5): | |
| """Finds the best matching jobs based on user skills.""" | |
| user_embedding = model.encode(", ".join(user_skills)).reshape(1, -1) | |
| distances, indices = knn.kneighbors(user_embedding, n_neighbors=top_n) | |
| matching_jobs = data.iloc[indices[0]] | |
| return matching_jobs | |
| def extract_text_from_file(uploaded_file): | |
| """Extracts text from uploaded PDF, DOCX, or TXT files.""" | |
| if uploaded_file.name.endswith(".txt"): | |
| return uploaded_file.getvalue().decode("utf-8") | |
| elif uploaded_file.name.endswith(".pdf"): | |
| with pdfplumber.open(uploaded_file) as pdf: | |
| return " ".join([page.extract_text() for page in pdf.pages if page.extract_text()]) | |
| elif uploaded_file.name.endswith(".docx"): | |
| doc = docx.Document(uploaded_file) | |
| return " ".join([para.text for para in doc.paragraphs]) | |
| return "" | |
| def show_skill_trends(data): | |
| """Visualizes demand trends for specific skills over time.""" | |
| #data['original_listed_time'] = pd.to_datetime(data['original_listed_time'], errors='coerce') | |
| data['original_listed_time'] = pd.to_datetime(data['original_listed_time'], unit='ms') | |
| all_skills = [skill for sublist in data['skills_desc'].str.split(', ') for skill in sublist] | |
| skill_counts = Counter(all_skills) | |
| top_skills = pd.DataFrame(skill_counts.items(), columns=['Skill', 'Count']).nlargest(15, 'Count') | |
| st.subheader("Top In-Demand Skills") | |
| st.bar_chart(top_skills.set_index("Skill")) | |
| data['skills_desc'] = data['skills_desc'].apply(lambda x: x.split(", ") if isinstance(x, str) else []) | |
| skill_trends = data.explode('skills_desc').groupby(['original_listed_time', 'skills_desc']).size().reset_index(name='count') | |
| top_trending_skills = skill_trends[skill_trends['skills_desc'].isin(top_skills['Skill'])] | |
| if top_trending_skills.empty: | |
| st.warning("No skill trends found. Try using a different dataset.") | |
| return | |
| plt.figure(figsize=(10, 5)) | |
| sns.lineplot(data=top_trending_skills, x='original_listed_time', y='count', hue='skills_desc') | |
| plt.xticks(rotation=45) | |
| plt.title("Skill Demand Trends Over Time") | |
| plt.xlabel("Job Posting Date") | |
| st.pyplot(plt) | |
| # Streamlit UI | |
| st.title("Job Skill Matching System") | |
| st.write("Upload your resume or manually enter skills to analyze skill gaps and job relationships.") | |
| data, embeddings, knn = load_model() | |
| # Upload Resume | |
| uploaded_file = st.file_uploader("Upload Resume (PDF, DOCX, or TXT format)", type=["pdf", "docx", "txt"]) | |
| user_skills = [] | |
| if uploaded_file is not None: | |
| extracted_text = extract_text_from_file(uploaded_file) | |
| user_skills = extracted_text.split(",") | |
| # Manual Skill Entry | |
| manual_input = st.text_area("Or Enter Your Skills (comma-separated)") | |
| if manual_input: | |
| user_skills.extend(manual_input.split(",")) | |
| # Select Job Role | |
| target_job = st.selectbox("Select a Job Role for finding missing Skills", data["title"].unique()) | |
| job_row = data[data["title"] == target_job].iloc[0] | |
| job_skills = job_row["skills_desc"].split(", ") | |
| if st.button("Find Matching Jobs"): | |
| matching_jobs = find_matching_jobs(user_skills, data, embeddings, knn) | |
| st.write("### Best Matching Jobs:") | |
| st.dataframe(matching_jobs[["title", "company_name", "location", "skills_desc","job_posting_url"]]) | |
| if st.button("Analyze Skill Gap"): | |
| missing_skills = gap_analysis(user_skills, job_skills) | |
| st.write("### Missing Skills:", missing_skills if missing_skills else "None") | |
| if st.button("Show Skill Demand Trends"): | |
| show_skill_trends(data) | |