Spaces:

Nikhil-Murade
/

TalentEdge

Sleeping

+import streamlit as st
+import joblib
+import numpy as np
+from preprocessing import preprocess_single_title
+import logging
+# Configure logging for errors
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Load the pre-trained models
+vectorizer = joblib.load('vectorizer_model.pkl')
+kmeans_model = joblib.load('kmeans_model.pkl')
+# Streamlit app title
+st.title("Job Title Clustering App")
+# Display Silhouette Score in the sidebar
+st.sidebar.header("Insights")  # This creates a big heading in the sidebar
+st.sidebar.write("Silhouette Score: 0.5840")
+# Input fields for job titles
+job_title_1 = st.text_input("Enter the first job title:")
+job_title_2 = st.text_input("Enter the second job title:")
+# Button to process the inputs
+if st.button("Submit"):
+    if not job_title_1 or not job_title_2:
+        st.error("Please enter both job titles.")
+    else:
+        try:
+            # Preprocess the input job titles
+            clean_title_1 = preprocess_single_title(job_title_1)
+            clean_title_2 = preprocess_single_title(job_title_2)
+            # Log the preprocessed titles
+            logger.info(f"Preprocessed Title 1: {clean_title_1}")
+            logger.info(f"Preprocessed Title 2: {clean_title_2}")
+            # Vectorize the preprocessed job titles
+            title_vector_1 = vectorizer.transform([clean_title_1])
+            title_vector_2 = vectorizer.transform([clean_title_2])
+            # Predict clusters for each job title
+            cluster_1 = kmeans_model.predict(title_vector_1)[0]
+            cluster_2 = kmeans_model.predict(title_vector_2)[0]
+            # Display results
+            st.write(f"Cluster for '{job_title_1}': {cluster_1}")
+            st.write(f"Cluster for '{job_title_2}': {cluster_2}")
+            if cluster_1 == cluster_2:
+                st.success(f"The job titles '{job_title_1}' and '{job_title_2}' belong to the same cluster!")
+            else:
+                st.warning(f"The job titles '{job_title_1}' and '{job_title_2}' do not belong to the same cluster.")
+            # Display top words for the predicted clusters
+            def get_top_words(cluster, vectorizer, kmeans_model):
+                feature_names = vectorizer.get_feature_names_out()
+                top_word_indices = np.argsort(kmeans_model.cluster_centers_[cluster])[::-1][:5]
+                top_words = [feature_names[i] for i in top_word_indices]
+                return top_words
+            top_words_1 = get_top_words(cluster_1, vectorizer, kmeans_model)
+            top_words_2 = get_top_words(cluster_2, vectorizer, kmeans_model)
+            st.write(f"Top words in Cluster {cluster_1}: {', '.join(top_words_1)}")
+            st.write(f"Top words in Cluster {cluster_2}: {', '.join(top_words_2)}")
+        except Exception as e:
+            logger.error(f"Error occurred: {e}", exc_info=True)
+            st.error(f"An error occurred: {e}")

preprocessing.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import re
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class JobTitlePreprocessor():
+    """Preprocesses job titles by converting to lowercase, removing unwanted words, special characters, numbers greater than 10, and content from location, states, regions, etc."""
+    def __init__(self):
+        self.unwanted_words = ['remote', 'hybrid', 'flexible location', 'location', 'open to work', 'role', 'job', 'level', 'remot']
+    def remove_location_unwanted_words_brackets(self, title: str) -> str:
+        """Removes parts of the title based on unwanted words, bracketed content, numbers greater than 10, and also removes symbols other than alphanumeric."""
+        # Remove unwanted words
+        for word in self.unwanted_words:
+            pattern = r'\b{}\b'.format(re.escape(word))
+            title = re.sub(pattern, '', title, flags=re.IGNORECASE)
+        # Remove content within brackets
+        title = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', title)
+        # Remove any non-alphanumeric characters (keeping spaces)
+        title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
+        # Remove numbers greater than 10
+        title = re.sub(r'\b(?:[1-9][0-9]+|1[1-9]|[2-9][0-9])\b', '', title)
+        # Clean up extra spaces
+        title = re.sub(r'\s+', ' ', title).strip()
+        return title
+    def preprocess(self, title: str) -> str:
+        """Converts title to lowercase, removes unwanted words, replaces specific terms, and standardizes job titles."""
+        if not isinstance(title, str):
+            return title
+        # Convert to lowercase
+        title = title.lower()
+        # Remove unwanted words
+        for word in self.unwanted_words:
+            title = re.sub(r'\b{}\b'.format(re.escape(word)), '', title, flags=re.IGNORECASE)
+        # Replace specific terms and Roman numerals
+        replacements = [
+            (r'\b(?:SR|sr|Sr\.?|SR\.?|Senior|senior)\b', 'Senior'),
+            (r'\b(?:JR|jr|Jr\.?|JR\.?|Junior|junior)\b', 'Junior'),
+            (r'\b(?:AIML|aiml|ML|ml|MachineLearning|machinelearning|machine[_\-]learning)\b', 'Machine Learning'),
+            (r'\b(?:GenAI|genai|Genai|generative[_\-]ai|GenerativeAI|generativeai)\b', 'Generative AI'),
+            (r'\b(?:NLP|nlp|natural[_\-]language[_\-]processing|natural language processing)\b', 'NLP'),
+            (r'\b(?:i|I)\b', '1'),
+            (r'\b(?:ii|II)\b', '2'),
+            (r'\b(?:iii|III)\b', '3'),
+            (r'\b(?:iv|IV)\b', '4'),
+            (r'\b(?:v|V)\b', '5')
+        ]
+        for pattern, replacement in replacements:
+            title = re.sub(pattern, replacement, title, flags=re.IGNORECASE)
+        # Handle specific Data Scientist cases
+        title = re.sub(r'\b(director|dir\.?|dir)\b.*?(data\s*scientist|data\s*science)', 'Director Data Scientist', title, flags=re.IGNORECASE)
+        title = re.sub(r'\b(manager|mgr)\b.*?(data\s*scientist|data\s*science)', 'Manager Data Scientist', title, flags=re.IGNORECASE)
+        title = re.sub(r'\b(lead)\b.*?(data\s*scientist|data\s*science)', 'Lead Data Scientist', title, flags=re.IGNORECASE)
+        title = re.sub(r'\b(associate|associates?)\b.*?(data\s*scientist|data\s*science)', 'Associate Data Scientist', title, flags=re.IGNORECASE)
+        title = re.sub(r'\b(applied)\b.*?(data\s*scientist|data\s*science)', 'Applied Data Scientist', title, flags=re.IGNORECASE)
+        title = re.sub(r'\b(intern|internship|trainee)\b.*?(data\s*scientist|data\s*science)', 'Intern Data Scientist', title, flags=re.IGNORECASE)
+        # Clean up extra spaces
+        title = re.sub(r'\s+', ' ', title).strip()
+        return title
+def preprocess_single_title(title: str) -> str:
+    preprocessor = JobTitlePreprocessor()
+    clean_title = preprocessor.remove_location_unwanted_words_brackets(title)
+    clean_title = preprocessor.preprocess(clean_title)
+    return clean_title
+if __name__ == "__main__":
+    # Example single title
+    title = "Senior Remote Machine Learning Data Scientist (Manager)"
+    clean_title = preprocess_single_title(title)
+    logger.info(f"Original title: {title}")
+    logger.info(f"Preprocessed title: {clean_title}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,427 @@

+aiohttp==3.9.5
+aiosignal==1.3.1
+alembic==1.13.2
+altair==5.3.0
+annotated-types==0.7.0
+anyio==4.4.0
+appdirs==1.4.4
+apturl==0.5.2
+argcomplete==1.10.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+arxiv==2.1.3
+asgiref==3.8.1
+asttokens==2.4.1
+async-lru==2.0.4
+async-timeout==4.0.3
+attrs==23.2.0
+auth0-python==4.7.1
+Babel==2.15.0
+backoff==2.2.1
+bcrypt==4.2.0
+beautifulsoup4==4.12.3
+black==24.4.2
+bleach==6.1.0
+blinker==1.8.2
+blis==0.7.11
+boto3==1.35.14
+botocore==1.35.14
+Brlapi==0.8.3
+build==1.2.2
+cachetools==5.3.3
+catalogue==2.0.10
+certifi==2024.6.2
+cffi==1.16.0
+chardet==3.0.4
+charset-normalizer==3.3.2
+chroma-hnswlib==0.7.3
+chromadb==0.4.24
+click==8.1.7
+cloudpathlib==0.18.1
+cog==0.9.7
+cohere==5.9.1
+colorama==0.4.4
+coloredlogs==15.0.1
+comm==0.2.2
+command-not-found==0.3
+compressed_rtf==1.0.6
+confection==0.1.5
+crewai==0.55.2
+crewai-tools==0.12.0
+cryptography==42.0.8
+cupshelpers==1.0
+cymem==2.0.8
+dataclasses-json==0.6.6
+datasets==3.0.0
+dbus-python==1.2.18
+debugpy==1.8.1
+decorator==5.1.1
+defer==1.0.6
+defusedxml==0.7.1
+Deprecated==1.2.14
+deprecation==2.1.0
+dill==0.3.8
+dirtyjson==1.0.8
+distlib==0.3.8
+distro==1.9.0
+distro-info==1.1+ubuntu0.2
+dnspython==2.6.1
+docker==7.1.0
+docstring_parser==0.16
+docx2txt==0.8
+duplicity==0.8.21
+ebcdic==1.1.1
+embedchain==0.1.121
+entrypoints==0.4
+exceptiongroup==1.2.1
+executing==2.0.1
+extract-msg==0.28.7
+fastapi==0.114.0
+fastavro==1.9.7
+fasteners==0.14.1
+fastjsonschema==2.19.1
+fasttext==0.9.3
+feedparser==6.0.11
+filelock==3.14.0
+fireworks-ai==0.15.1
+Flask==3.0.3
+flatbuffers==24.3.25
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.0
+future==0.18.2
+gitdb==4.0.11
+GitPython==3.1.43
+google-api-core==2.19.2
+google-auth==2.34.0
+google-cloud-aiplatform==1.65.0
+google-cloud-bigquery==3.25.0
+google-cloud-core==2.4.1
+google-cloud-resource-manager==1.12.5
+google-cloud-storage==2.18.2
+google-crc32c==1.6.0
+google-resumable-media==2.7.2
+googleapis-common-protos==1.65.0
+gptcache==0.1.44
+greenlet==3.0.3
+grpc-google-iam-v1==0.13.1
+grpcio==1.64.1
+grpcio-status==1.62.3
+grpcio-tools==1.62.2
+h11==0.14.0
+h2==4.1.0
+hpack==4.0.0
+httpcore==1.0.5
+httplib2==0.20.2
+httptools==0.6.1
+httpx==0.27.0
+httpx-sse==0.4.0
+huggingface-hub==0.24.6
+humanfriendly==10.0
+hyperframe==6.0.1
+idna==3.7
+IMAPClient==2.1.0
+importlib_metadata==7.1.0
+importlib_resources==6.4.4
+iniconfig==2.0.0
+instructor==1.3.3
+ipykernel==6.29.4
+ipython==8.24.0
+ipywidgets==8.1.2
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jeepney==0.7.1
+Jinja2==3.1.4
+jiter==0.4.2
+jmespath==1.0.1
+joblib==1.4.2
+json5==0.9.25
+json_repair==0.25.3
+jsonpatch==1.33
+jsonpointer==2.4
+jsonref==1.1.0
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+jupyter==1.0.0
+jupyter-console==6.6.3
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.1
+jupyter_core==5.7.2
+jupyter_server==2.14.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.1.8
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.1
+jupyterlab_widgets==3.0.10
+kazam==1.4.5
+keyring==23.5.0
+kubernetes==30.1.0
+lancedb==0.5.7
+langchain==0.2.16
+langchain-cohere==0.1.9
+langchain-community==0.2.16
+langchain-core==0.2.38
+langchain-experimental==0.0.65
+langchain-fireworks==0.1.7
+langchain-mongodb==0.1.9
+langchain-openai==0.1.9
+langchain-text-splitters==0.2.4
+langcodes==3.4.0
+langsmith==0.1.120
+language-selector==0.1
+language_data==1.2.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+llama-index==0.10.43
+llama-index-agent-openai==0.2.7
+llama-index-cli==0.1.12
+llama-index-core==0.10.43
+llama-index-embeddings-openai==0.1.10
+llama-index-indices-managed-llama-cloud==0.1.6
+llama-index-legacy==0.9.48
+llama-index-llms-openai==0.1.22
+llama-index-multi-modal-llms-openai==0.1.6
+llama-index-program-openai==0.1.6
+llama-index-question-gen-openai==0.1.3
+llama-index-readers-file==0.1.23
+llama-index-readers-llama-parse==0.1.4
+llama-index-vector-stores-mongodb==0.1.5
+llama-index-vector-stores-qdrant==0.2.8
+llama-parse==0.4.4
+llamaindex-py-client==0.1.19
+llmsherpa==0.1.4
+load-dotenv==0.1.0
+lockfile==0.12.2
+loguru==0.7.2
+louis==3.20.0
+lxml==5.2.2
+macaroonbakery==1.3.1
+Mako==1.1.3
+marisa-trie==1.2.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mem0ai==0.0.20
+mistune==3.0.2
+mmh3==4.1.0
+monotonic==1.6
+more-itertools==8.10.0
+MouseInfo==0.1.3
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+murmurhash==1.0.10
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+netifaces==0.11.0
+networkx==3.3
+nltk==3.8.1
+nodeenv==1.9.1
+notebook==7.1.3
+notebook_shim==0.2.4
+numpy==1.26.4
+oauthlib==3.2.2
+olefile==0.46
+onnxruntime==1.19.2
+openai==1.45.0
+opentelemetry-api==1.27.0
+opentelemetry-exporter-otlp-proto-common==1.27.0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-exporter-otlp-proto-http==1.27.0
+opentelemetry-instrumentation==0.48b0
+opentelemetry-instrumentation-asgi==0.48b0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-proto==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-semantic-conventions==0.48b0
+opentelemetry-util-http==0.48b0
+orjson==3.10.7
+outcome==1.3.0.post0
+overrides==7.7.0
+packaging==24.0
+pandas==2.2.2
+pandocfilters==1.5.1
+parameterized==0.9.0
+paramiko==2.9.3
+parso==0.8.4
+pathspec==0.12.1
+pdfminer.six==20191110
+pexpect==4.8.0
+pillow==10.3.0
+pipenv==2024.0.0
+platformdirs==4.2.2
+pluggy==1.5.0
+portalocker==2.8.2
+posthog==3.6.3
+preshed==3.0.9
+prometheus_client==0.20.0
+prompt-toolkit==3.0.43
+proto-plus==1.24.0
+protobuf==4.25.3
+psutil==5.9.8
+ptyprocess==0.7.0
+pulsar-client==3.5.0
+pure-eval==0.2.2
+py==1.11.0
+pyarrow==16.1.0
+pyasn1==0.6.0
+pyasn1_modules==0.4.0
+PyAutoGUI==0.9.54
+pybind11==2.13.5
+pycairo==1.20.1
+pycparser==2.22
+pycryptodome==3.20.0
+pycups==2.0.1
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydeck==0.9.1
+PyGetWindow==0.0.9
+Pygments==2.18.0
+PyGObject==3.42.1
+PyJWT==2.9.0
+pylance==0.9.18
+pymacaroons==0.13.0
+pymongo==4.7.3
+Pympler==1.0.1
+PyMsgBox==1.0.9
+PyMuPDF==1.24.10
+PyMuPDFb==1.24.10
+PyNaCl==1.5.0
+pyparsing==2.4.7
+pypdf==4.2.0
+pyperclip==1.9.0
+PyPika==0.48.9
+pyproject_hooks==1.1.0
+PyRect==0.2.0
+pyRFC3339==1.1
+pyright==1.1.379
+pysbd==0.3.4
+PyScreeze==0.1.30
+PySocks==1.7.1
+pytest==8.3.2
+python-apt==2.4.0+ubuntu3
+python-dateutil==2.9.0.post0
+python-debian==0.1.43+ubuntu1.1
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-pptx==0.6.23
+python3-xlib==0.15
+pytube==15.0.0
+pytweening==1.2.0
+pytz==2024.1
+pyxdg==0.27
+PyYAML==6.0.1
+pyzmq==26.0.3
+qdrant-client==1.9.1
+qtconsole==5.5.2
+QtPy==2.4.1
+ratelimiter==1.2.0.post0
+referencing==0.35.1
+regex==2024.7.24
+reportlab==3.6.8
+requests==2.32.3
+requests-oauthlib==2.0.0
+retry==0.9.2
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.7.1
+rpds-py==0.18.1
+rsa==4.9
+s3transfer==0.10.2
+schema==0.7.7
+scikit-learn==1.0.2
+scipy==1.13.1
+SecretStorage==3.3.1
+selenium==4.23.1
+semver==3.0.2
+Send2Trash==1.8.3
+sgmllib3k==1.0.0
+shapely==2.0.6
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.4
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.5
+spacy==3.7.5
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+SpeechRecognition==3.8.1
+SQLAlchemy==2.0.34
+srsly==2.4.8
+stack-data==0.6.3
+starlette==0.38.5
+streamlit==1.35.0
+striprtf==0.0.26
+structlog==24.1.0
+sympy==1.13.2
+systemd-python==234
+tabulate==0.9.0
+tenacity==8.3.0
+termcolor==2.4.0
+terminado==0.18.1
+textract==1.6.5
+thinc==8.2.4
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+tinycss2==1.3.0
+tokenizers==0.20.0
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+tornado==6.4
+tqdm==4.66.4
+traitlets==5.14.3
+trio==0.26.2
+trio-websocket==0.11.1
+trubrics==1.3.6
+typeguard==2.13.3
+typer==0.12.3
+types-python-dateutil==2.9.0.20240316
+types-requests==2.32.0.20240907
+typing-inspect==0.9.0
+typing_extensions==4.12.1
+tzdata==2024.1
+tzlocal==5.2
+ubuntu-drivers-common==0.0.0
+ubuntu-pro-client==8001
+ufw==0.36.1
+unattended-upgrades==0.1
+uri-template==1.3.0
+urllib3==2.2.1
+usb-creator==0.3.7
+uvicorn==0.29.0
+uvloop==0.19.0
+validators==0.28.3
+virtualenv==20.26.1
+wadllib==1.3.6
+wasabi==1.1.3
+watchdog==4.0.1
+watchfiles==0.21.0
+wcwidth==0.2.13
+weasel==0.4.1
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==12.0
+Werkzeug==3.0.4
+widgetsnbextension==4.0.10
+wrapt==1.16.0
+wsproto==1.2.0
+xdg==5
+xkit==0.0.0
+xlrd==1.2.0
+XlsxWriter==3.2.0
+xxhash==3.5.0
+yarl==1.9.4
+zipp==3.19.2

vectorizer_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f8ad0016f36e0a63823957b2d7b4184cc1bdfe1e6b0f7b67af25f2adfbf1e40
+size 143174