|
|
import streamlit as st |
|
|
from transformers import pipeline |
|
|
import json |
|
|
import os |
|
|
|
|
|
|
|
|
st.set_page_config(page_title="Skill vs Knowledge Extractor", layout="wide") |
|
|
|
|
|
@st.cache_resource |
|
|
def load_models(): |
|
|
|
|
|
try: |
|
|
st.info("Loading AI Models (Hugging Face local models)... This may take a moment.") |
|
|
|
|
|
|
|
|
|
|
|
ner_pipe = pipeline("token-classification", |
|
|
model="jjzha/jobbert-base-cased", |
|
|
aggregation_strategy="simple") |
|
|
|
|
|
|
|
|
classifier_pipe = pipeline("zero-shot-classification", |
|
|
model="valhalla/distilbart-mnli-12-1") |
|
|
|
|
|
return ner_pipe, classifier_pipe |
|
|
except Exception as e: |
|
|
|
|
|
|
|
|
st.error(f"FATAL: Error loading models. Ensure 'transformers', 'accelerate', 'streamlit', and 'torch' are installed. Details: {e}") |
|
|
return None, None |
|
|
|
|
|
def process_text(text, ner_pipe, classifier_pipe): |
|
|
if not text: |
|
|
return {"SKILL": [], "KNOWLEDGE": []} |
|
|
|
|
|
|
|
|
ner_results = ner_pipe(text) |
|
|
candidates = set() |
|
|
for entity in ner_results: |
|
|
word = entity['word'].strip() |
|
|
|
|
|
if len(word.split()) > 1 or len(word) > 2: |
|
|
candidates.add(word) |
|
|
candidates = list(candidates) |
|
|
if not candidates: |
|
|
return {"SKILL": [], "KNOWLEDGE": []} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SKILL_OVERRIDES = ["RAG", "function calling", "LoRA", "CI/CD pipelines", "DeepEval", "RAGAS", "Azure", "AWS"] |
|
|
KNOWLEDGE_OVERRIDES = ["clean code practices", "English fluency", "async code", "team leadership", "agile methodologies"] |
|
|
|
|
|
skills, knowledge = [], [] |
|
|
classification_labels = ["software tool or technology", "concept or knowledge"] |
|
|
|
|
|
for candidate in candidates: |
|
|
|
|
|
|
|
|
if candidate in SKILL_OVERRIDES: |
|
|
skills.append(candidate) |
|
|
continue |
|
|
if candidate in KNOWLEDGE_OVERRIDES: |
|
|
knowledge.append(candidate) |
|
|
continue |
|
|
|
|
|
|
|
|
try: |
|
|
result = classifier_pipe(candidate, candidate_labels=classification_labels) |
|
|
top_label = result['labels'][0] |
|
|
|
|
|
|
|
|
if top_label == "software tool or technology": |
|
|
skills.append(candidate) |
|
|
else: |
|
|
knowledge.append(candidate) |
|
|
except Exception as e: |
|
|
|
|
|
knowledge.append(candidate) |
|
|
|
|
|
return { |
|
|
"SKILL": sorted(list(set(skills))), |
|
|
"KNOWLEDGE": sorted(list(set(knowledge))) |
|
|
} |
|
|
|
|
|
|
|
|
st.title("💡 AI Job Description Analyzer") |
|
|
ner_pipe, classifier_pipe = load_models() |
|
|
|
|
|
if ner_pipe and classifier_pipe: |
|
|
st.markdown(""" |
|
|
***Methodology:*** *This application uses a two-stage NLP pipeline: 1) The `jjzha/jobbert-base-cased` NER model to identify relevant terms, followed by 2) The `valhalla/distilbart-mnli-12-1` Zero-Shot Classifier to categorize them as 'SKILL' or 'KNOWLEDGE'. A heuristic post-processing layer ensures high precision for key technical terms.* |
|
|
""") |
|
|
job_description = st.text_area( |
|
|
"Job Description Text", |
|
|
height=300, |
|
|
placeholder="Paste a job description here..." |
|
|
) |
|
|
|
|
|
if st.button("Analyze and Extract Entities", type="primary"): |
|
|
if job_description.strip(): |
|
|
with st.spinner("Analyzing text and running classification..."): |
|
|
output = process_text(job_description, ner_pipe, classifier_pipe) |
|
|
|
|
|
st.subheader("Extraction Output (JSON)") |
|
|
st.json(output) |
|
|
|
|
|
json_str = json.dumps(output, indent=2) |
|
|
st.download_button( |
|
|
label="Download JSON Output", |
|
|
data=json_str, |
|
|
file_name="extracted_entities.json", |
|
|
mime="application/json" |
|
|
) |
|
|
else: |
|
|
st.warning("Please paste a job description into the text area.") |