Spaces:
Running
Running
File size: 4,858 Bytes
682a2d1 dfff558 682a2d1 dfff558 682a2d1 dfff558 cf174b3 dfff558 cf174b3 dfff558 cf174b3 682a2d1 cf174b3 dfff558 682a2d1 dfff558 682a2d1 dfff558 6f4dbd2 682a2d1 dfff558 cf174b3 682a2d1 dfff558 682a2d1 dfff558 682a2d1 dfff558 682a2d1 dfff558 682a2d1 dfff558 cf174b3 dfff558 682a2d1 dfff558 682a2d1 04e4783 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import streamlit as st
from transformers import pipeline
import json
import os
# --- Page Configuration ---
st.set_page_config(page_title="Skill vs Knowledge Extractor", layout="wide")
@st.cache_resource
def load_models():
# Load NER (Finds the terms) and Zero-Shot Classifier (Categorizes them)
try:
st.info("Loading AI Models (Hugging Face local models)... This may take a moment.")
# Model 1: Named Entity Recognition for finding candidate terms
# CORRECTED MODEL ID: "jjzha/jobbert-base-cased"
ner_pipe = pipeline("token-classification",
model="jjzha/jobbert-base-cased",
aggregation_strategy="simple")
# Model 2: Zero-Shot Classification for categorizing terms
classifier_pipe = pipeline("zero-shot-classification",
model="valhalla/distilbart-mnli-12-1")
return ner_pipe, classifier_pipe
except Exception as e:
# Note: If the error persists, check your internet connection and ensure
# your device has enough memory to download these large models.
st.error(f"FATAL: Error loading models. Ensure 'transformers', 'accelerate', 'streamlit', and 'torch' are installed. Details: {e}")
return None, None
def process_text(text, ner_pipe, classifier_pipe):
if not text:
return {"SKILL": [], "KNOWLEDGE": []}
# 1. Extract Candidates (Using NER Model)
ner_results = ner_pipe(text)
candidates = set()
for entity in ner_results:
word = entity['word'].strip()
# Filter out short or single-character entities
if len(word.split()) > 1 or len(word) > 2:
candidates.add(word)
candidates = list(candidates)
if not candidates:
return {"SKILL": [], "KNOWLEDGE": []}
# --- THESIS ENHANCEMENT: Heuristic Post-Processing Overrides ---
# These lists are used to correct the known (and often variable) biases
# of the zero-shot classifier for specific technical terms.
SKILL_OVERRIDES = ["RAG", "function calling", "LoRA", "CI/CD pipelines", "DeepEval", "RAGAS", "Azure", "AWS"]
KNOWLEDGE_OVERRIDES = ["clean code practices", "English fluency", "async code", "team leadership", "agile methodologies"]
skills, knowledge = [], []
classification_labels = ["software tool or technology", "concept or knowledge"]
for candidate in candidates:
# Check Overrides First (Highest priority for accuracy)
if candidate in SKILL_OVERRIDES:
skills.append(candidate)
continue
if candidate in KNOWLEDGE_OVERRIDES:
knowledge.append(candidate)
continue
# 2. Classify (Zero-Shot Model)
try:
result = classifier_pipe(candidate, candidate_labels=classification_labels)
top_label = result['labels'][0]
# The zero-shot model determines the category
if top_label == "software tool or technology":
skills.append(candidate)
else:
knowledge.append(candidate)
except Exception as e:
# Fallback for errors or empty results
knowledge.append(candidate)
return {
"SKILL": sorted(list(set(skills))),
"KNOWLEDGE": sorted(list(set(knowledge)))
}
# --- UI Layout ---
st.title("💡 AI Job Description Analyzer")
ner_pipe, classifier_pipe = load_models()
if ner_pipe and classifier_pipe:
st.markdown("""
***Methodology:*** *This application uses a two-stage NLP pipeline: 1) The `jjzha/jobbert-base-cased` NER model to identify relevant terms, followed by 2) The `valhalla/distilbart-mnli-12-1` Zero-Shot Classifier to categorize them as 'SKILL' or 'KNOWLEDGE'. A heuristic post-processing layer ensures high precision for key technical terms.*
""")
job_description = st.text_area(
"Job Description Text",
height=300,
placeholder="Paste a job description here..."
)
if st.button("Analyze and Extract Entities", type="primary"):
if job_description.strip():
with st.spinner("Analyzing text and running classification..."):
output = process_text(job_description, ner_pipe, classifier_pipe)
st.subheader("Extraction Output (JSON)")
st.json(output)
json_str = json.dumps(output, indent=2)
st.download_button(
label="Download JSON Output",
data=json_str,
file_name="extracted_entities.json",
mime="application/json"
)
else:
st.warning("Please paste a job description into the text area.") |