Spaces:
Running
Running
| import streamlit as st | |
| from transformers import pipeline | |
| import json | |
| import os | |
| # --- Page Configuration --- | |
| st.set_page_config(page_title="Skill vs Knowledge Extractor", layout="wide") | |
| def load_models(): | |
| # Load NER (Finds the terms) and Zero-Shot Classifier (Categorizes them) | |
| try: | |
| st.info("Loading AI Models (Hugging Face local models)... This may take a moment.") | |
| # Model 1: Named Entity Recognition for finding candidate terms | |
| # CORRECTED MODEL ID: "jjzha/jobbert-base-cased" | |
| ner_pipe = pipeline("token-classification", | |
| model="jjzha/jobbert-base-cased", | |
| aggregation_strategy="simple") | |
| # Model 2: Zero-Shot Classification for categorizing terms | |
| classifier_pipe = pipeline("zero-shot-classification", | |
| model="valhalla/distilbart-mnli-12-1") | |
| return ner_pipe, classifier_pipe | |
| except Exception as e: | |
| # Note: If the error persists, check your internet connection and ensure | |
| # your device has enough memory to download these large models. | |
| st.error(f"FATAL: Error loading models. Ensure 'transformers', 'accelerate', 'streamlit', and 'torch' are installed. Details: {e}") | |
| return None, None | |
| def process_text(text, ner_pipe, classifier_pipe): | |
| if not text: | |
| return {"SKILL": [], "KNOWLEDGE": []} | |
| # 1. Extract Candidates (Using NER Model) | |
| ner_results = ner_pipe(text) | |
| candidates = set() | |
| for entity in ner_results: | |
| word = entity['word'].strip() | |
| # Filter out short or single-character entities | |
| if len(word.split()) > 1 or len(word) > 2: | |
| candidates.add(word) | |
| candidates = list(candidates) | |
| if not candidates: | |
| return {"SKILL": [], "KNOWLEDGE": []} | |
| # --- THESIS ENHANCEMENT: Heuristic Post-Processing Overrides --- | |
| # These lists are used to correct the known (and often variable) biases | |
| # of the zero-shot classifier for specific technical terms. | |
| SKILL_OVERRIDES = ["RAG", "function calling", "LoRA", "CI/CD pipelines", "DeepEval", "RAGAS", "Azure", "AWS"] | |
| KNOWLEDGE_OVERRIDES = ["clean code practices", "English fluency", "async code", "team leadership", "agile methodologies"] | |
| skills, knowledge = [], [] | |
| classification_labels = ["software tool or technology", "concept or knowledge"] | |
| for candidate in candidates: | |
| # Check Overrides First (Highest priority for accuracy) | |
| if candidate in SKILL_OVERRIDES: | |
| skills.append(candidate) | |
| continue | |
| if candidate in KNOWLEDGE_OVERRIDES: | |
| knowledge.append(candidate) | |
| continue | |
| # 2. Classify (Zero-Shot Model) | |
| try: | |
| result = classifier_pipe(candidate, candidate_labels=classification_labels) | |
| top_label = result['labels'][0] | |
| # The zero-shot model determines the category | |
| if top_label == "software tool or technology": | |
| skills.append(candidate) | |
| else: | |
| knowledge.append(candidate) | |
| except Exception as e: | |
| # Fallback for errors or empty results | |
| knowledge.append(candidate) | |
| return { | |
| "SKILL": sorted(list(set(skills))), | |
| "KNOWLEDGE": sorted(list(set(knowledge))) | |
| } | |
| # --- UI Layout --- | |
| st.title("💡 AI Job Description Analyzer") | |
| ner_pipe, classifier_pipe = load_models() | |
| if ner_pipe and classifier_pipe: | |
| st.markdown(""" | |
| ***Methodology:*** *This application uses a two-stage NLP pipeline: 1) The `jjzha/jobbert-base-cased` NER model to identify relevant terms, followed by 2) The `valhalla/distilbart-mnli-12-1` Zero-Shot Classifier to categorize them as 'SKILL' or 'KNOWLEDGE'. A heuristic post-processing layer ensures high precision for key technical terms.* | |
| """) | |
| job_description = st.text_area( | |
| "Job Description Text", | |
| height=300, | |
| placeholder="Paste a job description here..." | |
| ) | |
| if st.button("Analyze and Extract Entities", type="primary"): | |
| if job_description.strip(): | |
| with st.spinner("Analyzing text and running classification..."): | |
| output = process_text(job_description, ner_pipe, classifier_pipe) | |
| st.subheader("Extraction Output (JSON)") | |
| st.json(output) | |
| json_str = json.dumps(output, indent=2) | |
| st.download_button( | |
| label="Download JSON Output", | |
| data=json_str, | |
| file_name="extracted_entities.json", | |
| mime="application/json" | |
| ) | |
| else: | |
| st.warning("Please paste a job description into the text area.") |