import streamlit as st from transformers import pipeline import json import os # --- Page Configuration --- st.set_page_config(page_title="Skill vs Knowledge Extractor", layout="wide") @st.cache_resource def load_models(): # Load NER (Finds the terms) and Zero-Shot Classifier (Categorizes them) try: st.info("Loading AI Models (Hugging Face local models)... This may take a moment.") # Model 1: Named Entity Recognition for finding candidate terms # CORRECTED MODEL ID: "jjzha/jobbert-base-cased" ner_pipe = pipeline("token-classification", model="jjzha/jobbert-base-cased", aggregation_strategy="simple") # Model 2: Zero-Shot Classification for categorizing terms classifier_pipe = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-1") return ner_pipe, classifier_pipe except Exception as e: # Note: If the error persists, check your internet connection and ensure # your device has enough memory to download these large models. st.error(f"FATAL: Error loading models. Ensure 'transformers', 'accelerate', 'streamlit', and 'torch' are installed. Details: {e}") return None, None def process_text(text, ner_pipe, classifier_pipe): if not text: return {"SKILL": [], "KNOWLEDGE": []} # 1. Extract Candidates (Using NER Model) ner_results = ner_pipe(text) candidates = set() for entity in ner_results: word = entity['word'].strip() # Filter out short or single-character entities if len(word.split()) > 1 or len(word) > 2: candidates.add(word) candidates = list(candidates) if not candidates: return {"SKILL": [], "KNOWLEDGE": []} # --- THESIS ENHANCEMENT: Heuristic Post-Processing Overrides --- # These lists are used to correct the known (and often variable) biases # of the zero-shot classifier for specific technical terms. SKILL_OVERRIDES = ["RAG", "function calling", "LoRA", "CI/CD pipelines", "DeepEval", "RAGAS", "Azure", "AWS"] KNOWLEDGE_OVERRIDES = ["clean code practices", "English fluency", "async code", "team leadership", "agile methodologies"] skills, knowledge = [], [] classification_labels = ["software tool or technology", "concept or knowledge"] for candidate in candidates: # Check Overrides First (Highest priority for accuracy) if candidate in SKILL_OVERRIDES: skills.append(candidate) continue if candidate in KNOWLEDGE_OVERRIDES: knowledge.append(candidate) continue # 2. Classify (Zero-Shot Model) try: result = classifier_pipe(candidate, candidate_labels=classification_labels) top_label = result['labels'][0] # The zero-shot model determines the category if top_label == "software tool or technology": skills.append(candidate) else: knowledge.append(candidate) except Exception as e: # Fallback for errors or empty results knowledge.append(candidate) return { "SKILL": sorted(list(set(skills))), "KNOWLEDGE": sorted(list(set(knowledge))) } # --- UI Layout --- st.title("💡 AI Job Description Analyzer") ner_pipe, classifier_pipe = load_models() if ner_pipe and classifier_pipe: st.markdown(""" ***Methodology:*** *This application uses a two-stage NLP pipeline: 1) The `jjzha/jobbert-base-cased` NER model to identify relevant terms, followed by 2) The `valhalla/distilbart-mnli-12-1` Zero-Shot Classifier to categorize them as 'SKILL' or 'KNOWLEDGE'. A heuristic post-processing layer ensures high precision for key technical terms.* """) job_description = st.text_area( "Job Description Text", height=300, placeholder="Paste a job description here..." ) if st.button("Analyze and Extract Entities", type="primary"): if job_description.strip(): with st.spinner("Analyzing text and running classification..."): output = process_text(job_description, ner_pipe, classifier_pipe) st.subheader("Extraction Output (JSON)") st.json(output) json_str = json.dumps(output, indent=2) st.download_button( label="Download JSON Output", data=json_str, file_name="extracted_entities.json", mime="application/json" ) else: st.warning("Please paste a job description into the text area.")