import streamlit as st from transformers import pipeline import json # --- Page Configuration --- st.set_page_config(page_title="Skill vs Knowledge Extractor", layout="wide") @st.cache_resource def load_models(): """ Loads two Hugging Face models: 1. NER Model: To extract potential technical terms (candidates). 2. Zero-Shot Classifier: To categorize each term as SKILL or KNOWLEDGE. """ try: # 1. Entity Extraction Model (NER - finds the terms) st.info("Loading Entity Extraction Model...") ner_pipe = pipeline( "token-classification", model="jjzha/jobbert-base-cased-v2", aggregation_strategy="simple" # Merges sub-word tokens ) # 2. Zero-Shot Classification Model (Categorizes the terms) st.info("Loading Zero-Shot Classification Model...") classifier_pipe = pipeline( "zero-shot-classification", model="valhalla/distilbart-mnli-12-1" # Smaller, faster classification model ) return ner_pipe, classifier_pipe except Exception as e: st.error(f"Error loading models. Check your requirements.txt. Details: {e}") return None, None def process_text(text, ner_pipe, classifier_pipe): """ Runs the extraction and classification pipeline. """ if not text: return {"SKILL": [], "KNOWLEDGE": []} # Step 1: Extract Entities (Candidates) ner_results = ner_pipe(text) # Filter and clean extracted words, removing very short, possibly meaningless terms candidates = set() for entity in ner_results: word = entity['word'].strip() if len(word.split()) > 1 or len(word) > 2: # Keep multi-word phrases or single words longer than 2 chars candidates.add(word) candidates = list(candidates) if not candidates: return {"SKILL": [], "KNOWLEDGE": []} # Step 2: Classify each entity as SKILL or KNOWLEDGE using Zero-Shot skills = [] knowledge = [] # These are the labels the Zero-Shot model will use for classification classification_labels = ["software tool or technology", "concept or knowledge"] for candidate in candidates: try: # Classify the term result = classifier_pipe(candidate, candidate_labels=classification_labels) top_label = result['labels'][0] # Append to the correct list if top_label == "software tool or technology": skills.append(candidate) else: knowledge.append(candidate) except Exception as e: # Fallback for classification errors knowledge.append(candidate) return { "SKILL": sorted(list(set(skills))), "KNOWLEDGE": sorted(list(set(knowledge))) } # --- UI Layout --- st.title("💡 AI Job Description Analyzer") st.markdown("Paste a job description below to extract and categorize entities.") # 1. Load Models (Cached) ner_pipe, classifier_pipe = load_models() if ner_pipe and classifier_pipe: # 2. Input Area job_description = st.text_area( "Job Description Text", height=300, placeholder="Paste a job description here (e.g., 'We require a Python developer proficient in FastAPI and experienced with Kafka and RAG systems...')" ) # 3. Process Button if st.button("Analyze and Extract Entities", type="primary"): if job_description.strip(): with st.spinner("Analyzing text and running classification..."): output = process_text(job_description, ner_pipe, classifier_pipe) # Display Result st.subheader("Extraction Output (JSON)") st.json(output) # Option to download json_str = json.dumps(output, indent=2) st.download_button( label="Download JSON Output", data=json_str, file_name="extracted_entities.json", mime="application/json" ) else: st.warning("Please enter a job description first.")