Spaces:

NotRev
/

ThesisLast

Running

File size: 4,241 Bytes

682a2d1

import streamlit as st
from transformers import pipeline
import json

# --- Page Configuration ---
st.set_page_config(page_title="Skill vs Knowledge Extractor", layout="wide")

@st.cache_resource
def load_models():
    """
    Loads two Hugging Face models:
    1. NER Model: To extract potential technical terms (candidates).
    2. Zero-Shot Classifier: To categorize each term as SKILL or KNOWLEDGE.
    """
    try:
        # 1. Entity Extraction Model (NER - finds the terms)
        st.info("Loading Entity Extraction Model...")
        ner_pipe = pipeline(
            "token-classification", 
            model="jjzha/jobbert-base-cased-v2", 
            aggregation_strategy="simple" # Merges sub-word tokens
        )
        
        # 2. Zero-Shot Classification Model (Categorizes the terms)
        st.info("Loading Zero-Shot Classification Model...")
        classifier_pipe = pipeline(
            "zero-shot-classification", 
            model="valhalla/distilbart-mnli-12-1" # Smaller, faster classification model
        )
        return ner_pipe, classifier_pipe
    except Exception as e:
        st.error(f"Error loading models. Check your requirements.txt. Details: {e}")
        return None, None

def process_text(text, ner_pipe, classifier_pipe):
    """
    Runs the extraction and classification pipeline.
    """
    if not text:
        return {"SKILL": [], "KNOWLEDGE": []}

    # Step 1: Extract Entities (Candidates)
    ner_results = ner_pipe(text)
    
    # Filter and clean extracted words, removing very short, possibly meaningless terms
    candidates = set()
    for entity in ner_results:
        word = entity['word'].strip()
        if len(word.split()) > 1 or len(word) > 2: # Keep multi-word phrases or single words longer than 2 chars
            candidates.add(word)
    
    candidates = list(candidates)
    if not candidates:
        return {"SKILL": [], "KNOWLEDGE": []}

    # Step 2: Classify each entity as SKILL or KNOWLEDGE using Zero-Shot
    skills = []
    knowledge = []
    
    # These are the labels the Zero-Shot model will use for classification
    classification_labels = ["software tool or technology", "concept or knowledge"]
    
    for candidate in candidates:
        try:
            # Classify the term
            result = classifier_pipe(candidate, candidate_labels=classification_labels)
            top_label = result['labels'][0]
            
            # Append to the correct list
            if top_label == "software tool or technology":
                skills.append(candidate)
            else:
                knowledge.append(candidate)
        except Exception as e:
            # Fallback for classification errors
            knowledge.append(candidate)
    
    return {
        "SKILL": sorted(list(set(skills))),
        "KNOWLEDGE": sorted(list(set(knowledge)))
    }

# --- UI Layout ---
st.title("💡 AI Job Description Analyzer")
st.markdown("Paste a job description below to extract and categorize entities.")

# 1. Load Models (Cached)
ner_pipe, classifier_pipe = load_models()

if ner_pipe and classifier_pipe:
    # 2. Input Area
    job_description = st.text_area(
        "Job Description Text", 
        height=300, 
        placeholder="Paste a job description here (e.g., 'We require a Python developer proficient in FastAPI and experienced with Kafka and RAG systems...')"
    )

    # 3. Process Button
    if st.button("Analyze and Extract Entities", type="primary"):
        if job_description.strip():
            with st.spinner("Analyzing text and running classification..."):
                output = process_text(job_description, ner_pipe, classifier_pipe)
                
                # Display Result
                st.subheader("Extraction Output (JSON)")
                st.json(output)
                
                # Option to download
                json_str = json.dumps(output, indent=2)
                st.download_button(
                    label="Download JSON Output",
                    data=json_str,
                    file_name="extracted_entities.json",
                    mime="application/json"
                )
        else:
            st.warning("Please enter a job description first.")