Spaces:

NotRev
/

ThesisLast

Running

File size: 4,858 Bytes

682a2d1
 
 
dfff558
682a2d1
 
 
 
 
 
dfff558
682a2d1
dfff558
cf174b3
dfff558
cf174b3
 
 
 
dfff558
 
cf174b3
 
682a2d1
 
 
cf174b3
 
dfff558
682a2d1
 
 
 
 
 
dfff558
682a2d1
 
 
 
dfff558
6f4dbd2
682a2d1
 
 
 
 
dfff558
 
 
cf174b3
 
682a2d1
dfff558
682a2d1
 
 
dfff558
 
 
 
 
 
 
 
 
 
682a2d1
 
 
 
dfff558
682a2d1
 
 
 
dfff558
 
 
682a2d1
 
 
 
 
 
 
 
 
 
 
dfff558
cf174b3
dfff558
682a2d1
 
 
dfff558
682a2d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04e4783

import streamlit as st
from transformers import pipeline
import json
import os

# --- Page Configuration ---
st.set_page_config(page_title="Skill vs Knowledge Extractor", layout="wide")

@st.cache_resource
def load_models():
    # Load NER (Finds the terms) and Zero-Shot Classifier (Categorizes them)
    try:
        st.info("Loading AI Models (Hugging Face local models)... This may take a moment.")
        
        # Model 1: Named Entity Recognition for finding candidate terms
        # CORRECTED MODEL ID: "jjzha/jobbert-base-cased"
        ner_pipe = pipeline("token-classification", 
                             model="jjzha/jobbert-base-cased", 
                             aggregation_strategy="simple")
        
        # Model 2: Zero-Shot Classification for categorizing terms
        classifier_pipe = pipeline("zero-shot-classification", 
                                   model="valhalla/distilbart-mnli-12-1") 
        
        return ner_pipe, classifier_pipe
    except Exception as e:
        # Note: If the error persists, check your internet connection and ensure 
        # your device has enough memory to download these large models.
        st.error(f"FATAL: Error loading models. Ensure 'transformers', 'accelerate', 'streamlit', and 'torch' are installed. Details: {e}")
        return None, None

def process_text(text, ner_pipe, classifier_pipe):
    if not text:
        return {"SKILL": [], "KNOWLEDGE": []}

    # 1. Extract Candidates (Using NER Model)
    ner_results = ner_pipe(text)
    candidates = set()
    for entity in ner_results:
        word = entity['word'].strip()
        # Filter out short or single-character entities
        if len(word.split()) > 1 or len(word) > 2:
            candidates.add(word)
    candidates = list(candidates)
    if not candidates:
        return {"SKILL": [], "KNOWLEDGE": []}

    # --- THESIS ENHANCEMENT: Heuristic Post-Processing Overrides ---
    # These lists are used to correct the known (and often variable) biases 
    # of the zero-shot classifier for specific technical terms.
    SKILL_OVERRIDES = ["RAG", "function calling", "LoRA", "CI/CD pipelines", "DeepEval", "RAGAS", "Azure", "AWS"] 
    KNOWLEDGE_OVERRIDES = ["clean code practices", "English fluency", "async code", "team leadership", "agile methodologies"]
    
    skills, knowledge = [], []
    classification_labels = ["software tool or technology", "concept or knowledge"]
    
    for candidate in candidates:
        
        # Check Overrides First (Highest priority for accuracy)
        if candidate in SKILL_OVERRIDES:
            skills.append(candidate)
            continue
        if candidate in KNOWLEDGE_OVERRIDES:
            knowledge.append(candidate)
            continue
            
        # 2. Classify (Zero-Shot Model)
        try:
            result = classifier_pipe(candidate, candidate_labels=classification_labels)
            top_label = result['labels'][0]
            
            # The zero-shot model determines the category
            if top_label == "software tool or technology":
                skills.append(candidate)
            else:
                knowledge.append(candidate)
        except Exception as e:
            # Fallback for errors or empty results
            knowledge.append(candidate) 
    
    return {
        "SKILL": sorted(list(set(skills))),
        "KNOWLEDGE": sorted(list(set(knowledge)))
    }

# --- UI Layout ---
st.title("💡 AI Job Description Analyzer")
ner_pipe, classifier_pipe = load_models()

if ner_pipe and classifier_pipe:
    st.markdown("""
    ***Methodology:*** *This application uses a two-stage NLP pipeline: 1) The `jjzha/jobbert-base-cased` NER model to identify relevant terms, followed by 2) The `valhalla/distilbart-mnli-12-1` Zero-Shot Classifier to categorize them as 'SKILL' or 'KNOWLEDGE'. A heuristic post-processing layer ensures high precision for key technical terms.*
    """)
    job_description = st.text_area(
        "Job Description Text", 
        height=300, 
        placeholder="Paste a job description here..."
    )

    if st.button("Analyze and Extract Entities", type="primary"):
        if job_description.strip():
            with st.spinner("Analyzing text and running classification..."):
                output = process_text(job_description, ner_pipe, classifier_pipe)
                
                st.subheader("Extraction Output (JSON)")
                st.json(output)
                
                json_str = json.dumps(output, indent=2)
                st.download_button(
                    label="Download JSON Output",
                    data=json_str,
                    file_name="extracted_entities.json",
                    mime="application/json"
                )
        else:
            st.warning("Please paste a job description into the text area.")