Spaces:

NotRev
/

ThesisLast

Running

App Files Files Community

ThesisLast / src /streamlit_app.py

NotRev

Update src/streamlit_app.py

682a2d1 verified 2 months ago

raw

history blame

4.24 kB

	import streamlit as st
	from transformers import pipeline
	import json

	# --- Page Configuration ---
	st.set_page_config(page_title="Skill vs Knowledge Extractor", layout="wide")

	@st.cache_resource
	def load_models():
	"""
	Loads two Hugging Face models:
	1. NER Model: To extract potential technical terms (candidates).
	2. Zero-Shot Classifier: To categorize each term as SKILL or KNOWLEDGE.
	"""
	try:
	# 1. Entity Extraction Model (NER - finds the terms)
	st.info("Loading Entity Extraction Model...")
	ner_pipe = pipeline(
	"token-classification",
	model="jjzha/jobbert-base-cased-v2",
	aggregation_strategy="simple" # Merges sub-word tokens
	)

	# 2. Zero-Shot Classification Model (Categorizes the terms)
	st.info("Loading Zero-Shot Classification Model...")
	classifier_pipe = pipeline(
	"zero-shot-classification",
	model="valhalla/distilbart-mnli-12-1" # Smaller, faster classification model
	)
	return ner_pipe, classifier_pipe
	except Exception as e:
	st.error(f"Error loading models. Check your requirements.txt. Details: {e}")
	return None, None

	def process_text(text, ner_pipe, classifier_pipe):
	"""
	Runs the extraction and classification pipeline.
	"""
	if not text:
	return {"SKILL": [], "KNOWLEDGE": []}

	# Step 1: Extract Entities (Candidates)
	ner_results = ner_pipe(text)

	# Filter and clean extracted words, removing very short, possibly meaningless terms
	candidates = set()
	for entity in ner_results:
	word = entity['word'].strip()
	if len(word.split()) > 1 or len(word) > 2: # Keep multi-word phrases or single words longer than 2 chars
	candidates.add(word)

	candidates = list(candidates)
	if not candidates:
	return {"SKILL": [], "KNOWLEDGE": []}

	# Step 2: Classify each entity as SKILL or KNOWLEDGE using Zero-Shot
	skills = []
	knowledge = []

	# These are the labels the Zero-Shot model will use for classification
	classification_labels = ["software tool or technology", "concept or knowledge"]

	for candidate in candidates:
	try:
	# Classify the term
	result = classifier_pipe(candidate, candidate_labels=classification_labels)
	top_label = result['labels'][0]

	# Append to the correct list
	if top_label == "software tool or technology":
	skills.append(candidate)
	else:
	knowledge.append(candidate)
	except Exception as e:
	# Fallback for classification errors
	knowledge.append(candidate)

	return {
	"SKILL": sorted(list(set(skills))),
	"KNOWLEDGE": sorted(list(set(knowledge)))
	}

	# --- UI Layout ---
	st.title("💡 AI Job Description Analyzer")
	st.markdown("Paste a job description below to extract and categorize entities.")

	# 1. Load Models (Cached)
	ner_pipe, classifier_pipe = load_models()

	if ner_pipe and classifier_pipe:
	# 2. Input Area
	job_description = st.text_area(
	"Job Description Text",
	height=300,
	placeholder="Paste a job description here (e.g., 'We require a Python developer proficient in FastAPI and experienced with Kafka and RAG systems...')"
	)

	# 3. Process Button
	if st.button("Analyze and Extract Entities", type="primary"):
	if job_description.strip():
	with st.spinner("Analyzing text and running classification..."):
	output = process_text(job_description, ner_pipe, classifier_pipe)

	# Display Result
	st.subheader("Extraction Output (JSON)")
	st.json(output)

	# Option to download
	json_str = json.dumps(output, indent=2)
	st.download_button(
	label="Download JSON Output",
	data=json_str,
	file_name="extracted_entities.json",
	mime="application/json"
	)
	else:
	st.warning("Please enter a job description first.")