NotRev commited on
Commit
682a2d1
·
verified ·
1 Parent(s): d5d887f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +117 -76
src/streamlit_app.py CHANGED
@@ -1,76 +1,117 @@
1
- import streamlit as st
2
- from transformers import pipeline
3
- import json
4
-
5
- # Page Config
6
- st.set_page_config(page_title="Skill vs Knowledge Extractor", layout="wide")
7
-
8
- @st.cache_resource
9
- def load_models():
10
- # 1. Load the Entity Extraction Model (Finds the terms)
11
- ner_pipe = pipeline("token-classification", model="jjzha/jobbert-base-cased-v2", aggregation_strategy="simple")
12
-
13
- # 2. Load the Zero-Shot Classification Model (Categorizes them)
14
- # We use a smaller, faster model for speed
15
- classifier_pipe = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-1")
16
-
17
- return ner_pipe, classifier_pipe
18
-
19
- def process_text(text, ner_pipe, classifier_pipe):
20
- if not text:
21
- return {"SKILL": [], "KNOWLEDGE": []}
22
-
23
- # Step 1: Extract Entities (Candidates)
24
- ner_results = ner_pipe(text)
25
-
26
- # Filter and clean extracted words
27
- candidates = list(set([entity['word'].strip() for entity in ner_results if len(entity['word']) > 2]))
28
-
29
- if not candidates:
30
- return {"SKILL": [], "KNOWLEDGE": []}
31
-
32
- # Step 2: Classify each entity as SKILL or KNOWLEDGE
33
- skills = []
34
- knowledge = []
35
-
36
- # We classify the terms against these labels
37
- labels = ["software tool or technology", "concept or knowledge"]
38
-
39
- # Batch classification can be slow, so we do it simply here
40
- for candidate in candidates:
41
- # Ask the AI: Is this a tool/technology OR a concept/knowledge?
42
- result = classifier_pipe(candidate, candidate_labels=labels)
43
- top_label = result['labels'][0]
44
-
45
- if top_label == "software tool or technology":
46
- skills.append(candidate)
47
- else:
48
- knowledge.append(candidate)
49
-
50
- return {
51
- "SKILL": skills,
52
- "KNOWLEDGE": knowledge
53
- }
54
-
55
- # --- UI Layout ---
56
- st.title("Job Description Analyzer")
57
- st.markdown("Extracts entities and categorizes them into **SKILL** (Tools) and **KNOWLEDGE** (Concepts).")
58
-
59
- with st.spinner("Loading Models... this may take a minute first time..."):
60
- ner_pipe, classifier_pipe = load_models()
61
-
62
- job_description = st.text_area("Job Description", height=250, placeholder="Paste job description here...")
63
-
64
- if st.button("Analyze"):
65
- if job_description.strip():
66
- with st.spinner("Processing..."):
67
- output = process_text(job_description, ner_pipe, classifier_pipe)
68
-
69
- # Show formatted JSON
70
- st.json(output)
71
-
72
- # Download button
73
- json_str = json.dumps(output, indent=2)
74
- st.download_button("Download JSON", json_str, file_name="output.json", mime="application/json")
75
- else:
76
- st.warning("Please enter text first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+ import json
4
+
5
+ # --- Page Configuration ---
6
+ st.set_page_config(page_title="Skill vs Knowledge Extractor", layout="wide")
7
+
8
+ @st.cache_resource
9
+ def load_models():
10
+ """
11
+ Loads two Hugging Face models:
12
+ 1. NER Model: To extract potential technical terms (candidates).
13
+ 2. Zero-Shot Classifier: To categorize each term as SKILL or KNOWLEDGE.
14
+ """
15
+ try:
16
+ # 1. Entity Extraction Model (NER - finds the terms)
17
+ st.info("Loading Entity Extraction Model...")
18
+ ner_pipe = pipeline(
19
+ "token-classification",
20
+ model="jjzha/jobbert-base-cased-v2",
21
+ aggregation_strategy="simple" # Merges sub-word tokens
22
+ )
23
+
24
+ # 2. Zero-Shot Classification Model (Categorizes the terms)
25
+ st.info("Loading Zero-Shot Classification Model...")
26
+ classifier_pipe = pipeline(
27
+ "zero-shot-classification",
28
+ model="valhalla/distilbart-mnli-12-1" # Smaller, faster classification model
29
+ )
30
+ return ner_pipe, classifier_pipe
31
+ except Exception as e:
32
+ st.error(f"Error loading models. Check your requirements.txt. Details: {e}")
33
+ return None, None
34
+
35
+ def process_text(text, ner_pipe, classifier_pipe):
36
+ """
37
+ Runs the extraction and classification pipeline.
38
+ """
39
+ if not text:
40
+ return {"SKILL": [], "KNOWLEDGE": []}
41
+
42
+ # Step 1: Extract Entities (Candidates)
43
+ ner_results = ner_pipe(text)
44
+
45
+ # Filter and clean extracted words, removing very short, possibly meaningless terms
46
+ candidates = set()
47
+ for entity in ner_results:
48
+ word = entity['word'].strip()
49
+ if len(word.split()) > 1 or len(word) > 2: # Keep multi-word phrases or single words longer than 2 chars
50
+ candidates.add(word)
51
+
52
+ candidates = list(candidates)
53
+ if not candidates:
54
+ return {"SKILL": [], "KNOWLEDGE": []}
55
+
56
+ # Step 2: Classify each entity as SKILL or KNOWLEDGE using Zero-Shot
57
+ skills = []
58
+ knowledge = []
59
+
60
+ # These are the labels the Zero-Shot model will use for classification
61
+ classification_labels = ["software tool or technology", "concept or knowledge"]
62
+
63
+ for candidate in candidates:
64
+ try:
65
+ # Classify the term
66
+ result = classifier_pipe(candidate, candidate_labels=classification_labels)
67
+ top_label = result['labels'][0]
68
+
69
+ # Append to the correct list
70
+ if top_label == "software tool or technology":
71
+ skills.append(candidate)
72
+ else:
73
+ knowledge.append(candidate)
74
+ except Exception as e:
75
+ # Fallback for classification errors
76
+ knowledge.append(candidate)
77
+
78
+ return {
79
+ "SKILL": sorted(list(set(skills))),
80
+ "KNOWLEDGE": sorted(list(set(knowledge)))
81
+ }
82
+
83
+ # --- UI Layout ---
84
+ st.title("💡 AI Job Description Analyzer")
85
+ st.markdown("Paste a job description below to extract and categorize entities.")
86
+
87
+ # 1. Load Models (Cached)
88
+ ner_pipe, classifier_pipe = load_models()
89
+
90
+ if ner_pipe and classifier_pipe:
91
+ # 2. Input Area
92
+ job_description = st.text_area(
93
+ "Job Description Text",
94
+ height=300,
95
+ placeholder="Paste a job description here (e.g., 'We require a Python developer proficient in FastAPI and experienced with Kafka and RAG systems...')"
96
+ )
97
+
98
+ # 3. Process Button
99
+ if st.button("Analyze and Extract Entities", type="primary"):
100
+ if job_description.strip():
101
+ with st.spinner("Analyzing text and running classification..."):
102
+ output = process_text(job_description, ner_pipe, classifier_pipe)
103
+
104
+ # Display Result
105
+ st.subheader("Extraction Output (JSON)")
106
+ st.json(output)
107
+
108
+ # Option to download
109
+ json_str = json.dumps(output, indent=2)
110
+ st.download_button(
111
+ label="Download JSON Output",
112
+ data=json_str,
113
+ file_name="extracted_entities.json",
114
+ mime="application/json"
115
+ )
116
+ else:
117
+ st.warning("Please enter a job description first.")