Update app.py
Browse files
app.py
CHANGED
|
@@ -11,10 +11,9 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
| 11 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 12 |
from nltk.stem import PorterStemmer
|
| 13 |
import gradio as gr
|
| 14 |
-
import spacy
|
| 15 |
|
| 16 |
-
# ---
|
| 17 |
-
# This revised block is more direct and ensures all packages are downloaded.
|
| 18 |
for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
|
| 19 |
try:
|
| 20 |
nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
|
|
@@ -32,7 +31,7 @@ model = None
|
|
| 32 |
combined_job_embeddings = None
|
| 33 |
original_job_title_embeddings = None
|
| 34 |
LLM_PIPELINE = None
|
| 35 |
-
NLP_MODEL = None #
|
| 36 |
LLM_MODEL_NAME = "microsoft/phi-2"
|
| 37 |
FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
|
| 38 |
KNOWN_WORDS = set()
|
|
@@ -147,7 +146,6 @@ def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd
|
|
| 147 |
ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
|
| 148 |
return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
|
| 149 |
|
| 150 |
-
# --- REPLACED: Skill extraction now uses spaCy for much better accuracy ---
|
| 151 |
def extract_skills_from_text(text: str):
|
| 152 |
global NLP_MODEL
|
| 153 |
if not isinstance(text, str) or not NLP_MODEL:
|
|
@@ -179,20 +177,17 @@ def extract_skills_from_text(text: str):
|
|
| 179 |
stemmed_skills[stemmed_phrase] = skill
|
| 180 |
|
| 181 |
return sorted(list(stemmed_skills.values()))
|
| 182 |
-
# --- END REPLACEMENT ---
|
| 183 |
|
| 184 |
def initialize_data_and_model():
|
| 185 |
global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, NLP_MODEL
|
| 186 |
print("--- Initializing LLM Client ---")
|
| 187 |
if not initialize_llm_client(): print("Warning: LLM Client failed to initialize.")
|
| 188 |
|
| 189 |
-
# --- MODIFIED: Load spaCy model ---
|
| 190 |
print("--- Loading spaCy Model for Skill Extraction ---")
|
| 191 |
try:
|
| 192 |
NLP_MODEL = spacy.load("en_core_web_sm")
|
| 193 |
except Exception as e:
|
| 194 |
print(f"🚨 ERROR loading spaCy model: {e}. Skill extraction will be disabled.")
|
| 195 |
-
# --- END MODIFICATION ---
|
| 196 |
|
| 197 |
print("--- Loading Datasets ---")
|
| 198 |
ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
|
|
@@ -208,10 +203,8 @@ def initialize_data_and_model():
|
|
| 208 |
combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
|
| 209 |
original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
|
| 210 |
|
| 211 |
-
# --- MODIFIED: Apply new skill extraction function ---
|
| 212 |
print("--- Extracting Skills using spaCy (this may take a moment)... ---")
|
| 213 |
original_df['Skills'] = original_df['qualifications'].apply(extract_skills_from_text)
|
| 214 |
-
# --- END MODIFICATION ---
|
| 215 |
|
| 216 |
print("--- Loading Fine-Tuned Sentence Transformer Model ---")
|
| 217 |
model = SentenceTransformer(FINETUNED_MODEL_ID)
|
|
@@ -227,7 +220,7 @@ def _course_links_for(skill: str) -> str:
|
|
| 227 |
links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
|
| 228 |
return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
|
| 229 |
|
| 230 |
-
# --- GRADIO INTERFACE FUNCTIONS
|
| 231 |
|
| 232 |
def get_job_matches(dream_job: str, top_n: int, skills_text: str):
|
| 233 |
status = "Searching using hybrid model..."
|
|
|
|
| 11 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 12 |
from nltk.stem import PorterStemmer
|
| 13 |
import gradio as gr
|
| 14 |
+
import spacy
|
| 15 |
|
| 16 |
+
# --- Download necessary NLTK data ---
|
|
|
|
| 17 |
for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
|
| 18 |
try:
|
| 19 |
nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
|
|
|
|
| 31 |
combined_job_embeddings = None
|
| 32 |
original_job_title_embeddings = None
|
| 33 |
LLM_PIPELINE = None
|
| 34 |
+
NLP_MODEL = None # Global variable for the spaCy model
|
| 35 |
LLM_MODEL_NAME = "microsoft/phi-2"
|
| 36 |
FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
|
| 37 |
KNOWN_WORDS = set()
|
|
|
|
| 146 |
ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
|
| 147 |
return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
|
| 148 |
|
|
|
|
| 149 |
def extract_skills_from_text(text: str):
|
| 150 |
global NLP_MODEL
|
| 151 |
if not isinstance(text, str) or not NLP_MODEL:
|
|
|
|
| 177 |
stemmed_skills[stemmed_phrase] = skill
|
| 178 |
|
| 179 |
return sorted(list(stemmed_skills.values()))
|
|
|
|
| 180 |
|
| 181 |
def initialize_data_and_model():
|
| 182 |
global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, NLP_MODEL
|
| 183 |
print("--- Initializing LLM Client ---")
|
| 184 |
if not initialize_llm_client(): print("Warning: LLM Client failed to initialize.")
|
| 185 |
|
|
|
|
| 186 |
print("--- Loading spaCy Model for Skill Extraction ---")
|
| 187 |
try:
|
| 188 |
NLP_MODEL = spacy.load("en_core_web_sm")
|
| 189 |
except Exception as e:
|
| 190 |
print(f"🚨 ERROR loading spaCy model: {e}. Skill extraction will be disabled.")
|
|
|
|
| 191 |
|
| 192 |
print("--- Loading Datasets ---")
|
| 193 |
ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
|
|
|
|
| 203 |
combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
|
| 204 |
original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
|
| 205 |
|
|
|
|
| 206 |
print("--- Extracting Skills using spaCy (this may take a moment)... ---")
|
| 207 |
original_df['Skills'] = original_df['qualifications'].apply(extract_skills_from_text)
|
|
|
|
| 208 |
|
| 209 |
print("--- Loading Fine-Tuned Sentence Transformer Model ---")
|
| 210 |
model = SentenceTransformer(FINETUNED_MODEL_ID)
|
|
|
|
| 220 |
links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
|
| 221 |
return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
|
| 222 |
|
| 223 |
+
# --- GRADIO INTERFACE FUNCTIONS ---
|
| 224 |
|
| 225 |
def get_job_matches(dream_job: str, top_n: int, skills_text: str):
|
| 226 |
status = "Searching using hybrid model..."
|