zlf18 commited on
Commit
28dfdee
·
verified ·
1 Parent(s): 54b682d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -11
app.py CHANGED
@@ -11,10 +11,9 @@ from sklearn.metrics.pairwise import cosine_similarity
11
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
12
  from nltk.stem import PorterStemmer
13
  import gradio as gr
14
- import spacy # --- NEW: Import spaCy ---
15
 
16
- # --- CORRECTED: Download necessary NLTK data ---
17
- # This revised block is more direct and ensures all packages are downloaded.
18
  for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
19
  try:
20
  nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
@@ -32,7 +31,7 @@ model = None
32
  combined_job_embeddings = None
33
  original_job_title_embeddings = None
34
  LLM_PIPELINE = None
35
- NLP_MODEL = None # --- NEW: Global variable for the spaCy model ---
36
  LLM_MODEL_NAME = "microsoft/phi-2"
37
  FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
38
  KNOWN_WORDS = set()
@@ -147,7 +146,6 @@ def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd
147
  ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
148
  return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
149
 
150
- # --- REPLACED: Skill extraction now uses spaCy for much better accuracy ---
151
  def extract_skills_from_text(text: str):
152
  global NLP_MODEL
153
  if not isinstance(text, str) or not NLP_MODEL:
@@ -179,20 +177,17 @@ def extract_skills_from_text(text: str):
179
  stemmed_skills[stemmed_phrase] = skill
180
 
181
  return sorted(list(stemmed_skills.values()))
182
- # --- END REPLACEMENT ---
183
 
184
  def initialize_data_and_model():
185
  global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, NLP_MODEL
186
  print("--- Initializing LLM Client ---")
187
  if not initialize_llm_client(): print("Warning: LLM Client failed to initialize.")
188
 
189
- # --- MODIFIED: Load spaCy model ---
190
  print("--- Loading spaCy Model for Skill Extraction ---")
191
  try:
192
  NLP_MODEL = spacy.load("en_core_web_sm")
193
  except Exception as e:
194
  print(f"🚨 ERROR loading spaCy model: {e}. Skill extraction will be disabled.")
195
- # --- END MODIFICATION ---
196
 
197
  print("--- Loading Datasets ---")
198
  ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
@@ -208,10 +203,8 @@ def initialize_data_and_model():
208
  combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
209
  original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
210
 
211
- # --- MODIFIED: Apply new skill extraction function ---
212
  print("--- Extracting Skills using spaCy (this may take a moment)... ---")
213
  original_df['Skills'] = original_df['qualifications'].apply(extract_skills_from_text)
214
- # --- END MODIFICATION ---
215
 
216
  print("--- Loading Fine-Tuned Sentence Transformer Model ---")
217
  model = SentenceTransformer(FINETUNED_MODEL_ID)
@@ -227,7 +220,7 @@ def _course_links_for(skill: str) -> str:
227
  links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
228
  return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
229
 
230
- # --- GRADIO INTERFACE FUNCTIONS (No changes needed below this line) ---
231
 
232
  def get_job_matches(dream_job: str, top_n: int, skills_text: str):
233
  status = "Searching using hybrid model..."
 
11
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
12
  from nltk.stem import PorterStemmer
13
  import gradio as gr
14
+ import spacy
15
 
16
+ # --- Download necessary NLTK data ---
 
17
  for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
18
  try:
19
  nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
 
31
  combined_job_embeddings = None
32
  original_job_title_embeddings = None
33
  LLM_PIPELINE = None
34
+ NLP_MODEL = None # Global variable for the spaCy model
35
  LLM_MODEL_NAME = "microsoft/phi-2"
36
  FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
37
  KNOWN_WORDS = set()
 
146
  ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
147
  return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
148
 
 
149
  def extract_skills_from_text(text: str):
150
  global NLP_MODEL
151
  if not isinstance(text, str) or not NLP_MODEL:
 
177
  stemmed_skills[stemmed_phrase] = skill
178
 
179
  return sorted(list(stemmed_skills.values()))
 
180
 
181
  def initialize_data_and_model():
182
  global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, NLP_MODEL
183
  print("--- Initializing LLM Client ---")
184
  if not initialize_llm_client(): print("Warning: LLM Client failed to initialize.")
185
 
 
186
  print("--- Loading spaCy Model for Skill Extraction ---")
187
  try:
188
  NLP_MODEL = spacy.load("en_core_web_sm")
189
  except Exception as e:
190
  print(f"🚨 ERROR loading spaCy model: {e}. Skill extraction will be disabled.")
 
191
 
192
  print("--- Loading Datasets ---")
193
  ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
 
203
  combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
204
  original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
205
 
 
206
  print("--- Extracting Skills using spaCy (this may take a moment)... ---")
207
  original_df['Skills'] = original_df['qualifications'].apply(extract_skills_from_text)
 
208
 
209
  print("--- Loading Fine-Tuned Sentence Transformer Model ---")
210
  model = SentenceTransformer(FINETUNED_MODEL_ID)
 
220
  links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
221
  return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
222
 
223
+ # --- GRADIO INTERFACE FUNCTIONS ---
224
 
225
  def get_job_matches(dream_job: str, top_n: int, skills_text: str):
226
  status = "Searching using hybrid model..."