zlf18 commited on
Commit
0522165
·
verified ·
1 Parent(s): dd9737e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -46
app.py CHANGED
@@ -94,58 +94,58 @@ def llm_expand_query(user_input: str) -> str:
94
  except Exception:
95
  return user_input
96
 
97
- def extract_fallback_keywords(text: str, top_n=10) -> list[str]:
98
- """Fallback function to extract keywords if the primary method fails."""
99
- if not isinstance(text, str) or not nlp:
100
- return []
101
 
102
- # Basic keyword extraction: find noun chunks, filter them, and return the most common.
 
 
 
 
 
103
  doc = nlp(text.lower())
104
-
105
- # Add more job-specific junk words to the filter
106
- junk_words = STOPWORDS.union({'experience', 'ability', 'knowledge', 'skill', 'skills', 'degree', 'education', 'work', 'year', 'years', 'job', 'role', 'team', 'company', 'duties', 'responsibilities', 'requirements', 'qualifications', 'description'})
107
-
108
- candidates = []
109
  for chunk in doc.noun_chunks:
110
- chunk_text = chunk.text
111
- # Filter out chunks that are just junk words or too short
112
- if chunk_text not in junk_words and len(chunk_text) > 2 and not chunk_text.isnumeric():
113
- candidates.append(chunk_text)
114
-
115
- if not candidates:
116
- return []
117
-
118
- # Return the most frequent candidates
119
- most_common = [word for word, count in Counter(candidates).most_common(top_n)]
120
- return sorted(most_common)
121
-
122
- def get_skills_from_text(row: pd.Series) -> list[str]:
123
- """
124
- Primary skill extraction function. Tries the high-precision AI method first,
125
- then uses a fallback keyword extractor if needed.
126
- """
127
- # 1. Broaden the Search: Combine text from multiple fields
128
- full_text = " ".join([
129
- str(row.get('qualifications', '')),
130
- str(row.get('Duties', '')),
131
- str(row.get('Description', ''))
132
- ])
133
 
134
- if not full_text.strip():
135
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- # 2. Try the high-precision method first
138
  if nlp and matcher:
139
  doc = nlp(full_text.lower())
140
  matches = matcher(doc)
141
  skills = {doc[start:end].text.strip() for _, start, end in matches}
142
  validated_skills = sorted([s for s in skills if s in AI_VALIDATED_SKILLS])
143
-
144
  if validated_skills:
145
  return validated_skills
146
 
147
- # 3. If no skills found, use the fallback "safety net" method
148
- return extract_fallback_keywords(full_text)
149
 
150
  def initialize_data_and_model():
151
  global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, AI_VALIDATED_SKILLS
@@ -157,8 +157,7 @@ def initialize_data_and_model():
157
  AI_VALIDATED_SKILLS = set(json.load(f))
158
  print(f"--- Loaded {len(AI_VALIDATED_SKILLS)} AI-validated skills ---")
159
  except FileNotFoundError:
160
- print("🚨 ERROR: validated_skills.json not found. App functionality will be degraded.")
161
- # Don't fail completely, allow the fallback to work
162
  AI_VALIDATED_SKILLS = set()
163
 
164
  print("--- Loading Datasets ---")
@@ -166,9 +165,8 @@ def initialize_data_and_model():
166
  original_df = ds["original"].to_pandas()
167
  augmented_df = ds["augmented"].to_pandas()
168
 
169
- print("--- Mapping skills to each job description using two-layer method ---")
170
- # UPDATED: Apply the function to each row
171
- original_df['Skills'] = original_df.apply(get_skills_from_text, axis=1)
172
 
173
  original_df['job_id'] = original_df.index
174
  max_id = len(original_df) - 1
@@ -213,6 +211,10 @@ def find_job_matches(original_user_query: str, expanded_user_query: str, top_k:
213
  def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
214
  if df_to_rank is None or df_to_rank.empty: return pd.DataFrame()
215
  ranked_df = df_to_rank.copy()
 
 
 
 
216
  if 'Skills' not in ranked_df.columns: return ranked_df.sort_values(by='Similarity Score', ascending=False)
217
  def calculate_match(row, user_tokens):
218
  job_skills = row.get('Skills', [])
@@ -275,12 +277,16 @@ def find_matches_and_rank_anyway(dream_job, top_n, skills_text):
275
  def on_select_job(job_id, skills_text):
276
  if job_id is None:
277
  return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
 
278
  row = original_df.loc[job_id]
279
  details = f"### {row.get('job_title', '')} — {row.get('company', '')}"
280
  user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
281
- job_skills = row.get("Skills", [])
 
 
 
282
  if not job_skills:
283
- plan = "<p><i>No specific skills were extracted for this job. (Fallback keywords may be shown).</i></p>"
284
  return details, row.get('Duties', ''), row.get('qualifications', ''), row.get('Description', ''), plan, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
285
 
286
  missing = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=str.lower)
 
94
  except Exception:
95
  return user_input
96
 
97
+ def extract_fallback_keywords(text: str, user_skills: list[str], top_n=7) -> list[str]:
98
+ """Smarter fallback that prioritizes keywords semantically similar to the user's input."""
99
+ if not isinstance(text, str) or not nlp: return []
 
100
 
101
+ junk_words = STOPWORDS.union({
102
+ 'experience', 'ability', 'knowledge', 'skill', 'skills', 'degree', 'education', 'work', 'year', 'years', 'job', 'role', 'team',
103
+ 'company', 'duties', 'responsibilities', 'requirements', 'qualifications', 'description', 'position', 'opportunity', 'candidate',
104
+ 'application', 'applications', 'university', 'college', 'school', 'department', 'program', 'field', 'service', 'level'
105
+ })
106
+
107
  doc = nlp(text.lower())
108
+ candidates = set()
109
+ for ent in doc.ents:
110
+ if ent.label_ in ['GPE', 'ORG', 'DATE', 'PERSON', 'MONEY', 'CARDINAL', 'TIME']:
111
+ junk_words.add(ent.text)
112
+
113
  for chunk in doc.noun_chunks:
114
+ chunk_text = chunk.text.strip()
115
+ if len(chunk_text) > 3 and not any(junk in chunk_text.split() for junk in junk_words) and not chunk_text.isnumeric():
116
+ candidates.add(chunk_text)
117
+
118
+ if not candidates: return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ candidates = list(candidates)
121
+
122
+ if user_skills and model:
123
+ user_skills_embedding = model.encode(user_skills, convert_to_tensor=True)
124
+ candidate_embeddings = model.encode(candidates, convert_to_tensor=True)
125
+
126
+ cos_scores = util.cos_sim(candidate_embeddings, user_skills_embedding)
127
+ top_scores, _ = torch.max(cos_scores, dim=1)
128
+
129
+ scored_candidates = sorted(zip(candidates, top_scores.tolist()), key=lambda x: x[1], reverse=True)
130
+
131
+ return [candidate for candidate, score in scored_candidates if score > 0.2][:top_n]
132
+
133
+ return sorted(candidates)[:top_n]
134
+
135
+ def get_skills_from_text(row: pd.Series, user_skills: list[str]) -> list[str]:
136
+ """Primary skill extraction: uses AI-validated list first, then a smart fallback."""
137
+ full_text = " ".join([str(row.get(col, '')) for col in ['qualifications', 'Duties', 'Description']])
138
+ if not full_text.strip(): return []
139
 
 
140
  if nlp and matcher:
141
  doc = nlp(full_text.lower())
142
  matches = matcher(doc)
143
  skills = {doc[start:end].text.strip() for _, start, end in matches}
144
  validated_skills = sorted([s for s in skills if s in AI_VALIDATED_SKILLS])
 
145
  if validated_skills:
146
  return validated_skills
147
 
148
+ return extract_fallback_keywords(full_text, user_skills)
 
149
 
150
  def initialize_data_and_model():
151
  global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, AI_VALIDATED_SKILLS
 
157
  AI_VALIDATED_SKILLS = set(json.load(f))
158
  print(f"--- Loaded {len(AI_VALIDATED_SKILLS)} AI-validated skills ---")
159
  except FileNotFoundError:
160
+ print("🚨 WARNING: validated_skills.json not found. Skill extraction will rely on fallback method.")
 
161
  AI_VALIDATED_SKILLS = set()
162
 
163
  print("--- Loading Datasets ---")
 
165
  original_df = ds["original"].to_pandas()
166
  augmented_df = ds["augmented"].to_pandas()
167
 
168
+ print("--- Mapping skills to each job description (initial pass) ---")
169
+ original_df['Skills'] = original_df.apply(lambda row: get_skills_from_text(row, user_skills=[]), axis=1)
 
170
 
171
  original_df['job_id'] = original_df.index
172
  max_id = len(original_df) - 1
 
211
  def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
212
  if df_to_rank is None or df_to_rank.empty: return pd.DataFrame()
213
  ranked_df = df_to_rank.copy()
214
+
215
+ # Re-extract skills for the ranked DF using the user's context for better fallback results
216
+ ranked_df['Skills'] = ranked_df.apply(lambda row: get_skills_from_text(row, user_skills=user_tokens), axis=1)
217
+
218
  if 'Skills' not in ranked_df.columns: return ranked_df.sort_values(by='Similarity Score', ascending=False)
219
  def calculate_match(row, user_tokens):
220
  job_skills = row.get('Skills', [])
 
277
  def on_select_job(job_id, skills_text):
278
  if job_id is None:
279
  return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
280
+
281
  row = original_df.loc[job_id]
282
  details = f"### {row.get('job_title', '')} — {row.get('company', '')}"
283
  user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
284
+
285
+ # Re-run skill extraction with user context to ensure the learning plan is relevant
286
+ job_skills = get_skills_from_text(row, user_skills)
287
+
288
  if not job_skills:
289
+ plan = "<p><i>No specific skills were extracted for this job.</i></p>"
290
  return details, row.get('Duties', ''), row.get('qualifications', ''), row.get('Description', ''), plan, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
291
 
292
  missing = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=str.lower)