Update app.py
Browse files
app.py
CHANGED
|
@@ -94,58 +94,58 @@ def llm_expand_query(user_input: str) -> str:
|
|
| 94 |
except Exception:
|
| 95 |
return user_input
|
| 96 |
|
| 97 |
-
def extract_fallback_keywords(text: str, top_n=
|
| 98 |
-
"""
|
| 99 |
-
if not isinstance(text, str) or not nlp:
|
| 100 |
-
return []
|
| 101 |
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
doc = nlp(text.lower())
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
for chunk in doc.noun_chunks:
|
| 110 |
-
chunk_text = chunk.text
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
if not candidates:
|
| 116 |
-
return []
|
| 117 |
-
|
| 118 |
-
# Return the most frequent candidates
|
| 119 |
-
most_common = [word for word, count in Counter(candidates).most_common(top_n)]
|
| 120 |
-
return sorted(most_common)
|
| 121 |
-
|
| 122 |
-
def get_skills_from_text(row: pd.Series) -> list[str]:
|
| 123 |
-
"""
|
| 124 |
-
Primary skill extraction function. Tries the high-precision AI method first,
|
| 125 |
-
then uses a fallback keyword extractor if needed.
|
| 126 |
-
"""
|
| 127 |
-
# 1. Broaden the Search: Combine text from multiple fields
|
| 128 |
-
full_text = " ".join([
|
| 129 |
-
str(row.get('qualifications', '')),
|
| 130 |
-
str(row.get('Duties', '')),
|
| 131 |
-
str(row.get('Description', ''))
|
| 132 |
-
])
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
-
# 2. Try the high-precision method first
|
| 138 |
if nlp and matcher:
|
| 139 |
doc = nlp(full_text.lower())
|
| 140 |
matches = matcher(doc)
|
| 141 |
skills = {doc[start:end].text.strip() for _, start, end in matches}
|
| 142 |
validated_skills = sorted([s for s in skills if s in AI_VALIDATED_SKILLS])
|
| 143 |
-
|
| 144 |
if validated_skills:
|
| 145 |
return validated_skills
|
| 146 |
|
| 147 |
-
|
| 148 |
-
return extract_fallback_keywords(full_text)
|
| 149 |
|
| 150 |
def initialize_data_and_model():
|
| 151 |
global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, AI_VALIDATED_SKILLS
|
|
@@ -157,8 +157,7 @@ def initialize_data_and_model():
|
|
| 157 |
AI_VALIDATED_SKILLS = set(json.load(f))
|
| 158 |
print(f"--- Loaded {len(AI_VALIDATED_SKILLS)} AI-validated skills ---")
|
| 159 |
except FileNotFoundError:
|
| 160 |
-
print("🚨
|
| 161 |
-
# Don't fail completely, allow the fallback to work
|
| 162 |
AI_VALIDATED_SKILLS = set()
|
| 163 |
|
| 164 |
print("--- Loading Datasets ---")
|
|
@@ -166,9 +165,8 @@ def initialize_data_and_model():
|
|
| 166 |
original_df = ds["original"].to_pandas()
|
| 167 |
augmented_df = ds["augmented"].to_pandas()
|
| 168 |
|
| 169 |
-
print("--- Mapping skills to each job description
|
| 170 |
-
|
| 171 |
-
original_df['Skills'] = original_df.apply(get_skills_from_text, axis=1)
|
| 172 |
|
| 173 |
original_df['job_id'] = original_df.index
|
| 174 |
max_id = len(original_df) - 1
|
|
@@ -213,6 +211,10 @@ def find_job_matches(original_user_query: str, expanded_user_query: str, top_k:
|
|
| 213 |
def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
|
| 214 |
if df_to_rank is None or df_to_rank.empty: return pd.DataFrame()
|
| 215 |
ranked_df = df_to_rank.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
if 'Skills' not in ranked_df.columns: return ranked_df.sort_values(by='Similarity Score', ascending=False)
|
| 217 |
def calculate_match(row, user_tokens):
|
| 218 |
job_skills = row.get('Skills', [])
|
|
@@ -275,12 +277,16 @@ def find_matches_and_rank_anyway(dream_job, top_n, skills_text):
|
|
| 275 |
def on_select_job(job_id, skills_text):
|
| 276 |
if job_id is None:
|
| 277 |
return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
|
|
|
|
| 278 |
row = original_df.loc[job_id]
|
| 279 |
details = f"### {row.get('job_title', '')} — {row.get('company', '')}"
|
| 280 |
user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
| 282 |
if not job_skills:
|
| 283 |
-
plan = "<p><i>No specific skills were extracted for this job.
|
| 284 |
return details, row.get('Duties', ''), row.get('qualifications', ''), row.get('Description', ''), plan, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
|
| 285 |
|
| 286 |
missing = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=str.lower)
|
|
|
|
| 94 |
except Exception:
|
| 95 |
return user_input
|
| 96 |
|
| 97 |
+
def extract_fallback_keywords(text: str, user_skills: list[str], top_n=7) -> list[str]:
|
| 98 |
+
"""Smarter fallback that prioritizes keywords semantically similar to the user's input."""
|
| 99 |
+
if not isinstance(text, str) or not nlp: return []
|
|
|
|
| 100 |
|
| 101 |
+
junk_words = STOPWORDS.union({
|
| 102 |
+
'experience', 'ability', 'knowledge', 'skill', 'skills', 'degree', 'education', 'work', 'year', 'years', 'job', 'role', 'team',
|
| 103 |
+
'company', 'duties', 'responsibilities', 'requirements', 'qualifications', 'description', 'position', 'opportunity', 'candidate',
|
| 104 |
+
'application', 'applications', 'university', 'college', 'school', 'department', 'program', 'field', 'service', 'level'
|
| 105 |
+
})
|
| 106 |
+
|
| 107 |
doc = nlp(text.lower())
|
| 108 |
+
candidates = set()
|
| 109 |
+
for ent in doc.ents:
|
| 110 |
+
if ent.label_ in ['GPE', 'ORG', 'DATE', 'PERSON', 'MONEY', 'CARDINAL', 'TIME']:
|
| 111 |
+
junk_words.add(ent.text)
|
| 112 |
+
|
| 113 |
for chunk in doc.noun_chunks:
|
| 114 |
+
chunk_text = chunk.text.strip()
|
| 115 |
+
if len(chunk_text) > 3 and not any(junk in chunk_text.split() for junk in junk_words) and not chunk_text.isnumeric():
|
| 116 |
+
candidates.add(chunk_text)
|
| 117 |
+
|
| 118 |
+
if not candidates: return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
candidates = list(candidates)
|
| 121 |
+
|
| 122 |
+
if user_skills and model:
|
| 123 |
+
user_skills_embedding = model.encode(user_skills, convert_to_tensor=True)
|
| 124 |
+
candidate_embeddings = model.encode(candidates, convert_to_tensor=True)
|
| 125 |
+
|
| 126 |
+
cos_scores = util.cos_sim(candidate_embeddings, user_skills_embedding)
|
| 127 |
+
top_scores, _ = torch.max(cos_scores, dim=1)
|
| 128 |
+
|
| 129 |
+
scored_candidates = sorted(zip(candidates, top_scores.tolist()), key=lambda x: x[1], reverse=True)
|
| 130 |
+
|
| 131 |
+
return [candidate for candidate, score in scored_candidates if score > 0.2][:top_n]
|
| 132 |
+
|
| 133 |
+
return sorted(candidates)[:top_n]
|
| 134 |
+
|
| 135 |
+
def get_skills_from_text(row: pd.Series, user_skills: list[str]) -> list[str]:
|
| 136 |
+
"""Primary skill extraction: uses AI-validated list first, then a smart fallback."""
|
| 137 |
+
full_text = " ".join([str(row.get(col, '')) for col in ['qualifications', 'Duties', 'Description']])
|
| 138 |
+
if not full_text.strip(): return []
|
| 139 |
|
|
|
|
| 140 |
if nlp and matcher:
|
| 141 |
doc = nlp(full_text.lower())
|
| 142 |
matches = matcher(doc)
|
| 143 |
skills = {doc[start:end].text.strip() for _, start, end in matches}
|
| 144 |
validated_skills = sorted([s for s in skills if s in AI_VALIDATED_SKILLS])
|
|
|
|
| 145 |
if validated_skills:
|
| 146 |
return validated_skills
|
| 147 |
|
| 148 |
+
return extract_fallback_keywords(full_text, user_skills)
|
|
|
|
| 149 |
|
| 150 |
def initialize_data_and_model():
|
| 151 |
global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, AI_VALIDATED_SKILLS
|
|
|
|
| 157 |
AI_VALIDATED_SKILLS = set(json.load(f))
|
| 158 |
print(f"--- Loaded {len(AI_VALIDATED_SKILLS)} AI-validated skills ---")
|
| 159 |
except FileNotFoundError:
|
| 160 |
+
print("🚨 WARNING: validated_skills.json not found. Skill extraction will rely on fallback method.")
|
|
|
|
| 161 |
AI_VALIDATED_SKILLS = set()
|
| 162 |
|
| 163 |
print("--- Loading Datasets ---")
|
|
|
|
| 165 |
original_df = ds["original"].to_pandas()
|
| 166 |
augmented_df = ds["augmented"].to_pandas()
|
| 167 |
|
| 168 |
+
print("--- Mapping skills to each job description (initial pass) ---")
|
| 169 |
+
original_df['Skills'] = original_df.apply(lambda row: get_skills_from_text(row, user_skills=[]), axis=1)
|
|
|
|
| 170 |
|
| 171 |
original_df['job_id'] = original_df.index
|
| 172 |
max_id = len(original_df) - 1
|
|
|
|
| 211 |
def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
|
| 212 |
if df_to_rank is None or df_to_rank.empty: return pd.DataFrame()
|
| 213 |
ranked_df = df_to_rank.copy()
|
| 214 |
+
|
| 215 |
+
# Re-extract skills for the ranked DF using the user's context for better fallback results
|
| 216 |
+
ranked_df['Skills'] = ranked_df.apply(lambda row: get_skills_from_text(row, user_skills=user_tokens), axis=1)
|
| 217 |
+
|
| 218 |
if 'Skills' not in ranked_df.columns: return ranked_df.sort_values(by='Similarity Score', ascending=False)
|
| 219 |
def calculate_match(row, user_tokens):
|
| 220 |
job_skills = row.get('Skills', [])
|
|
|
|
| 277 |
def on_select_job(job_id, skills_text):
|
| 278 |
if job_id is None:
|
| 279 |
return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
|
| 280 |
+
|
| 281 |
row = original_df.loc[job_id]
|
| 282 |
details = f"### {row.get('job_title', '')} — {row.get('company', '')}"
|
| 283 |
user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
|
| 284 |
+
|
| 285 |
+
# Re-run skill extraction with user context to ensure the learning plan is relevant
|
| 286 |
+
job_skills = get_skills_from_text(row, user_skills)
|
| 287 |
+
|
| 288 |
if not job_skills:
|
| 289 |
+
plan = "<p><i>No specific skills were extracted for this job.</i></p>"
|
| 290 |
return details, row.get('Duties', ''), row.get('qualifications', ''), row.get('Description', ''), plan, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
|
| 291 |
|
| 292 |
missing = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=str.lower)
|