Spaces:

zlf18
/

testjob

Sleeping

App Files Files Community

zlf18 commited on Oct 10, 2025

Commit

0522165

verified ·

1 Parent(s): dd9737e

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -46

app.py CHANGED Viewed

@@ -94,58 +94,58 @@ def llm_expand_query(user_input: str) -> str:
     except Exception:
         return user_input
-def extract_fallback_keywords(text: str, top_n=10) -> list[str]:
-    """Fallback function to extract keywords if the primary method fails."""
-    if not isinstance(text, str) or not nlp:
-        return []
-    # Basic keyword extraction: find noun chunks, filter them, and return the most common.
     doc = nlp(text.lower())
-    # Add more job-specific junk words to the filter
-    junk_words = STOPWORDS.union({'experience', 'ability', 'knowledge', 'skill', 'skills', 'degree', 'education', 'work', 'year', 'years', 'job', 'role', 'team', 'company', 'duties', 'responsibilities', 'requirements', 'qualifications', 'description'})
-    candidates = []
     for chunk in doc.noun_chunks:
-        chunk_text = chunk.text
-        # Filter out chunks that are just junk words or too short
-        if chunk_text not in junk_words and len(chunk_text) > 2 and not chunk_text.isnumeric():
-             candidates.append(chunk_text)
-    if not candidates:
-        return []
-    # Return the most frequent candidates
-    most_common = [word for word, count in Counter(candidates).most_common(top_n)]
-    return sorted(most_common)
-def get_skills_from_text(row: pd.Series) -> list[str]:
-    """
-    Primary skill extraction function. Tries the high-precision AI method first,
-    then uses a fallback keyword extractor if needed.
-    """
-    # 1. Broaden the Search: Combine text from multiple fields
-    full_text = " ".join([
-        str(row.get('qualifications', '')),
-        str(row.get('Duties', '')),
-        str(row.get('Description', ''))
-    ])
-    if not full_text.strip():
-        return []
-    # 2. Try the high-precision method first
     if nlp and matcher:
         doc = nlp(full_text.lower())
         matches = matcher(doc)
         skills = {doc[start:end].text.strip() for _, start, end in matches}
         validated_skills = sorted([s for s in skills if s in AI_VALIDATED_SKILLS])
         if validated_skills:
             return validated_skills
-    # 3. If no skills found, use the fallback "safety net" method
-    return extract_fallback_keywords(full_text)
 def initialize_data_and_model():
     global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, AI_VALIDATED_SKILLS
@@ -157,8 +157,7 @@ def initialize_data_and_model():
             AI_VALIDATED_SKILLS = set(json.load(f))
         print(f"--- Loaded {len(AI_VALIDATED_SKILLS)} AI-validated skills ---")
     except FileNotFoundError:
-        print("🚨 ERROR: validated_skills.json not found. App functionality will be degraded.")
-        # Don't fail completely, allow the fallback to work
         AI_VALIDATED_SKILLS = set()
     print("--- Loading Datasets ---")
@@ -166,9 +165,8 @@ def initialize_data_and_model():
     original_df = ds["original"].to_pandas()
     augmented_df = ds["augmented"].to_pandas()
-    print("--- Mapping skills to each job description using two-layer method ---")
-    # UPDATED: Apply the function to each row
-    original_df['Skills'] = original_df.apply(get_skills_from_text, axis=1)
     original_df['job_id'] = original_df.index
     max_id = len(original_df) - 1
@@ -213,6 +211,10 @@ def find_job_matches(original_user_query: str, expanded_user_query: str, top_k:
 def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
     if df_to_rank is None or df_to_rank.empty: return pd.DataFrame()
     ranked_df = df_to_rank.copy()
     if 'Skills' not in ranked_df.columns: return ranked_df.sort_values(by='Similarity Score', ascending=False)
     def calculate_match(row, user_tokens):
         job_skills = row.get('Skills', [])
@@ -275,12 +277,16 @@ def find_matches_and_rank_anyway(dream_job, top_n, skills_text):
 def on_select_job(job_id, skills_text):
     if job_id is None:
         return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
     row = original_df.loc[job_id]
     details = f"### {row.get('job_title', '')} — {row.get('company', '')}"
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
-    job_skills = row.get("Skills", [])
     if not job_skills:
-        plan = "<p><i>No specific skills were extracted for this job. (Fallback keywords may be shown).</i></p>"
         return details, row.get('Duties', ''), row.get('qualifications', ''), row.get('Description', ''), plan, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     missing = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=str.lower)

     except Exception:
         return user_input
+def extract_fallback_keywords(text: str, user_skills: list[str], top_n=7) -> list[str]:
+    """Smarter fallback that prioritizes keywords semantically similar to the user's input."""
+    if not isinstance(text, str) or not nlp: return []
+    junk_words = STOPWORDS.union({
+        'experience', 'ability', 'knowledge', 'skill', 'skills', 'degree', 'education', 'work', 'year', 'years', 'job', 'role', 'team',
+        'company', 'duties', 'responsibilities', 'requirements', 'qualifications', 'description', 'position', 'opportunity', 'candidate',
+        'application', 'applications', 'university', 'college', 'school', 'department', 'program', 'field', 'service', 'level'
+    })
     doc = nlp(text.lower())
+    candidates = set()
+    for ent in doc.ents:
+        if ent.label_ in ['GPE', 'ORG', 'DATE', 'PERSON', 'MONEY', 'CARDINAL', 'TIME']:
+            junk_words.add(ent.text)
     for chunk in doc.noun_chunks:
+        chunk_text = chunk.text.strip()
+        if len(chunk_text) > 3 and not any(junk in chunk_text.split() for junk in junk_words) and not chunk_text.isnumeric():
+            candidates.add(chunk_text)
+    if not candidates: return []
+    candidates = list(candidates)
+    if user_skills and model:
+        user_skills_embedding = model.encode(user_skills, convert_to_tensor=True)
+        candidate_embeddings = model.encode(candidates, convert_to_tensor=True)
+        cos_scores = util.cos_sim(candidate_embeddings, user_skills_embedding)
+        top_scores, _ = torch.max(cos_scores, dim=1)
+        scored_candidates = sorted(zip(candidates, top_scores.tolist()), key=lambda x: x[1], reverse=True)
+        return [candidate for candidate, score in scored_candidates if score > 0.2][:top_n]
+    return sorted(candidates)[:top_n]
+def get_skills_from_text(row: pd.Series, user_skills: list[str]) -> list[str]:
+    """Primary skill extraction: uses AI-validated list first, then a smart fallback."""
+    full_text = " ".join([str(row.get(col, '')) for col in ['qualifications', 'Duties', 'Description']])
+    if not full_text.strip(): return []
     if nlp and matcher:
         doc = nlp(full_text.lower())
         matches = matcher(doc)
         skills = {doc[start:end].text.strip() for _, start, end in matches}
         validated_skills = sorted([s for s in skills if s in AI_VALIDATED_SKILLS])
         if validated_skills:
             return validated_skills
+    return extract_fallback_keywords(full_text, user_skills)
 def initialize_data_and_model():
     global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, AI_VALIDATED_SKILLS
             AI_VALIDATED_SKILLS = set(json.load(f))
         print(f"--- Loaded {len(AI_VALIDATED_SKILLS)} AI-validated skills ---")
     except FileNotFoundError:
+        print("🚨 WARNING: validated_skills.json not found. Skill extraction will rely on fallback method.")
         AI_VALIDATED_SKILLS = set()
     print("--- Loading Datasets ---")
     original_df = ds["original"].to_pandas()
     augmented_df = ds["augmented"].to_pandas()
+    print("--- Mapping skills to each job description (initial pass) ---")
+    original_df['Skills'] = original_df.apply(lambda row: get_skills_from_text(row, user_skills=[]), axis=1)
     original_df['job_id'] = original_df.index
     max_id = len(original_df) - 1
 def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
     if df_to_rank is None or df_to_rank.empty: return pd.DataFrame()
     ranked_df = df_to_rank.copy()
+    # Re-extract skills for the ranked DF using the user's context for better fallback results
+    ranked_df['Skills'] = ranked_df.apply(lambda row: get_skills_from_text(row, user_skills=user_tokens), axis=1)
     if 'Skills' not in ranked_df.columns: return ranked_df.sort_values(by='Similarity Score', ascending=False)
     def calculate_match(row, user_tokens):
         job_skills = row.get('Skills', [])
 def on_select_job(job_id, skills_text):
     if job_id is None:
         return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
     row = original_df.loc[job_id]
     details = f"### {row.get('job_title', '')} — {row.get('company', '')}"
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
+    # Re-run skill extraction with user context to ensure the learning plan is relevant
+    job_skills = get_skills_from_text(row, user_skills)
     if not job_skills:
+        plan = "<p><i>No specific skills were extracted for this job.</i></p>"
         return details, row.get('Duties', ''), row.get('qualifications', ''), row.get('Description', ''), plan, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     missing = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=str.lower)