Spaces:

zlf18
/

test2

Sleeping

App Files Files Community

zlf18 commited on Oct 12, 2025

Commit

a4e6efa

verified ·

1 Parent(s): 1d05fa5

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -138

app.py CHANGED Viewed

@@ -26,23 +26,29 @@ for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
 STOPWORDS = set(stopwords.words('english'))
 stemmer = PorterStemmer()
-# --- NEW: Curated Skill Whitelist for NLTK Fallback Accuracy ---
 SKILL_WHITELIST = {
     'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
     'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
     'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
     'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
-    'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics',
     'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
     'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
-    'network security', 'cryptography', 'blockchain', 'agile', 'scrum', 'project management', 'product management',
     'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
-    'critical thinking', 'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks',
-    'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', 'api design', 'rest apis',
-    'graphql', 'microservices', 'serverless', 'system design', 'saas', 'sales', 'marketing', 'seo', 'sem', 'content writing',
-    'customer support', 'technical writing', 'sap', 'oracle', 'financial analysis', 'budgeting', 'mentoring', 'supervising'
 }
-# -----------------------------------------------------------------
 # --- GLOBAL STATE & DATA ---
 original_df = None
@@ -63,22 +69,6 @@ def _norm_skill_token(s: str) -> str:
     s = re.sub(r'\s+', ' ', s)
     return s
-def _skill_match(token1: str, token2: str, threshold: float = 0.9) -> bool:
-    t1 = _norm_skill_token(token1)
-    t2 = _norm_skill_token(token2)
-    if t1 == t2 or t1 in t2 or t2 in t1:
-        return True
-    try:
-        if len(t1) > 2 and len(t2) > 2:
-            vectorizer = TfidfVectorizer().fit([t1, t2])
-            vectors = vectorizer.transform([t1, t2])
-            similarity = cosine_similarity(vectors)[0, 1]
-            if similarity >= threshold:
-                return True
-    except:
-        pass
-    return False
 def build_known_vocabulary(df: pd.DataFrame):
     global KNOWN_WORDS
     english_words = set(w.lower() for w in words.words())
@@ -147,56 +137,83 @@ def find_job_matches(original_user_query: str, expanded_user_query: str, top_k:
     final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
     return final_results_df
-def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
-    if df_to_rank is None or df_to_rank.empty: return pd.DataFrame()
     ranked_df = df_to_rank.copy()
-    if 'Skills' not in ranked_df.columns: return ranked_df.sort_values(by='Similarity Score', ascending=False)
-    def calculate_match(row, user_tokens):
-        job_skills = row.get('Skills', [])
-        if not isinstance(job_skills, list): return [], 0, 0.0
-        matched_skills = [s for s in job_skills if any(_skill_match(ut, s) for ut in user_tokens)]
-        total_required_count = len(job_skills)
-        match_score = len(matched_skills) / total_required_count if total_required_count > 0 else 0.0
-        return matched_skills, len(matched_skills), match_score
-    results = ranked_df.apply(lambda row: calculate_match(row, user_tokens), axis=1, result_type='expand')
     ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
     ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
     return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
 def initialize_data_and_model():
     global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
     PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
     print("--- Initializing LLM Client ---")
-    if not initialize_llm_client():
-        print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
     if os.path.exists(PROCESSED_DATA_PATH):
         print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
         original_df = pd.read_parquet(PROCESSED_DATA_PATH)
     else:
         print("--- No pre-processed data found. Starting one-time processing... ---")
         ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
         original_df = ds["original"].to_pandas()
-        # --- Method 1: LLM-based extraction with FEW-SHOT PROMPT ---
         def extract_skills_llm(text: str) -> list[str]:
-            if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE:
-                return []
             prompt = f"""
 Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
 [Example 1]
 Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
 Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
 [Example 2]
 Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
 Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
 [Actual Task]
 Text: "{text}"
 Extracted Skills:
@@ -207,10 +224,8 @@ Extracted Skills:
                 skills_part = generated_text.split("Extracted Skills:")[-1].strip()
                 skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
                 return list(dict.fromkeys(s.lower() for s in skills))
-            except Exception:
-                return []
-        # --- Method 2: NLTK fallback with SKILL WHITELIST validation ---
         def extract_skills_nltk(text: str) -> list[str]:
             if not isinstance(text, str): return []
             text_lower = text.lower()
@@ -219,23 +234,18 @@ Extracted Skills:
             tokens = nltk.word_tokenize(text_lower)
             tagged_tokens = nltk.pos_tag(tokens)
             chunked_text = chunk_parser.parse(tagged_tokens)
             potential_skills = set()
             for subtree in chunked_text.subtrees():
                 if subtree.label() == 'NP':
                     phrase = " ".join(word for word, tag in subtree.leaves())
                     normalized_phrase = _norm_skill_token(phrase)
-                    # The key change: only add the phrase if it's in our known skill list
                     if normalized_phrase in SKILL_WHITELIST:
                         potential_skills.add(normalized_phrase)
             return sorted(list(potential_skills))
-        # --- Hybrid Orchestrator: MERGE LLM and NLTK results for best coverage ---
         def extract_skills_hybrid(text: str) -> list[str]:
             llm_skills = extract_skills_llm(text)
             nltk_skills = extract_skills_nltk(text)
-            # Combine the results and remove duplicates
             combined_skills = set(llm_skills) | set(nltk_skills)
             return sorted(list(combined_skills))
@@ -243,7 +253,6 @@ Extracted Skills:
             return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
         original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
         print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
         original_df['Skills'] = original_df['text_for_skills'].progress_apply(extract_skills_hybrid)
         original_df = original_df.drop(columns=['text_for_skills'])
@@ -251,7 +260,6 @@ Extracted Skills:
         print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
         original_df.to_parquet(PROCESSED_DATA_PATH)
-    # --- Continue with the rest of the data processing ---
     original_df['job_id'] = original_df.index
     def create_full_text(row):
         return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
@@ -280,7 +288,6 @@ def _course_links_for(skill: str) -> str:
     links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
     return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
-# --- GRADIO INTERFACE FUNCTIONS (No changes needed below this line) ---
 def get_job_matches(dream_job: str, top_n: int, skills_text: str):
     status = "Searching using hybrid model..."
     expanded_desc = llm_expand_query(dream_job)
@@ -291,29 +298,22 @@ def get_job_matches(dream_job: str, top_n: int, skills_text: str):
         display_df = score_jobs_by_skills(user_skills, emb_matches)
     else:
         display_df = emb_matches
     display_df = display_df.head(top_n)
     if user_skills:
         status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
     else:
         status = f"Found {len(display_df)} top matches using semantic search."
     table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
     if 'Skill Match Score' in display_df.columns:
-        table_to_show['Skill Match Score'] = display_df['Skill Match Score']
     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
     dropdown_value = dropdown_options[0][1] if dropdown_options else None
     return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
 def rerank_current_results(initial_matches_df, skills_text, top_n):
     if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
         return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
     initial_matches_df = pd.DataFrame(initial_matches_df)
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
     if not user_skills:
         status = "Skills cleared. Showing original semantic search results."
@@ -324,7 +324,7 @@ def rerank_current_results(initial_matches_df, skills_text, top_n):
         status = f"Results **re-ranked** based on your {len(user_skills)} skills."
         display_df = ranked_df.head(top_n)
         table_to_show = display_df[['job_title', 'company', 'Similarity Score', 'Skill Match Score']]
     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
     dropdown_value = dropdown_options[0][1] if dropdown_options else None
     return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
@@ -337,7 +337,6 @@ def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: st
         word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
         alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
         return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
     status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
     return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
@@ -346,78 +345,61 @@ def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str):
     return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
 def on_select_job(job_id, skills_text):
-    if job_id is None:
-        return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
     row = original_df.loc[job_id]
     title, company = str(row.get("job_title", "")), str(row.get("company", ""))
     job_details_markdown = f"### {title} — {company}"
     duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
     job_skills = row.get("Skills", [])
     if not job_skills:
         learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
-    all_missing_skills = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=lambda x: x.lower())
     if not all_missing_skills:
         learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     if user_skills:
-        score_val = (len(job_skills) - len(all_missing_skills)) / len(job_skills)
         job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
         headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
         learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
         skills_to_display = all_missing_skills[:5]
         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
         learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     else:
         headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
-        skills_to_display = all_missing_skills[:5]
         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
         learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
-        full_skill_list_for_state = all_missing_skills
         new_offset = len(skills_to_display)
-        should_button_be_visible = len(all_missing_skills) > 5
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
 def load_more_skills(full_skills_list, current_offset):
     SKILLS_INCREMENT = 5
     new_offset = current_offset + SKILLS_INCREMENT
     skills_to_display = full_skills_list[:new_offset]
     items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
     learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
     should_button_be_visible = new_offset < len(full_skills_list)
     return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
 def on_reset():
     return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
-# --- Run Initialization ---
 print("Starting application initialization...")
 initialization_status = initialize_data_and_model()
 print(initialization_status)
-# --- Gradio Interface Definition ---
 with gr.Blocks(theme=gr.themes.Soft()) as ui:
     gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
     initial_matches_state = gr.State()
     missing_skills_state = gr.State([])
     skills_offset_state = gr.State(0)
     with gr.Row():
         with gr.Column(scale=3):
             dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
@@ -429,64 +411,27 @@ with gr.Blocks(theme=gr.themes.Soft()) as ui:
             topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
             search_btn = gr.Button("Find Matches", variant="primary")
             reset_btn = gr.Button("Reset All")
     status_text = gr.Markdown("Status: Ready.")
     spelling_alert = gr.Markdown(visible=False)
     with gr.Row(visible=False) as spelling_row:
         search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
         retype_btn = gr.Button("Let Me Fix It", variant="stop")
     df_output = gr.DataFrame(label="Job Matches", interactive=False)
     job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
     with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
         job_details_markdown = gr.Markdown()
         with gr.Tabs():
-            with gr.TabItem("Duties"):
-                duties_markdown = gr.Markdown()
-            with gr.TabItem("Qualifications"):
-                qualifications_markdown = gr.Markdown()
-            with gr.TabItem("Full Description"):
-                description_markdown = gr.Markdown()
         learning_plan_output = gr.HTML(label="Learning Plan")
         load_more_btn = gr.Button("Load More Skills", visible=False)
-    # --- Event Handlers ---
-    search_btn.click(
-        fn=find_matches_and_rank_with_check,
-        inputs=[dream_text, topk_slider, skills_text],
-        outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
-    )
-    search_anyway_btn.click(
-        fn=find_matches_and_rank_anyway,
-        inputs=[dream_text, topk_slider, skills_text],
-        outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
-    )
-    retype_btn.click(
-        lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)),
-        outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
-    )
-    reset_btn.click(
-        fn=on_reset,
-        outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn],
-        queue=False
-    )
-    rerank_btn.click(
-        fn=rerank_current_results,
-        inputs=[initial_matches_state, skills_text, topk_slider],
-        outputs=[status_text, df_output, job_selector]
-    )
-    job_selector.change(
-        fn=on_select_job,
-        inputs=[job_selector, skills_text],
-        outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn]
-    )
-    load_more_btn.click(
-        fn=load_more_skills,
-        inputs=[missing_skills_state, skills_offset_state],
-        outputs=[learning_plan_output, skills_offset_state, load_more_btn]
-    )
 ui.launch()

 STOPWORDS = set(stopwords.words('english'))
 stemmer = PorterStemmer()
+# --- EXPANDED: Skill Whitelist with more business, finance, and consulting terms ---
 SKILL_WHITELIST = {
+    # Technical & Data
     'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
     'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
     'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
     'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
+    'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics',
     'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
     'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
+    'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis',
+    'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas',
+    # Business & Consulting
+    'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development',
+    'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
+    'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
+    'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
+    # Soft & Other
     'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
+    'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
+    'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising'
 }
+# --------------------------------------------------------------------------------
 # --- GLOBAL STATE & DATA ---
 original_df = None
     s = re.sub(r'\s+', ' ', s)
     return s
 def build_known_vocabulary(df: pd.DataFrame):
     global KNOWN_WORDS
     english_words = set(w.lower() for w in words.words())
     final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
     return final_results_df
+# --- REWRITTEN: Skill scoring function using semantic similarity ---
+def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
+    if df_to_rank is None or df_to_rank.empty or not user_skills:
+        return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame()
     ranked_df = df_to_rank.copy()
+    if 'Skills' not in ranked_df.columns:
+        return ranked_df.sort_values(by='Similarity Score', ascending=False)
+    # 1. Encode all user skills and all unique job skills across the dataframe ONCE for efficiency
+    user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
+    all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list)))
+    if not all_job_skills: # No skills to compare against
+        ranked_df['Skill Match Score'] = 0.0
+        return ranked_df
+    job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True)
+    # 2. Calculate the similarity matrix between every user skill and every job skill
+    similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
+    # 3. Define the new scoring function
+    def calculate_semantic_match(row, threshold=0.55):
+        job_skills_list = row.get('Skills', [])
+        if not job_skills_list:
+            return [], 0, 0.0
+        matched_skills_in_job = set()
+        for job_skill in job_skills_list:
+            try:
+                # Find which column in the matrix corresponds to the current job skill
+                job_skill_idx = all_job_skills.index(job_skill)
+                # Check if ANY of the user's skills meet the similarity threshold for this job skill
+                if torch.any(similarity_matrix[:, job_skill_idx] > threshold):
+                    matched_skills_in_job.add(job_skill)
+            except (ValueError, IndexError):
+                continue
+        total_required = len(job_skills_list)
+        match_score = len(matched_skills_in_job) / total_required if total_required > 0 else 0.0
+        return list(matched_skills_in_job), len(matched_skills_in_job), match_score
+    # 4. Apply the new scoring function to each row
+    results = ranked_df.apply(lambda row: calculate_semantic_match(row), axis=1, result_type='expand')
     ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
+    # 5. Sort by the new graded score
     ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
     return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
+# ----------------------------------------------------------------------
 def initialize_data_and_model():
     global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
     PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
     print("--- Initializing LLM Client ---")
+    if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
     if os.path.exists(PROCESSED_DATA_PATH):
         print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
         original_df = pd.read_parquet(PROCESSED_DATA_PATH)
     else:
         print("--- No pre-processed data found. Starting one-time processing... ---")
         ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
         original_df = ds["original"].to_pandas()
         def extract_skills_llm(text: str) -> list[str]:
+            if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
             prompt = f"""
 Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
 [Example 1]
 Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
 Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
 [Example 2]
 Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
 Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
 [Actual Task]
 Text: "{text}"
 Extracted Skills:
                 skills_part = generated_text.split("Extracted Skills:")[-1].strip()
                 skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
                 return list(dict.fromkeys(s.lower() for s in skills))
+            except Exception: return []
         def extract_skills_nltk(text: str) -> list[str]:
             if not isinstance(text, str): return []
             text_lower = text.lower()
             tokens = nltk.word_tokenize(text_lower)
             tagged_tokens = nltk.pos_tag(tokens)
             chunked_text = chunk_parser.parse(tagged_tokens)
             potential_skills = set()
             for subtree in chunked_text.subtrees():
                 if subtree.label() == 'NP':
                     phrase = " ".join(word for word, tag in subtree.leaves())
                     normalized_phrase = _norm_skill_token(phrase)
                     if normalized_phrase in SKILL_WHITELIST:
                         potential_skills.add(normalized_phrase)
             return sorted(list(potential_skills))
         def extract_skills_hybrid(text: str) -> list[str]:
             llm_skills = extract_skills_llm(text)
             nltk_skills = extract_skills_nltk(text)
             combined_skills = set(llm_skills) | set(nltk_skills)
             return sorted(list(combined_skills))
             return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
         original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
         print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
         original_df['Skills'] = original_df['text_for_skills'].progress_apply(extract_skills_hybrid)
         original_df = original_df.drop(columns=['text_for_skills'])
         print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
         original_df.to_parquet(PROCESSED_DATA_PATH)
     original_df['job_id'] = original_df.index
     def create_full_text(row):
         return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
     links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
     return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
 def get_job_matches(dream_job: str, top_n: int, skills_text: str):
     status = "Searching using hybrid model..."
     expanded_desc = llm_expand_query(dream_job)
         display_df = score_jobs_by_skills(user_skills, emb_matches)
     else:
         display_df = emb_matches
     display_df = display_df.head(top_n)
     if user_skills:
         status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
     else:
         status = f"Found {len(display_df)} top matches using semantic search."
     table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
     if 'Skill Match Score' in display_df.columns:
+        table_to_show['Skill Match Score'] = display_df['Skill Match Score'].map('{:.2%}'.format)
     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
     dropdown_value = dropdown_options[0][1] if dropdown_options else None
     return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
 def rerank_current_results(initial_matches_df, skills_text, top_n):
     if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
         return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
     initial_matches_df = pd.DataFrame(initial_matches_df)
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
     if not user_skills:
         status = "Skills cleared. Showing original semantic search results."
         status = f"Results **re-ranked** based on your {len(user_skills)} skills."
         display_df = ranked_df.head(top_n)
         table_to_show = display_df[['job_title', 'company', 'Similarity Score', 'Skill Match Score']]
+        table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
     dropdown_value = dropdown_options[0][1] if dropdown_options else None
     return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
         word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
         alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
         return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
     status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
     return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
     return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
 def on_select_job(job_id, skills_text):
+    if job_id is None: return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
     row = original_df.loc[job_id]
     title, company = str(row.get("job_title", "")), str(row.get("company", ""))
     job_details_markdown = f"### {title} — {company}"
     duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
     job_skills = row.get("Skills", [])
     if not job_skills:
         learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
+    all_missing_skills = sorted([s for s in job_skills if not any(util.cos_sim(model.encode(ut), model.encode(s))[0][0] > 0.55 for ut in user_skills)], key=lambda x: x.lower())
     if not all_missing_skills:
         learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     if user_skills:
+        match_count = len(job_skills) - len(all_missing_skills)
+        score_val = match_count / len(job_skills) if len(job_skills) > 0 else 0
         job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
         headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
         learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
         skills_to_display = all_missing_skills[:5]
         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
         learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     else:
         headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
+        skills_to_display = job_skills[:5]
         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
         learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
+        full_skill_list_for_state = job_skills
         new_offset = len(skills_to_display)
+        should_button_be_visible = len(full_skill_list_for_state) > 5
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
 def load_more_skills(full_skills_list, current_offset):
     SKILLS_INCREMENT = 5
     new_offset = current_offset + SKILLS_INCREMENT
     skills_to_display = full_skills_list[:new_offset]
     items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
     learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
     should_button_be_visible = new_offset < len(full_skills_list)
     return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
 def on_reset():
     return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
 print("Starting application initialization...")
 initialization_status = initialize_data_and_model()
 print(initialization_status)
 with gr.Blocks(theme=gr.themes.Soft()) as ui:
     gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
     initial_matches_state = gr.State()
     missing_skills_state = gr.State([])
     skills_offset_state = gr.State(0)
     with gr.Row():
         with gr.Column(scale=3):
             dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
             topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
             search_btn = gr.Button("Find Matches", variant="primary")
             reset_btn = gr.Button("Reset All")
     status_text = gr.Markdown("Status: Ready.")
     spelling_alert = gr.Markdown(visible=False)
     with gr.Row(visible=False) as spelling_row:
         search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
         retype_btn = gr.Button("Let Me Fix It", variant="stop")
     df_output = gr.DataFrame(label="Job Matches", interactive=False)
     job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
     with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
         job_details_markdown = gr.Markdown()
         with gr.Tabs():
+            with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
+            with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
+            with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
         learning_plan_output = gr.HTML(label="Learning Plan")
         load_more_btn = gr.Button("Load More Skills", visible=False)
+    search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
+    search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
+    retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
+    reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn], queue=False)
+    rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector])
+    job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn])
+    load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn])
 ui.launch()