Spaces:

zlf18
/

test2

Sleeping

App Files Files Community

zlf18 commited on Oct 12, 2025

Commit

1d05fa5

verified ·

1 Parent(s): 2975425

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -37

app.py CHANGED Viewed

@@ -11,23 +11,38 @@ from sklearn.metrics.pairwise import cosine_similarity
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from nltk.stem import PorterStemmer
 import gradio as gr
-import os # New import for file path checking
-from tqdm import tqdm # New import for progress bars
-# Initialize tqdm for pandas
 tqdm.pandas()
-# --- CORRECTED: Download necessary NLTK data ---
-for package in ['words', 'stopwords', 'punkt']: # Removed unused NLTK packages
     try:
-        nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'tokenizers/{package}')
     except LookupError:
         nltk.download(package)
-# ------------------------------------------------
 STOPWORDS = set(stopwords.words('english'))
 stemmer = PorterStemmer()
-# NOTE: The hardcoded EXTENDED_JUNK_PHRASES set has been removed.
 # --- GLOBAL STATE & DATA ---
 original_df = None
@@ -88,9 +103,7 @@ def initialize_llm_client():
         model_llm = AutoModelForCausalLM.from_pretrained(
             LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
         )
-        LLM_PIPELINE = pipeline(
-            "text-generation", model=model_llm, tokenizer=tokenizer
-        )
         return True
     except Exception as e:
         print(f"🚨 ERROR initializing local LLM: {e}")
@@ -150,7 +163,6 @@ def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd
     ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
     return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
-# --- COMPLETELY REWRITTEN INITIALIZATION FUNCTION ---
 def initialize_data_and_model():
     global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
@@ -158,35 +170,35 @@ def initialize_data_and_model():
     print("--- Initializing LLM Client ---")
     if not initialize_llm_client():
-        print("Warning: LLM Client failed to initialize. Skill extraction will be skipped.")
-    # --- Caching Logic: Check for pre-processed file ---
     if os.path.exists(PROCESSED_DATA_PATH):
         print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
         original_df = pd.read_parquet(PROCESSED_DATA_PATH)
     else:
         print("--- No pre-processed data found. Starting one-time processing... ---")
-        print("--- This will be slow on the first run but fast on subsequent runs. ---")
         ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
         original_df = ds["original"].to_pandas()
-        # --- NEW LLM-based skill extraction function ---
-        def extract_skills_with_llm(text: str) -> list[str]:
-            if not isinstance(text, str) or len(text.strip()) < 20:
-                return []
-            if not LLM_PIPELINE:
                 return []
             prompt = f"""
-Instruct: You are an expert technical recruiter analyzing a job description. Extract the key skills required for the role.
-- Identify both technical skills (e.g., 'Python', 'React', 'AWS', 'machine learning') and important soft skills (e.g., 'leadership', 'project management').
-- Do not include generic phrases like 'bachelor's degree' or 'years of experience' as skills.
-- List the extracted skills as a single, comma-separated string. Do not use bullet points or any other formatting.
-Job Description Text:
-"{text}"
 Extracted Skills:
 """
             try:
@@ -194,25 +206,52 @@ Extracted Skills:
                 generated_text = response[0]['generated_text']
                 skills_part = generated_text.split("Extracted Skills:")[-1].strip()
                 skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
-                return list(dict.fromkeys(s.lower() for s in skills)) # Return unique skills
-            except Exception as e:
-                print(f"Error during LLM skill extraction: {e}")
                 return []
-        # Combine relevant text fields for better context
         def create_text_for_skills(row):
             return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
         original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
-        print("--- Extracting skills using the LLM. Please wait... ---")
-        original_df['Skills'] = original_df['text_for_skills'].progress_apply(extract_skills_with_llm)
         original_df = original_df.drop(columns=['text_for_skills'])
         print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
         original_df.to_parquet(PROCESSED_DATA_PATH)
-    # --- Continue with the rest of the data processing using the loaded/created `original_df` ---
     original_df['job_id'] = original_df.index
     def create_full_text(row):
         return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
@@ -242,7 +281,6 @@ def _course_links_for(skill: str) -> str:
     return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
 # --- GRADIO INTERFACE FUNCTIONS (No changes needed below this line) ---
 def get_job_matches(dream_job: str, top_n: int, skills_text: str):
     status = "Searching using hybrid model..."
     expanded_desc = llm_expand_query(dream_job)
@@ -320,7 +358,7 @@ def on_select_job(job_id, skills_text):
     job_skills = row.get("Skills", [])
     if not job_skills:
-        learning_plan_html = "<p><i>No specific skills were extracted for this job.</i></p>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     all_missing_skills = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=lambda x: x.lower())

 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from nltk.stem import PorterStemmer
 import gradio as gr
+import os
+from tqdm import tqdm
 tqdm.pandas()
+# --- NLTK Data Download ---
+for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
     try:
+        nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
     except LookupError:
         nltk.download(package)
 STOPWORDS = set(stopwords.words('english'))
 stemmer = PorterStemmer()
+# --- NEW: Curated Skill Whitelist for NLTK Fallback Accuracy ---
+SKILL_WHITELIST = {
+    'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
+    'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
+    'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
+    'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
+    'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics',
+    'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
+    'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
+    'network security', 'cryptography', 'blockchain', 'agile', 'scrum', 'project management', 'product management',
+    'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
+    'critical thinking', 'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks',
+    'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', 'api design', 'rest apis',
+    'graphql', 'microservices', 'serverless', 'system design', 'saas', 'sales', 'marketing', 'seo', 'sem', 'content writing',
+    'customer support', 'technical writing', 'sap', 'oracle', 'financial analysis', 'budgeting', 'mentoring', 'supervising'
+}
+# -----------------------------------------------------------------
 # --- GLOBAL STATE & DATA ---
 original_df = None
         model_llm = AutoModelForCausalLM.from_pretrained(
             LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
         )
+        LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer)
         return True
     except Exception as e:
         print(f"🚨 ERROR initializing local LLM: {e}")
     ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
     return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
 def initialize_data_and_model():
     global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
     print("--- Initializing LLM Client ---")
     if not initialize_llm_client():
+        print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
     if os.path.exists(PROCESSED_DATA_PATH):
         print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
         original_df = pd.read_parquet(PROCESSED_DATA_PATH)
     else:
         print("--- No pre-processed data found. Starting one-time processing... ---")
         ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
         original_df = ds["original"].to_pandas()
+        # --- Method 1: LLM-based extraction with FEW-SHOT PROMPT ---
+        def extract_skills_llm(text: str) -> list[str]:
+            if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE:
                 return []
             prompt = f"""
+Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
+[Example 1]
+Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
+Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
+[Example 2]
+Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
+Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
+[Actual Task]
+Text: "{text}"
 Extracted Skills:
 """
             try:
                 generated_text = response[0]['generated_text']
                 skills_part = generated_text.split("Extracted Skills:")[-1].strip()
                 skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
+                return list(dict.fromkeys(s.lower() for s in skills))
+            except Exception:
                 return []
+        # --- Method 2: NLTK fallback with SKILL WHITELIST validation ---
+        def extract_skills_nltk(text: str) -> list[str]:
+            if not isinstance(text, str): return []
+            text_lower = text.lower()
+            grammar = "NP: {<JJ.*>*<NN.*>+}"
+            chunk_parser = nltk.RegexpParser(grammar)
+            tokens = nltk.word_tokenize(text_lower)
+            tagged_tokens = nltk.pos_tag(tokens)
+            chunked_text = chunk_parser.parse(tagged_tokens)
+            potential_skills = set()
+            for subtree in chunked_text.subtrees():
+                if subtree.label() == 'NP':
+                    phrase = " ".join(word for word, tag in subtree.leaves())
+                    normalized_phrase = _norm_skill_token(phrase)
+                    # The key change: only add the phrase if it's in our known skill list
+                    if normalized_phrase in SKILL_WHITELIST:
+                        potential_skills.add(normalized_phrase)
+            return sorted(list(potential_skills))
+        # --- Hybrid Orchestrator: MERGE LLM and NLTK results for best coverage ---
+        def extract_skills_hybrid(text: str) -> list[str]:
+            llm_skills = extract_skills_llm(text)
+            nltk_skills = extract_skills_nltk(text)
+            # Combine the results and remove duplicates
+            combined_skills = set(llm_skills) | set(nltk_skills)
+            return sorted(list(combined_skills))
         def create_text_for_skills(row):
             return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
         original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
+        print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
+        original_df['Skills'] = original_df['text_for_skills'].progress_apply(extract_skills_hybrid)
         original_df = original_df.drop(columns=['text_for_skills'])
         print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
         original_df.to_parquet(PROCESSED_DATA_PATH)
+    # --- Continue with the rest of the data processing ---
     original_df['job_id'] = original_df.index
     def create_full_text(row):
         return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
     return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
 # --- GRADIO INTERFACE FUNCTIONS (No changes needed below this line) ---
 def get_job_matches(dream_job: str, top_n: int, skills_text: str):
     status = "Searching using hybrid model..."
     expanded_desc = llm_expand_query(dream_job)
     job_skills = row.get("Skills", [])
     if not job_skills:
+        learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     all_missing_skills = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=lambda x: x.lower())