Update app.py
Browse files
app.py
CHANGED
|
@@ -11,23 +11,38 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
| 11 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 12 |
from nltk.stem import PorterStemmer
|
| 13 |
import gradio as gr
|
| 14 |
-
import os
|
| 15 |
-
from tqdm import tqdm
|
| 16 |
|
| 17 |
-
# Initialize tqdm for pandas
|
| 18 |
tqdm.pandas()
|
| 19 |
|
| 20 |
-
# ---
|
| 21 |
-
for package in ['words', 'stopwords', 'punkt']:
|
| 22 |
try:
|
| 23 |
-
nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'tokenizers/{package}')
|
| 24 |
except LookupError:
|
| 25 |
nltk.download(package)
|
| 26 |
-
# ------------------------------------------------
|
| 27 |
|
| 28 |
STOPWORDS = set(stopwords.words('english'))
|
| 29 |
stemmer = PorterStemmer()
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# --- GLOBAL STATE & DATA ---
|
| 33 |
original_df = None
|
|
@@ -88,9 +103,7 @@ def initialize_llm_client():
|
|
| 88 |
model_llm = AutoModelForCausalLM.from_pretrained(
|
| 89 |
LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
|
| 90 |
)
|
| 91 |
-
LLM_PIPELINE = pipeline(
|
| 92 |
-
"text-generation", model=model_llm, tokenizer=tokenizer
|
| 93 |
-
)
|
| 94 |
return True
|
| 95 |
except Exception as e:
|
| 96 |
print(f"🚨 ERROR initializing local LLM: {e}")
|
|
@@ -150,7 +163,6 @@ def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd
|
|
| 150 |
ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
|
| 151 |
return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
|
| 152 |
|
| 153 |
-
# --- COMPLETELY REWRITTEN INITIALIZATION FUNCTION ---
|
| 154 |
def initialize_data_and_model():
|
| 155 |
global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
|
| 156 |
|
|
@@ -158,35 +170,35 @@ def initialize_data_and_model():
|
|
| 158 |
|
| 159 |
print("--- Initializing LLM Client ---")
|
| 160 |
if not initialize_llm_client():
|
| 161 |
-
print("Warning: LLM Client failed to initialize.
|
| 162 |
|
| 163 |
-
# --- Caching Logic: Check for pre-processed file ---
|
| 164 |
if os.path.exists(PROCESSED_DATA_PATH):
|
| 165 |
print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
|
| 166 |
original_df = pd.read_parquet(PROCESSED_DATA_PATH)
|
| 167 |
else:
|
| 168 |
print("--- No pre-processed data found. Starting one-time processing... ---")
|
| 169 |
-
print("--- This will be slow on the first run but fast on subsequent runs. ---")
|
| 170 |
|
| 171 |
ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
|
| 172 |
original_df = ds["original"].to_pandas()
|
| 173 |
|
| 174 |
-
# ---
|
| 175 |
-
def
|
| 176 |
-
if not isinstance(text, str) or len(text.strip()) < 20:
|
| 177 |
-
return []
|
| 178 |
-
if not LLM_PIPELINE:
|
| 179 |
return []
|
| 180 |
-
|
| 181 |
prompt = f"""
|
| 182 |
-
Instruct: You are an expert technical recruiter
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
|
|
|
| 186 |
|
| 187 |
-
|
| 188 |
-
"
|
|
|
|
| 189 |
|
|
|
|
|
|
|
| 190 |
Extracted Skills:
|
| 191 |
"""
|
| 192 |
try:
|
|
@@ -194,25 +206,52 @@ Extracted Skills:
|
|
| 194 |
generated_text = response[0]['generated_text']
|
| 195 |
skills_part = generated_text.split("Extracted Skills:")[-1].strip()
|
| 196 |
skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
|
| 197 |
-
return list(dict.fromkeys(s.lower() for s in skills))
|
| 198 |
-
except Exception
|
| 199 |
-
print(f"Error during LLM skill extraction: {e}")
|
| 200 |
return []
|
| 201 |
-
|
| 202 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
def create_text_for_skills(row):
|
| 204 |
return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
|
| 205 |
|
| 206 |
original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
|
| 207 |
|
| 208 |
-
print("--- Extracting skills
|
| 209 |
-
original_df['Skills'] = original_df['text_for_skills'].progress_apply(
|
| 210 |
original_df = original_df.drop(columns=['text_for_skills'])
|
| 211 |
|
| 212 |
print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
|
| 213 |
original_df.to_parquet(PROCESSED_DATA_PATH)
|
| 214 |
|
| 215 |
-
# --- Continue with the rest of the data processing
|
| 216 |
original_df['job_id'] = original_df.index
|
| 217 |
def create_full_text(row):
|
| 218 |
return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
|
|
@@ -242,7 +281,6 @@ def _course_links_for(skill: str) -> str:
|
|
| 242 |
return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
|
| 243 |
|
| 244 |
# --- GRADIO INTERFACE FUNCTIONS (No changes needed below this line) ---
|
| 245 |
-
|
| 246 |
def get_job_matches(dream_job: str, top_n: int, skills_text: str):
|
| 247 |
status = "Searching using hybrid model..."
|
| 248 |
expanded_desc = llm_expand_query(dream_job)
|
|
@@ -320,7 +358,7 @@ def on_select_job(job_id, skills_text):
|
|
| 320 |
job_skills = row.get("Skills", [])
|
| 321 |
|
| 322 |
if not job_skills:
|
| 323 |
-
learning_plan_html = "<p><i>No specific skills
|
| 324 |
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
|
| 325 |
|
| 326 |
all_missing_skills = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=lambda x: x.lower())
|
|
|
|
| 11 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 12 |
from nltk.stem import PorterStemmer
|
| 13 |
import gradio as gr
|
| 14 |
+
import os
|
| 15 |
+
from tqdm import tqdm
|
| 16 |
|
|
|
|
| 17 |
tqdm.pandas()
|
| 18 |
|
| 19 |
+
# --- NLTK Data Download ---
|
| 20 |
+
for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
|
| 21 |
try:
|
| 22 |
+
nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
|
| 23 |
except LookupError:
|
| 24 |
nltk.download(package)
|
|
|
|
| 25 |
|
| 26 |
STOPWORDS = set(stopwords.words('english'))
|
| 27 |
stemmer = PorterStemmer()
|
| 28 |
+
|
| 29 |
+
# --- NEW: Curated Skill Whitelist for NLTK Fallback Accuracy ---
|
| 30 |
+
SKILL_WHITELIST = {
|
| 31 |
+
'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
|
| 32 |
+
'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
|
| 33 |
+
'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
|
| 34 |
+
'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
|
| 35 |
+
'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics',
|
| 36 |
+
'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
|
| 37 |
+
'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
|
| 38 |
+
'network security', 'cryptography', 'blockchain', 'agile', 'scrum', 'project management', 'product management',
|
| 39 |
+
'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
|
| 40 |
+
'critical thinking', 'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks',
|
| 41 |
+
'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', 'api design', 'rest apis',
|
| 42 |
+
'graphql', 'microservices', 'serverless', 'system design', 'saas', 'sales', 'marketing', 'seo', 'sem', 'content writing',
|
| 43 |
+
'customer support', 'technical writing', 'sap', 'oracle', 'financial analysis', 'budgeting', 'mentoring', 'supervising'
|
| 44 |
+
}
|
| 45 |
+
# -----------------------------------------------------------------
|
| 46 |
|
| 47 |
# --- GLOBAL STATE & DATA ---
|
| 48 |
original_df = None
|
|
|
|
| 103 |
model_llm = AutoModelForCausalLM.from_pretrained(
|
| 104 |
LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
|
| 105 |
)
|
| 106 |
+
LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer)
|
|
|
|
|
|
|
| 107 |
return True
|
| 108 |
except Exception as e:
|
| 109 |
print(f"🚨 ERROR initializing local LLM: {e}")
|
|
|
|
| 163 |
ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
|
| 164 |
return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
|
| 165 |
|
|
|
|
| 166 |
def initialize_data_and_model():
|
| 167 |
global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
|
| 168 |
|
|
|
|
| 170 |
|
| 171 |
print("--- Initializing LLM Client ---")
|
| 172 |
if not initialize_llm_client():
|
| 173 |
+
print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
|
| 174 |
|
|
|
|
| 175 |
if os.path.exists(PROCESSED_DATA_PATH):
|
| 176 |
print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
|
| 177 |
original_df = pd.read_parquet(PROCESSED_DATA_PATH)
|
| 178 |
else:
|
| 179 |
print("--- No pre-processed data found. Starting one-time processing... ---")
|
|
|
|
| 180 |
|
| 181 |
ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
|
| 182 |
original_df = ds["original"].to_pandas()
|
| 183 |
|
| 184 |
+
# --- Method 1: LLM-based extraction with FEW-SHOT PROMPT ---
|
| 185 |
+
def extract_skills_llm(text: str) -> list[str]:
|
| 186 |
+
if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE:
|
|
|
|
|
|
|
| 187 |
return []
|
| 188 |
+
|
| 189 |
prompt = f"""
|
| 190 |
+
Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
|
| 191 |
+
|
| 192 |
+
[Example 1]
|
| 193 |
+
Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
|
| 194 |
+
Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
|
| 195 |
|
| 196 |
+
[Example 2]
|
| 197 |
+
Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
|
| 198 |
+
Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
|
| 199 |
|
| 200 |
+
[Actual Task]
|
| 201 |
+
Text: "{text}"
|
| 202 |
Extracted Skills:
|
| 203 |
"""
|
| 204 |
try:
|
|
|
|
| 206 |
generated_text = response[0]['generated_text']
|
| 207 |
skills_part = generated_text.split("Extracted Skills:")[-1].strip()
|
| 208 |
skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
|
| 209 |
+
return list(dict.fromkeys(s.lower() for s in skills))
|
| 210 |
+
except Exception:
|
|
|
|
| 211 |
return []
|
| 212 |
+
|
| 213 |
+
# --- Method 2: NLTK fallback with SKILL WHITELIST validation ---
|
| 214 |
+
def extract_skills_nltk(text: str) -> list[str]:
|
| 215 |
+
if not isinstance(text, str): return []
|
| 216 |
+
text_lower = text.lower()
|
| 217 |
+
grammar = "NP: {<JJ.*>*<NN.*>+}"
|
| 218 |
+
chunk_parser = nltk.RegexpParser(grammar)
|
| 219 |
+
tokens = nltk.word_tokenize(text_lower)
|
| 220 |
+
tagged_tokens = nltk.pos_tag(tokens)
|
| 221 |
+
chunked_text = chunk_parser.parse(tagged_tokens)
|
| 222 |
+
|
| 223 |
+
potential_skills = set()
|
| 224 |
+
for subtree in chunked_text.subtrees():
|
| 225 |
+
if subtree.label() == 'NP':
|
| 226 |
+
phrase = " ".join(word for word, tag in subtree.leaves())
|
| 227 |
+
normalized_phrase = _norm_skill_token(phrase)
|
| 228 |
+
# The key change: only add the phrase if it's in our known skill list
|
| 229 |
+
if normalized_phrase in SKILL_WHITELIST:
|
| 230 |
+
potential_skills.add(normalized_phrase)
|
| 231 |
+
return sorted(list(potential_skills))
|
| 232 |
+
|
| 233 |
+
# --- Hybrid Orchestrator: MERGE LLM and NLTK results for best coverage ---
|
| 234 |
+
def extract_skills_hybrid(text: str) -> list[str]:
|
| 235 |
+
llm_skills = extract_skills_llm(text)
|
| 236 |
+
nltk_skills = extract_skills_nltk(text)
|
| 237 |
+
|
| 238 |
+
# Combine the results and remove duplicates
|
| 239 |
+
combined_skills = set(llm_skills) | set(nltk_skills)
|
| 240 |
+
return sorted(list(combined_skills))
|
| 241 |
+
|
| 242 |
def create_text_for_skills(row):
|
| 243 |
return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
|
| 244 |
|
| 245 |
original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
|
| 246 |
|
| 247 |
+
print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
|
| 248 |
+
original_df['Skills'] = original_df['text_for_skills'].progress_apply(extract_skills_hybrid)
|
| 249 |
original_df = original_df.drop(columns=['text_for_skills'])
|
| 250 |
|
| 251 |
print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
|
| 252 |
original_df.to_parquet(PROCESSED_DATA_PATH)
|
| 253 |
|
| 254 |
+
# --- Continue with the rest of the data processing ---
|
| 255 |
original_df['job_id'] = original_df.index
|
| 256 |
def create_full_text(row):
|
| 257 |
return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
|
|
|
|
| 281 |
return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
|
| 282 |
|
| 283 |
# --- GRADIO INTERFACE FUNCTIONS (No changes needed below this line) ---
|
|
|
|
| 284 |
def get_job_matches(dream_job: str, top_n: int, skills_text: str):
|
| 285 |
status = "Searching using hybrid model..."
|
| 286 |
expanded_desc = llm_expand_query(dream_job)
|
|
|
|
| 358 |
job_skills = row.get("Skills", [])
|
| 359 |
|
| 360 |
if not job_skills:
|
| 361 |
+
learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
|
| 362 |
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
|
| 363 |
|
| 364 |
all_missing_skills = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=lambda x: x.lower())
|