aaronjosephd commited on
Commit
a2c267d
·
0 Parent(s):

Initial BERT backend upload

Browse files
.gitattributes ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ner_model/transformer/model filter=lfs diff=lfs merge=lfs -text
37
+ *.csv filter=lfs diff=lfs merge=lfs -text
38
+ job_embeddings.pt filter=lfs diff=lfs merge=lfs -text
39
+ ner_model/ner/model filter=lfs diff=lfs merge=lfs -text
40
+ ner_model/tokenizer filter=lfs diff=lfs merge=lfs -text
41
+ ner_model/vocab/lookups.bin filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ cached_models/
2
+ __pycache__/
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.11-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies required for building some Python packages
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Copy the requirements files into the container
13
+ COPY requirements.txt similarity_requirements.txt ./
14
+
15
+ # --- Create and populate the main virtual environment ---
16
+ RUN python3 -m venv env
17
+ RUN ./env/bin/pip install --no-cache-dir -r requirements.txt
18
+
19
+ # --- Create and populate the similarity worker virtual environment ---
20
+ RUN python3 -m venv similarity_env
21
+ RUN ./similarity_env/bin/pip install --no-cache-dir -r similarity_requirements.txt
22
+
23
+ # Copy the rest of the backend application files into the container
24
+ COPY . .
25
+
26
+ # Expose the port the app runs on (standard for Hugging Face Spaces)
27
+ EXPOSE 7860
28
+
29
+ # Define the command to run the application
30
+ # Use the Python from the main virtual environment to run uvicorn
31
+ CMD ["./env/bin/python3", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Skill Gap Bert Backend
3
+ emoji: 🏃
4
+ colorFrom: purple
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
final_prototype_postings.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b253b04a9032662b233233674118a1f5306f55e2e8e7edb38ae6362eb2a3830
3
+ size 26993136
job_embeddings.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6da951796f43a827628e5266f8a0f761f346efb56c1e97de445c3bec01f671b7
3
+ size 36259418
main.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from pydantic import BaseModel
5
+ from typing import List, Dict, Any, Optional
6
+ import os
7
+ import fitz # PyMuPDF
8
+ import torch
9
+ import spacy
10
+ import re
11
+ from bs4 import BeautifulSoup
12
+ import emoji
13
+ import subprocess
14
+ import json
15
+ import sys
16
+ import pathlib
17
+ import uuid
18
+ import time
19
+
20
+ # --- Text Cleaning Functions ---
21
+
22
+ def old_refined_text_cleaning(text: str) -> str:
23
+ """The OLD cleaning function used for the annotation phase. Removes #, +, / etc."""
24
+ if not isinstance(text, str):
25
+ return ""
26
+ text = BeautifulSoup(text, "html.parser").get_text()
27
+ url_pattern = r'(?:(?:https?|ftp)://)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?'
28
+ text = re.sub(url_pattern, '', text)
29
+ text = re.sub(r'\S+@\S+\s?', '', text)
30
+ text = emoji.demojize(text)
31
+ text = re.sub(r':[a-zA-Z_]+:', '', text)
32
+ text = text.replace('\\', ' ')
33
+ text = re.sub(r'[#*•]', ' ', text)
34
+ text = re.sub(r'\{.*?\}', ' ', text)
35
+ text = re.sub(r'[^a-zA-Z0-9\s.,!?-]', ' ', text)
36
+ text = re.sub(r'\s+', ' ', text)
37
+ text = re.sub(r'\s([,.!?-])', r'\1', text)
38
+ text = text.strip()
39
+ text = text.lower()
40
+ return text
41
+
42
+ def new_refined_text_cleaning(text: str) -> str:
43
+ """The NEW, improved cleaning function. Keeps technical symbols."""
44
+ if not isinstance(text, str):
45
+ return ""
46
+ text = BeautifulSoup(text, "html.parser").get_text()
47
+ url_pattern = r'(?:(?:https?|ftp)://)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?'
48
+ text = re.sub(url_pattern, '', text)
49
+ text = re.sub(r'\S+@\S+\s?', '', text)
50
+ text = emoji.demojize(text)
51
+ text = re.sub(r':[a-zA-Z_]+:', '', text)
52
+ text = text.replace('\\', ' ')
53
+ text = re.sub(r'[*•]', ' ', text) # Keep '#' from old regex r'[#*•]' to preserve C#
54
+ text = re.sub(r'\{.*?\}', ' ', text)
55
+ # Keep '#', '+', '/', '()', and '_' to preserve technical terms.
56
+ text = re.sub(r'[^a-zA-Z0-9_#+()/\s.,!?-]', ' ', text)
57
+ text = re.sub(r'\s+', ' ', text)
58
+ text = re.sub(r'\s([,.!?-])', r'\1', text)
59
+ text = text.strip()
60
+ text = text.lower()
61
+ return text
62
+
63
+
64
+ # --- Pydantic Models for API Response Structure ---
65
+
66
+ class SkillCount(BaseModel):
67
+ skill: str
68
+ count: int
69
+
70
+ class ToolCount(BaseModel):
71
+ tool: str
72
+ count: int
73
+
74
+ class RoleSkill(BaseModel):
75
+ cmo_role_match: str
76
+ skill: str
77
+ count: int
78
+
79
+ class RoleTool(BaseModel):
80
+ cmo_role_match: str
81
+ tool: str
82
+ count: int
83
+
84
+ class ExperienceDistribution(BaseModel):
85
+ year: int
86
+ count: int
87
+
88
+ class SkillCooccurrence(BaseModel):
89
+ skill_A: str
90
+ skill_B: str
91
+ count: int
92
+
93
+ class ToolCooccurrence(BaseModel):
94
+ tool_A: str
95
+ tool_B: str
96
+ count: int
97
+
98
+ class JobRoleDistribution(BaseModel):
99
+ cmo_role_match: str
100
+ count: int
101
+
102
+ class RoleInsightsResponse(BaseModel):
103
+ top_skills: List[RoleSkill]
104
+ total_skills: int
105
+ top_tools: List[RoleTool]
106
+ total_tools: int
107
+ average_experience: Optional[float] = None
108
+ experience_distribution: List[ExperienceDistribution]
109
+ total_experience_distribution: int
110
+ skill_co_occurrence: List[SkillCooccurrence]
111
+ total_skill_co_occurrence: int
112
+ tool_co_occurrence: List[ToolCooccurrence]
113
+ total_tool_co_occurrence: int
114
+
115
+ class MarketInsightsResponse(BaseModel):
116
+ top_overall_skills: List[SkillCount]
117
+ total_overall_skills: int
118
+ top_overall_tools: List[ToolCount]
119
+ total_overall_tools: int
120
+ experience_distribution: List[ExperienceDistribution]
121
+ total_experience_distribution: int
122
+ skill_co_occurrence: List[SkillCooccurrence]
123
+ total_skill_co_occurrence: int
124
+ tool_co_occurrence: List[ToolCooccurrence]
125
+ total_tool_co_occurrence: int
126
+ average_experience: Optional[float] = None
127
+
128
+ class SimilarJob(BaseModel):
129
+ job_title: str
130
+ similarity_score: float
131
+ cmo_role_match: str
132
+ url: Optional[str] = None
133
+
134
+ class SkillDetail(BaseModel):
135
+ name: str
136
+ count: int
137
+
138
+ class GapAnalysis(BaseModel):
139
+ user_skills: List[SkillDetail]
140
+ user_tools: List[SkillDetail]
141
+ missing_skills: List[SkillDetail]
142
+ matching_skills: List[SkillDetail]
143
+ missing_tools: List[SkillDetail]
144
+ matching_tools: List[SkillDetail]
145
+ total_user_skills: int
146
+ total_user_tools: int
147
+ total_missing_skills: int
148
+ total_matching_skills: int
149
+ total_missing_tools: int
150
+ total_matching_tools: int
151
+
152
+ class AnalysisResult(BaseModel):
153
+ similar_jobs: List[SimilarJob]
154
+ total_similar_jobs: int
155
+ gap_analysis: GapAnalysis
156
+ recommendations: Dict[str, Any]
157
+ session_id: str
158
+
159
+ # --- App instantiation ---
160
+ app = FastAPI(
161
+ title="Skill Gap Analyzer API",
162
+ description="API for market insights and resume analysis.",
163
+ version="1.3.0", # Version bump
164
+ )
165
+
166
+ # --- CORS Middleware ---
167
+ origins = [
168
+ "http://localhost:5173",
169
+ "http://127.0.0.1:5173",
170
+ "http://localhost:5174",
171
+ ]
172
+
173
+ app.add_middleware(
174
+ CORSMiddleware,
175
+ allow_origins=origins,
176
+ allow_credentials=True,
177
+ allow_methods=["*"],
178
+ allow_headers=["*"],
179
+ )
180
+
181
+ # --- In-memory storage for models and data ---
182
+ DB = {}
183
+
184
+ @app.on_event("startup")
185
+ async def startup_event():
186
+ DB['similarity_cache'] = {}
187
+ """Load models and data into memory on application startup."""
188
+ print("INFO: Loading models and data...")
189
+
190
+ backend_dir = os.path.dirname(os.path.abspath(__file__))
191
+ model_path = os.path.join(backend_dir, "ner_model")
192
+
193
+ # --- Load Pre-computed Insights ---
194
+ insights_path = os.path.join(backend_dir, 'market_insights.json')
195
+ with open(insights_path, 'r') as f:
196
+ DB['insights'] = json.load(f)
197
+ print("INFO: Market insights loaded successfully.")
198
+
199
+ # --- Load other necessary data ---
200
+ # This is still needed for the similarity worker and gap analysis source
201
+ DB['market_data'] = pd.read_csv(os.path.join(backend_dir, 'final_prototype_postings.csv'))
202
+
203
+ # --- Load Models ---
204
+ print(f"INFO: Loading NER model from {model_path}...")
205
+ DB['ner_model'] = spacy.load(model_path)
206
+ print("INFO: NER model loaded successfully.")
207
+
208
+ print("INFO: Models and data loaded successfully.")
209
+
210
+
211
+ @app.get("/", tags=["General"])
212
+ async def read_root():
213
+ return {"message": "Welcome to the Skill Gap Analyzer API v1.3"}
214
+
215
+ @app.get("/roles", response_model=List[str], tags=["Market Insights"])
216
+ async def get_roles():
217
+ roles = sorted(DB['insights']['by_role'].keys())
218
+ return ["Overall Market"] + roles
219
+
220
+ @app.get("/job_roles_distribution", response_model=List[JobRoleDistribution], tags=["Market Insights"])
221
+ async def get_job_roles_distribution():
222
+ return DB['insights']['job_role_distribution']
223
+
224
+
225
+ @app.get("/market_insights", response_model=MarketInsightsResponse, tags=["Market Insights"])
226
+ async def get_market_insights(page: int = 1, limit: int = 20):
227
+ start = (page - 1) * limit
228
+ end = page * limit
229
+
230
+ overall_data = DB['insights']['overall_market']
231
+
232
+ top_skills = overall_data.get('top_skills', [])
233
+ top_tools = overall_data.get('top_tools', [])
234
+ exp_dist = overall_data.get('experience_distribution', [])
235
+ skill_co = overall_data.get('skill_co_occurrence', [])
236
+ tool_co = overall_data.get('tool_co_occurrence', [])
237
+ avg_exp = overall_data.get('average_experience')
238
+
239
+ return {
240
+ "top_overall_skills": top_skills[start:end],
241
+ "total_overall_skills": len(top_skills),
242
+ "top_overall_tools": top_tools[start:end],
243
+ "total_overall_tools": len(top_tools),
244
+ "experience_distribution": exp_dist[start:end],
245
+ "total_experience_distribution": len(exp_dist),
246
+ "skill_co_occurrence": skill_co[start:end],
247
+ "total_skill_co_occurrence": len(skill_co),
248
+ "tool_co_occurrence": tool_co[start:end],
249
+ "total_tool_co_occurrence": len(tool_co),
250
+ "average_experience": avg_exp,
251
+ }
252
+
253
+ @app.get("/market_insights/{role:path}", response_model=RoleInsightsResponse, tags=["Market Insights"])
254
+ async def get_role_insights(role: str, page: int = 1, limit: int = 10):
255
+ start = (page - 1) * limit
256
+ end = page * limit
257
+
258
+ role_data = DB['insights']['by_role'].get(role)
259
+ if not role_data:
260
+ raise HTTPException(status_code=404, detail="Role not found")
261
+
262
+ top_skills = role_data.get('top_skills', [])
263
+ top_tools = role_data.get('top_tools', [])
264
+ exp_dist = role_data.get('experience_distribution', [])
265
+ skill_co = role_data.get('skill_co_occurrence', [])
266
+ tool_co = role_data.get('tool_co_occurrence', [])
267
+ avg_exp = role_data.get('average_experience')
268
+
269
+ return {
270
+ "top_skills": top_skills[start:end],
271
+ "total_skills": len(top_skills),
272
+ "top_tools": top_tools[start:end],
273
+ "total_tools": len(top_tools),
274
+ "average_experience": avg_exp,
275
+ "experience_distribution": exp_dist[start:end],
276
+ "total_experience_distribution": len(exp_dist),
277
+ "skill_co_occurrence": skill_co[start:end],
278
+ "total_skill_co_occurrence": len(skill_co),
279
+ "tool_co_occurrence": tool_co[start:end],
280
+ "total_tool_co_occurrence": len(tool_co),
281
+ }
282
+
283
+ @app.post("/analyze_resume", response_model=AnalysisResult, tags=["Resume Analysis"])
284
+ async def analyze_resume(
285
+ resume_file: UploadFile = File(...),
286
+ target_role: Optional[str] = Form(None),
287
+ limit: Optional[int] = Form(10) # This limit is now for the initial page load
288
+ ):
289
+ # --- PDF Processing ---
290
+ if not resume_file or not resume_file.filename.lower().endswith('.pdf'):
291
+ raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
292
+
293
+ pdf_bytes = await resume_file.read()
294
+
295
+ MAX_FILE_SIZE = 1 * 1024 * 1024 # 1MB
296
+ if len(pdf_bytes) > MAX_FILE_SIZE:
297
+ raise HTTPException(
298
+ status_code=413,
299
+ detail="File is too large. Please upload a PDF under 1MB."
300
+ )
301
+
302
+ resume_text = ""
303
+ try:
304
+ with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
305
+ for page in doc:
306
+ resume_text += page.get_text()
307
+ except Exception as e:
308
+ raise HTTPException(status_code=422, detail=f"Failed to process PDF file: {e}")
309
+
310
+ if not resume_text or resume_text.isspace():
311
+ raise HTTPException(
312
+ status_code=422,
313
+ detail="Could not extract any text from the provided PDF. The document may be empty, image-based, or corrupted."
314
+ )
315
+
316
+ # --- Text Cleaning ---
317
+ ner_cleaned_text = old_refined_text_cleaning(resume_text)
318
+ similarity_cleaned_text = new_refined_text_cleaning(resume_text)
319
+
320
+ # --- NER Processing ---
321
+ start_time = time.time()
322
+ doc = DB['ner_model'](ner_cleaned_text)
323
+ end_time = time.time()
324
+ print(f"--- BERT NER INFERENCE TIME: {end_time - start_time:.4f} seconds ---")
325
+ user_skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
326
+ user_tools = [ent.text for ent in doc.ents if ent.label_ == "TOOL"]
327
+
328
+ # --- Similarity Search (via Isolated Subprocess) ---
329
+ # Fetch a large number of jobs to cache for pagination
330
+ all_similar_jobs = []
331
+ total_similar_jobs = 0
332
+ try:
333
+ backend_dir = pathlib.Path(__file__).parent.resolve()
334
+ worker_path = backend_dir / "similarity_worker.py"
335
+ worker_python_executable = "/app/similarity_env/bin/python3"
336
+
337
+ process = subprocess.run(
338
+ [
339
+ str(worker_python_executable),
340
+ str(worker_path),
341
+ "--target_role",
342
+ target_role or "Overall Market",
343
+ "--limit",
344
+ "200" # Fetch a large batch for caching
345
+ ],
346
+ input=similarity_cleaned_text,
347
+ capture_output=True,
348
+ text=True,
349
+ check=True,
350
+ )
351
+
352
+ worker_output = json.loads(process.stdout)
353
+ all_similar_jobs = worker_output.get("similar_jobs", [])
354
+ total_similar_jobs = worker_output.get("total_jobs", 0)
355
+
356
+ except (subprocess.CalledProcessError, json.JSONDecodeError, FileNotFoundError) as e:
357
+ print(f"Similarity worker failed: {e}", file=sys.stderr)
358
+ if isinstance(e, subprocess.CalledProcessError):
359
+ print(f"Worker stderr: {e.stderr}", file=sys.stderr)
360
+ all_similar_jobs = []
361
+ total_similar_jobs = 0
362
+
363
+ # --- Cache the full results ---
364
+ session_id = str(uuid.uuid4())
365
+
366
+ # Simple cache eviction: Keep cache size under a limit (e.g., 50)
367
+ if len(DB['similarity_cache']) > 50:
368
+ try:
369
+ oldest_key = next(iter(DB['similarity_cache']))
370
+ del DB['similarity_cache'][oldest_key]
371
+ except (StopIteration, KeyError):
372
+ # Handle edge cases where cache might be empty or key is gone
373
+ pass
374
+
375
+ DB['similarity_cache'][session_id] = all_similar_jobs
376
+
377
+ # --- Gap Analysis (remains the same) ---
378
+ if target_role and target_role != "Overall Market":
379
+ role_data = DB['insights']['by_role'].get(target_role, {})
380
+ market_skills_list = role_data.get('top_skills', [])
381
+ market_tools_list = role_data.get('top_tools', [])
382
+ else:
383
+ overall_data = DB['insights']['overall_market']
384
+ market_skills_list = overall_data.get('top_skills', [])
385
+ market_tools_list = overall_data.get('top_tools', [])
386
+
387
+ market_skill_freq = {s['skill'].lower(): s['count'] for s in market_skills_list}
388
+ market_tool_freq = {t['tool'].lower(): t['count'] for t in market_tools_list}
389
+ user_skills_lower = {s.lower() for s in user_skills}
390
+ user_tools_lower = {t.lower() for t in user_tools}
391
+ missing_skills = [{"name": s['skill'], "count": s['count']} for s in market_skills_list if s['skill'].lower() not in user_skills_lower]
392
+ matching_skills = [{"name": s['skill'], "count": s['count']} for s in market_skills_list if s['skill'].lower() in user_skills_lower]
393
+ missing_tools = [{"name": t['tool'], "count": t['count']} for t in market_tools_list if t['tool'].lower() not in user_tools_lower]
394
+ matching_tools = [{"name": t['tool'], "count": t['count']} for t in market_tools_list if t['tool'].lower() in user_tools_lower]
395
+ user_skills_with_freq = [{"name": s, "count": market_skill_freq.get(s.lower(), 0)} for s in user_skills]
396
+ user_tools_with_freq = [{"name": t, "count": market_tool_freq.get(t.lower(), 0)} for t in user_tools]
397
+ gap_analysis = {
398
+ "user_skills": user_skills_with_freq,
399
+ "user_tools": user_tools_with_freq,
400
+ "missing_skills": missing_skills,
401
+ "matching_skills": matching_skills,
402
+ "missing_tools": missing_tools,
403
+ "matching_tools": matching_tools,
404
+ "total_user_skills": len(user_skills),
405
+ "total_user_tools": len(user_tools),
406
+ "total_missing_skills": len(missing_skills),
407
+ "total_matching_skills": len(matching_skills),
408
+ "total_missing_tools": len(missing_tools),
409
+ "total_matching_tools": len(matching_tools),
410
+ }
411
+
412
+ # --- Recommendation Generation (remains the same) ---
413
+ all_user_entities = user_skills_lower.union(user_tools_lower)
414
+ recommendations = {
415
+ "message": "Based on your resume, focusing on these skills and tools could improve your market alignment. We also recommend looking at co-occurring skills for your existing strengths.",
416
+ "skills_to_learn": missing_skills[:5],
417
+ "tools_to_learn": missing_tools[:5],
418
+ "based_on_your_strengths": {}
419
+ }
420
+ skill_co_data = []
421
+ tool_co_data = []
422
+ if target_role and target_role != "Overall Market":
423
+ role_data = DB['insights']['by_role'].get(target_role, {})
424
+ skill_co_data = role_data.get('skill_co_occurrence', [])
425
+ tool_co_data = role_data.get('tool_co_occurrence', [])
426
+ else:
427
+ overall_data = DB['insights']['overall_market']
428
+ skill_co_data = overall_data.get('skill_co_occurrence', [])
429
+ tool_co_data = overall_data.get('tool_co_occurrence', [])
430
+ df_list = []
431
+ if skill_co_data:
432
+ skills_df = pd.DataFrame(skill_co_data)
433
+ if 'skill_A' in skills_df.columns and 'skill_B' in skills_df.columns:
434
+ skills_df = skills_df.rename(columns={'skill_A': 'entity_A', 'skill_B': 'entity_B'})
435
+ df_list.append(skills_df)
436
+ if tool_co_data:
437
+ tools_df = pd.DataFrame(tool_co_data)
438
+ if 'tool_A' in tools_df.columns and 'tool_B' in tools_df.columns:
439
+ tools_df = tools_df.rename(columns={'tool_A': 'entity_A', 'tool_B': 'entity_B'})
440
+ df_list.append(tools_df)
441
+ if df_list:
442
+ co_occurrence_df = pd.concat(df_list, ignore_index=True)
443
+ if 'entity_A' in co_occurrence_df.columns and 'entity_B' in co_occurrence_df.columns:
444
+ for entity in all_user_entities:
445
+ related_A = co_occurrence_df[co_occurrence_df['entity_B'].str.lower() == entity]['entity_A'].tolist()
446
+ related_B = co_occurrence_df[co_occurrence_df['entity_A'].str.lower() == entity]['entity_B'].tolist()
447
+ related_entities = related_A + related_B
448
+ recommended = [s for s in related_entities if s.lower() not in all_user_entities]
449
+ if recommended:
450
+ unique_recommended = list(dict.fromkeys(recommended))
451
+ recommendations["based_on_your_strengths"][entity] = unique_recommended[:3]
452
+
453
+ # --- Final Response ---
454
+ return {
455
+ "similar_jobs": all_similar_jobs[:limit], # Return only the first page
456
+ "total_similar_jobs": total_similar_jobs,
457
+ "gap_analysis": gap_analysis,
458
+ "recommendations": recommendations,
459
+ "session_id": session_id,
460
+ }
461
+
462
+ @app.get("/similar_jobs/{session_id}", response_model=List[SimilarJob], tags=["Resume Analysis"])
463
+ async def get_more_similar_jobs(session_id: str, page: int = 1, limit: int = 10):
464
+ """
465
+ Gets a paginated list of similar jobs from the cache.
466
+ """
467
+ if session_id not in DB['similarity_cache']:
468
+ raise HTTPException(status_code=404, detail="Session not found or expired.")
469
+
470
+ full_job_list = DB['similarity_cache'][session_id]
471
+
472
+ start_index = (page - 1) * limit
473
+ end_index = page * limit
474
+
475
+ return full_job_list[start_index:end_index]
market_data_with_entities.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d9544e829d6e258b26907a303488bdaf6e5ea2767dfd918d90ac3d90edeeb45
3
+ size 27560352
market_insights.json ADDED
The diff for this file is too large to render. See raw diff
 
ner_model/config.cfg ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [paths]
2
+ train = "assets/train.spacy"
3
+ dev = "assets/dev.spacy"
4
+ vectors = null
5
+ init_tok2vec = null
6
+
7
+ [system]
8
+ seed = 0
9
+ gpu_allocator = "pytorch"
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["transformer","ner"]
14
+ batch_size = 128
15
+ disabled = []
16
+ before_creation = null
17
+ after_creation = null
18
+ after_pipeline_creation = null
19
+ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
+ vectors = {"@vectors":"spacy.Vectors.v1"}
21
+
22
+ [components]
23
+
24
+ [components.ner]
25
+ factory = "ner"
26
+ incorrect_spans_key = null
27
+ moves = null
28
+ scorer = {"@scorers":"spacy.ner_scorer.v1"}
29
+ update_with_oracle_cut_size = 100
30
+
31
+ [components.ner.model]
32
+ @architectures = "spacy.TransitionBasedParser.v2"
33
+ state_type = "ner"
34
+ extra_state_tokens = false
35
+ hidden_width = 64
36
+ maxout_pieces = 2
37
+ use_upper = true
38
+ nO = null
39
+
40
+ [components.ner.model.tok2vec]
41
+ @architectures = "spacy-transformers.TransformerListener.v1"
42
+ pooling = {"@layers":"reduce_mean.v1"}
43
+ grad_factor = 1.0
44
+ upstream = "*"
45
+
46
+ [components.transformer]
47
+ factory = "transformer"
48
+ max_batch_items = 174000
49
+ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
50
+
51
+ [components.transformer.model]
52
+ @architectures = "spacy-transformers.TransformerModel.v3"
53
+ name = "bert-base-uncased"
54
+ mixed_precision = true
55
+
56
+ [components.transformer.model.get_spans]
57
+ @span_getters = "spacy-transformers.strided_spans.v1"
58
+ window = 128
59
+ stride = 96
60
+
61
+ [components.transformer.model.grad_scaler_config]
62
+
63
+ [components.transformer.model.tokenizer_config]
64
+ use_fast = true
65
+
66
+ [components.transformer.model.transformer_config]
67
+
68
+ [corpora]
69
+
70
+ [corpora.dev]
71
+ @readers = "spacy.Corpus.v1"
72
+ path = ${paths.dev}
73
+ gold_preproc = false
74
+ max_length = 0
75
+ limit = 0
76
+ augmenter = null
77
+
78
+ [corpora.train]
79
+ @readers = "spacy.Corpus.v1"
80
+ path = ${paths.train}
81
+ gold_preproc = false
82
+ max_length = 0
83
+ limit = 0
84
+ augmenter = null
85
+
86
+ [training]
87
+ seed = ${system.seed}
88
+ gpu_allocator = ${system.gpu_allocator}
89
+ dropout = 0.1
90
+ accumulate_gradient = 1
91
+ max_steps = 20000
92
+ eval_frequency = 200
93
+ frozen_components = []
94
+ annotating_components = []
95
+ dev_corpus = "corpora.dev"
96
+ train_corpus = "corpora.train"
97
+ patience = 3200
98
+ max_epochs = 0
99
+ before_to_disk = null
100
+ before_update = null
101
+
102
+ [training.batcher]
103
+ @batchers = "spacy.batch_by_words.v1"
104
+ discard_oversize = true
105
+ tolerance = 0.2
106
+ get_length = null
107
+
108
+ [training.batcher.size]
109
+ @schedules = "compounding.v1"
110
+ start = 100
111
+ stop = 1000
112
+ compound = 1.001
113
+ t = 0.0
114
+
115
+ [training.logger]
116
+ @loggers = "spacy.ConsoleLogger.v1"
117
+ progress_bar = true
118
+
119
+ [training.optimizer]
120
+ @optimizers = "Adam.v1"
121
+ learn_rate = 0.00001
122
+ beta1 = 0.9
123
+ beta2 = 0.999
124
+ L2_is_weight_decay = true
125
+ L2 = 0.01
126
+ grad_clip = 1.0
127
+ use_averages = false
128
+ eps = 0.00000001
129
+
130
+ [training.score_weights]
131
+ ents_f = 1.0
132
+ ents_p = 0.0
133
+ ents_r = 0.0
134
+ ents_per_type = null
135
+
136
+ [pretraining]
137
+
138
+ [initialize]
139
+ vectors = ${paths.vectors}
140
+ init_tok2vec = ${paths.init_tok2vec}
141
+ vocab_data = null
142
+ lookups = null
143
+ before_init = null
144
+ after_init = null
145
+
146
+ [initialize.components]
147
+
148
+ [initialize.tokenizer]
ner_model/meta.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lang":"en",
3
+ "name":"pipeline",
4
+ "version":"0.0.0",
5
+ "spacy_version":">=3.7.4,<3.8.0",
6
+ "description":"",
7
+ "author":"",
8
+ "email":"",
9
+ "url":"",
10
+ "license":"",
11
+ "spacy_git_version":"bff8725f4",
12
+ "vectors":{
13
+ "width":0,
14
+ "vectors":0,
15
+ "keys":0,
16
+ "name":null,
17
+ "mode":"default"
18
+ },
19
+ "labels":{
20
+ "transformer":[
21
+
22
+ ],
23
+ "ner":[
24
+ "EXPERIENCE",
25
+ "SKILL",
26
+ "TOOL"
27
+ ]
28
+ },
29
+ "pipeline":[
30
+ "transformer",
31
+ "ner"
32
+ ],
33
+ "components":[
34
+ "transformer",
35
+ "ner"
36
+ ],
37
+ "disabled":[
38
+
39
+ ],
40
+ "performance":{
41
+ "ents_f":0.6260185445,
42
+ "ents_p":0.6503210741,
43
+ "ents_r":0.6034669556,
44
+ "ents_per_type":{
45
+ "EXPERIENCE":{
46
+ "p":0.8803418803,
47
+ "r":0.9537037037,
48
+ "f":0.9155555556
49
+ },
50
+ "SKILL":{
51
+ "p":0.6098981077,
52
+ "r":0.4515086207,
53
+ "f":0.5188854489
54
+ },
55
+ "TOOL":{
56
+ "p":0.6174242424,
57
+ "r":0.6965811966,
58
+ "f":0.6546184739
59
+ }
60
+ },
61
+ "transformer_loss":150.5976690537,
62
+ "ner_loss":2037.8725703364
63
+ }
64
+ }
ner_model/ner/cfg.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "moves":null,
3
+ "update_with_oracle_cut_size":100,
4
+ "multitasks":[
5
+
6
+ ],
7
+ "min_action_freq":1,
8
+ "learn_tokens":false,
9
+ "beam_width":1,
10
+ "beam_density":0.0,
11
+ "beam_update_prob":0.0,
12
+ "incorrect_spans_key":null
13
+ }
ner_model/ner/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e716df2fd6faab2abcb5035e526c58e9be2515448a6d9b576366febb0881d5e4
3
+ size 301831
ner_model/ner/moves ADDED
@@ -0,0 +1 @@
 
 
1
+ ��moves��{"0":{},"1":{"SKILL":8953,"TOOL":8067,"EXPERIENCE":4012},"2":{"SKILL":8953,"TOOL":8067,"EXPERIENCE":4012},"3":{"SKILL":8953,"TOOL":8067,"EXPERIENCE":4012},"4":{"SKILL":8953,"TOOL":8067,"EXPERIENCE":4012,"":1},"5":{"":1}}�cfg��neg_key�
ner_model/tokenizer ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b014e8bba4958b120af2d0c1c63eabb7c00379f2bacaf10df7c5325efd2ea467
3
+ size 77066
ner_model/transformer/cfg.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "max_batch_items":174000
3
+ }
ner_model/transformer/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50fe4b8034556a21fe59b1d5e0e1d710c77f7b65dd8870b8ffc67c367f00e628
3
+ size 438953871
ner_model/vocab/key2row ADDED
@@ -0,0 +1 @@
 
 
1
+
ner_model/vocab/lookups.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
ner_model/vocab/strings.json ADDED
The diff for this file is too large to render. See raw diff
 
ner_model/vocab/vectors ADDED
Binary file (128 Bytes). View file
 
ner_model/vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }
precompute_insights.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import ast
4
+ import re
5
+ import itertools
6
+ from collections import Counter
7
+ import json
8
+ import os
9
+
10
+ # --- Configuration ---
11
+ INPUT_FILE = 'market_data_with_entities.csv'
12
+ OUTPUT_FILE = 'market_insights.json'
13
+ OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
14
+ INPUT_PATH = os.path.join(OUTPUT_DIR, INPUT_FILE)
15
+ OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE)
16
+
17
+ def safe_literal_eval(s):
18
+ try:
19
+ if isinstance(s, str) and s.startswith('[') and s.endswith(']'):
20
+ return ast.literal_eval(s)
21
+ except (ValueError, SyntaxError):
22
+ pass
23
+ return []
24
+
25
+ def get_top_items(series):
26
+ """Calculates value counts for an exploded series."""
27
+ all_items = series.explode().dropna()
28
+ all_items = all_items.str.lower().str.strip()
29
+ counts = all_items.value_counts().reset_index()
30
+ counts.columns = ['item', 'count']
31
+ return counts
32
+
33
+ def get_co_occurrence(df, column, top_n=100):
34
+ """Calculates co-occurrence for a given column."""
35
+ co_occurrence_df = df[df[column].apply(lambda x: len(set(x))) >= 2].copy()
36
+ co_occurrence_df[f'{column}_normalized'] = co_occurrence_df[column].apply(
37
+ lambda items: sorted(list(set([i.lower().strip() for i in items])))
38
+ )
39
+ pairs = co_occurrence_df[f'{column}_normalized'].apply(lambda x: list(itertools.combinations(x, 2)))
40
+ pair_counts = Counter(pairs.explode().dropna())
41
+ most_common_pairs = pair_counts.most_common(top_n)
42
+
43
+ results = pd.DataFrame(most_common_pairs, columns=['pair', 'count'])
44
+ results[['item1', 'item2']] = pd.DataFrame(results['pair'].tolist(), index=results.index)
45
+ return results[['item1', 'item2', 'count']]
46
+
47
+ def parse_experience(exp_list):
48
+ """Parses experience strings to find years."""
49
+ if not isinstance(exp_list, list) or not exp_list:
50
+ return None
51
+ for exp_string in exp_list:
52
+ exp_string = str(exp_string).lower()
53
+ numbers = re.findall(r'\d+\.?\d*', exp_string)
54
+ if not numbers:
55
+ continue
56
+ val = float(numbers[0])
57
+ return val / 12.0 if 'month' in exp_string else val
58
+ return None
59
+
60
+ def main():
61
+ print("--- Starting Market Insight Pre-computation ---")
62
+
63
+ # --- Load and Prepare Data ---
64
+ print(f"Loading data from {INPUT_PATH}...")
65
+ if not os.path.exists(INPUT_PATH):
66
+ print(f"ERROR: Input file not found at {INPUT_PATH}")
67
+ return
68
+
69
+ df = pd.read_csv(INPUT_PATH)
70
+ print(f"Data loaded. Found {len(df)} records.")
71
+
72
+ print("Converting stringified lists to actual lists...")
73
+ for col in ['extracted_skills', 'extracted_tools', 'extracted_experience']:
74
+ df[col] = df[col].apply(safe_literal_eval)
75
+
76
+ # --- Master Data Structure ---
77
+ insights = {
78
+ "overall_market": {},
79
+ "by_role": {}
80
+ }
81
+
82
+ # --- Overall Market Analysis ---
83
+ print("Analyzing overall market...")
84
+ # Skills
85
+ overall_skills = get_top_items(df['extracted_skills'])
86
+ insights["overall_market"]["top_skills"] = overall_skills.rename(columns={'item': 'skill'}).to_dict(orient='records')
87
+
88
+ # Tools
89
+ overall_tools = get_top_items(df['extracted_tools'])
90
+ insights["overall_market"]["top_tools"] = overall_tools.rename(columns={'item': 'tool'}).to_dict(orient='records')
91
+
92
+ # Skill Co-occurrence
93
+ overall_skill_co = get_co_occurrence(df, 'extracted_skills')
94
+ insights["overall_market"]["skill_co_occurrence"] = overall_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records')
95
+
96
+ # Tool Co-occurrence
97
+ overall_tool_co = get_co_occurrence(df, 'extracted_tools')
98
+ insights["overall_market"]["tool_co_occurrence"] = overall_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records')
99
+
100
+ # Experience
101
+ df['min_years'] = df['extracted_experience'].apply(parse_experience)
102
+ exp_df = df.dropna(subset=['min_years'])
103
+ exp_df_filtered = exp_df[exp_df['min_years'] >= 1]
104
+
105
+ exp_dist = exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index()
106
+ exp_dist.columns = ['year', 'count']
107
+ insights["overall_market"]["experience_distribution"] = exp_dist.to_dict(orient='records')
108
+ insights["overall_market"]["average_experience"] = exp_df['min_years'].mean()
109
+
110
+ # Job Role Distribution
111
+ role_counts = df['cmo_role_match'].value_counts().reset_index()
112
+ role_counts.columns = ['cmo_role_match', 'count']
113
+ insights["job_role_distribution"] = role_counts.to_dict(orient='records')
114
+
115
+ # --- Per Role Analysis ---
116
+ print("Analyzing data for each role...")
117
+ roles = df['cmo_role_match'].unique()
118
+ for role in roles:
119
+ print(f"- Processing {role}...")
120
+ role_df = df[df['cmo_role_match'] == role].copy()
121
+ insights["by_role"][role] = {}
122
+
123
+ # Skills
124
+ role_skills = get_top_items(role_df['extracted_skills'])
125
+ if not role_skills.empty:
126
+ role_skills['cmo_role_match'] = role
127
+ insights["by_role"][role]["top_skills"] = role_skills.rename(columns={'item': 'skill'}).to_dict(orient='records')
128
+
129
+ # Tools
130
+ role_tools = get_top_items(role_df['extracted_tools'])
131
+ if not role_tools.empty:
132
+ role_tools['cmo_role_match'] = role
133
+ insights["by_role"][role]["top_tools"] = role_tools.rename(columns={'item': 'tool'}).to_dict(orient='records')
134
+
135
+ # Skill Co-occurrence
136
+ role_skill_co = get_co_occurrence(role_df, 'extracted_skills')
137
+ insights["by_role"][role]["skill_co_occurrence"] = role_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records')
138
+
139
+ # Tool Co-occurrence
140
+ role_tool_co = get_co_occurrence(role_df, 'extracted_tools')
141
+ insights["by_role"][role]["tool_co_occurrence"] = role_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records')
142
+
143
+ # Experience
144
+ role_exp_df = role_df.dropna(subset=['min_years'])
145
+ role_exp_df_filtered = role_exp_df[role_exp_df['min_years'] >= 1]
146
+
147
+ if not role_exp_df.empty:
148
+ insights["by_role"][role]["average_experience"] = role_exp_df['min_years'].mean()
149
+
150
+ role_exp_dist = role_exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index()
151
+ role_exp_dist.columns = ['year', 'count']
152
+ insights["by_role"][role]["experience_distribution"] = role_exp_dist.to_dict(orient='records')
153
+ else:
154
+ insights["by_role"][role]["average_experience"] = None
155
+ insights["by_role"][role]["experience_distribution"] = []
156
+
157
+
158
+ # --- Save to JSON ---
159
+ print(f"Saving aggregated insights to {OUTPUT_PATH}...")
160
+ with open(OUTPUT_PATH, 'w') as f:
161
+ json.dump(insights, f, indent=4)
162
+
163
+ print("--- Pre-computation Finished Successfully! ---")
164
+
165
+ if __name__ == "__main__":
166
+ main()
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Core Application Dependencies ---
2
+ fastapi==0.117.1
3
+ uvicorn==0.37.0
4
+ pandas==2.3.2
5
+ PyMuPDF==1.26.4
6
+ python-multipart==0.0.20
7
+ beautifulsoup4==4.13.5
8
+ emoji==2.15.0
9
+
10
+ # --- ML/NLP Dependencies (Pinned for Stability & BERT Model Compatibility) ---
11
+ # Pinned to match the BERT model's training environment
12
+ spacy[transformers]==3.7.4
13
+
14
+ # Pinned from training to resolve environment conflicts
15
+ click<8.0.0
16
+ typer<0.5.0
17
+ numpy==1.26.4
18
+ catalogue==2.0.10
19
+
20
+ # Pinned to fix build failures on hosting platforms like Render
21
+ blis==0.7.11
22
+ thinc==8.2.3
23
+
24
+ # PyTorch - Compatible with Spacy 3.7.4 and Python 3.11/3.12
25
+ torch==2.2.0
26
+
27
+ # Other dependencies from model training
28
+ scikit-learn==1.4.2
29
+ tqdm==4.66.2
similarity_requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sentence-transformers==5.1.1
2
+ torch==2.8.0
3
+ pandas==2.3.2
4
+ beautifulsoup4==4.13.5
5
+ emoji==2.15.0
similarity_worker.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sys
3
+ import json
4
+ import pandas as pd
5
+ import torch
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import argparse
8
+ import pathlib
9
+ import re
10
+ from bs4 import BeautifulSoup
11
+ import emoji
12
+
13
+ # --- Text Cleaning Function (copied from main.py) ---
14
+ def new_refined_text_cleaning(text: str) -> str:
15
+ """The NEW, improved cleaning function. Keeps technical symbols."""
16
+ if not isinstance(text, str):
17
+ return ""
18
+ text = BeautifulSoup(text, "html.parser").get_text()
19
+ url_pattern = r'(?:(?:https?|ftp)://)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?'
20
+ text = re.sub(url_pattern, '', text)
21
+ text = re.sub(r'\S+@\S+\s?', '', text)
22
+ text = emoji.demojize(text)
23
+ text = re.sub(r':[a-zA-Z_]+:', '', text)
24
+ text = text.replace('\\', ' ')
25
+ text = re.sub(r'[*•]', ' ', text)
26
+ text = re.sub(r'\{.*?\}', ' ', text)
27
+ text = re.sub(r'[^a-zA-Z0-9_#+()/\\s.,!?-]', ' ', text)
28
+ text = re.sub(r'\s+', ' ', text)
29
+ text = re.sub(r'\s([,.!?-])', r'\1', text)
30
+ text = text.strip()
31
+ text = text.lower()
32
+ return text
33
+
34
+ def main():
35
+ """
36
+ Main function to perform similarity search.
37
+ Reads resume text from stdin and target role from args.
38
+ Prints a JSON list of similar jobs to stdout.
39
+ """
40
+ try:
41
+ # 1. Setup paths
42
+ backend_dir = pathlib.Path(__file__).parent.resolve()
43
+
44
+ # 2. Parse arguments
45
+ parser = argparse.ArgumentParser()
46
+ parser.add_argument("--target_role", type=str, required=True)
47
+ parser.add_argument("--limit", type=int, default=10)
48
+ args = parser.parse_args()
49
+ target_role = args.target_role
50
+
51
+ # 3. Read resume text from stdin
52
+ resume_text = sys.stdin.read()
53
+ if not resume_text:
54
+ print(json.dumps([]))
55
+ return
56
+
57
+ # 4. Load models and data
58
+ model = SentenceTransformer(
59
+ 'TechWolf/JobBERT-v2',
60
+ cache_folder=str(backend_dir / "cached_models"),
61
+ device="cpu"
62
+ )
63
+ job_embeddings = torch.load(backend_dir / "job_embeddings.pt", map_location="cpu")
64
+ market_data = pd.read_csv(backend_dir / "final_prototype_postings.csv")
65
+
66
+ # 5. Filter data based on target_role
67
+ if target_role != "Overall Market":
68
+ role_specific_data = market_data[market_data["cmo_role_match"] == target_role]
69
+ if not role_specific_data.empty:
70
+ role_indices = role_specific_data.index.tolist()
71
+ embeddings_tensor = job_embeddings[role_indices]
72
+ filtered_market_data = role_specific_data
73
+ else:
74
+ embeddings_tensor = job_embeddings
75
+ filtered_market_data = market_data
76
+ else:
77
+ embeddings_tensor = job_embeddings
78
+ filtered_market_data = market_data
79
+
80
+ # 6. Perform similarity search
81
+ cleaned_resume_text = new_refined_text_cleaning(resume_text)
82
+ resume_embedding = model.encode(
83
+ cleaned_resume_text, convert_to_tensor=True, device="cpu"
84
+ )
85
+
86
+ cosine_scores = util.cos_sim(resume_embedding, embeddings_tensor)[0]
87
+ top_results = torch.topk(cosine_scores, k=min(args.limit, len(filtered_market_data)))
88
+
89
+ # 7. Prepare and print results
90
+ similar_jobs = []
91
+ for score, idx in zip(top_results[0], top_results[1]):
92
+ job = filtered_market_data.iloc[idx.item()]
93
+ similar_jobs.append({
94
+ "job_title": job["title"],
95
+ "cmo_role_match": job["cmo_role_match"],
96
+ "url": job["job_url"],
97
+ "similarity_score": score.item(),
98
+ })
99
+
100
+ output = {
101
+ "total_jobs": len(filtered_market_data),
102
+ "similar_jobs": similar_jobs
103
+ }
104
+ print(json.dumps(output))
105
+
106
+ except Exception as e:
107
+ # Log any errors to stderr to be captured by the main process
108
+ print(f"Similarity worker error: {e}", file=sys.stderr)
109
+ # Output an empty list to stdout to prevent downstream JSON errors
110
+ print(json.dumps([]))
111
+ sys.exit(1)
112
+
113
+ if __name__ == "__main__":
114
+ main()