michaelozon commited on
Commit
5dfa886
Β·
verified Β·
1 Parent(s): 6f3365f

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +85 -17
pipeline.py CHANGED
@@ -1,24 +1,20 @@
1
  """
2
- Part 4: Input β†’ Output Pipeline (Resume-Job Matching) - CLEAN (SPACE-READY)
3
- ==========================================================================
4
 
5
  βœ… Implements the core IO Pipeline: User Input β†’ Embedding β†’ Similarity β†’ Top-K
6
  βœ… Loads precomputed embeddings from Part 3
7
  βœ… Uses the same job text construction logic as Part 3
8
  βœ… SAFE for HuggingFace Spaces: does NOT run demos on import
9
  βœ… Adds robust embedding normalization (so cosine similarity is correct)
10
-
11
- What changed vs your version:
12
- 1) All loading prints are removed from import-time (Spaces-friendly).
13
- 2) Heavy work is done lazily via init_pipeline().
14
- 3) Demo code runs ONLY if you run: python pipeline.py (not when Gradio imports it).
15
- 4) Ensures resume embeddings are normalized (even if Part 3 saved them non-normalized).
16
  """
17
 
18
  import os
19
  import json
20
  import ast
21
- from typing import List, Optional, Dict, Any, Tuple
22
 
23
  import numpy as np
24
  import pandas as pd
@@ -29,14 +25,16 @@ from sentence_transformers import SentenceTransformer
29
  # CONFIG
30
  # =========================
31
  DATASET_REPO = "michaelozon/candidate-matching-synthetic"
32
- MODEL_NAME = "intfloat/e5-small-v2"
33
 
34
  # Where embeddings are in your Space repo
 
35
  CANDIDATE_DIRS = ["./embeddings", "./embeddings_out", "./"]
36
 
37
  # Filenames you uploaded (based on your screenshot)
38
  RESUME_EMB_FILE = "intfloat__e5-small-v2_resumes.npy"
39
  RESUME_IDS_FILE = "intfloat__e5-small-v2_resume_ids.json"
 
40
 
41
  DEFAULT_TOP_K = 10
42
 
@@ -136,6 +134,40 @@ def _normalize_rows(mat: np.ndarray) -> np.ndarray:
136
  return mat / norms
137
 
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  # =========================
140
  # LAZY-LOADED GLOBALS (Spaces-friendly)
141
  # =========================
@@ -146,12 +178,18 @@ def init_pipeline(force_reload: bool = False) -> Dict[str, Any]:
146
  """
147
  Load everything once and keep it in memory.
148
  Call this from app.py before using rank_candidates_for_new_job().
 
 
 
149
  """
150
  global _PIPELINE
151
  if _PIPELINE and not force_reload:
152
  return _PIPELINE
153
 
 
 
154
  # ---- Load resumes DF ----
 
155
  df_resumes = load_dataset(
156
  DATASET_REPO,
157
  data_files="resumes/*.parquet",
@@ -161,8 +199,11 @@ def init_pipeline(force_reload: bool = False) -> Dict[str, Any]:
161
  df_resumes["skills"] = df_resumes["skills"].apply(to_list)
162
  df_resumes["experience_bullets"] = df_resumes["experience_bullets"].apply(to_list)
163
  df_resumes["resume_id"] = df_resumes["resume_id"].astype(str)
 
 
164
 
165
  # ---- Load embeddings + ids ----
 
166
  emb_path = find_existing_path(RESUME_EMB_FILE)
167
  ids_path = find_existing_path(RESUME_IDS_FILE)
168
 
@@ -174,6 +215,9 @@ def init_pipeline(force_reload: bool = False) -> Dict[str, Any]:
174
  "Tip: In your Space repo, put them under /embeddings/ (recommended)."
175
  )
176
 
 
 
 
177
  resume_emb = np.load(emb_path).astype(np.float32)
178
  with open(ids_path, "r", encoding="utf-8") as f:
179
  resume_ids = [str(x) for x in json.load(f)]
@@ -185,12 +229,19 @@ def init_pipeline(force_reload: bool = False) -> Dict[str, Any]:
185
 
186
  # Ensure embeddings normalized (cosine)
187
  resume_emb = _normalize_rows(resume_emb)
 
 
188
 
189
  # Fast lookup resume_id -> df row index
190
  df_index_by_id = {rid: i for i, rid in enumerate(df_resumes["resume_id"].tolist())}
191
 
192
  # ---- Load model (for query embedding) ----
193
- model = SentenceTransformer(MODEL_NAME, device="cpu")
 
 
 
 
 
194
 
195
  _PIPELINE = {
196
  "df_resumes": df_resumes,
@@ -198,7 +249,10 @@ def init_pipeline(force_reload: bool = False) -> Dict[str, Any]:
198
  "resume_ids": resume_ids,
199
  "df_index_by_id": df_index_by_id,
200
  "model": model,
 
201
  }
 
 
202
  return _PIPELINE
203
 
204
 
@@ -325,10 +379,15 @@ def rank_candidates_for_new_job(
325
  # DEMO (RUNS ONLY IF YOU EXECUTE THIS FILE DIRECTLY)
326
  # =========================
327
  if __name__ == "__main__":
328
- print("Initializing pipeline...")
 
 
 
329
  init_pipeline()
330
 
331
- print("\nDEMO 1: Senior Data Scientist in FinTech")
 
 
332
  demo1 = rank_candidates_for_new_job(
333
  job_title="Senior Data Scientist",
334
  seniority="Senior",
@@ -341,8 +400,11 @@ if __name__ == "__main__":
341
  top_k=10,
342
  )
343
  print(demo1.to_string(index=False))
 
344
 
345
- print("\nDEMO 2: UX Designer (role filter)")
 
 
346
  demo2 = rank_candidates_for_new_job(
347
  job_title="UX Designer",
348
  seniority="Mid-Level",
@@ -352,7 +414,7 @@ if __name__ == "__main__":
352
  filter_by_role=True,
353
  )
354
  if len(demo2) == 0:
355
- print("No results with role filter; showing without filter:")
356
  demo2 = rank_candidates_for_new_job(
357
  job_title="UX Designer",
358
  seniority="Mid-Level",
@@ -363,7 +425,9 @@ if __name__ == "__main__":
363
  )
364
  print(demo2.to_string(index=False))
365
 
366
- print("\nDEMO 3: Product Manager (E-commerce only)")
 
 
367
  demo3 = rank_candidates_for_new_job(
368
  job_title="Product Manager",
369
  seniority="Mid-Level",
@@ -372,4 +436,8 @@ if __name__ == "__main__":
372
  top_k=10,
373
  filter_by_industry=True,
374
  )
375
- print(demo3.to_string(index=False))
 
 
 
 
 
1
  """
2
+ Part 4: Input β†’ Output Pipeline (Resume-Job Matching) - FINAL VERSION
3
+ =====================================================================
4
 
5
  βœ… Implements the core IO Pipeline: User Input β†’ Embedding β†’ Similarity β†’ Top-K
6
  βœ… Loads precomputed embeddings from Part 3
7
  βœ… Uses the same job text construction logic as Part 3
8
  βœ… SAFE for HuggingFace Spaces: does NOT run demos on import
9
  βœ… Adds robust embedding normalization (so cosine similarity is correct)
10
+ βœ… Reads winning model from optimal_model.json (with fallback)
11
+ βœ… Corrected directory search order (embeddings/ first)
 
 
 
 
12
  """
13
 
14
  import os
15
  import json
16
  import ast
17
+ from typing import List, Optional, Dict, Any
18
 
19
  import numpy as np
20
  import pandas as pd
 
25
  # CONFIG
26
  # =========================
27
  DATASET_REPO = "michaelozon/candidate-matching-synthetic"
28
+ MODEL_NAME_DEFAULT = "intfloat/e5-small-v2" # Fallback if optimal_model.json not found
29
 
30
  # Where embeddings are in your Space repo
31
+ # FIXED: Changed order - ./embeddings FIRST (as shown in your screenshots)
32
  CANDIDATE_DIRS = ["./embeddings", "./embeddings_out", "./"]
33
 
34
  # Filenames you uploaded (based on your screenshot)
35
  RESUME_EMB_FILE = "intfloat__e5-small-v2_resumes.npy"
36
  RESUME_IDS_FILE = "intfloat__e5-small-v2_resume_ids.json"
37
+ OPTIMAL_MODEL_FILE = "optimal_model.json" # NEW: Model selection file
38
 
39
  DEFAULT_TOP_K = 10
40
 
 
134
  return mat / norms
135
 
136
 
137
+ def _load_optimal_model_name() -> str:
138
+ """
139
+ NEW: Load the winning model name from optimal_model.json
140
+
141
+ This implements the Part 5 requirement:
142
+ "Read the winning Embedding model directly from HF model repo"
143
+
144
+ Returns:
145
+ model_name: The model name to use (from JSON or fallback)
146
+ """
147
+ optimal_model_path = find_existing_path(OPTIMAL_MODEL_FILE)
148
+
149
+ if optimal_model_path:
150
+ try:
151
+ with open(optimal_model_path, "r", encoding="utf-8") as f:
152
+ optimal_data = json.load(f)
153
+
154
+ # Extract model_name from JSON
155
+ model_name = optimal_data.get("model_name") or optimal_data.get("model")
156
+
157
+ if model_name:
158
+ print(f"βœ… Using model from {OPTIMAL_MODEL_FILE}: {model_name}")
159
+ return model_name
160
+ else:
161
+ print(f"⚠️ No 'model_name' field in {OPTIMAL_MODEL_FILE}")
162
+
163
+ except Exception as e:
164
+ print(f"⚠️ Could not read {OPTIMAL_MODEL_FILE}: {e}")
165
+
166
+ # Fallback to default
167
+ print(f"ℹ️ Using default model: {MODEL_NAME_DEFAULT}")
168
+ return MODEL_NAME_DEFAULT
169
+
170
+
171
  # =========================
172
  # LAZY-LOADED GLOBALS (Spaces-friendly)
173
  # =========================
 
178
  """
179
  Load everything once and keep it in memory.
180
  Call this from app.py before using rank_candidates_for_new_job().
181
+
182
+ FIXED: Now loads model name from optimal_model.json (with fallback)
183
+ FIXED: Corrected directory search order
184
  """
185
  global _PIPELINE
186
  if _PIPELINE and not force_reload:
187
  return _PIPELINE
188
 
189
+ print("πŸ”„ Initializing pipeline...")
190
+
191
  # ---- Load resumes DF ----
192
+ print(f"πŸ“₯ Loading dataset from {DATASET_REPO}...")
193
  df_resumes = load_dataset(
194
  DATASET_REPO,
195
  data_files="resumes/*.parquet",
 
199
  df_resumes["skills"] = df_resumes["skills"].apply(to_list)
200
  df_resumes["experience_bullets"] = df_resumes["experience_bullets"].apply(to_list)
201
  df_resumes["resume_id"] = df_resumes["resume_id"].astype(str)
202
+
203
+ print(f"βœ… Loaded {len(df_resumes):,} resumes")
204
 
205
  # ---- Load embeddings + ids ----
206
+ print(f"πŸ“¦ Loading embeddings from {CANDIDATE_DIRS}...")
207
  emb_path = find_existing_path(RESUME_EMB_FILE)
208
  ids_path = find_existing_path(RESUME_IDS_FILE)
209
 
 
215
  "Tip: In your Space repo, put them under /embeddings/ (recommended)."
216
  )
217
 
218
+ print(f" Found embeddings at: {emb_path}")
219
+ print(f" Found IDs at: {ids_path}")
220
+
221
  resume_emb = np.load(emb_path).astype(np.float32)
222
  with open(ids_path, "r", encoding="utf-8") as f:
223
  resume_ids = [str(x) for x in json.load(f)]
 
229
 
230
  # Ensure embeddings normalized (cosine)
231
  resume_emb = _normalize_rows(resume_emb)
232
+
233
+ print(f"βœ… Loaded embeddings: {resume_emb.shape}")
234
 
235
  # Fast lookup resume_id -> df row index
236
  df_index_by_id = {rid: i for i, rid in enumerate(df_resumes["resume_id"].tolist())}
237
 
238
  # ---- Load model (for query embedding) ----
239
+ # NEW: Read model name from optimal_model.json with fallback
240
+ model_name = _load_optimal_model_name()
241
+
242
+ print(f"πŸ€– Loading model: {model_name}...")
243
+ model = SentenceTransformer(model_name, device="cpu")
244
+ print(f"βœ… Model loaded successfully")
245
 
246
  _PIPELINE = {
247
  "df_resumes": df_resumes,
 
249
  "resume_ids": resume_ids,
250
  "df_index_by_id": df_index_by_id,
251
  "model": model,
252
+ "model_name": model_name, # Store for reference
253
  }
254
+
255
+ print("βœ… Pipeline initialization complete!\n")
256
  return _PIPELINE
257
 
258
 
 
379
  # DEMO (RUNS ONLY IF YOU EXECUTE THIS FILE DIRECTLY)
380
  # =========================
381
  if __name__ == "__main__":
382
+ print("="*80)
383
+ print("PART 4: Pipeline Demo")
384
+ print("="*80 + "\n")
385
+
386
  init_pipeline()
387
 
388
+ print("\n" + "="*80)
389
+ print("DEMO 1: Senior Data Scientist in FinTech")
390
+ print("="*80)
391
  demo1 = rank_candidates_for_new_job(
392
  job_title="Senior Data Scientist",
393
  seniority="Senior",
 
400
  top_k=10,
401
  )
402
  print(demo1.to_string(index=False))
403
+ print(f"\nScore range: [{demo1['similarity_score'].min():.4f}, {demo1['similarity_score'].max():.4f}]")
404
 
405
+ print("\n" + "="*80)
406
+ print("DEMO 2: UX Designer (with role filter)")
407
+ print("="*80)
408
  demo2 = rank_candidates_for_new_job(
409
  job_title="UX Designer",
410
  seniority="Mid-Level",
 
414
  filter_by_role=True,
415
  )
416
  if len(demo2) == 0:
417
+ print("⚠️ No results with role filter; showing without filter:")
418
  demo2 = rank_candidates_for_new_job(
419
  job_title="UX Designer",
420
  seniority="Mid-Level",
 
425
  )
426
  print(demo2.to_string(index=False))
427
 
428
+ print("\n" + "="*80)
429
+ print("DEMO 3: Product Manager (E-commerce only)")
430
+ print("="*80)
431
  demo3 = rank_candidates_for_new_job(
432
  job_title="Product Manager",
433
  seniority="Mid-Level",
 
436
  top_k=10,
437
  filter_by_industry=True,
438
  )
439
+ print(demo3.to_string(index=False))
440
+
441
+ print("\n" + "="*80)
442
+ print("βœ… All demos completed successfully!")
443
+ print("="*80)