Spaces:

michaelozon
/

Resume_job_Matching_System_new

Sleeping

App Files Files Community

michaelozon commited on Jan 22

Commit

5dfa886

verified ·

1 Parent(s): 6f3365f

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +85 -17

pipeline.py CHANGED Viewed

@@ -1,24 +1,20 @@
 """
-Part 4: Input → Output Pipeline (Resume-Job Matching) - CLEAN (SPACE-READY)
-==========================================================================
 ✅ Implements the core IO Pipeline: User Input → Embedding → Similarity → Top-K
 ✅ Loads precomputed embeddings from Part 3
 ✅ Uses the same job text construction logic as Part 3
 ✅ SAFE for HuggingFace Spaces: does NOT run demos on import
 ✅ Adds robust embedding normalization (so cosine similarity is correct)
-What changed vs your version:
-1) All loading prints are removed from import-time (Spaces-friendly).
-2) Heavy work is done lazily via init_pipeline().
-3) Demo code runs ONLY if you run: python pipeline.py (not when Gradio imports it).
-4) Ensures resume embeddings are normalized (even if Part 3 saved them non-normalized).
 """
 import os
 import json
 import ast
-from typing import List, Optional, Dict, Any, Tuple
 import numpy as np
 import pandas as pd
@@ -29,14 +25,16 @@ from sentence_transformers import SentenceTransformer
 # CONFIG
 # =========================
 DATASET_REPO = "michaelozon/candidate-matching-synthetic"
-MODEL_NAME = "intfloat/e5-small-v2"
 # Where embeddings are in your Space repo
 CANDIDATE_DIRS = ["./embeddings", "./embeddings_out", "./"]
 # Filenames you uploaded (based on your screenshot)
 RESUME_EMB_FILE = "intfloat__e5-small-v2_resumes.npy"
 RESUME_IDS_FILE = "intfloat__e5-small-v2_resume_ids.json"
 DEFAULT_TOP_K = 10
@@ -136,6 +134,40 @@ def _normalize_rows(mat: np.ndarray) -> np.ndarray:
     return mat / norms
 # =========================
 # LAZY-LOADED GLOBALS (Spaces-friendly)
 # =========================
@@ -146,12 +178,18 @@ def init_pipeline(force_reload: bool = False) -> Dict[str, Any]:
     """
     Load everything once and keep it in memory.
     Call this from app.py before using rank_candidates_for_new_job().
     """
     global _PIPELINE
     if _PIPELINE and not force_reload:
         return _PIPELINE
     # ---- Load resumes DF ----
     df_resumes = load_dataset(
         DATASET_REPO,
         data_files="resumes/*.parquet",
@@ -161,8 +199,11 @@ def init_pipeline(force_reload: bool = False) -> Dict[str, Any]:
     df_resumes["skills"] = df_resumes["skills"].apply(to_list)
     df_resumes["experience_bullets"] = df_resumes["experience_bullets"].apply(to_list)
     df_resumes["resume_id"] = df_resumes["resume_id"].astype(str)
     # ---- Load embeddings + ids ----
     emb_path = find_existing_path(RESUME_EMB_FILE)
     ids_path = find_existing_path(RESUME_IDS_FILE)
@@ -174,6 +215,9 @@ def init_pipeline(force_reload: bool = False) -> Dict[str, Any]:
             "Tip: In your Space repo, put them under /embeddings/ (recommended)."
         )
     resume_emb = np.load(emb_path).astype(np.float32)
     with open(ids_path, "r", encoding="utf-8") as f:
         resume_ids = [str(x) for x in json.load(f)]
@@ -185,12 +229,19 @@ def init_pipeline(force_reload: bool = False) -> Dict[str, Any]:
     # Ensure embeddings normalized (cosine)
     resume_emb = _normalize_rows(resume_emb)
     # Fast lookup resume_id -> df row index
     df_index_by_id = {rid: i for i, rid in enumerate(df_resumes["resume_id"].tolist())}
     # ---- Load model (for query embedding) ----
-    model = SentenceTransformer(MODEL_NAME, device="cpu")
     _PIPELINE = {
         "df_resumes": df_resumes,
@@ -198,7 +249,10 @@ def init_pipeline(force_reload: bool = False) -> Dict[str, Any]:
         "resume_ids": resume_ids,
         "df_index_by_id": df_index_by_id,
         "model": model,
     }
     return _PIPELINE
@@ -325,10 +379,15 @@ def rank_candidates_for_new_job(
 # DEMO (RUNS ONLY IF YOU EXECUTE THIS FILE DIRECTLY)
 # =========================
 if __name__ == "__main__":
-    print("Initializing pipeline...")
     init_pipeline()
-    print("\nDEMO 1: Senior Data Scientist in FinTech")
     demo1 = rank_candidates_for_new_job(
         job_title="Senior Data Scientist",
         seniority="Senior",
@@ -341,8 +400,11 @@ if __name__ == "__main__":
         top_k=10,
     )
     print(demo1.to_string(index=False))
-    print("\nDEMO 2: UX Designer (role filter)")
     demo2 = rank_candidates_for_new_job(
         job_title="UX Designer",
         seniority="Mid-Level",
@@ -352,7 +414,7 @@ if __name__ == "__main__":
         filter_by_role=True,
     )
     if len(demo2) == 0:
-        print("No results with role filter; showing without filter:")
         demo2 = rank_candidates_for_new_job(
             job_title="UX Designer",
             seniority="Mid-Level",
@@ -363,7 +425,9 @@ if __name__ == "__main__":
         )
     print(demo2.to_string(index=False))
-    print("\nDEMO 3: Product Manager (E-commerce only)")
     demo3 = rank_candidates_for_new_job(
         job_title="Product Manager",
         seniority="Mid-Level",
@@ -372,4 +436,8 @@ if __name__ == "__main__":
         top_k=10,
         filter_by_industry=True,
     )
-    print(demo3.to_string(index=False))

 """
+Part 4: Input → Output Pipeline (Resume-Job Matching) - FINAL VERSION
+=====================================================================
 ✅ Implements the core IO Pipeline: User Input → Embedding → Similarity → Top-K
 ✅ Loads precomputed embeddings from Part 3
 ✅ Uses the same job text construction logic as Part 3
 ✅ SAFE for HuggingFace Spaces: does NOT run demos on import
 ✅ Adds robust embedding normalization (so cosine similarity is correct)
+✅ Reads winning model from optimal_model.json (with fallback)
+✅ Corrected directory search order (embeddings/ first)
 """
 import os
 import json
 import ast
+from typing import List, Optional, Dict, Any
 import numpy as np
 import pandas as pd
 # CONFIG
 # =========================
 DATASET_REPO = "michaelozon/candidate-matching-synthetic"
+MODEL_NAME_DEFAULT = "intfloat/e5-small-v2"  # Fallback if optimal_model.json not found
 # Where embeddings are in your Space repo
+# FIXED: Changed order - ./embeddings FIRST (as shown in your screenshots)
 CANDIDATE_DIRS = ["./embeddings", "./embeddings_out", "./"]
 # Filenames you uploaded (based on your screenshot)
 RESUME_EMB_FILE = "intfloat__e5-small-v2_resumes.npy"
 RESUME_IDS_FILE = "intfloat__e5-small-v2_resume_ids.json"
+OPTIMAL_MODEL_FILE = "optimal_model.json"  # NEW: Model selection file
 DEFAULT_TOP_K = 10
     return mat / norms
+def _load_optimal_model_name() -> str:
+    """
+    NEW: Load the winning model name from optimal_model.json
+    This implements the Part 5 requirement:
+    "Read the winning Embedding model directly from HF model repo"
+    Returns:
+        model_name: The model name to use (from JSON or fallback)
+    """
+    optimal_model_path = find_existing_path(OPTIMAL_MODEL_FILE)
+    if optimal_model_path:
+        try:
+            with open(optimal_model_path, "r", encoding="utf-8") as f:
+                optimal_data = json.load(f)
+            # Extract model_name from JSON
+            model_name = optimal_data.get("model_name") or optimal_data.get("model")
+            if model_name:
+                print(f"✅ Using model from {OPTIMAL_MODEL_FILE}: {model_name}")
+                return model_name
+            else:
+                print(f"⚠️  No 'model_name' field in {OPTIMAL_MODEL_FILE}")
+        except Exception as e:
+            print(f"⚠️  Could not read {OPTIMAL_MODEL_FILE}: {e}")
+    # Fallback to default
+    print(f"ℹ️  Using default model: {MODEL_NAME_DEFAULT}")
+    return MODEL_NAME_DEFAULT
 # =========================
 # LAZY-LOADED GLOBALS (Spaces-friendly)
 # =========================
     """
     Load everything once and keep it in memory.
     Call this from app.py before using rank_candidates_for_new_job().
+    FIXED: Now loads model name from optimal_model.json (with fallback)
+    FIXED: Corrected directory search order
     """
     global _PIPELINE
     if _PIPELINE and not force_reload:
         return _PIPELINE
+    print("🔄 Initializing pipeline...")
     # ---- Load resumes DF ----
+    print(f"📥 Loading dataset from {DATASET_REPO}...")
     df_resumes = load_dataset(
         DATASET_REPO,
         data_files="resumes/*.parquet",
     df_resumes["skills"] = df_resumes["skills"].apply(to_list)
     df_resumes["experience_bullets"] = df_resumes["experience_bullets"].apply(to_list)
     df_resumes["resume_id"] = df_resumes["resume_id"].astype(str)
+    print(f"✅ Loaded {len(df_resumes):,} resumes")
     # ---- Load embeddings + ids ----
+    print(f"📦 Loading embeddings from {CANDIDATE_DIRS}...")
     emb_path = find_existing_path(RESUME_EMB_FILE)
     ids_path = find_existing_path(RESUME_IDS_FILE)
             "Tip: In your Space repo, put them under /embeddings/ (recommended)."
         )
+    print(f"   Found embeddings at: {emb_path}")
+    print(f"   Found IDs at: {ids_path}")
     resume_emb = np.load(emb_path).astype(np.float32)
     with open(ids_path, "r", encoding="utf-8") as f:
         resume_ids = [str(x) for x in json.load(f)]
     # Ensure embeddings normalized (cosine)
     resume_emb = _normalize_rows(resume_emb)
+    print(f"✅ Loaded embeddings: {resume_emb.shape}")
     # Fast lookup resume_id -> df row index
     df_index_by_id = {rid: i for i, rid in enumerate(df_resumes["resume_id"].tolist())}
     # ---- Load model (for query embedding) ----
+    # NEW: Read model name from optimal_model.json with fallback
+    model_name = _load_optimal_model_name()
+    print(f"🤖 Loading model: {model_name}...")
+    model = SentenceTransformer(model_name, device="cpu")
+    print(f"✅ Model loaded successfully")
     _PIPELINE = {
         "df_resumes": df_resumes,
         "resume_ids": resume_ids,
         "df_index_by_id": df_index_by_id,
         "model": model,
+        "model_name": model_name,  # Store for reference
     }
+    print("✅ Pipeline initialization complete!\n")
     return _PIPELINE
 # DEMO (RUNS ONLY IF YOU EXECUTE THIS FILE DIRECTLY)
 # =========================
 if __name__ == "__main__":
+    print("="*80)
+    print("PART 4: Pipeline Demo")
+    print("="*80 + "\n")
     init_pipeline()
+    print("\n" + "="*80)
+    print("DEMO 1: Senior Data Scientist in FinTech")
+    print("="*80)
     demo1 = rank_candidates_for_new_job(
         job_title="Senior Data Scientist",
         seniority="Senior",
         top_k=10,
     )
     print(demo1.to_string(index=False))
+    print(f"\nScore range: [{demo1['similarity_score'].min():.4f}, {demo1['similarity_score'].max():.4f}]")
+    print("\n" + "="*80)
+    print("DEMO 2: UX Designer (with role filter)")
+    print("="*80)
     demo2 = rank_candidates_for_new_job(
         job_title="UX Designer",
         seniority="Mid-Level",
         filter_by_role=True,
     )
     if len(demo2) == 0:
+        print("⚠️  No results with role filter; showing without filter:")
         demo2 = rank_candidates_for_new_job(
             job_title="UX Designer",
             seniority="Mid-Level",
         )
     print(demo2.to_string(index=False))
+    print("\n" + "="*80)
+    print("DEMO 3: Product Manager (E-commerce only)")
+    print("="*80)
     demo3 = rank_candidates_for_new_job(
         job_title="Product Manager",
         seniority="Mid-Level",
         top_k=10,
         filter_by_industry=True,
     )
+    print(demo3.to_string(index=False))
+    print("\n" + "="*80)
+    print("✅ All demos completed successfully!")
+    print("="*80)