Spaces:
Running
Running
| """ | |
| Data loader for candidate data. | |
| AUTO-GENERATED by scripts/generate_hf.sh - DO NOT EDIT DIRECTLY | |
| Edit data_loader.py in main repo and regenerate. | |
| """ | |
| from pathlib import Path | |
| from typing import Optional | |
| import pandas as pd | |
| class DataLoader: | |
| """Loads and caches candidate data from parquet file.""" | |
| _instance: Optional['DataLoader'] = None | |
| _data: Optional[pd.DataFrame] = None | |
| def __new__(cls): | |
| if cls._instance is None: | |
| cls._instance = super().__new__(cls) | |
| return cls._instance | |
| def __init__(self): | |
| """Initialize the data loader.""" | |
| if self._data is None: | |
| self._load_data() | |
| def _load_data(self): | |
| """Load candidate data from parquet file.""" | |
| # Find data file - try multiple locations for different deployments | |
| possible_paths = [ | |
| # Main repo: bpo_benchmark/api/data_loader.py -> data/ | |
| Path(__file__).parent.parent.parent / "data" / "candidate_data.parquet", | |
| # HF/CUGA: data_loader.py in same dir as data/ | |
| Path(__file__).parent / "data" / "candidate_data.parquet", | |
| # Current working directory | |
| Path("./data/candidate_data.parquet"), | |
| # HuggingFace Spaces default path | |
| Path("/home/user/app/data/candidate_data.parquet"), | |
| ] | |
| data_file = None | |
| for path in possible_paths: | |
| if path.exists(): | |
| data_file = path | |
| break | |
| if data_file is None: | |
| raise FileNotFoundError( | |
| f"Data file not found. Searched paths: {[str(p) for p in possible_paths]}" | |
| ) | |
| self._data = pd.read_parquet(data_file) | |
| # Parse skills column (may be string representation of list or already parsed) | |
| import ast | |
| import numpy as np | |
| def parse_skills(x): | |
| # Handle None/NaN | |
| if x is None: | |
| return [] | |
| # Check if it's a numpy/pandas array or list (already parsed) | |
| if isinstance(x, (list, np.ndarray)): | |
| return list(x) if isinstance(x, np.ndarray) else x | |
| # Handle string case | |
| if isinstance(x, str): | |
| if x == '': | |
| return [] | |
| try: | |
| return ast.literal_eval(x) | |
| except (ValueError, SyntaxError): | |
| return [] | |
| # Scalar NaN case | |
| try: | |
| if pd.isna(x): | |
| return [] | |
| except (TypeError, ValueError): | |
| pass | |
| return [] | |
| self._data['skills_parsed'] = self._data['skills'].apply(parse_skills) | |
| def data(self) -> pd.DataFrame: | |
| """Get the loaded data.""" | |
| if self._data is None: | |
| self._load_data() | |
| return self._data | |
| def get_by_requisition(self, requisition_id: str) -> pd.DataFrame: | |
| """Get all candidates for a specific requisition.""" | |
| return self.data[self.data['requisition_id'] == requisition_id].copy() | |
| def get_similar_requisitions(self, requisition_id: str) -> pd.DataFrame: | |
| """ | |
| Get candidates from similar requisitions. | |
| Similarity is determined by matching requisition metadata: | |
| - Primary: requisition_template_id (most specific) | |
| - Fallback: department + seniority_level (broader matching) | |
| This enables data-driven similarity without hardcoded requisition lists. | |
| """ | |
| # Get the reference requisition's metadata | |
| ref_rows = self.data[self.data['requisition_id'] == requisition_id] | |
| if ref_rows.empty: | |
| # Unknown requisition - return empty DataFrame | |
| return pd.DataFrame(columns=self.data.columns) | |
| # Extract metadata from first row (all rows for same req_id have same metadata) | |
| ref_row = ref_rows.iloc[0] | |
| ref_template_id = ref_row.get('requisition_template_id') | |
| # Primary: match by template ID if present | |
| if pd.notna(ref_template_id) and str(ref_template_id).strip(): | |
| similar_mask = self.data['requisition_template_id'] == ref_template_id | |
| similar = self.data[similar_mask] | |
| else: | |
| similar = pd.DataFrame(columns=self.data.columns) | |
| # Fallback: if template match is missing/too small, match by dept + seniority | |
| if similar.empty or similar['requisition_id'].nunique() < 2: | |
| ref_department = ref_row.get('department') | |
| ref_seniority = ref_row.get('seniority_level') | |
| similar_mask = ( | |
| (self.data['department'] == ref_department) | |
| & (self.data['seniority_level'] == ref_seniority) | |
| ) | |
| similar = self.data[similar_mask] | |
| return similar.copy() | |
| def is_valid_requisition(self, requisition_id: str) -> bool: | |
| """Check if a requisition ID exists in the data.""" | |
| return requisition_id in self.data['requisition_id'].values | |
| def get_suggested_requisitions(self, invalid_id: str, limit: int = 4) -> list: | |
| """ | |
| Get a list of valid requisition IDs to suggest when an invalid ID is provided. | |
| Returns close-match IDs from the dataset. | |
| """ | |
| valid_ids = list(self.data['requisition_id'].unique()) | |
| try: | |
| from rapidfuzz import process, fuzz | |
| matches = process.extract( | |
| invalid_id, | |
| valid_ids, | |
| scorer=fuzz.WRatio, | |
| limit=limit, | |
| ) | |
| return [match[0] for match in matches] | |
| except Exception: | |
| # Fall back to first few valid IDs if RapidFuzz isn't available | |
| return valid_ids[:limit] | |
| # Singleton instance | |
| _loader = DataLoader() | |
| def get_data_loader() -> DataLoader: | |
| """Get the singleton data loader instance.""" | |
| return _loader | |