from pathlib import Path import re import unicodedata from src.config import HF_TOKEN, SPACE_ID def strip_degrees_for_search(text): if not isinstance(text, str): return text degree_pattern = r'\b(MSc|MBA|BBA|BSc|Ph\.?D\.?|BA|MA|BS|MS|EMBA|Master|Bachelor|Masters|Bachelors|Licence)\b' cleaned = re.sub(degree_pattern, '', text, flags=re.IGNORECASE) cleaned = re.sub(r'\s+', ' ', cleaned) cleaned = cleaned.strip(' -.,&/|') if not cleaned: return text.strip() return cleaned def smart_format(text): if not isinstance(text, str): return text res = text.title() acronyms = ['Ma', 'Ba', 'Mba', 'Bba', 'Hr', 'It', 'Bs', 'Ms', 'Phd', 'Bsc', 'Msc', 'Llm', 'Pge', 'Cems'] for ac in acronyms: res = re.sub(rf'\b{ac}\b', lambda m: m.group(0).upper(), res) res = res.replace("PHD", "PhD").replace("BSC", "BSc").replace("MSC", "MSc") res = re.sub(r"\b(L|D|Qu)'([A-Z])", lambda m: f"{m.group(1)}'{m.group(2).lower()}", res) return res.strip() def clean_degree_text(text): if not isinstance(text, str): return "" text = re.sub(r'\band\b', '&', text, flags=re.IGNORECASE) text = re.sub(r'\bet\b', '&', text, flags=re.IGNORECASE) text = re.sub(r'[^\w\s\-&\+\']', ' ', text) text = re.sub(r'\s+', ' ', text).strip() return smart_format(text) def normalize_text(text): if not isinstance(text, str): return "" normalized = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn') return normalized.strip().lower() def normalize_ref(value): return normalize_text(str(value)) def iter_ref_values(ref_data): if isinstance(ref_data, dict): yield from (item for item in ref_data.keys() if isinstance(item, str)) yield from (item for item in ref_data.values() if isinstance(item, str)) elif isinstance(ref_data, list): yield from (item for item in ref_data if isinstance(item, str)) def ref_contains(ref_data, value): needle = normalize_ref(value) return any(normalize_ref(item) == needle for item in iter_ref_values(ref_data)) def prune_manual_refs_against_official(manual_refs, official_refs): removed_count = 0 for column_name, manual_bucket in list(manual_refs.items()): official_bucket = official_refs.get(column_name, []) if isinstance(manual_bucket, list): kept = [] seen = set() for value in manual_bucket: if not isinstance(value, str): removed_count += 1 continue key = normalize_ref(value) if not key or key in seen: removed_count += 1 continue if ref_contains(official_bucket, value): removed_count += 1 continue seen.add(key) kept.append(value) manual_refs[column_name] = kept elif isinstance(manual_bucket, dict): kept = {} seen_values = set() for alias, value in manual_bucket.items(): candidate = value if isinstance(value, str) else alias key = normalize_ref(candidate) if not key or key in seen_values: removed_count += 1 continue if ref_contains(official_bucket, candidate): removed_count += 1 continue seen_values.add(key) kept[normalize_ref(alias)] = value manual_refs[column_name] = kept return removed_count MANUAL_REFERENCES_REPO_PATH = "refdata/manual_references.json" def reference_sync_status(): if not SPACE_ID: return { "enabled": False, "space_id": "", "reason": "Reference sync is only available on Hugging Face Spaces.", } if not HF_TOKEN: return { "enabled": False, "space_id": SPACE_ID, "reason": "HF_TOKEN secret is missing from this Space.", } return { "enabled": True, "space_id": SPACE_ID, "reason": "", } def save_manual_references_to_hub(app_root: Path): status = reference_sync_status() if not status["enabled"]: raise RuntimeError(status["reason"]) manual_refs_path = app_root / MANUAL_REFERENCES_REPO_PATH if not manual_refs_path.is_file(): raise FileNotFoundError(f"Manual references file not found: {manual_refs_path}") try: from huggingface_hub import HfApi except ImportError as exc: raise RuntimeError("huggingface_hub is not installed.") from exc api = HfApi(token=HF_TOKEN) commit_info = api.upload_file( path_or_fileobj=str(manual_refs_path), path_in_repo=MANUAL_REFERENCES_REPO_PATH, repo_id=status["space_id"], repo_type="space", commit_message="Update manual references", ) return { "space_id": status["space_id"], "path": MANUAL_REFERENCES_REPO_PATH, "commit_url": str(commit_info), }