Spaces:
Running
Running
| from pathlib import Path | |
| import re | |
| import unicodedata | |
| from src.config import HF_TOKEN, SPACE_ID | |
| def strip_degrees_for_search(text): | |
| if not isinstance(text, str): return text | |
| degree_pattern = r'\b(MSc|MBA|BBA|BSc|Ph\.?D\.?|BA|MA|BS|MS|EMBA|Master|Bachelor|Masters|Bachelors|Licence)\b' | |
| cleaned = re.sub(degree_pattern, '', text, flags=re.IGNORECASE) | |
| cleaned = re.sub(r'\s+', ' ', cleaned) | |
| cleaned = cleaned.strip(' -.,&/|') | |
| if not cleaned: return text.strip() | |
| return cleaned | |
| def smart_format(text): | |
| if not isinstance(text, str): return text | |
| res = text.title() | |
| acronyms = ['Ma', 'Ba', 'Mba', 'Bba', 'Hr', 'It', 'Bs', 'Ms', 'Phd', 'Bsc', 'Msc', 'Llm', 'Pge', 'Cems'] | |
| for ac in acronyms: | |
| res = re.sub(rf'\b{ac}\b', lambda m: m.group(0).upper(), res) | |
| res = res.replace("PHD", "PhD").replace("BSC", "BSc").replace("MSC", "MSc") | |
| res = re.sub(r"\b(L|D|Qu)'([A-Z])", lambda m: f"{m.group(1)}'{m.group(2).lower()}", res) | |
| return res.strip() | |
| def clean_degree_text(text): | |
| if not isinstance(text, str): return "" | |
| text = re.sub(r'\band\b', '&', text, flags=re.IGNORECASE) | |
| text = re.sub(r'\bet\b', '&', text, flags=re.IGNORECASE) | |
| text = re.sub(r'[^\w\s\-&\+\']', ' ', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return smart_format(text) | |
| def normalize_text(text): | |
| if not isinstance(text, str): return "" | |
| normalized = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn') | |
| return normalized.strip().lower() | |
| def normalize_ref(value): | |
| return normalize_text(str(value)) | |
| def iter_ref_values(ref_data): | |
| if isinstance(ref_data, dict): | |
| yield from (item for item in ref_data.keys() if isinstance(item, str)) | |
| yield from (item for item in ref_data.values() if isinstance(item, str)) | |
| elif isinstance(ref_data, list): | |
| yield from (item for item in ref_data if isinstance(item, str)) | |
| def ref_contains(ref_data, value): | |
| needle = normalize_ref(value) | |
| return any(normalize_ref(item) == needle for item in iter_ref_values(ref_data)) | |
| def prune_manual_refs_against_official(manual_refs, official_refs): | |
| removed_count = 0 | |
| for column_name, manual_bucket in list(manual_refs.items()): | |
| official_bucket = official_refs.get(column_name, []) | |
| if isinstance(manual_bucket, list): | |
| kept = [] | |
| seen = set() | |
| for value in manual_bucket: | |
| if not isinstance(value, str): | |
| removed_count += 1 | |
| continue | |
| key = normalize_ref(value) | |
| if not key or key in seen: | |
| removed_count += 1 | |
| continue | |
| if ref_contains(official_bucket, value): | |
| removed_count += 1 | |
| continue | |
| seen.add(key) | |
| kept.append(value) | |
| manual_refs[column_name] = kept | |
| elif isinstance(manual_bucket, dict): | |
| kept = {} | |
| seen_values = set() | |
| for alias, value in manual_bucket.items(): | |
| candidate = value if isinstance(value, str) else alias | |
| key = normalize_ref(candidate) | |
| if not key or key in seen_values: | |
| removed_count += 1 | |
| continue | |
| if ref_contains(official_bucket, candidate): | |
| removed_count += 1 | |
| continue | |
| seen_values.add(key) | |
| kept[normalize_ref(alias)] = value | |
| manual_refs[column_name] = kept | |
| return removed_count | |
| MANUAL_REFERENCES_REPO_PATH = "refdata/manual_references.json" | |
| def reference_sync_status(): | |
| if not SPACE_ID: | |
| return { | |
| "enabled": False, | |
| "space_id": "", | |
| "reason": "Reference sync is only available on Hugging Face Spaces.", | |
| } | |
| if not HF_TOKEN: | |
| return { | |
| "enabled": False, | |
| "space_id": SPACE_ID, | |
| "reason": "HF_TOKEN secret is missing from this Space.", | |
| } | |
| return { | |
| "enabled": True, | |
| "space_id": SPACE_ID, | |
| "reason": "", | |
| } | |
| def save_manual_references_to_hub(app_root: Path): | |
| status = reference_sync_status() | |
| if not status["enabled"]: | |
| raise RuntimeError(status["reason"]) | |
| manual_refs_path = app_root / MANUAL_REFERENCES_REPO_PATH | |
| if not manual_refs_path.is_file(): | |
| raise FileNotFoundError(f"Manual references file not found: {manual_refs_path}") | |
| try: | |
| from huggingface_hub import HfApi | |
| except ImportError as exc: | |
| raise RuntimeError("huggingface_hub is not installed.") from exc | |
| api = HfApi(token=HF_TOKEN) | |
| commit_info = api.upload_file( | |
| path_or_fileobj=str(manual_refs_path), | |
| path_in_repo=MANUAL_REFERENCES_REPO_PATH, | |
| repo_id=status["space_id"], | |
| repo_type="space", | |
| commit_message="Update manual references", | |
| ) | |
| return { | |
| "space_id": status["space_id"], | |
| "path": MANUAL_REFERENCES_REPO_PATH, | |
| "commit_url": str(commit_info), | |
| } | |