Spaces:

MasterMap
/

mastermap-cleaner

Sleeping

File size: 5,129 Bytes

from pathlib import Path
import re
import unicodedata

from src.config import HF_TOKEN, SPACE_ID

def strip_degrees_for_search(text):
    if not isinstance(text, str): return text
    degree_pattern = r'\b(MSc|MBA|BBA|BSc|Ph\.?D\.?|BA|MA|BS|MS|EMBA|Master|Bachelor|Masters|Bachelors|Licence)\b'
    cleaned = re.sub(degree_pattern, '', text, flags=re.IGNORECASE)
    cleaned = re.sub(r'\s+', ' ', cleaned) 
    cleaned = cleaned.strip(' -.,&/|') 
    if not cleaned: return text.strip()
    return cleaned

def smart_format(text):
    if not isinstance(text, str): return text
    res = text.title()
    acronyms = ['Ma', 'Ba', 'Mba', 'Bba', 'Hr', 'It', 'Bs', 'Ms', 'Phd', 'Bsc', 'Msc', 'Llm', 'Pge', 'Cems']
    for ac in acronyms:
        res = re.sub(rf'\b{ac}\b', lambda m: m.group(0).upper(), res)
    res = res.replace("PHD", "PhD").replace("BSC", "BSc").replace("MSC", "MSc")
    res = re.sub(r"\b(L|D|Qu)'([A-Z])", lambda m: f"{m.group(1)}'{m.group(2).lower()}", res)
    return res.strip()

def clean_degree_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'\band\b', '&', text, flags=re.IGNORECASE)
    text = re.sub(r'\bet\b', '&', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\w\s\-&\+\']', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return smart_format(text)

def normalize_text(text):
    if not isinstance(text, str): return ""
    normalized = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    return normalized.strip().lower()

def normalize_ref(value):
    return normalize_text(str(value))

def iter_ref_values(ref_data):
    if isinstance(ref_data, dict):
        yield from (item for item in ref_data.keys() if isinstance(item, str))
        yield from (item for item in ref_data.values() if isinstance(item, str))
    elif isinstance(ref_data, list):
        yield from (item for item in ref_data if isinstance(item, str))

def ref_contains(ref_data, value):
    needle = normalize_ref(value)
    return any(normalize_ref(item) == needle for item in iter_ref_values(ref_data))

def prune_manual_refs_against_official(manual_refs, official_refs):
    removed_count = 0

    for column_name, manual_bucket in list(manual_refs.items()):
        official_bucket = official_refs.get(column_name, [])

        if isinstance(manual_bucket, list):
            kept = []
            seen = set()
            for value in manual_bucket:
                if not isinstance(value, str):
                    removed_count += 1
                    continue

                key = normalize_ref(value)
                if not key or key in seen:
                    removed_count += 1
                    continue

                if ref_contains(official_bucket, value):
                    removed_count += 1
                    continue

                seen.add(key)
                kept.append(value)
            manual_refs[column_name] = kept

        elif isinstance(manual_bucket, dict):
            kept = {}
            seen_values = set()
            for alias, value in manual_bucket.items():
                candidate = value if isinstance(value, str) else alias
                key = normalize_ref(candidate)
                if not key or key in seen_values:
                    removed_count += 1
                    continue

                if ref_contains(official_bucket, candidate):
                    removed_count += 1
                    continue

                seen_values.add(key)
                kept[normalize_ref(alias)] = value
            manual_refs[column_name] = kept

    return removed_count

MANUAL_REFERENCES_REPO_PATH = "refdata/manual_references.json"

def reference_sync_status():
    if not SPACE_ID:
        return {
            "enabled": False,
            "space_id": "",
            "reason": "Reference sync is only available on Hugging Face Spaces.",
        }

    if not HF_TOKEN:
        return {
            "enabled": False,
            "space_id": SPACE_ID,
            "reason": "HF_TOKEN secret is missing from this Space.",
        }

    return {
        "enabled": True,
        "space_id": SPACE_ID,
        "reason": "",
    }

def save_manual_references_to_hub(app_root: Path):
    status = reference_sync_status()
    if not status["enabled"]:
        raise RuntimeError(status["reason"])

    manual_refs_path = app_root / MANUAL_REFERENCES_REPO_PATH
    if not manual_refs_path.is_file():
        raise FileNotFoundError(f"Manual references file not found: {manual_refs_path}")

    try:
        from huggingface_hub import HfApi
    except ImportError as exc:
        raise RuntimeError("huggingface_hub is not installed.") from exc

    api = HfApi(token=HF_TOKEN)
    commit_info = api.upload_file(
        path_or_fileobj=str(manual_refs_path),
        path_in_repo=MANUAL_REFERENCES_REPO_PATH,
        repo_id=status["space_id"],
        repo_type="space",
        commit_message="Update manual references",
    )

    return {
        "space_id": status["space_id"],
        "path": MANUAL_REFERENCES_REPO_PATH,
        "commit_url": str(commit_info),
    }