Spaces:

monish563
/

NU-KIOSK-API

Sleeping

File size: 4,619 Bytes

c2b7a7b

"""Helper utilities for normalising and searching catalog data."""

from __future__ import annotations

import re
from typing import Iterable, List


def canonicalize_name(raw: str | None) -> str:
    """Lowercase and strip punctuation/spaces for stable matching."""
    if not raw:
        return ""
    lowered = raw.strip().lower()
    cleaned = "".join(ch for ch in lowered if ch.isalnum() or ch.isspace())
    # Collapse duplicate spaces
    return " ".join(part for part in cleaned.split() if part)


def tokenize_name(raw: str | None) -> set[str]:
    """Break names into normalized token sets for fuzzy comparisons."""
    if not raw:
        return set()
    lowered = raw.lower()
    return set(re.findall(r"[a-z0-9]+", lowered))


def generate_name_variants(raw: str | None) -> Iterable[str]:
    """Yield common name permutations used across CSV sources."""
    if not raw:
        return []
    cleaned = raw.strip()
    yield cleaned
    if "," in cleaned:
        last, _, first = cleaned.partition(",")
        first = first.strip()
        last = last.strip()
        if first and last:
            yield f"{first} {last}"
            yield f"{last} {first}"
            yield f"{last}, {first}"
            yield f"{last},{first}"
    else:
        parts = cleaned.split()
        if len(parts) >= 2:
            first = " ".join(parts[:-1])
            last = parts[-1]
            yield f"{last} {first}"
            yield f"{last}, {first}"
            yield f"{last},{first}"


def extract_leadership_names(raw: str | None) -> List[str]:
    """
    Parse leadership strings from centers.csv and extract individual names.

    Examples:
    - "Director: Kristian Hammond"
    - "Co-directors: Michael Horn, Chris Riesbeck, Uri Wilensky"
    - "Director: Diego Klabjan; Associate Director: Lauren Smith"
    """
    if not raw:
        return []

    text = raw.replace("\xa0", " ").strip()
    # Discard role labels (e.g., "Director:", "Co-directors:")
    if ":" in text:
        _, _, text = text.partition(":")
    # Normalize coordinators
    text = text.replace(" and ", ",")
    # Remove role labels repeated later in the string
    text = re.sub(r"\b[A-Za-z ]*Director[s]?\b", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\bCo-PI\b", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\bAssociate\b", "", text, flags=re.IGNORECASE)

    # Remove parentheses content (e.g., titles)
    text = re.sub(r"\([^)]*\)", "", text)

    names = []
    for chunk in re.split(r"[,/;]+", text):
        cleaned = chunk.strip()
        if not cleaned:
            continue
        # Strip lingering prefixes like "Co-" or trailing descriptors
        cleaned = re.sub(r"^(co-)?director(s)?\b", "", cleaned, flags=re.IGNORECASE).strip()
        # Collapse internal multiple spaces
        cleaned = " ".join(cleaned.split())
        if cleaned:
            names.append(cleaned)
    return names


def centers_for_faculty(source_row: dict, centers: List[dict]) -> List[dict]:
    """Find centers led by the faculty member described in ``source_row``."""
    name = source_row.get("Name")
    if not name:
        return []
    lookup = canonicalize_name(name)
    matches: List[dict] = []
    for center in centers:
        leaders = extract_leadership_names(center.get("Leadership"))
        if not leaders:
            continue
        for leader in leaders:
            if canonicalize_name(leader) == lookup:
                matches.append(center)
                break
    return matches


def extract_advisor_names(raw: str | None) -> List[str]:
    """Normalize advisor lists from students.csv."""
    if not raw:
        return []
    text = raw.strip()
    if not text or text.lower() in {"none", "na", "n/a"}:
        return []

    names: List[str] = []

    # First capture "Last, First" patterns so we can rebuild full names.
    comma_matches = re.findall(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", text)
    for last, first in comma_matches:
        first = first.strip()
        last = last.strip()
        if first and last:
            names.append(f"{first} {last}")
    # Remove the matched patterns to avoid double counting when splitting later.
    text = re.sub(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", "", text)

    text = text.replace(" and ", ",")
    segments = re.split(r"[,/;]+", text)
    for segment in segments:
        cleaned = segment.strip().strip('"').strip()
        if not cleaned:
            continue
        cleaned = re.sub(r"\(.*?\)$", "", cleaned).strip()
        if cleaned:
            names.append(cleaned)
    return [name for name in names if name]