"""Helper utilities for normalising and searching catalog data.""" from __future__ import annotations import re from typing import Iterable, List def canonicalize_name(raw: str | None) -> str: """Lowercase and strip punctuation/spaces for stable matching.""" if not raw: return "" lowered = raw.strip().lower() cleaned = "".join(ch for ch in lowered if ch.isalnum() or ch.isspace()) # Collapse duplicate spaces return " ".join(part for part in cleaned.split() if part) def tokenize_name(raw: str | None) -> set[str]: """Break names into normalized token sets for fuzzy comparisons.""" if not raw: return set() lowered = raw.lower() return set(re.findall(r"[a-z0-9]+", lowered)) def generate_name_variants(raw: str | None) -> Iterable[str]: """Yield common name permutations used across CSV sources.""" if not raw: return [] cleaned = raw.strip() yield cleaned if "," in cleaned: last, _, first = cleaned.partition(",") first = first.strip() last = last.strip() if first and last: yield f"{first} {last}" yield f"{last} {first}" yield f"{last}, {first}" yield f"{last},{first}" else: parts = cleaned.split() if len(parts) >= 2: first = " ".join(parts[:-1]) last = parts[-1] yield f"{last} {first}" yield f"{last}, {first}" yield f"{last},{first}" def extract_leadership_names(raw: str | None) -> List[str]: """ Parse leadership strings from centers.csv and extract individual names. Examples: - "Director: Kristian Hammond" - "Co-directors: Michael Horn, Chris Riesbeck, Uri Wilensky" - "Director: Diego Klabjan; Associate Director: Lauren Smith" """ if not raw: return [] text = raw.replace("\xa0", " ").strip() # Discard role labels (e.g., "Director:", "Co-directors:") if ":" in text: _, _, text = text.partition(":") # Normalize coordinators text = text.replace(" and ", ",") # Remove role labels repeated later in the string text = re.sub(r"\b[A-Za-z ]*Director[s]?\b", "", text, flags=re.IGNORECASE) text = re.sub(r"\bCo-PI\b", "", text, flags=re.IGNORECASE) text = re.sub(r"\bAssociate\b", "", text, flags=re.IGNORECASE) # Remove parentheses content (e.g., titles) text = re.sub(r"\([^)]*\)", "", text) names = [] for chunk in re.split(r"[,/;]+", text): cleaned = chunk.strip() if not cleaned: continue # Strip lingering prefixes like "Co-" or trailing descriptors cleaned = re.sub(r"^(co-)?director(s)?\b", "", cleaned, flags=re.IGNORECASE).strip() # Collapse internal multiple spaces cleaned = " ".join(cleaned.split()) if cleaned: names.append(cleaned) return names def centers_for_faculty(source_row: dict, centers: List[dict]) -> List[dict]: """Find centers led by the faculty member described in ``source_row``.""" name = source_row.get("Name") if not name: return [] lookup = canonicalize_name(name) matches: List[dict] = [] for center in centers: leaders = extract_leadership_names(center.get("Leadership")) if not leaders: continue for leader in leaders: if canonicalize_name(leader) == lookup: matches.append(center) break return matches def extract_advisor_names(raw: str | None) -> List[str]: """Normalize advisor lists from students.csv.""" if not raw: return [] text = raw.strip() if not text or text.lower() in {"none", "na", "n/a"}: return [] names: List[str] = [] # First capture "Last, First" patterns so we can rebuild full names. comma_matches = re.findall(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", text) for last, first in comma_matches: first = first.strip() last = last.strip() if first and last: names.append(f"{first} {last}") # Remove the matched patterns to avoid double counting when splitting later. text = re.sub(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", "", text) text = text.replace(" and ", ",") segments = re.split(r"[,/;]+", text) for segment in segments: cleaned = segment.strip().strip('"').strip() if not cleaned: continue cleaned = re.sub(r"\(.*?\)$", "", cleaned).strip() if cleaned: names.append(cleaned) return [name for name in names if name]