Spaces:
Sleeping
Sleeping
| """Helper utilities for normalising and searching catalog data.""" | |
| from __future__ import annotations | |
| import re | |
| from typing import Iterable, List | |
| def canonicalize_name(raw: str | None) -> str: | |
| """Lowercase and strip punctuation/spaces for stable matching.""" | |
| if not raw: | |
| return "" | |
| lowered = raw.strip().lower() | |
| cleaned = "".join(ch for ch in lowered if ch.isalnum() or ch.isspace()) | |
| # Collapse duplicate spaces | |
| return " ".join(part for part in cleaned.split() if part) | |
| def tokenize_name(raw: str | None) -> set[str]: | |
| """Break names into normalized token sets for fuzzy comparisons.""" | |
| if not raw: | |
| return set() | |
| lowered = raw.lower() | |
| return set(re.findall(r"[a-z0-9]+", lowered)) | |
| def generate_name_variants(raw: str | None) -> Iterable[str]: | |
| """Yield common name permutations used across CSV sources.""" | |
| if not raw: | |
| return [] | |
| cleaned = raw.strip() | |
| yield cleaned | |
| if "," in cleaned: | |
| last, _, first = cleaned.partition(",") | |
| first = first.strip() | |
| last = last.strip() | |
| if first and last: | |
| yield f"{first} {last}" | |
| yield f"{last} {first}" | |
| yield f"{last}, {first}" | |
| yield f"{last},{first}" | |
| else: | |
| parts = cleaned.split() | |
| if len(parts) >= 2: | |
| first = " ".join(parts[:-1]) | |
| last = parts[-1] | |
| yield f"{last} {first}" | |
| yield f"{last}, {first}" | |
| yield f"{last},{first}" | |
| def extract_leadership_names(raw: str | None) -> List[str]: | |
| """ | |
| Parse leadership strings from centers.csv and extract individual names. | |
| Examples: | |
| - "Director: Kristian Hammond" | |
| - "Co-directors: Michael Horn, Chris Riesbeck, Uri Wilensky" | |
| - "Director: Diego Klabjan; Associate Director: Lauren Smith" | |
| """ | |
| if not raw: | |
| return [] | |
| text = raw.replace("\xa0", " ").strip() | |
| # Discard role labels (e.g., "Director:", "Co-directors:") | |
| if ":" in text: | |
| _, _, text = text.partition(":") | |
| # Normalize coordinators | |
| text = text.replace(" and ", ",") | |
| # Remove role labels repeated later in the string | |
| text = re.sub(r"\b[A-Za-z ]*Director[s]?\b", "", text, flags=re.IGNORECASE) | |
| text = re.sub(r"\bCo-PI\b", "", text, flags=re.IGNORECASE) | |
| text = re.sub(r"\bAssociate\b", "", text, flags=re.IGNORECASE) | |
| # Remove parentheses content (e.g., titles) | |
| text = re.sub(r"\([^)]*\)", "", text) | |
| names = [] | |
| for chunk in re.split(r"[,/;]+", text): | |
| cleaned = chunk.strip() | |
| if not cleaned: | |
| continue | |
| # Strip lingering prefixes like "Co-" or trailing descriptors | |
| cleaned = re.sub(r"^(co-)?director(s)?\b", "", cleaned, flags=re.IGNORECASE).strip() | |
| # Collapse internal multiple spaces | |
| cleaned = " ".join(cleaned.split()) | |
| if cleaned: | |
| names.append(cleaned) | |
| return names | |
| def centers_for_faculty(source_row: dict, centers: List[dict]) -> List[dict]: | |
| """Find centers led by the faculty member described in ``source_row``.""" | |
| name = source_row.get("Name") | |
| if not name: | |
| return [] | |
| lookup = canonicalize_name(name) | |
| matches: List[dict] = [] | |
| for center in centers: | |
| leaders = extract_leadership_names(center.get("Leadership")) | |
| if not leaders: | |
| continue | |
| for leader in leaders: | |
| if canonicalize_name(leader) == lookup: | |
| matches.append(center) | |
| break | |
| return matches | |
| def extract_advisor_names(raw: str | None) -> List[str]: | |
| """Normalize advisor lists from students.csv.""" | |
| if not raw: | |
| return [] | |
| text = raw.strip() | |
| if not text or text.lower() in {"none", "na", "n/a"}: | |
| return [] | |
| names: List[str] = [] | |
| # First capture "Last, First" patterns so we can rebuild full names. | |
| comma_matches = re.findall(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", text) | |
| for last, first in comma_matches: | |
| first = first.strip() | |
| last = last.strip() | |
| if first and last: | |
| names.append(f"{first} {last}") | |
| # Remove the matched patterns to avoid double counting when splitting later. | |
| text = re.sub(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", "", text) | |
| text = text.replace(" and ", ",") | |
| segments = re.split(r"[,/;]+", text) | |
| for segment in segments: | |
| cleaned = segment.strip().strip('"').strip() | |
| if not cleaned: | |
| continue | |
| cleaned = re.sub(r"\(.*?\)$", "", cleaned).strip() | |
| if cleaned: | |
| names.append(cleaned) | |
| return [name for name in names if name] | |