Spaces:
Sleeping
Sleeping
File size: 4,619 Bytes
c2b7a7b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | """Helper utilities for normalising and searching catalog data."""
from __future__ import annotations
import re
from typing import Iterable, List
def canonicalize_name(raw: str | None) -> str:
"""Lowercase and strip punctuation/spaces for stable matching."""
if not raw:
return ""
lowered = raw.strip().lower()
cleaned = "".join(ch for ch in lowered if ch.isalnum() or ch.isspace())
# Collapse duplicate spaces
return " ".join(part for part in cleaned.split() if part)
def tokenize_name(raw: str | None) -> set[str]:
"""Break names into normalized token sets for fuzzy comparisons."""
if not raw:
return set()
lowered = raw.lower()
return set(re.findall(r"[a-z0-9]+", lowered))
def generate_name_variants(raw: str | None) -> Iterable[str]:
"""Yield common name permutations used across CSV sources."""
if not raw:
return []
cleaned = raw.strip()
yield cleaned
if "," in cleaned:
last, _, first = cleaned.partition(",")
first = first.strip()
last = last.strip()
if first and last:
yield f"{first} {last}"
yield f"{last} {first}"
yield f"{last}, {first}"
yield f"{last},{first}"
else:
parts = cleaned.split()
if len(parts) >= 2:
first = " ".join(parts[:-1])
last = parts[-1]
yield f"{last} {first}"
yield f"{last}, {first}"
yield f"{last},{first}"
def extract_leadership_names(raw: str | None) -> List[str]:
"""
Parse leadership strings from centers.csv and extract individual names.
Examples:
- "Director: Kristian Hammond"
- "Co-directors: Michael Horn, Chris Riesbeck, Uri Wilensky"
- "Director: Diego Klabjan; Associate Director: Lauren Smith"
"""
if not raw:
return []
text = raw.replace("\xa0", " ").strip()
# Discard role labels (e.g., "Director:", "Co-directors:")
if ":" in text:
_, _, text = text.partition(":")
# Normalize coordinators
text = text.replace(" and ", ",")
# Remove role labels repeated later in the string
text = re.sub(r"\b[A-Za-z ]*Director[s]?\b", "", text, flags=re.IGNORECASE)
text = re.sub(r"\bCo-PI\b", "", text, flags=re.IGNORECASE)
text = re.sub(r"\bAssociate\b", "", text, flags=re.IGNORECASE)
# Remove parentheses content (e.g., titles)
text = re.sub(r"\([^)]*\)", "", text)
names = []
for chunk in re.split(r"[,/;]+", text):
cleaned = chunk.strip()
if not cleaned:
continue
# Strip lingering prefixes like "Co-" or trailing descriptors
cleaned = re.sub(r"^(co-)?director(s)?\b", "", cleaned, flags=re.IGNORECASE).strip()
# Collapse internal multiple spaces
cleaned = " ".join(cleaned.split())
if cleaned:
names.append(cleaned)
return names
def centers_for_faculty(source_row: dict, centers: List[dict]) -> List[dict]:
"""Find centers led by the faculty member described in ``source_row``."""
name = source_row.get("Name")
if not name:
return []
lookup = canonicalize_name(name)
matches: List[dict] = []
for center in centers:
leaders = extract_leadership_names(center.get("Leadership"))
if not leaders:
continue
for leader in leaders:
if canonicalize_name(leader) == lookup:
matches.append(center)
break
return matches
def extract_advisor_names(raw: str | None) -> List[str]:
"""Normalize advisor lists from students.csv."""
if not raw:
return []
text = raw.strip()
if not text or text.lower() in {"none", "na", "n/a"}:
return []
names: List[str] = []
# First capture "Last, First" patterns so we can rebuild full names.
comma_matches = re.findall(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", text)
for last, first in comma_matches:
first = first.strip()
last = last.strip()
if first and last:
names.append(f"{first} {last}")
# Remove the matched patterns to avoid double counting when splitting later.
text = re.sub(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", "", text)
text = text.replace(" and ", ",")
segments = re.split(r"[,/;]+", text)
for segment in segments:
cleaned = segment.strip().strip('"').strip()
if not cleaned:
continue
cleaned = re.sub(r"\(.*?\)$", "", cleaned).strip()
if cleaned:
names.append(cleaned)
return [name for name in names if name]
|