File size: 4,619 Bytes
c2b7a7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""Helper utilities for normalising and searching catalog data."""

from __future__ import annotations

import re
from typing import Iterable, List


def canonicalize_name(raw: str | None) -> str:
    """Lowercase and strip punctuation/spaces for stable matching."""
    if not raw:
        return ""
    lowered = raw.strip().lower()
    cleaned = "".join(ch for ch in lowered if ch.isalnum() or ch.isspace())
    # Collapse duplicate spaces
    return " ".join(part for part in cleaned.split() if part)


def tokenize_name(raw: str | None) -> set[str]:
    """Break names into normalized token sets for fuzzy comparisons."""
    if not raw:
        return set()
    lowered = raw.lower()
    return set(re.findall(r"[a-z0-9]+", lowered))


def generate_name_variants(raw: str | None) -> Iterable[str]:
    """Yield common name permutations used across CSV sources."""
    if not raw:
        return []
    cleaned = raw.strip()
    yield cleaned
    if "," in cleaned:
        last, _, first = cleaned.partition(",")
        first = first.strip()
        last = last.strip()
        if first and last:
            yield f"{first} {last}"
            yield f"{last} {first}"
            yield f"{last}, {first}"
            yield f"{last},{first}"
    else:
        parts = cleaned.split()
        if len(parts) >= 2:
            first = " ".join(parts[:-1])
            last = parts[-1]
            yield f"{last} {first}"
            yield f"{last}, {first}"
            yield f"{last},{first}"


def extract_leadership_names(raw: str | None) -> List[str]:
    """
    Parse leadership strings from centers.csv and extract individual names.

    Examples:
    - "Director: Kristian Hammond"
    - "Co-directors: Michael Horn, Chris Riesbeck, Uri Wilensky"
    - "Director: Diego Klabjan; Associate Director: Lauren Smith"
    """
    if not raw:
        return []

    text = raw.replace("\xa0", " ").strip()
    # Discard role labels (e.g., "Director:", "Co-directors:")
    if ":" in text:
        _, _, text = text.partition(":")
    # Normalize coordinators
    text = text.replace(" and ", ",")
    # Remove role labels repeated later in the string
    text = re.sub(r"\b[A-Za-z ]*Director[s]?\b", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\bCo-PI\b", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\bAssociate\b", "", text, flags=re.IGNORECASE)

    # Remove parentheses content (e.g., titles)
    text = re.sub(r"\([^)]*\)", "", text)

    names = []
    for chunk in re.split(r"[,/;]+", text):
        cleaned = chunk.strip()
        if not cleaned:
            continue
        # Strip lingering prefixes like "Co-" or trailing descriptors
        cleaned = re.sub(r"^(co-)?director(s)?\b", "", cleaned, flags=re.IGNORECASE).strip()
        # Collapse internal multiple spaces
        cleaned = " ".join(cleaned.split())
        if cleaned:
            names.append(cleaned)
    return names


def centers_for_faculty(source_row: dict, centers: List[dict]) -> List[dict]:
    """Find centers led by the faculty member described in ``source_row``."""
    name = source_row.get("Name")
    if not name:
        return []
    lookup = canonicalize_name(name)
    matches: List[dict] = []
    for center in centers:
        leaders = extract_leadership_names(center.get("Leadership"))
        if not leaders:
            continue
        for leader in leaders:
            if canonicalize_name(leader) == lookup:
                matches.append(center)
                break
    return matches


def extract_advisor_names(raw: str | None) -> List[str]:
    """Normalize advisor lists from students.csv."""
    if not raw:
        return []
    text = raw.strip()
    if not text or text.lower() in {"none", "na", "n/a"}:
        return []

    names: List[str] = []

    # First capture "Last, First" patterns so we can rebuild full names.
    comma_matches = re.findall(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", text)
    for last, first in comma_matches:
        first = first.strip()
        last = last.strip()
        if first and last:
            names.append(f"{first} {last}")
    # Remove the matched patterns to avoid double counting when splitting later.
    text = re.sub(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", "", text)

    text = text.replace(" and ", ",")
    segments = re.split(r"[,/;]+", text)
    for segment in segments:
        cleaned = segment.strip().strip('"').strip()
        if not cleaned:
            continue
        cleaned = re.sub(r"\(.*?\)$", "", cleaned).strip()
        if cleaned:
            names.append(cleaned)
    return [name for name in names if name]