NU-KIOSK-API / backend /data /utils.py
Monish BV
Add kiosk-api: stripped backend for speech integration
c2b7a7b
"""Helper utilities for normalising and searching catalog data."""
from __future__ import annotations
import re
from typing import Iterable, List
def canonicalize_name(raw: str | None) -> str:
"""Lowercase and strip punctuation/spaces for stable matching."""
if not raw:
return ""
lowered = raw.strip().lower()
cleaned = "".join(ch for ch in lowered if ch.isalnum() or ch.isspace())
# Collapse duplicate spaces
return " ".join(part for part in cleaned.split() if part)
def tokenize_name(raw: str | None) -> set[str]:
"""Break names into normalized token sets for fuzzy comparisons."""
if not raw:
return set()
lowered = raw.lower()
return set(re.findall(r"[a-z0-9]+", lowered))
def generate_name_variants(raw: str | None) -> Iterable[str]:
"""Yield common name permutations used across CSV sources."""
if not raw:
return []
cleaned = raw.strip()
yield cleaned
if "," in cleaned:
last, _, first = cleaned.partition(",")
first = first.strip()
last = last.strip()
if first and last:
yield f"{first} {last}"
yield f"{last} {first}"
yield f"{last}, {first}"
yield f"{last},{first}"
else:
parts = cleaned.split()
if len(parts) >= 2:
first = " ".join(parts[:-1])
last = parts[-1]
yield f"{last} {first}"
yield f"{last}, {first}"
yield f"{last},{first}"
def extract_leadership_names(raw: str | None) -> List[str]:
"""
Parse leadership strings from centers.csv and extract individual names.
Examples:
- "Director: Kristian Hammond"
- "Co-directors: Michael Horn, Chris Riesbeck, Uri Wilensky"
- "Director: Diego Klabjan; Associate Director: Lauren Smith"
"""
if not raw:
return []
text = raw.replace("\xa0", " ").strip()
# Discard role labels (e.g., "Director:", "Co-directors:")
if ":" in text:
_, _, text = text.partition(":")
# Normalize coordinators
text = text.replace(" and ", ",")
# Remove role labels repeated later in the string
text = re.sub(r"\b[A-Za-z ]*Director[s]?\b", "", text, flags=re.IGNORECASE)
text = re.sub(r"\bCo-PI\b", "", text, flags=re.IGNORECASE)
text = re.sub(r"\bAssociate\b", "", text, flags=re.IGNORECASE)
# Remove parentheses content (e.g., titles)
text = re.sub(r"\([^)]*\)", "", text)
names = []
for chunk in re.split(r"[,/;]+", text):
cleaned = chunk.strip()
if not cleaned:
continue
# Strip lingering prefixes like "Co-" or trailing descriptors
cleaned = re.sub(r"^(co-)?director(s)?\b", "", cleaned, flags=re.IGNORECASE).strip()
# Collapse internal multiple spaces
cleaned = " ".join(cleaned.split())
if cleaned:
names.append(cleaned)
return names
def centers_for_faculty(source_row: dict, centers: List[dict]) -> List[dict]:
"""Find centers led by the faculty member described in ``source_row``."""
name = source_row.get("Name")
if not name:
return []
lookup = canonicalize_name(name)
matches: List[dict] = []
for center in centers:
leaders = extract_leadership_names(center.get("Leadership"))
if not leaders:
continue
for leader in leaders:
if canonicalize_name(leader) == lookup:
matches.append(center)
break
return matches
def extract_advisor_names(raw: str | None) -> List[str]:
"""Normalize advisor lists from students.csv."""
if not raw:
return []
text = raw.strip()
if not text or text.lower() in {"none", "na", "n/a"}:
return []
names: List[str] = []
# First capture "Last, First" patterns so we can rebuild full names.
comma_matches = re.findall(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", text)
for last, first in comma_matches:
first = first.strip()
last = last.strip()
if first and last:
names.append(f"{first} {last}")
# Remove the matched patterns to avoid double counting when splitting later.
text = re.sub(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", "", text)
text = text.replace(" and ", ",")
segments = re.split(r"[,/;]+", text)
for segment in segments:
cleaned = segment.strip().strip('"').strip()
if not cleaned:
continue
cleaned = re.sub(r"\(.*?\)$", "", cleaned).strip()
if cleaned:
names.append(cleaned)
return [name for name in names if name]