Spaces:

monish563
/

NU-KIOSK-API

Sleeping

Monish BV

Add kiosk-api: stripped backend for speech integration

c2b7a7b 4 months ago

4.62 kB

	"""Helper utilities for normalising and searching catalog data."""

	from __future__ import annotations

	import re
	from typing import Iterable, List


	def canonicalize_name(raw: str \| None) -> str:
	"""Lowercase and strip punctuation/spaces for stable matching."""
	if not raw:
	return ""
	lowered = raw.strip().lower()
	cleaned = "".join(ch for ch in lowered if ch.isalnum() or ch.isspace())
	# Collapse duplicate spaces
	return " ".join(part for part in cleaned.split() if part)


	def tokenize_name(raw: str \| None) -> set[str]:
	"""Break names into normalized token sets for fuzzy comparisons."""
	if not raw:
	return set()
	lowered = raw.lower()
	return set(re.findall(r"[a-z0-9]+", lowered))


	def generate_name_variants(raw: str \| None) -> Iterable[str]:
	"""Yield common name permutations used across CSV sources."""
	if not raw:
	return []
	cleaned = raw.strip()
	yield cleaned
	if "," in cleaned:
	last, _, first = cleaned.partition(",")
	first = first.strip()
	last = last.strip()
	if first and last:
	yield f"{first} {last}"
	yield f"{last} {first}"
	yield f"{last}, {first}"
	yield f"{last},{first}"
	else:
	parts = cleaned.split()
	if len(parts) >= 2:
	first = " ".join(parts[:-1])
	last = parts[-1]
	yield f"{last} {first}"
	yield f"{last}, {first}"
	yield f"{last},{first}"


	def extract_leadership_names(raw: str \| None) -> List[str]:
	"""
	Parse leadership strings from centers.csv and extract individual names.

	Examples:
	- "Director: Kristian Hammond"
	- "Co-directors: Michael Horn, Chris Riesbeck, Uri Wilensky"
	- "Director: Diego Klabjan; Associate Director: Lauren Smith"
	"""
	if not raw:
	return []

	text = raw.replace("\xa0", " ").strip()
	# Discard role labels (e.g., "Director:", "Co-directors:")
	if ":" in text:
	_, _, text = text.partition(":")
	# Normalize coordinators
	text = text.replace(" and ", ",")
	# Remove role labels repeated later in the string
	text = re.sub(r"\b[A-Za-z ]*Director[s]?\b", "", text, flags=re.IGNORECASE)
	text = re.sub(r"\bCo-PI\b", "", text, flags=re.IGNORECASE)
	text = re.sub(r"\bAssociate\b", "", text, flags=re.IGNORECASE)

	# Remove parentheses content (e.g., titles)
	text = re.sub(r"$[^)]*$", "", text)

	names = []
	for chunk in re.split(r"[,/;]+", text):
	cleaned = chunk.strip()
	if not cleaned:
	continue
	# Strip lingering prefixes like "Co-" or trailing descriptors
	cleaned = re.sub(r"^(co-)?director(s)?\b", "", cleaned, flags=re.IGNORECASE).strip()
	# Collapse internal multiple spaces
	cleaned = " ".join(cleaned.split())
	if cleaned:
	names.append(cleaned)
	return names


	def centers_for_faculty(source_row: dict, centers: List[dict]) -> List[dict]:
	"""Find centers led by the faculty member described in ``source_row``."""
	name = source_row.get("Name")
	if not name:
	return []
	lookup = canonicalize_name(name)
	matches: List[dict] = []
	for center in centers:
	leaders = extract_leadership_names(center.get("Leadership"))
	if not leaders:
	continue
	for leader in leaders:
	if canonicalize_name(leader) == lookup:
	matches.append(center)
	break
	return matches


	def extract_advisor_names(raw: str \| None) -> List[str]:
	"""Normalize advisor lists from students.csv."""
	if not raw:
	return []
	text = raw.strip()
	if not text or text.lower() in {"none", "na", "n/a"}:
	return []

	names: List[str] = []

	# First capture "Last, First" patterns so we can rebuild full names.
	comma_matches = re.findall(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", text)
	for last, first in comma_matches:
	first = first.strip()
	last = last.strip()
	if first and last:
	names.append(f"{first} {last}")
	# Remove the matched patterns to avoid double counting when splitting later.
	text = re.sub(r"([A-Za-z.'\- ]+),\s*([A-Za-z.'\- ]+)", "", text)

	text = text.replace(" and ", ",")
	segments = re.split(r"[,/;]+", text)
	for segment in segments:
	cleaned = segment.strip().strip('"').strip()
	if not cleaned:
	continue
	cleaned = re.sub(r"$.*?$$", "", cleaned).strip()
	if cleaned:
	names.append(cleaned)
	return [name for name in names if name]