NU-KIOSK-API / backend /data /sources.py
Monish BV
Add kiosk-api: stripped backend for speech integration
c2b7a7b
"""Source adapters that load structured data into the kiosk catalog."""
from __future__ import annotations
import csv
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional
from .utils import canonicalize_name
@dataclass
class EntityDefinition:
"""Specification for registering an entity in the data catalog."""
name: str
records: List[Dict[str, Any]]
key_field: Optional[str] = None
origin: Optional[str] = None
normalizer: Optional[Callable[[str], str]] = None
@dataclass
class SourceResult:
"""Payload returned by a data source."""
entities: List[EntityDefinition] = field(default_factory=list)
metadata: Dict[str, Any] = field(default_factory=dict)
class DataSource(ABC):
"""Interface for ingesting structured data into the catalog."""
def __init__(self, name: str) -> None:
self.name = name
@abstractmethod
def load(self) -> SourceResult:
raise NotImplementedError
class CSVSource(DataSource):
"""Loads a CSV file into an entity definition."""
def __init__(
self,
name: str,
path: Path,
entity_name: str,
*,
key_field: Optional[str] = None,
normalizer: Optional[Callable[[str], str]] = None,
) -> None:
super().__init__(name)
self.path = path
self.entity_name = entity_name
self.key_field = key_field
self.normalizer = normalizer
def load(self) -> SourceResult:
if not self.path.exists():
return SourceResult()
records = self._read_csv(self.path)
entity = EntityDefinition(
name=self.entity_name,
records=records,
key_field=self.key_field,
origin=str(self.path),
normalizer=self.normalizer,
)
return SourceResult(entities=[entity])
@staticmethod
def _read_csv(path: Path) -> List[Dict[str, Any]]:
with path.open(newline="", encoding="utf-8-sig") as handle:
reader = csv.DictReader(handle)
return [dict(row) for row in reader]
class FeedListSource(DataSource):
"""Loads newline-delimited feed URLs into catalog metadata."""
def __init__(self, name: str, path: Path, metadata_key: str) -> None:
super().__init__(name)
self.path = path
self.metadata_key = metadata_key
def load(self) -> SourceResult:
if not self.path.exists():
return SourceResult()
urls = [
line.strip()
for line in self.path.read_text(encoding="utf-8").splitlines()
if line.strip()
]
return SourceResult(metadata={self.metadata_key: {"urls": urls}})
def default_sources(base_dir: Path, *, name_normalizer: Optional[Callable[[str], str]] = None) -> List[DataSource]:
"""
Produce the default set of data sources used by the backend.
Additional sources (e.g., TA office hours) can be appended to this list
without modifying the rest of the pipeline.
"""
base_dir = base_dir.resolve()
normalizer = name_normalizer or canonicalize_name
sources: List[DataSource] = [
CSVSource(
name="faculty_roster",
path=base_dir / "faculty_2.csv",
entity_name="faculty",
key_field="Name",
normalizer=normalizer,
),
CSVSource(
name="faculty_offices",
path=base_dir / "Faculty.csv",
entity_name="faculty_offices",
key_field="Assignee Name",
normalizer=normalizer,
),
CSVSource(
name="staff_roster",
path=base_dir / "staff.csv",
entity_name="staff",
key_field="Name",
normalizer=normalizer,
),
CSVSource(
name="students_roster",
path=base_dir / "students.csv",
entity_name="students",
key_field="Name",
normalizer=normalizer,
),
CSVSource(
name="office_hours",
path=base_dir / "CS Office Hours Room Reservations.csv",
entity_name="office_hours",
key_field="Course Name",
normalizer=normalizer,
),
CSVSource(
name="centers_catalog",
path=base_dir / "centers.csv",
entity_name="centers",
key_field="Name",
normalizer=normalizer,
),
CSVSource(
name="mudd_seating",
path=base_dir / "Mudd Seating Sample.csv",
entity_name="mudd_seating",
key_field="Student/Visitor",
normalizer=normalizer,
),
FeedListSource(
name="event_feeds",
path=base_dir / "feed.txt",
metadata_key="event_feeds",
),
]
return sources