"""Dataset loader for CSV tables and metadata. Provides a single abstraction for loading any CSV table or the dataset metadata JSON from the configured dataset directory. All path resolution is relative to the dataset_dir parameter — no hardcoded paths. """ import json import logging from pathlib import Path import pandas as pd from app.core.exceptions import DatasetError logger = logging.getLogger(__name__) class DatasetLoader: """Loads CSV tables and metadata from the dataset directory. All file paths are resolved relative to the dataset_dir provided at construction time. Table names are accepted with or without the .csv extension. No columns are renamed, dropped, or aliased during loading. """ def __init__(self, dataset_dir: str | Path) -> None: self._dataset_dir = Path(dataset_dir).resolve() @property def dataset_dir(self) -> Path: """The resolved dataset directory path.""" return self._dataset_dir def load_table(self, table_name: str) -> pd.DataFrame: """Load a CSV table by name and return it as a DataFrame. Accepts table names with or without the .csv extension (e.g., "learning_outcomes" or "learning_outcomes.csv"). Args: table_name: Name of the CSV table to load. Returns: A pandas DataFrame with the table contents. Raises: DatasetError: If the CSV file does not exist at the resolved path. """ path = self.get_table_path(table_name) if not path.exists(): raise DatasetError( f"Table file not found: {path}" ) try: df = pd.read_csv(path) except Exception as exc: raise DatasetError( f"Failed to read table '{table_name}' at {path}: {exc}" ) from exc logger.info("Loaded table '%s' — %d rows, %d columns", table_name, len(df), len(df.columns)) return df def load_metadata(self) -> dict: """Load and parse dataset_metadata.json from the dataset directory. Returns: A dictionary with the parsed JSON content. Raises: DatasetError: If the metadata file does not exist or cannot be parsed. """ path = self._dataset_dir / "dataset_metadata.json" if not path.exists(): raise DatasetError( f"Metadata file not found: {path}" ) try: with open(path, encoding="utf-8") as f: metadata = json.load(f) except json.JSONDecodeError as exc: raise DatasetError( f"Failed to parse metadata JSON at {path}: {exc}" ) from exc except Exception as exc: raise DatasetError( f"Failed to read metadata file at {path}: {exc}" ) from exc logger.info("Loaded dataset metadata from %s", path) return metadata def list_tables(self) -> list[str]: """Return a list of available CSV table names without extension. Scans the dataset directory for .csv files and returns their stem names (e.g., "learning_outcomes", "student_profiles"). Returns: A sorted list of table name strings without the .csv extension. """ csv_files = sorted(self._dataset_dir.glob("*.csv")) return [f.stem for f in csv_files] def get_table_path(self, table_name: str) -> Path: """Resolve the full path for a table name. Handles table names with or without the .csv extension. Does NOT verify that the file exists. Args: table_name: Name of the table (with or without .csv extension). Returns: The resolved Path to the CSV file. """ if not table_name.endswith(".csv"): table_name = f"{table_name}.csv" return self._dataset_dir / table_name