Spaces:
Sleeping
Sleeping
| """Dataset loader for CSV tables and metadata. | |
| Provides a single abstraction for loading any CSV table or the dataset | |
| metadata JSON from the configured dataset directory. All path resolution | |
| is relative to the dataset_dir parameter — no hardcoded paths. | |
| """ | |
| import json | |
| import logging | |
| from pathlib import Path | |
| import pandas as pd | |
| from app.core.exceptions import DatasetError | |
| logger = logging.getLogger(__name__) | |
| class DatasetLoader: | |
| """Loads CSV tables and metadata from the dataset directory. | |
| All file paths are resolved relative to the dataset_dir provided at | |
| construction time. Table names are accepted with or without the .csv | |
| extension. No columns are renamed, dropped, or aliased during loading. | |
| """ | |
| def __init__(self, dataset_dir: str | Path) -> None: | |
| self._dataset_dir = Path(dataset_dir).resolve() | |
| def dataset_dir(self) -> Path: | |
| """The resolved dataset directory path.""" | |
| return self._dataset_dir | |
| def load_table(self, table_name: str) -> pd.DataFrame: | |
| """Load a CSV table by name and return it as a DataFrame. | |
| Accepts table names with or without the .csv extension | |
| (e.g., "learning_outcomes" or "learning_outcomes.csv"). | |
| Args: | |
| table_name: Name of the CSV table to load. | |
| Returns: | |
| A pandas DataFrame with the table contents. | |
| Raises: | |
| DatasetError: If the CSV file does not exist at the resolved path. | |
| """ | |
| path = self.get_table_path(table_name) | |
| if not path.exists(): | |
| raise DatasetError( | |
| f"Table file not found: {path}" | |
| ) | |
| try: | |
| df = pd.read_csv(path) | |
| except Exception as exc: | |
| raise DatasetError( | |
| f"Failed to read table '{table_name}' at {path}: {exc}" | |
| ) from exc | |
| logger.info("Loaded table '%s' — %d rows, %d columns", table_name, len(df), len(df.columns)) | |
| return df | |
| def load_metadata(self) -> dict: | |
| """Load and parse dataset_metadata.json from the dataset directory. | |
| Returns: | |
| A dictionary with the parsed JSON content. | |
| Raises: | |
| DatasetError: If the metadata file does not exist or cannot be parsed. | |
| """ | |
| path = self._dataset_dir / "dataset_metadata.json" | |
| if not path.exists(): | |
| raise DatasetError( | |
| f"Metadata file not found: {path}" | |
| ) | |
| try: | |
| with open(path, encoding="utf-8") as f: | |
| metadata = json.load(f) | |
| except json.JSONDecodeError as exc: | |
| raise DatasetError( | |
| f"Failed to parse metadata JSON at {path}: {exc}" | |
| ) from exc | |
| except Exception as exc: | |
| raise DatasetError( | |
| f"Failed to read metadata file at {path}: {exc}" | |
| ) from exc | |
| logger.info("Loaded dataset metadata from %s", path) | |
| return metadata | |
| def list_tables(self) -> list[str]: | |
| """Return a list of available CSV table names without extension. | |
| Scans the dataset directory for .csv files and returns their | |
| stem names (e.g., "learning_outcomes", "student_profiles"). | |
| Returns: | |
| A sorted list of table name strings without the .csv extension. | |
| """ | |
| csv_files = sorted(self._dataset_dir.glob("*.csv")) | |
| return [f.stem for f in csv_files] | |
| def get_table_path(self, table_name: str) -> Path: | |
| """Resolve the full path for a table name. | |
| Handles table names with or without the .csv extension. | |
| Does NOT verify that the file exists. | |
| Args: | |
| table_name: Name of the table (with or without .csv extension). | |
| Returns: | |
| The resolved Path to the CSV file. | |
| """ | |
| if not table_name.endswith(".csv"): | |
| table_name = f"{table_name}.csv" | |
| return self._dataset_dir / table_name | |