File size: 3,971 Bytes
70ea7be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""Dataset loader for CSV tables and metadata.

Provides a single abstraction for loading any CSV table or the dataset
metadata JSON from the configured dataset directory. All path resolution
is relative to the dataset_dir parameter — no hardcoded paths.
"""

import json
import logging
from pathlib import Path

import pandas as pd

from app.core.exceptions import DatasetError

logger = logging.getLogger(__name__)


class DatasetLoader:
    """Loads CSV tables and metadata from the dataset directory.

    All file paths are resolved relative to the dataset_dir provided at
    construction time. Table names are accepted with or without the .csv
    extension. No columns are renamed, dropped, or aliased during loading.
    """

    def __init__(self, dataset_dir: str | Path) -> None:
        self._dataset_dir = Path(dataset_dir).resolve()

    @property
    def dataset_dir(self) -> Path:
        """The resolved dataset directory path."""
        return self._dataset_dir

    def load_table(self, table_name: str) -> pd.DataFrame:
        """Load a CSV table by name and return it as a DataFrame.

        Accepts table names with or without the .csv extension
        (e.g., "learning_outcomes" or "learning_outcomes.csv").

        Args:
            table_name: Name of the CSV table to load.

        Returns:
            A pandas DataFrame with the table contents.

        Raises:
            DatasetError: If the CSV file does not exist at the resolved path.
        """
        path = self.get_table_path(table_name)

        if not path.exists():
            raise DatasetError(
                f"Table file not found: {path}"
            )

        try:
            df = pd.read_csv(path)
        except Exception as exc:
            raise DatasetError(
                f"Failed to read table '{table_name}' at {path}: {exc}"
            ) from exc

        logger.info("Loaded table '%s' — %d rows, %d columns", table_name, len(df), len(df.columns))
        return df

    def load_metadata(self) -> dict:
        """Load and parse dataset_metadata.json from the dataset directory.

        Returns:
            A dictionary with the parsed JSON content.

        Raises:
            DatasetError: If the metadata file does not exist or cannot be parsed.
        """
        path = self._dataset_dir / "dataset_metadata.json"

        if not path.exists():
            raise DatasetError(
                f"Metadata file not found: {path}"
            )

        try:
            with open(path, encoding="utf-8") as f:
                metadata = json.load(f)
        except json.JSONDecodeError as exc:
            raise DatasetError(
                f"Failed to parse metadata JSON at {path}: {exc}"
            ) from exc
        except Exception as exc:
            raise DatasetError(
                f"Failed to read metadata file at {path}: {exc}"
            ) from exc

        logger.info("Loaded dataset metadata from %s", path)
        return metadata

    def list_tables(self) -> list[str]:
        """Return a list of available CSV table names without extension.

        Scans the dataset directory for .csv files and returns their
        stem names (e.g., "learning_outcomes", "student_profiles").

        Returns:
            A sorted list of table name strings without the .csv extension.
        """
        csv_files = sorted(self._dataset_dir.glob("*.csv"))
        return [f.stem for f in csv_files]

    def get_table_path(self, table_name: str) -> Path:
        """Resolve the full path for a table name.

        Handles table names with or without the .csv extension.
        Does NOT verify that the file exists.

        Args:
            table_name: Name of the table (with or without .csv extension).

        Returns:
            The resolved Path to the CSV file.
        """
        if not table_name.endswith(".csv"):
            table_name = f"{table_name}.csv"
        return self._dataset_dir / table_name