File size: 5,481 Bytes
3795605
 
 
 
 
 
 
 
 
b7766ce
3795605
 
 
b7766ce
3795605
c327792
3795605
 
 
 
 
 
 
 
 
 
 
 
 
 
b7766ce
c327792
b7766ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3795605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""Dataset loader for CSV tables and metadata.

Provides a single abstraction for loading any CSV table or the dataset
metadata JSON from the configured dataset directory. All path resolution
is relative to the dataset_dir parameter — no hardcoded paths.
"""

import json
import logging
import os
from pathlib import Path

import pandas as pd
from huggingface_hub import snapshot_download

from app.core.config import settings
from app.core.exceptions import DatasetError

logger = logging.getLogger(__name__)


class DatasetLoader:
    """Loads CSV tables and metadata from the dataset directory.

    All file paths are resolved relative to the dataset_dir provided at
    construction time. Table names are accepted with or without the .csv
    extension. No columns are renamed, dropped, or aliased during loading.
    """

    def __init__(self, dataset_dir: str | Path) -> None:
        self._requested_dataset_dir = Path(dataset_dir)
        self._dataset_repo_id = os.getenv("DATASET_REPO_ID") or settings.dataset_repo_id
        self._dataset_dir = self._resolve_dataset_dir()

    def _resolve_dataset_dir(self) -> Path:
        resolved_dataset_dir = self._requested_dataset_dir.resolve()

        if resolved_dataset_dir.exists() and (resolved_dataset_dir / "dataset_metadata.json").exists():
            return resolved_dataset_dir

        if not self._dataset_repo_id:
            raise DatasetError(
                f"Dataset directory not found: {resolved_dataset_dir}. "
                "Set DATASET_REPO_ID to a valid Hugging Face dataset repo."
            )

        try:
            downloaded_path = snapshot_download(
                repo_id=self._dataset_repo_id,
                repo_type="dataset",
            )
            resolved_path = Path(downloaded_path).resolve()
        except Exception as exc:
            raise DatasetError(
                f"Failed to download dataset from Hugging Face repo '{self._dataset_repo_id}': {exc}"
            ) from exc

        if not (resolved_path / "dataset_metadata.json").exists():
            raise DatasetError(
                f"Downloaded dataset from HF repo '{self._dataset_repo_id}' is missing dataset_metadata.json"
            )

        logger.info("Downloaded dataset from Hugging Face repo '%s' to %s", self._dataset_repo_id, resolved_path)
        return resolved_path

    @property
    def dataset_dir(self) -> Path:
        """The resolved dataset directory path."""
        return self._dataset_dir

    def load_table(self, table_name: str) -> pd.DataFrame:
        """Load a CSV table by name and return it as a DataFrame.

        Accepts table names with or without the .csv extension
        (e.g., "learning_outcomes" or "learning_outcomes.csv").

        Args:
            table_name: Name of the CSV table to load.

        Returns:
            A pandas DataFrame with the table contents.

        Raises:
            DatasetError: If the CSV file does not exist at the resolved path.
        """
        path = self.get_table_path(table_name)

        if not path.exists():
            raise DatasetError(
                f"Table file not found: {path}"
            )

        try:
            df = pd.read_csv(path)
        except Exception as exc:
            raise DatasetError(
                f"Failed to read table '{table_name}' at {path}: {exc}"
            ) from exc

        logger.info("Loaded table '%s' — %d rows, %d columns", table_name, len(df), len(df.columns))
        return df

    def load_metadata(self) -> dict:
        """Load and parse dataset_metadata.json from the dataset directory.

        Returns:
            A dictionary with the parsed JSON content.

        Raises:
            DatasetError: If the metadata file does not exist or cannot be parsed.
        """
        path = self._dataset_dir / "dataset_metadata.json"

        if not path.exists():
            raise DatasetError(
                f"Metadata file not found: {path}"
            )

        try:
            with open(path, encoding="utf-8") as f:
                metadata = json.load(f)
        except json.JSONDecodeError as exc:
            raise DatasetError(
                f"Failed to parse metadata JSON at {path}: {exc}"
            ) from exc
        except Exception as exc:
            raise DatasetError(
                f"Failed to read metadata file at {path}: {exc}"
            ) from exc

        logger.info("Loaded dataset metadata from %s", path)
        return metadata

    def list_tables(self) -> list[str]:
        """Return a list of available CSV table names without extension.

        Scans the dataset directory for .csv files and returns their
        stem names (e.g., "learning_outcomes", "student_profiles").

        Returns:
            A sorted list of table name strings without the .csv extension.
        """
        csv_files = sorted(self._dataset_dir.glob("*.csv"))
        return [f.stem for f in csv_files]

    def get_table_path(self, table_name: str) -> Path:
        """Resolve the full path for a table name.

        Handles table names with or without the .csv extension.
        Does NOT verify that the file exists.

        Args:
            table_name: Name of the table (with or without .csv extension).

        Returns:
            The resolved Path to the CSV file.
        """
        if not table_name.endswith(".csv"):
            table_name = f"{table_name}.csv"
        return self._dataset_dir / table_name