Spaces:
Sleeping
Sleeping
File size: 5,481 Bytes
3795605 b7766ce 3795605 b7766ce 3795605 c327792 3795605 b7766ce c327792 b7766ce 3795605 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | """Dataset loader for CSV tables and metadata.
Provides a single abstraction for loading any CSV table or the dataset
metadata JSON from the configured dataset directory. All path resolution
is relative to the dataset_dir parameter — no hardcoded paths.
"""
import json
import logging
import os
from pathlib import Path
import pandas as pd
from huggingface_hub import snapshot_download
from app.core.config import settings
from app.core.exceptions import DatasetError
logger = logging.getLogger(__name__)
class DatasetLoader:
"""Loads CSV tables and metadata from the dataset directory.
All file paths are resolved relative to the dataset_dir provided at
construction time. Table names are accepted with or without the .csv
extension. No columns are renamed, dropped, or aliased during loading.
"""
def __init__(self, dataset_dir: str | Path) -> None:
self._requested_dataset_dir = Path(dataset_dir)
self._dataset_repo_id = os.getenv("DATASET_REPO_ID") or settings.dataset_repo_id
self._dataset_dir = self._resolve_dataset_dir()
def _resolve_dataset_dir(self) -> Path:
resolved_dataset_dir = self._requested_dataset_dir.resolve()
if resolved_dataset_dir.exists() and (resolved_dataset_dir / "dataset_metadata.json").exists():
return resolved_dataset_dir
if not self._dataset_repo_id:
raise DatasetError(
f"Dataset directory not found: {resolved_dataset_dir}. "
"Set DATASET_REPO_ID to a valid Hugging Face dataset repo."
)
try:
downloaded_path = snapshot_download(
repo_id=self._dataset_repo_id,
repo_type="dataset",
)
resolved_path = Path(downloaded_path).resolve()
except Exception as exc:
raise DatasetError(
f"Failed to download dataset from Hugging Face repo '{self._dataset_repo_id}': {exc}"
) from exc
if not (resolved_path / "dataset_metadata.json").exists():
raise DatasetError(
f"Downloaded dataset from HF repo '{self._dataset_repo_id}' is missing dataset_metadata.json"
)
logger.info("Downloaded dataset from Hugging Face repo '%s' to %s", self._dataset_repo_id, resolved_path)
return resolved_path
@property
def dataset_dir(self) -> Path:
"""The resolved dataset directory path."""
return self._dataset_dir
def load_table(self, table_name: str) -> pd.DataFrame:
"""Load a CSV table by name and return it as a DataFrame.
Accepts table names with or without the .csv extension
(e.g., "learning_outcomes" or "learning_outcomes.csv").
Args:
table_name: Name of the CSV table to load.
Returns:
A pandas DataFrame with the table contents.
Raises:
DatasetError: If the CSV file does not exist at the resolved path.
"""
path = self.get_table_path(table_name)
if not path.exists():
raise DatasetError(
f"Table file not found: {path}"
)
try:
df = pd.read_csv(path)
except Exception as exc:
raise DatasetError(
f"Failed to read table '{table_name}' at {path}: {exc}"
) from exc
logger.info("Loaded table '%s' — %d rows, %d columns", table_name, len(df), len(df.columns))
return df
def load_metadata(self) -> dict:
"""Load and parse dataset_metadata.json from the dataset directory.
Returns:
A dictionary with the parsed JSON content.
Raises:
DatasetError: If the metadata file does not exist or cannot be parsed.
"""
path = self._dataset_dir / "dataset_metadata.json"
if not path.exists():
raise DatasetError(
f"Metadata file not found: {path}"
)
try:
with open(path, encoding="utf-8") as f:
metadata = json.load(f)
except json.JSONDecodeError as exc:
raise DatasetError(
f"Failed to parse metadata JSON at {path}: {exc}"
) from exc
except Exception as exc:
raise DatasetError(
f"Failed to read metadata file at {path}: {exc}"
) from exc
logger.info("Loaded dataset metadata from %s", path)
return metadata
def list_tables(self) -> list[str]:
"""Return a list of available CSV table names without extension.
Scans the dataset directory for .csv files and returns their
stem names (e.g., "learning_outcomes", "student_profiles").
Returns:
A sorted list of table name strings without the .csv extension.
"""
csv_files = sorted(self._dataset_dir.glob("*.csv"))
return [f.stem for f in csv_files]
def get_table_path(self, table_name: str) -> Path:
"""Resolve the full path for a table name.
Handles table names with or without the .csv extension.
Does NOT verify that the file exists.
Args:
table_name: Name of the table (with or without .csv extension).
Returns:
The resolved Path to the CSV file.
"""
if not table_name.endswith(".csv"):
table_name = f"{table_name}.csv"
return self._dataset_dir / table_name
|