aaa / app /data /loader.py
work-sejal
Deploy AI service with FastAPI
70ea7be
Raw
History Blame Contribute Delete
3.97 kB
"""Dataset loader for CSV tables and metadata.
Provides a single abstraction for loading any CSV table or the dataset
metadata JSON from the configured dataset directory. All path resolution
is relative to the dataset_dir parameter — no hardcoded paths.
"""
import json
import logging
from pathlib import Path
import pandas as pd
from app.core.exceptions import DatasetError
logger = logging.getLogger(__name__)
class DatasetLoader:
"""Loads CSV tables and metadata from the dataset directory.
All file paths are resolved relative to the dataset_dir provided at
construction time. Table names are accepted with or without the .csv
extension. No columns are renamed, dropped, or aliased during loading.
"""
def __init__(self, dataset_dir: str | Path) -> None:
self._dataset_dir = Path(dataset_dir).resolve()
@property
def dataset_dir(self) -> Path:
"""The resolved dataset directory path."""
return self._dataset_dir
def load_table(self, table_name: str) -> pd.DataFrame:
"""Load a CSV table by name and return it as a DataFrame.
Accepts table names with or without the .csv extension
(e.g., "learning_outcomes" or "learning_outcomes.csv").
Args:
table_name: Name of the CSV table to load.
Returns:
A pandas DataFrame with the table contents.
Raises:
DatasetError: If the CSV file does not exist at the resolved path.
"""
path = self.get_table_path(table_name)
if not path.exists():
raise DatasetError(
f"Table file not found: {path}"
)
try:
df = pd.read_csv(path)
except Exception as exc:
raise DatasetError(
f"Failed to read table '{table_name}' at {path}: {exc}"
) from exc
logger.info("Loaded table '%s' — %d rows, %d columns", table_name, len(df), len(df.columns))
return df
def load_metadata(self) -> dict:
"""Load and parse dataset_metadata.json from the dataset directory.
Returns:
A dictionary with the parsed JSON content.
Raises:
DatasetError: If the metadata file does not exist or cannot be parsed.
"""
path = self._dataset_dir / "dataset_metadata.json"
if not path.exists():
raise DatasetError(
f"Metadata file not found: {path}"
)
try:
with open(path, encoding="utf-8") as f:
metadata = json.load(f)
except json.JSONDecodeError as exc:
raise DatasetError(
f"Failed to parse metadata JSON at {path}: {exc}"
) from exc
except Exception as exc:
raise DatasetError(
f"Failed to read metadata file at {path}: {exc}"
) from exc
logger.info("Loaded dataset metadata from %s", path)
return metadata
def list_tables(self) -> list[str]:
"""Return a list of available CSV table names without extension.
Scans the dataset directory for .csv files and returns their
stem names (e.g., "learning_outcomes", "student_profiles").
Returns:
A sorted list of table name strings without the .csv extension.
"""
csv_files = sorted(self._dataset_dir.glob("*.csv"))
return [f.stem for f in csv_files]
def get_table_path(self, table_name: str) -> Path:
"""Resolve the full path for a table name.
Handles table names with or without the .csv extension.
Does NOT verify that the file exists.
Args:
table_name: Name of the table (with or without .csv extension).
Returns:
The resolved Path to the CSV file.
"""
if not table_name.endswith(".csv"):
table_name = f"{table_name}.csv"
return self._dataset_dir / table_name