Spaces:

orderlymirror
/

lov2

Sleeping

lov2 / app /data /loader.py

work-sejal

Use remote HF dataset repo orderlymirror/lo and align HF Space env paths

c327792 about 1 month ago

5.48 kB

	"""Dataset loader for CSV tables and metadata.

	Provides a single abstraction for loading any CSV table or the dataset
	metadata JSON from the configured dataset directory. All path resolution
	is relative to the dataset_dir parameter — no hardcoded paths.
	"""

	import json
	import logging
	import os
	from pathlib import Path

	import pandas as pd
	from huggingface_hub import snapshot_download

	from app.core.config import settings
	from app.core.exceptions import DatasetError

	logger = logging.getLogger(__name__)


	class DatasetLoader:
	"""Loads CSV tables and metadata from the dataset directory.

	All file paths are resolved relative to the dataset_dir provided at
	construction time. Table names are accepted with or without the .csv
	extension. No columns are renamed, dropped, or aliased during loading.
	"""

	def __init__(self, dataset_dir: str \| Path) -> None:
	self._requested_dataset_dir = Path(dataset_dir)
	self._dataset_repo_id = os.getenv("DATASET_REPO_ID") or settings.dataset_repo_id
	self._dataset_dir = self._resolve_dataset_dir()

	def _resolve_dataset_dir(self) -> Path:
	resolved_dataset_dir = self._requested_dataset_dir.resolve()

	if resolved_dataset_dir.exists() and (resolved_dataset_dir / "dataset_metadata.json").exists():
	return resolved_dataset_dir

	if not self._dataset_repo_id:
	raise DatasetError(
	f"Dataset directory not found: {resolved_dataset_dir}. "
	"Set DATASET_REPO_ID to a valid Hugging Face dataset repo."
	)

	try:
	downloaded_path = snapshot_download(
	repo_id=self._dataset_repo_id,
	repo_type="dataset",
	)
	resolved_path = Path(downloaded_path).resolve()
	except Exception as exc:
	raise DatasetError(
	f"Failed to download dataset from Hugging Face repo '{self._dataset_repo_id}': {exc}"
	) from exc

	if not (resolved_path / "dataset_metadata.json").exists():
	raise DatasetError(
	f"Downloaded dataset from HF repo '{self._dataset_repo_id}' is missing dataset_metadata.json"
	)

	logger.info("Downloaded dataset from Hugging Face repo '%s' to %s", self._dataset_repo_id, resolved_path)
	return resolved_path

	@property
	def dataset_dir(self) -> Path:
	"""The resolved dataset directory path."""
	return self._dataset_dir

	def load_table(self, table_name: str) -> pd.DataFrame:
	"""Load a CSV table by name and return it as a DataFrame.

	Accepts table names with or without the .csv extension
	(e.g., "learning_outcomes" or "learning_outcomes.csv").

	Args:
	table_name: Name of the CSV table to load.

	Returns:
	A pandas DataFrame with the table contents.

	Raises:
	DatasetError: If the CSV file does not exist at the resolved path.
	"""
	path = self.get_table_path(table_name)

	if not path.exists():
	raise DatasetError(
	f"Table file not found: {path}"
	)

	try:
	df = pd.read_csv(path)
	except Exception as exc:
	raise DatasetError(
	f"Failed to read table '{table_name}' at {path}: {exc}"
	) from exc

	logger.info("Loaded table '%s' — %d rows, %d columns", table_name, len(df), len(df.columns))
	return df

	def load_metadata(self) -> dict:
	"""Load and parse dataset_metadata.json from the dataset directory.

	Returns:
	A dictionary with the parsed JSON content.

	Raises:
	DatasetError: If the metadata file does not exist or cannot be parsed.
	"""
	path = self._dataset_dir / "dataset_metadata.json"

	if not path.exists():
	raise DatasetError(
	f"Metadata file not found: {path}"
	)

	try:
	with open(path, encoding="utf-8") as f:
	metadata = json.load(f)
	except json.JSONDecodeError as exc:
	raise DatasetError(
	f"Failed to parse metadata JSON at {path}: {exc}"
	) from exc
	except Exception as exc:
	raise DatasetError(
	f"Failed to read metadata file at {path}: {exc}"
	) from exc

	logger.info("Loaded dataset metadata from %s", path)
	return metadata

	def list_tables(self) -> list[str]:
	"""Return a list of available CSV table names without extension.

	Scans the dataset directory for .csv files and returns their
	stem names (e.g., "learning_outcomes", "student_profiles").

	Returns:
	A sorted list of table name strings without the .csv extension.
	"""
	csv_files = sorted(self._dataset_dir.glob("*.csv"))
	return [f.stem for f in csv_files]

	def get_table_path(self, table_name: str) -> Path:
	"""Resolve the full path for a table name.

	Handles table names with or without the .csv extension.
	Does NOT verify that the file exists.

	Args:
	table_name: Name of the table (with or without .csv extension).

	Returns:
	The resolved Path to the CSV file.
	"""
	if not table_name.endswith(".csv"):
	table_name = f"{table_name}.csv"
	return self._dataset_dir / table_name