| """Parse MovieLens dataset files into raw DataFrames. Variant-aware. |
| |
| ml-1m: legacy `.dat` files with `::` separator, latin-1 encoding, has user demographics. |
| ml-32m (and other modern releases): `.csv` files with headers, comma separator, |
| UTF-8 encoding, no user demographics. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Final |
|
|
| import pandas as pd |
|
|
| from ..logging_utils import get_logger |
|
|
| _logger = get_logger(__name__) |
|
|
| |
| _ML1M_SEP: Final[str] = "::" |
| _ML1M_ENCODING: Final[str] = "latin-1" |
| _ML1M_RATINGS_COLS: Final[tuple[str, ...]] = ("user_id", "movie_id", "rating", "timestamp") |
| _ML1M_USERS_COLS: Final[tuple[str, ...]] = ("user_id", "gender", "age", "occupation", "zip") |
| _ML1M_MOVIES_COLS: Final[tuple[str, ...]] = ("movie_id", "title", "genres") |
|
|
| |
| _CSV_RATINGS_RENAME: Final[dict[str, str]] = {"userId": "user_id", "movieId": "movie_id"} |
| _CSV_MOVIES_RENAME: Final[dict[str, str]] = {"movieId": "movie_id"} |
|
|
|
|
| @dataclass(frozen=True) |
| class RawFrames: |
| """Raw MovieLens tables as parsed from disk, with no transformations applied. |
| |
| `users` is None on variants that don't ship user demographics (ml-25m, ml-32m). |
| """ |
|
|
| ratings: pd.DataFrame |
| movies: pd.DataFrame |
| users: pd.DataFrame | None = None |
|
|
|
|
| def load_raw(dataset_dir: Path | str, variant: str) -> RawFrames: |
| """Dispatch on `variant`. Returns frames in a uniform internal schema.""" |
| dataset_dir = Path(dataset_dir) |
| if variant == "ml-1m": |
| return _load_ml1m(dataset_dir) |
| if variant in {"ml-25m", "ml-32m", "ml-latest", "ml-latest-small"}: |
| return _load_csv_variant(dataset_dir) |
| raise ValueError(f"unsupported dataset variant: {variant!r}") |
|
|
|
|
| |
|
|
| def _load_ml1m(dataset_dir: Path) -> RawFrames: |
| ratings = _read_dat(dataset_dir / "ratings.dat", _ML1M_RATINGS_COLS) |
| users = _read_dat(dataset_dir / "users.dat", _ML1M_USERS_COLS) |
| movies = _read_dat(dataset_dir / "movies.dat", _ML1M_MOVIES_COLS) |
|
|
| ratings = ratings.astype( |
| {"user_id": "int64", "movie_id": "int64", "rating": "int64", "timestamp": "int64"} |
| ) |
| users = users.astype( |
| {"user_id": "int64", "age": "int64", "occupation": "int64", "zip": "string"} |
| ) |
| movies = movies.astype({"movie_id": "int64", "title": "string", "genres": "string"}) |
|
|
| _logger.info( |
| "Loaded ml-1m: %d ratings, %d users, %d movies", |
| len(ratings), len(users), len(movies), |
| ) |
| return RawFrames(ratings=ratings, users=users, movies=movies) |
|
|
|
|
| def _read_dat(path: Path, columns: tuple[str, ...]) -> pd.DataFrame: |
| if not path.is_file(): |
| raise FileNotFoundError(f"expected dataset file missing: {path}") |
| return pd.read_csv( |
| path, |
| sep=_ML1M_SEP, |
| names=list(columns), |
| engine="python", |
| encoding=_ML1M_ENCODING, |
| header=None, |
| ) |
|
|
|
|
| |
|
|
| def _load_csv_variant(dataset_dir: Path) -> RawFrames: |
| """ml-25m, ml-32m and ml-latest all share the same CSV schema.""" |
| ratings_path = dataset_dir / "ratings.csv" |
| movies_path = dataset_dir / "movies.csv" |
| if not ratings_path.is_file(): |
| raise FileNotFoundError(f"expected dataset file missing: {ratings_path}") |
| if not movies_path.is_file(): |
| raise FileNotFoundError(f"expected dataset file missing: {movies_path}") |
|
|
| |
| |
| ratings = pd.read_csv(ratings_path).rename(columns=_CSV_RATINGS_RENAME) |
| movies = pd.read_csv(movies_path).rename(columns=_CSV_MOVIES_RENAME) |
|
|
| |
| |
| ratings = ratings.astype( |
| {"user_id": "int64", "movie_id": "int64", "rating": "float32", "timestamp": "int64"} |
| ) |
| movies = movies.astype({"movie_id": "int64", "title": "string", "genres": "string"}) |
|
|
| _logger.info( |
| "Loaded csv variant: %d ratings, %d movies (no user demographics)", |
| len(ratings), len(movies), |
| ) |
| return RawFrames(ratings=ratings, movies=movies, users=None) |
|
|