rc-docker / src /recsys /data /loader.py
moecr7
Dockerize rc-ranked: FastAPI service for HF Spaces
188f0cf
"""Parse MovieLens dataset files into raw DataFrames. Variant-aware.
ml-1m: legacy `.dat` files with `::` separator, latin-1 encoding, has user demographics.
ml-32m (and other modern releases): `.csv` files with headers, comma separator,
UTF-8 encoding, no user demographics.
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Final
import pandas as pd
from ..logging_utils import get_logger
_logger = get_logger(__name__)
# ---------- ml-1m schema (legacy `.dat`) ----------
_ML1M_SEP: Final[str] = "::"
_ML1M_ENCODING: Final[str] = "latin-1"
_ML1M_RATINGS_COLS: Final[tuple[str, ...]] = ("user_id", "movie_id", "rating", "timestamp")
_ML1M_USERS_COLS: Final[tuple[str, ...]] = ("user_id", "gender", "age", "occupation", "zip")
_ML1M_MOVIES_COLS: Final[tuple[str, ...]] = ("movie_id", "title", "genres")
# ---------- ml-25m / ml-32m schema (modern `.csv`) ----------
_CSV_RATINGS_RENAME: Final[dict[str, str]] = {"userId": "user_id", "movieId": "movie_id"}
_CSV_MOVIES_RENAME: Final[dict[str, str]] = {"movieId": "movie_id"}
@dataclass(frozen=True)
class RawFrames:
"""Raw MovieLens tables as parsed from disk, with no transformations applied.
`users` is None on variants that don't ship user demographics (ml-25m, ml-32m).
"""
ratings: pd.DataFrame
movies: pd.DataFrame
users: pd.DataFrame | None = None
def load_raw(dataset_dir: Path | str, variant: str) -> RawFrames:
"""Dispatch on `variant`. Returns frames in a uniform internal schema."""
dataset_dir = Path(dataset_dir)
if variant == "ml-1m":
return _load_ml1m(dataset_dir)
if variant in {"ml-25m", "ml-32m", "ml-latest", "ml-latest-small"}:
return _load_csv_variant(dataset_dir)
raise ValueError(f"unsupported dataset variant: {variant!r}")
# ---------- ml-1m loader ----------
def _load_ml1m(dataset_dir: Path) -> RawFrames:
ratings = _read_dat(dataset_dir / "ratings.dat", _ML1M_RATINGS_COLS)
users = _read_dat(dataset_dir / "users.dat", _ML1M_USERS_COLS)
movies = _read_dat(dataset_dir / "movies.dat", _ML1M_MOVIES_COLS)
ratings = ratings.astype(
{"user_id": "int64", "movie_id": "int64", "rating": "int64", "timestamp": "int64"}
)
users = users.astype(
{"user_id": "int64", "age": "int64", "occupation": "int64", "zip": "string"}
)
movies = movies.astype({"movie_id": "int64", "title": "string", "genres": "string"})
_logger.info(
"Loaded ml-1m: %d ratings, %d users, %d movies",
len(ratings), len(users), len(movies),
)
return RawFrames(ratings=ratings, users=users, movies=movies)
def _read_dat(path: Path, columns: tuple[str, ...]) -> pd.DataFrame:
if not path.is_file():
raise FileNotFoundError(f"expected dataset file missing: {path}")
return pd.read_csv(
path,
sep=_ML1M_SEP,
names=list(columns),
engine="python",
encoding=_ML1M_ENCODING,
header=None,
)
# ---------- ml-25m / ml-32m / ml-latest loader ----------
def _load_csv_variant(dataset_dir: Path) -> RawFrames:
"""ml-25m, ml-32m and ml-latest all share the same CSV schema."""
ratings_path = dataset_dir / "ratings.csv"
movies_path = dataset_dir / "movies.csv"
if not ratings_path.is_file():
raise FileNotFoundError(f"expected dataset file missing: {ratings_path}")
if not movies_path.is_file():
raise FileNotFoundError(f"expected dataset file missing: {movies_path}")
# Modern csv variants are UTF-8 with header rows. Titles can contain commas
# (escaped via double-quotes) — the default csv parser handles that.
ratings = pd.read_csv(ratings_path).rename(columns=_CSV_RATINGS_RENAME)
movies = pd.read_csv(movies_path).rename(columns=_CSV_MOVIES_RENAME)
# Pin dtypes. Note: ratings can be half-stars (0.5–5.0) in modern variants,
# so `rating` is float, not int.
ratings = ratings.astype(
{"user_id": "int64", "movie_id": "int64", "rating": "float32", "timestamp": "int64"}
)
movies = movies.astype({"movie_id": "int64", "title": "string", "genres": "string"})
_logger.info(
"Loaded csv variant: %d ratings, %d movies (no user demographics)",
len(ratings), len(movies),
)
return RawFrames(ratings=ratings, movies=movies, users=None)