| | """Utilities for loading raw Amazon electronics review data.""" |
| |
|
| | from __future__ import annotations |
| |
|
| | from dataclasses import dataclass |
| | from pathlib import Path |
| | from typing import Iterable, Optional |
| |
|
| | import pandas as pd |
| |
|
| | from src.utils.exception import CustomException |
| |
|
| |
|
| | @dataclass |
| | class ReviewDatasetLoader: |
| | """Load JSON-lines review dumps with optional sampling.""" |
| |
|
| | data_path: Path |
| | sample_size: Optional[int] = None |
| | random_state: int = 42 |
| |
|
| | def _read_jsonl(self) -> Iterable[dict]: |
| | if not self.data_path.exists(): |
| | raise CustomException(f"Dataset not found at {self.data_path}") |
| | import json |
| |
|
| | with self.data_path.open("r", encoding="utf-8") as handle: |
| | for line in handle: |
| | line = line.strip() |
| | if line: |
| | yield json.loads(line) |
| |
|
| | def load(self) -> pd.DataFrame: |
| | records = list(self._read_jsonl()) |
| | if not records: |
| | raise CustomException("Dataset file is empty") |
| | df = pd.DataFrame(records) |
| | df = df.dropna(subset=["reviewText"]).reset_index(drop=True) |
| | if self.sample_size and len(df) > self.sample_size: |
| | df = df.sample(self.sample_size, random_state=self.random_state) |
| | df["reviewText"] = df["reviewText"].astype(str) |
| | return df |
| |
|
| |
|
| | __all__ = ["ReviewDatasetLoader"] |
| |
|