File size: 1,357 Bytes
0116d50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""Utilities for loading raw Amazon electronics review data."""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Optional

import pandas as pd

from src.utils.exception import CustomException


@dataclass
class ReviewDatasetLoader:
    """Load JSON-lines review dumps with optional sampling."""

    data_path: Path
    sample_size: Optional[int] = None
    random_state: int = 42

    def _read_jsonl(self) -> Iterable[dict]:
        if not self.data_path.exists():
            raise CustomException(f"Dataset not found at {self.data_path}")
        import json

        with self.data_path.open("r", encoding="utf-8") as handle:
            for line in handle:
                line = line.strip()
                if line:
                    yield json.loads(line)

    def load(self) -> pd.DataFrame:
        records = list(self._read_jsonl())
        if not records:
            raise CustomException("Dataset file is empty")
        df = pd.DataFrame(records)
        df = df.dropna(subset=["reviewText"]).reset_index(drop=True)
        if self.sample_size and len(df) > self.sample_size:
            df = df.sample(self.sample_size, random_state=self.random_state)
        df["reviewText"] = df["reviewText"].astype(str)
        return df


__all__ = ["ReviewDatasetLoader"]