Anshrathore01's picture
Implement core pipelines and web UI
0116d50
"""Utilities for loading raw Amazon electronics review data."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Optional
import pandas as pd
from src.utils.exception import CustomException
@dataclass
class ReviewDatasetLoader:
"""Load JSON-lines review dumps with optional sampling."""
data_path: Path
sample_size: Optional[int] = None
random_state: int = 42
def _read_jsonl(self) -> Iterable[dict]:
if not self.data_path.exists():
raise CustomException(f"Dataset not found at {self.data_path}")
import json
with self.data_path.open("r", encoding="utf-8") as handle:
for line in handle:
line = line.strip()
if line:
yield json.loads(line)
def load(self) -> pd.DataFrame:
records = list(self._read_jsonl())
if not records:
raise CustomException("Dataset file is empty")
df = pd.DataFrame(records)
df = df.dropna(subset=["reviewText"]).reset_index(drop=True)
if self.sample_size and len(df) > self.sample_size:
df = df.sample(self.sample_size, random_state=self.random_state)
df["reviewText"] = df["reviewText"].astype(str)
return df
__all__ = ["ReviewDatasetLoader"]