import pandas as pd import os class DataLoader: def __init__(self, data_path: str): self.data_path = data_path def load_data(self) -> pd.DataFrame: if not os.path.exists(self.data_path): raise FileNotFoundError(f"Dataset not found at {self.data_path}") df = pd.read_csv(self.data_path) df = df.dropna(subset=["summaries"]).reset_index(drop=True) to_remove = set() for book_name, group in df.groupby("book_name"): if len(group) < 2: continue group = group.sort_index() for i, row_i in group.iterrows(): for j, row_j in group.iterrows(): if ( j > i and j - i == 9 and row_i["categories"] == row_j["categories"] and row_i["summaries"] == row_j["summaries"] ): remaining = df[(df["book_name"] == book_name) & (~df.index.isin([i, j]))] if not remaining.empty: to_remove.update([i, j]) df = df.drop(index=to_remove).reset_index(drop=True) return df def preprocess(self, df: pd.DataFrame) -> pd.DataFrame: df = df.groupby("book_name").agg({ "summaries": "first", "categories": lambda x: ', '.join(set(x)) }).reset_index() df["combined_text"] = ( "Summary of the book: " + df["summaries"].fillna("") + " " + "Categories/Genre of the book: " + df["categories"].fillna("") ) return df