| import pandas as pd |
| import os |
|
|
| class DataLoader: |
| def __init__(self, data_path: str): |
| self.data_path = data_path |
|
|
| def load_data(self) -> pd.DataFrame: |
| if not os.path.exists(self.data_path): |
| raise FileNotFoundError(f"Dataset not found at {self.data_path}") |
|
|
| df = pd.read_csv(self.data_path) |
|
|
| df = df.dropna(subset=["summaries"]).reset_index(drop=True) |
| |
| to_remove = set() |
| for book_name, group in df.groupby("book_name"): |
| if len(group) < 2: |
| continue |
| |
| group = group.sort_index() |
| |
| for i, row_i in group.iterrows(): |
| for j, row_j in group.iterrows(): |
| if ( |
| j > i |
| and j - i == 9 |
| and row_i["categories"] == row_j["categories"] |
| and row_i["summaries"] == row_j["summaries"] |
| ): |
| remaining = df[(df["book_name"] == book_name) & (~df.index.isin([i, j]))] |
| if not remaining.empty: |
| to_remove.update([i, j]) |
| |
| df = df.drop(index=to_remove).reset_index(drop=True) |
| |
| return df |
|
|
| def preprocess(self, df: pd.DataFrame) -> pd.DataFrame: |
| df = df.groupby("book_name").agg({ |
| "summaries": "first", |
| "categories": lambda x: ', '.join(set(x)) |
| }).reset_index() |
|
|
| df["combined_text"] = ( |
| "Summary of the book: " + df["summaries"].fillna("") + " " + |
| "Categories/Genre of the book: " + df["categories"].fillna("") |
| ) |
| |
| return df |
|
|