File size: 1,691 Bytes
8807f0d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | import pandas as pd
import os
class DataLoader:
def __init__(self, data_path: str):
self.data_path = data_path
def load_data(self) -> pd.DataFrame:
if not os.path.exists(self.data_path):
raise FileNotFoundError(f"Dataset not found at {self.data_path}")
df = pd.read_csv(self.data_path)
df = df.dropna(subset=["summaries"]).reset_index(drop=True)
to_remove = set()
for book_name, group in df.groupby("book_name"):
if len(group) < 2:
continue
group = group.sort_index()
for i, row_i in group.iterrows():
for j, row_j in group.iterrows():
if (
j > i
and j - i == 9
and row_i["categories"] == row_j["categories"]
and row_i["summaries"] == row_j["summaries"]
):
remaining = df[(df["book_name"] == book_name) & (~df.index.isin([i, j]))]
if not remaining.empty:
to_remove.update([i, j])
df = df.drop(index=to_remove).reset_index(drop=True)
return df
def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.groupby("book_name").agg({
"summaries": "first",
"categories": lambda x: ', '.join(set(x))
}).reset_index()
df["combined_text"] = (
"Summary of the book: " + df["summaries"].fillna("") + " " +
"Categories/Genre of the book: " + df["categories"].fillna("")
)
return df
|