File size: 1,691 Bytes
8807f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
import os

class DataLoader:
    def __init__(self, data_path: str):
        self.data_path = data_path

    def load_data(self) -> pd.DataFrame:
        if not os.path.exists(self.data_path):
            raise FileNotFoundError(f"Dataset not found at {self.data_path}")

        df = pd.read_csv(self.data_path)

        df = df.dropna(subset=["summaries"]).reset_index(drop=True)
        
        to_remove = set()
        for book_name, group in df.groupby("book_name"):
            if len(group) < 2:
                continue
        
            group = group.sort_index()
        
            for i, row_i in group.iterrows():
                for j, row_j in group.iterrows():
                    if (
                        j > i
                        and j - i == 9
                        and row_i["categories"] == row_j["categories"]
                        and row_i["summaries"] == row_j["summaries"]
                    ):
                        remaining = df[(df["book_name"] == book_name) & (~df.index.isin([i, j]))]
                        if not remaining.empty:
                            to_remove.update([i, j])
        
        df = df.drop(index=to_remove).reset_index(drop=True)
        
        return df

    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.groupby("book_name").agg({
            "summaries": "first",
            "categories": lambda x: ', '.join(set(x))
        }).reset_index()

        df["combined_text"] = (
            "Summary of the book: " + df["summaries"].fillna("") + " " +
            "Categories/Genre of the book: " + df["categories"].fillna("")
        )
        
        return df