File size: 1,213 Bytes
ab62db9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import pandas as pd
from utils import clean_text


def load_and_clean_data(data_path, cleaned_data_path):
    """
    Load dataset, aggregate categories, drop duplicates, and preprocess text.
    """
    # Load the dataset
    books_df = pd.read_csv(data_path)
    print(f"Original dataset shape: {books_df.shape}")

    # Group by 'book_name' and 'book_summary', aggregate 'book_tags'
    books_df = books_df.groupby(["book_name", "summaries"], as_index=False).agg(
        {"categories": lambda tags: ", ".join(set(tags.dropna()))}
    )  # Remove duplicates within tags

    print(f"After aggregating categories and removing duplicates: {books_df.shape}")
    books_df = books_df.drop_duplicates(subset=["book_name", "summaries"], keep="first")
    # Combine 'book_summary' and 'book_tags' into a single text field
    books_df["combined_text"] = (
        books_df["summaries"].fillna("") + " " + books_df["categories"].fillna("")
    )

    # Clean the combined text
    books_df["combined_text"] = books_df["combined_text"].apply(clean_text)

    # Save the cleaned dataset
    books_df.to_csv(cleaned_data_path, index=False)
    print(f"Cleaned dataset saved to: {cleaned_data_path}")

    return books_df