File size: 2,434 Bytes
cdb73a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import logging
import os
import sys
import pandas as pd

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def prepare_100k_data():
    """
    Convert GoodReads_100k_books.csv format to the format expected by data_processor.
    
    Input Columns: author, bookformat, desc, genre, img, isbn, isbn13, link, pages, rating, reviews, title, totalratings
    Target Columns: title, authors, genres, description, tags, rating, cover_image_url
    """
    input_path = "data/raw/GoodReads_100k_books.csv"
    output_path = "data/raw/books_prepared.csv"

    logger.info(f"Loading new 100k dataset from {input_path}...")

    if not os.path.exists(input_path):
        logger.error(f"Input file not found: {input_path}")
        return

    try:
        # Load data
        df = pd.read_csv(input_path)
        logger.info(f"Loaded {len(df)} books.")
        
        # Rename columns
        logger.info("Mapping columns...")
        df = df.rename(columns={
            "author": "authors",
            "desc": "description",
            "genre": "genres",
            "img": "cover_image_url",
            "rating": "rating"
        })
        
        # Create tags column (using genres as base if available, else empty)
        df["tags"] = df["genres"].fillna("")
        
        # Select and Reorder
        target_cols = ["title", "authors", "genres", "description", "tags", "rating", "cover_image_url"]
        
        # Ensure all target columns exist
        for col in target_cols:
            if col not in df.columns:
                df[col] = ""
                logger.warning(f"Column {col} missing in source, filled with empty strings.")

        df = df[target_cols]
        
        # Clean up
        logger.info("Cleaning data...")
        # Remove rows with no title
        df = df.dropna(subset=["title"])
        # Fill NaNs in text columns
        df[["authors", "genres", "description", "cover_image_url"]] = df[["authors", "genres", "description", "cover_image_url"]].fillna("")
        
        logger.info(f"Saving prepared data to {output_path}...")
        df.to_csv(output_path, index=False)
        logger.info(f"Successfully prepared {len(df)} books.")
        
    except Exception as e:
        logger.error(f"Error processing data: {e}")
        raise

if __name__ == "__main__":
    prepare_100k_data()