File size: 3,632 Bytes
cdb73a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import logging
import os
import sys

import pandas as pd

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


def prepare_goodreads_data():
    """
    Convert Goodreads CSV format to the format expected by data_processor.

    Maps:
    - Book → title
    - Author → authors
    - Avg_Rating → rating
    - Description → description
    - Genres → genres
    - Creates 'tags' from genres
    - Drops: Unnamed: 0, Num_Ratings, URL
    """
    input_path = "data/raw/goodreads_data.csv"
    output_path = "data/raw/books_prepared.csv"

    logger.info(f"Loading Goodreads data from {input_path}...")

    try:
        df = pd.read_csv(input_path)
        logger.info(f"Loaded {len(df)} books")
        logger.info(f"Original columns: {df.columns.tolist()}")

        if "Unnamed: 0" in df.columns or "" in df.columns:
            df = df.drop(columns=[col for col in df.columns if "Unnamed" in str(col) or col == ""])
            logger.info("Dropped unnamed index column")

        logger.info("Renaming columns...")
        df = df.rename(
            columns={
                "Book": "title",
                "Author": "authors",
                "Avg_Rating": "rating",
                "Description": "description",
                "Genres": "genres",
            }
        )

        df["tags"] = ""
        logger.info("Created 'tags' column (empty - can be populated later)")

        columns_to_keep = ["title", "authors", "genres", "description", "tags", "rating"]

        missing_cols = [col for col in columns_to_keep if col not in df.columns]
        if missing_cols:
            logger.error(f"Missing columns after mapping: {missing_cols}")
            logger.error(f"Available columns: {df.columns.tolist()}")
            raise ValueError(f"Column mapping failed. Missing: {missing_cols}")

        df = df[columns_to_keep]

        logger.info(f"Final shape: {df.shape}")
        logger.info(f"Final columns: {df.columns.tolist()}")
        logger.info(f"Sample row:\n{df.iloc[0]}")

        null_rows = df.isnull().all(axis=1).sum()
        if null_rows > 0:
            logger.warning(f"Found {null_rows} completely null rows - will be removed by processor")

        logger.info(f"Saving prepared data to {output_path}...")
        df.to_csv(output_path, index=False)
        logger.info(f" Successfully prepared {len(df)} books")

        logger.info("\n Dataset Summary:")
        logger.info(f"  Total books: {len(df)}")
        logger.info(f"  Books with ratings: {df['rating'].notna().sum()}")
        logger.info(f"  Books with descriptions: {(df['description'] != '').sum()}")
        logger.info(f"  Average rating: {df['rating'].mean():.2f}")

        print("\n Data preparation complete!")
        print(f"   Input:  {input_path}")
        print(f"   Output: {output_path}")
        print("\n Next steps:")
        print("   1. Update src/config.py line 17:")
        print("      RAW_DATA_PATH = os.path.join(RAW_DATA_DIR, 'books_prepared.csv')")
        print("   2. Run: python src/data_processor.py")
        print("   3. Run: python src/embedder.py")
        print("   4. Run: streamlit run app.py")

        return df

    except FileNotFoundError:
        logger.error(f"File not found: {input_path}")
        logger.error("Make sure goodreads_data.csv exists in data/raw/")
        raise
    except Exception as e:
        logger.error(f"Error processing data: {e}")
        raise


if __name__ == "__main__":
    prepare_goodreads_data()