Spaces:
Running
Running
File size: 3,632 Bytes
cdb73a8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | import logging
import os
import sys
import pandas as pd
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
def prepare_goodreads_data():
"""
Convert Goodreads CSV format to the format expected by data_processor.
Maps:
- Book → title
- Author → authors
- Avg_Rating → rating
- Description → description
- Genres → genres
- Creates 'tags' from genres
- Drops: Unnamed: 0, Num_Ratings, URL
"""
input_path = "data/raw/goodreads_data.csv"
output_path = "data/raw/books_prepared.csv"
logger.info(f"Loading Goodreads data from {input_path}...")
try:
df = pd.read_csv(input_path)
logger.info(f"Loaded {len(df)} books")
logger.info(f"Original columns: {df.columns.tolist()}")
if "Unnamed: 0" in df.columns or "" in df.columns:
df = df.drop(columns=[col for col in df.columns if "Unnamed" in str(col) or col == ""])
logger.info("Dropped unnamed index column")
logger.info("Renaming columns...")
df = df.rename(
columns={
"Book": "title",
"Author": "authors",
"Avg_Rating": "rating",
"Description": "description",
"Genres": "genres",
}
)
df["tags"] = ""
logger.info("Created 'tags' column (empty - can be populated later)")
columns_to_keep = ["title", "authors", "genres", "description", "tags", "rating"]
missing_cols = [col for col in columns_to_keep if col not in df.columns]
if missing_cols:
logger.error(f"Missing columns after mapping: {missing_cols}")
logger.error(f"Available columns: {df.columns.tolist()}")
raise ValueError(f"Column mapping failed. Missing: {missing_cols}")
df = df[columns_to_keep]
logger.info(f"Final shape: {df.shape}")
logger.info(f"Final columns: {df.columns.tolist()}")
logger.info(f"Sample row:\n{df.iloc[0]}")
null_rows = df.isnull().all(axis=1).sum()
if null_rows > 0:
logger.warning(f"Found {null_rows} completely null rows - will be removed by processor")
logger.info(f"Saving prepared data to {output_path}...")
df.to_csv(output_path, index=False)
logger.info(f" Successfully prepared {len(df)} books")
logger.info("\n Dataset Summary:")
logger.info(f" Total books: {len(df)}")
logger.info(f" Books with ratings: {df['rating'].notna().sum()}")
logger.info(f" Books with descriptions: {(df['description'] != '').sum()}")
logger.info(f" Average rating: {df['rating'].mean():.2f}")
print("\n Data preparation complete!")
print(f" Input: {input_path}")
print(f" Output: {output_path}")
print("\n Next steps:")
print(" 1. Update src/config.py line 17:")
print(" RAW_DATA_PATH = os.path.join(RAW_DATA_DIR, 'books_prepared.csv')")
print(" 2. Run: python src/data_processor.py")
print(" 3. Run: python src/embedder.py")
print(" 4. Run: streamlit run app.py")
return df
except FileNotFoundError:
logger.error(f"File not found: {input_path}")
logger.error("Make sure goodreads_data.csv exists in data/raw/")
raise
except Exception as e:
logger.error(f"Error processing data: {e}")
raise
if __name__ == "__main__":
prepare_goodreads_data()
|