Spaces:

nice-bill
/

deepshelf-api

Running

App Files Files Community

deepshelf-api / scripts /prepare_goodreads_data.py

nice-bill

initial commit

cdb73a8 3 months ago

raw

history blame contribute delete

3.63 kB

	import logging
	import os
	import sys

	import pandas as pd

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)


	def prepare_goodreads_data():
	"""
	Convert Goodreads CSV format to the format expected by data_processor.

	Maps:
	- Book → title
	- Author → authors
	- Avg_Rating → rating
	- Description → description
	- Genres → genres
	- Creates 'tags' from genres
	- Drops: Unnamed: 0, Num_Ratings, URL
	"""
	input_path = "data/raw/goodreads_data.csv"
	output_path = "data/raw/books_prepared.csv"

	logger.info(f"Loading Goodreads data from {input_path}...")

	try:
	df = pd.read_csv(input_path)
	logger.info(f"Loaded {len(df)} books")
	logger.info(f"Original columns: {df.columns.tolist()}")

	if "Unnamed: 0" in df.columns or "" in df.columns:
	df = df.drop(columns=[col for col in df.columns if "Unnamed" in str(col) or col == ""])
	logger.info("Dropped unnamed index column")

	logger.info("Renaming columns...")
	df = df.rename(
	columns={
	"Book": "title",
	"Author": "authors",
	"Avg_Rating": "rating",
	"Description": "description",
	"Genres": "genres",
	}
	)

	df["tags"] = ""
	logger.info("Created 'tags' column (empty - can be populated later)")

	columns_to_keep = ["title", "authors", "genres", "description", "tags", "rating"]

	missing_cols = [col for col in columns_to_keep if col not in df.columns]
	if missing_cols:
	logger.error(f"Missing columns after mapping: {missing_cols}")
	logger.error(f"Available columns: {df.columns.tolist()}")
	raise ValueError(f"Column mapping failed. Missing: {missing_cols}")

	df = df[columns_to_keep]

	logger.info(f"Final shape: {df.shape}")
	logger.info(f"Final columns: {df.columns.tolist()}")
	logger.info(f"Sample row:\n{df.iloc[0]}")

	null_rows = df.isnull().all(axis=1).sum()
	if null_rows > 0:
	logger.warning(f"Found {null_rows} completely null rows - will be removed by processor")

	logger.info(f"Saving prepared data to {output_path}...")
	df.to_csv(output_path, index=False)
	logger.info(f" Successfully prepared {len(df)} books")

	logger.info("\n Dataset Summary:")
	logger.info(f" Total books: {len(df)}")
	logger.info(f" Books with ratings: {df['rating'].notna().sum()}")
	logger.info(f" Books with descriptions: {(df['description'] != '').sum()}")
	logger.info(f" Average rating: {df['rating'].mean():.2f}")

	print("\n Data preparation complete!")
	print(f" Input: {input_path}")
	print(f" Output: {output_path}")
	print("\n Next steps:")
	print(" 1. Update src/config.py line 17:")
	print(" RAW_DATA_PATH = os.path.join(RAW_DATA_DIR, 'books_prepared.csv')")
	print(" 2. Run: python src/data_processor.py")
	print(" 3. Run: python src/embedder.py")
	print(" 4. Run: streamlit run app.py")

	return df

	except FileNotFoundError:
	logger.error(f"File not found: {input_path}")
	logger.error("Make sure goodreads_data.csv exists in data/raw/")
	raise
	except Exception as e:
	logger.error(f"Error processing data: {e}")
	raise


	if __name__ == "__main__":
	prepare_goodreads_data()