Spaces:
Sleeping
Sleeping
| import csv | |
| import logging | |
| from pathlib import Path | |
| import pandas as pd | |
| logger = logging.getLogger(__name__) | |
| def run( | |
| books_path: Path = Path("data/books_data.csv"), | |
| ratings_path: Path = Path("data/Books_rating.csv"), | |
| output_path: Path = Path("data/books_basic_info.csv"), | |
| ) -> None: | |
| """Build books basic info from raw data. Callable from Pipeline.""" | |
| books_data = pd.read_csv( | |
| str(books_path), | |
| engine="python", | |
| quotechar='"', | |
| escapechar='\\', | |
| on_bad_lines='skip', | |
| ) | |
| ratings = pd.read_csv( | |
| str(ratings_path), | |
| engine="python", | |
| quotechar='"', | |
| escapechar='\\', | |
| on_bad_lines='skip', | |
| ) | |
| books_cols = ["Title", "description", "authors", "image", "publisher", "publishedDate", "categories"] | |
| books_data = books_data[books_cols] | |
| ratings = ratings[["Title", "Id", "review/score"]].drop_duplicates(subset=["Title"]) | |
| merged = books_data.merge(ratings, on="Title", how="left") | |
| merged = merged.rename(columns={ | |
| "Id": "isbn10", "Title": "title", "authors": "authors", "description": "description", | |
| "image": "image", "publisher": "publisher", "publishedDate": "publishedDate", | |
| "categories": "categories", "review/score": "average_rating" | |
| }) | |
| merged["isbn13"] = None | |
| merged.to_csv(str(output_path), index=False, quoting=csv.QUOTE_ALL, quotechar='"', escapechar='\\') | |
| logger.info("Saved %s", output_path) | |
| if __name__ == "__main__": | |
| logging.basicConfig(level=logging.INFO) | |
| run() | |