Spaces:
Sleeping
Sleeping
File size: 1,545 Bytes
fe617ac 3f281f1 fe617ac 3f281f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | import csv
import logging
from pathlib import Path
import pandas as pd
logger = logging.getLogger(__name__)
def run(
books_path: Path = Path("data/books_data.csv"),
ratings_path: Path = Path("data/Books_rating.csv"),
output_path: Path = Path("data/books_basic_info.csv"),
) -> None:
"""Build books basic info from raw data. Callable from Pipeline."""
books_data = pd.read_csv(
str(books_path),
engine="python",
quotechar='"',
escapechar='\\',
on_bad_lines='skip',
)
ratings = pd.read_csv(
str(ratings_path),
engine="python",
quotechar='"',
escapechar='\\',
on_bad_lines='skip',
)
books_cols = ["Title", "description", "authors", "image", "publisher", "publishedDate", "categories"]
books_data = books_data[books_cols]
ratings = ratings[["Title", "Id", "review/score"]].drop_duplicates(subset=["Title"])
merged = books_data.merge(ratings, on="Title", how="left")
merged = merged.rename(columns={
"Id": "isbn10", "Title": "title", "authors": "authors", "description": "description",
"image": "image", "publisher": "publisher", "publishedDate": "publishedDate",
"categories": "categories", "review/score": "average_rating"
})
merged["isbn13"] = None
merged.to_csv(str(output_path), index=False, quoting=csv.QUOTE_ALL, quotechar='"', escapechar='\\')
logger.info("Saved %s", output_path)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
run()
|