File size: 1,545 Bytes
fe617ac
3f281f1
 
 
 
fe617ac
3f281f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import csv
import logging
from pathlib import Path

import pandas as pd

logger = logging.getLogger(__name__)


def run(
    books_path: Path = Path("data/books_data.csv"),
    ratings_path: Path = Path("data/Books_rating.csv"),
    output_path: Path = Path("data/books_basic_info.csv"),
) -> None:
    """Build books basic info from raw data. Callable from Pipeline."""
    books_data = pd.read_csv(
        str(books_path),
        engine="python",
        quotechar='"',
        escapechar='\\',
        on_bad_lines='skip',
    )
    ratings = pd.read_csv(
        str(ratings_path),
        engine="python",
        quotechar='"',
        escapechar='\\',
        on_bad_lines='skip',
    )

    books_cols = ["Title", "description", "authors", "image", "publisher", "publishedDate", "categories"]
    books_data = books_data[books_cols]
    ratings = ratings[["Title", "Id", "review/score"]].drop_duplicates(subset=["Title"])
    merged = books_data.merge(ratings, on="Title", how="left")
    merged = merged.rename(columns={
        "Id": "isbn10", "Title": "title", "authors": "authors", "description": "description",
        "image": "image", "publisher": "publisher", "publishedDate": "publishedDate",
        "categories": "categories", "review/score": "average_rating"
    })
    merged["isbn13"] = None

    merged.to_csv(str(output_path), index=False, quoting=csv.QUOTE_ALL, quotechar='"', escapechar='\\')
    logger.info("Saved %s", output_path)


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    run()