book-rec-with-LLMs / scripts /data /build_books_basic_info.py
ymlin105's picture
chore: remove legacy files and scripts no longer part of the main architecture
3f281f1
import csv
import logging
from pathlib import Path
import pandas as pd
logger = logging.getLogger(__name__)
def run(
books_path: Path = Path("data/books_data.csv"),
ratings_path: Path = Path("data/Books_rating.csv"),
output_path: Path = Path("data/books_basic_info.csv"),
) -> None:
"""Build books basic info from raw data. Callable from Pipeline."""
books_data = pd.read_csv(
str(books_path),
engine="python",
quotechar='"',
escapechar='\\',
on_bad_lines='skip',
)
ratings = pd.read_csv(
str(ratings_path),
engine="python",
quotechar='"',
escapechar='\\',
on_bad_lines='skip',
)
books_cols = ["Title", "description", "authors", "image", "publisher", "publishedDate", "categories"]
books_data = books_data[books_cols]
ratings = ratings[["Title", "Id", "review/score"]].drop_duplicates(subset=["Title"])
merged = books_data.merge(ratings, on="Title", how="left")
merged = merged.rename(columns={
"Id": "isbn10", "Title": "title", "authors": "authors", "description": "description",
"image": "image", "publisher": "publisher", "publishedDate": "publishedDate",
"categories": "categories", "review/score": "average_rating"
})
merged["isbn13"] = None
merged.to_csv(str(output_path), index=False, quoting=csv.QUOTE_ALL, quotechar='"', escapechar='\\')
logger.info("Saved %s", output_path)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
run()