Recommender / preprocessing /preprocess_books_50000.py
badminton001's picture
Update preprocessing/preprocess_books_50000.py
5e5ed91 verified
"""
preprocessing/preprocess_books_50000.py
Preprocesses Open Library annotated book data:
- Tokenizes titles and uses subjects as a proxy for overview text (word_tokenize)
- Converts text to lowercase
- Removes English stopwords
- Keeps only alphabetic-only tokens
- Normalizes various tag fields (genres, mood, target_audience, era, decade, language)
Input:
- data/book/annotated/books_annotated_50000.json
Outputs:
- data/book/preprocessed/books_preprocessed_50000.json
- data/book/preprocessed/books_preprocessed_50000.csv
"""
import os
import json
import csv
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pathlib import Path
from typing import List, Dict, Any, Optional
# --- NLTK Downloads (Uncomment if not already downloaded) ---
# try:
# nltk.data.find('tokenizers/punkt')
# except nltk.downloader.DownloadError:
# nltk.download('punkt')
# try:
# nltk.data.find('corpora/stopwords')
# except nltk.downloader.DownloadError:
# nltk.download('stopwords')
# --- Global Stopword Set ---
STOPWORDS = set(stopwords.words("english"))
# Optional: Add custom stopwords relevant to book data if needed
# CUSTOM_STOPWORDS = {"book", "novel", "story", "chapter"}
# STOPWORDS.update(CUSTOM_STOPWORDS)
# --- File Paths Configuration ---
# Input annotated books JSON file (updated for 50,000 records)
ANNOTATED_PATH = Path(__file__).parent.parent / "data" / "book" / "annotated" / "books_annotated_50000.json"
# Output preprocessed books JSON file (updated for 50,000 records)
OUT_JSON = Path(__file__).parent.parent / "data" / "book" / "preprocessed" / "books_preprocessed_50000.json"
# Output preprocessed books CSV file (for inspection, updated for 50,000 records)
OUT_CSV = Path(__file__).parent.parent / "data" / "book" / "preprocessed" / "books_preprocessed_50000.csv"
# Ensure output directory exists
OUT_JSON.parent.mkdir(parents=True, exist_ok=True)
# --- Text Cleaning Function ---
def clean_text(text: Optional[str]) -> List[str]:
"""
Cleans and tokenizes text:
- Handles None input
- Converts to lowercase
- Removes URLs and HTML tags
- Tokenizes using NLTK's word_tokenize
- Filters out non-alphabetic tokens and stopwords
"""
text = text or ""
text = text.lower()
text = re.sub(r"http\S+", " ", text)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r'[^a-z\s]', ' ', text)
tokens = word_tokenize(text)
return [t for t in tokens if t.isalpha() and t not in STOPWORDS]
# --- List Normalization Function ---
def normalize_list_tags(tags: Optional[List[Any]]) -> List[str]:
"""
Normalizes a list of tags (e.g., genres, moods).
- Handles None input
- Ensures elements are strings before processing
- Strips whitespace, converts to lowercase, filters empty strings
"""
if tags is None:
return []
return [str(t).strip().lower() for t in tags if t is not None and isinstance(t, str) and str(t).strip()]
# --- Main Preprocessing Function ---
def main():
"""
Main function to load annotated book data, preprocess it,
and save the results to JSON and CSV files.
"""
if not ANNOTATED_PATH.exists():
raise FileNotFoundError(f"Cannot find input file: {ANNOTATED_PATH}. Please ensure annotation is complete.")
print(f"Loading annotated book data from: {ANNOTATED_PATH}")
try:
with open(ANNOTATED_PATH, "r", encoding="utf-8") as f:
books_annotated = json.load(f)
except json.JSONDecodeError as e:
print(f"Error decoding JSON from {ANNOTATED_PATH}: {e}")
return
processed_books = []
print(f"Preprocessing {len(books_annotated)} book records...")
for rec in books_annotated:
processed_rec = dict(rec)
processed_rec['title_tokens'] = clean_text(rec.get("title"))
processed_rec['overview_tokens'] = clean_text(" ".join(rec.get("subjects", [])))
processed_rec['genres'] = normalize_list_tags(rec.get("genres"))
processed_rec['mood'] = normalize_list_tags(rec.get("mood"))
processed_rec['authors'] = normalize_list_tags(rec.get('authors'))
processed_rec['target_audience'] = rec.get("target_audience", "").lower().replace(" ", "_")
processed_rec['era'] = rec.get("era", "").lower().replace(" ", "_")
processed_rec['decade'] = rec.get("decade", "").lower()
processed_rec['language'] = rec.get("language", "").lower()
processed_books.append(processed_rec)
print(f"Saving preprocessed data to: {OUT_JSON}")
with open(OUT_JSON, "w", encoding="utf-8") as f:
json.dump(processed_books, f, ensure_ascii=False, indent=2)
fieldnames = [
"title", "title_tokens", "overview_tokens", "genres", "mood",
"target_audience", "era", "decade", "language", "authors"
]
print(f"Saving preprocessed data to: {OUT_CSV}")
with open(OUT_CSV, "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for rec in processed_books:
row = {k: rec.get(k, "") for k in fieldnames}
row["title_tokens"] = " ".join(row["title_tokens"])
row["overview_tokens"] = " ".join(row["overview_tokens"])
row["genres"] = ";".join(row["genres"])
row["mood"] = ";".join(row["mood"])
row["authors"] = ";".join(row["authors"])
writer.writerow(row)
print(f"✅ Book preprocessing complete. Processed {len(processed_books)} records.")
print(f"Outputs:\n - {OUT_JSON}\n - {OUT_CSV}")
if __name__ == "__main__":
main()