Spaces:
Sleeping
Sleeping
| """ | |
| preprocessing/preprocess_books_50000.py | |
| Preprocesses Open Library annotated book data: | |
| - Tokenizes titles and uses subjects as a proxy for overview text (word_tokenize) | |
| - Converts text to lowercase | |
| - Removes English stopwords | |
| - Keeps only alphabetic-only tokens | |
| - Normalizes various tag fields (genres, mood, target_audience, era, decade, language) | |
| Input: | |
| - data/book/annotated/books_annotated_50000.json | |
| Outputs: | |
| - data/book/preprocessed/books_preprocessed_50000.json | |
| - data/book/preprocessed/books_preprocessed_50000.csv | |
| """ | |
| import os | |
| import json | |
| import csv | |
| import re | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| from pathlib import Path | |
| from typing import List, Dict, Any, Optional | |
| # --- NLTK Downloads (Uncomment if not already downloaded) --- | |
| # try: | |
| # nltk.data.find('tokenizers/punkt') | |
| # except nltk.downloader.DownloadError: | |
| # nltk.download('punkt') | |
| # try: | |
| # nltk.data.find('corpora/stopwords') | |
| # except nltk.downloader.DownloadError: | |
| # nltk.download('stopwords') | |
| # --- Global Stopword Set --- | |
| STOPWORDS = set(stopwords.words("english")) | |
| # Optional: Add custom stopwords relevant to book data if needed | |
| # CUSTOM_STOPWORDS = {"book", "novel", "story", "chapter"} | |
| # STOPWORDS.update(CUSTOM_STOPWORDS) | |
| # --- File Paths Configuration --- | |
| # Input annotated books JSON file (updated for 50,000 records) | |
| ANNOTATED_PATH = Path(__file__).parent.parent / "data" / "book" / "annotated" / "books_annotated_50000.json" | |
| # Output preprocessed books JSON file (updated for 50,000 records) | |
| OUT_JSON = Path(__file__).parent.parent / "data" / "book" / "preprocessed" / "books_preprocessed_50000.json" | |
| # Output preprocessed books CSV file (for inspection, updated for 50,000 records) | |
| OUT_CSV = Path(__file__).parent.parent / "data" / "book" / "preprocessed" / "books_preprocessed_50000.csv" | |
| # Ensure output directory exists | |
| OUT_JSON.parent.mkdir(parents=True, exist_ok=True) | |
| # --- Text Cleaning Function --- | |
| def clean_text(text: Optional[str]) -> List[str]: | |
| """ | |
| Cleans and tokenizes text: | |
| - Handles None input | |
| - Converts to lowercase | |
| - Removes URLs and HTML tags | |
| - Tokenizes using NLTK's word_tokenize | |
| - Filters out non-alphabetic tokens and stopwords | |
| """ | |
| text = text or "" | |
| text = text.lower() | |
| text = re.sub(r"http\S+", " ", text) | |
| text = re.sub(r"<[^>]+>", " ", text) | |
| text = re.sub(r'[^a-z\s]', ' ', text) | |
| tokens = word_tokenize(text) | |
| return [t for t in tokens if t.isalpha() and t not in STOPWORDS] | |
| # --- List Normalization Function --- | |
| def normalize_list_tags(tags: Optional[List[Any]]) -> List[str]: | |
| """ | |
| Normalizes a list of tags (e.g., genres, moods). | |
| - Handles None input | |
| - Ensures elements are strings before processing | |
| - Strips whitespace, converts to lowercase, filters empty strings | |
| """ | |
| if tags is None: | |
| return [] | |
| return [str(t).strip().lower() for t in tags if t is not None and isinstance(t, str) and str(t).strip()] | |
| # --- Main Preprocessing Function --- | |
| def main(): | |
| """ | |
| Main function to load annotated book data, preprocess it, | |
| and save the results to JSON and CSV files. | |
| """ | |
| if not ANNOTATED_PATH.exists(): | |
| raise FileNotFoundError(f"Cannot find input file: {ANNOTATED_PATH}. Please ensure annotation is complete.") | |
| print(f"Loading annotated book data from: {ANNOTATED_PATH}") | |
| try: | |
| with open(ANNOTATED_PATH, "r", encoding="utf-8") as f: | |
| books_annotated = json.load(f) | |
| except json.JSONDecodeError as e: | |
| print(f"Error decoding JSON from {ANNOTATED_PATH}: {e}") | |
| return | |
| processed_books = [] | |
| print(f"Preprocessing {len(books_annotated)} book records...") | |
| for rec in books_annotated: | |
| processed_rec = dict(rec) | |
| processed_rec['title_tokens'] = clean_text(rec.get("title")) | |
| processed_rec['overview_tokens'] = clean_text(" ".join(rec.get("subjects", []))) | |
| processed_rec['genres'] = normalize_list_tags(rec.get("genres")) | |
| processed_rec['mood'] = normalize_list_tags(rec.get("mood")) | |
| processed_rec['authors'] = normalize_list_tags(rec.get('authors')) | |
| processed_rec['target_audience'] = rec.get("target_audience", "").lower().replace(" ", "_") | |
| processed_rec['era'] = rec.get("era", "").lower().replace(" ", "_") | |
| processed_rec['decade'] = rec.get("decade", "").lower() | |
| processed_rec['language'] = rec.get("language", "").lower() | |
| processed_books.append(processed_rec) | |
| print(f"Saving preprocessed data to: {OUT_JSON}") | |
| with open(OUT_JSON, "w", encoding="utf-8") as f: | |
| json.dump(processed_books, f, ensure_ascii=False, indent=2) | |
| fieldnames = [ | |
| "title", "title_tokens", "overview_tokens", "genres", "mood", | |
| "target_audience", "era", "decade", "language", "authors" | |
| ] | |
| print(f"Saving preprocessed data to: {OUT_CSV}") | |
| with open(OUT_CSV, "w", encoding="utf-8", newline="") as f: | |
| writer = csv.DictWriter(f, fieldnames=fieldnames) | |
| writer.writeheader() | |
| for rec in processed_books: | |
| row = {k: rec.get(k, "") for k in fieldnames} | |
| row["title_tokens"] = " ".join(row["title_tokens"]) | |
| row["overview_tokens"] = " ".join(row["overview_tokens"]) | |
| row["genres"] = ";".join(row["genres"]) | |
| row["mood"] = ";".join(row["mood"]) | |
| row["authors"] = ";".join(row["authors"]) | |
| writer.writerow(row) | |
| print(f"✅ Book preprocessing complete. Processed {len(processed_books)} records.") | |
| print(f"Outputs:\n - {OUT_JSON}\n - {OUT_CSV}") | |
| if __name__ == "__main__": | |
| main() |