Spaces:

badminton001
/

Recommender

Sleeping

App Files Files Community

Recommender / preprocessing /preprocess_books_50000.py

badminton001

Update preprocessing/preprocess_books_50000.py

5e5ed91 verified 10 months ago

raw

history blame contribute delete

5.65 kB

	"""
	preprocessing/preprocess_books_50000.py

	Preprocesses Open Library annotated book data:
	- Tokenizes titles and uses subjects as a proxy for overview text (word_tokenize)
	- Converts text to lowercase
	- Removes English stopwords
	- Keeps only alphabetic-only tokens
	- Normalizes various tag fields (genres, mood, target_audience, era, decade, language)

	Input:
	- data/book/annotated/books_annotated_50000.json

	Outputs:
	- data/book/preprocessed/books_preprocessed_50000.json
	- data/book/preprocessed/books_preprocessed_50000.csv
	"""

	import os
	import json
	import csv
	import re
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from pathlib import Path
	from typing import List, Dict, Any, Optional

	# --- NLTK Downloads (Uncomment if not already downloaded) ---
	# try:
	# nltk.data.find('tokenizers/punkt')
	# except nltk.downloader.DownloadError:
	# nltk.download('punkt')
	# try:
	# nltk.data.find('corpora/stopwords')
	# except nltk.downloader.DownloadError:
	# nltk.download('stopwords')

	# --- Global Stopword Set ---
	STOPWORDS = set(stopwords.words("english"))
	# Optional: Add custom stopwords relevant to book data if needed
	# CUSTOM_STOPWORDS = {"book", "novel", "story", "chapter"}
	# STOPWORDS.update(CUSTOM_STOPWORDS)

	# --- File Paths Configuration ---
	# Input annotated books JSON file (updated for 50,000 records)
	ANNOTATED_PATH = Path(__file__).parent.parent / "data" / "book" / "annotated" / "books_annotated_50000.json"
	# Output preprocessed books JSON file (updated for 50,000 records)
	OUT_JSON = Path(__file__).parent.parent / "data" / "book" / "preprocessed" / "books_preprocessed_50000.json"
	# Output preprocessed books CSV file (for inspection, updated for 50,000 records)
	OUT_CSV = Path(__file__).parent.parent / "data" / "book" / "preprocessed" / "books_preprocessed_50000.csv"

	# Ensure output directory exists
	OUT_JSON.parent.mkdir(parents=True, exist_ok=True)


	# --- Text Cleaning Function ---
	def clean_text(text: Optional[str]) -> List[str]:
	"""
	Cleans and tokenizes text:
	- Handles None input
	- Converts to lowercase
	- Removes URLs and HTML tags
	- Tokenizes using NLTK's word_tokenize
	- Filters out non-alphabetic tokens and stopwords
	"""
	text = text or ""
	text = text.lower()
	text = re.sub(r"http\S+", " ", text)
	text = re.sub(r"<[^>]+>", " ", text)
	text = re.sub(r'[^a-z\s]', ' ', text)

	tokens = word_tokenize(text)
	return [t for t in tokens if t.isalpha() and t not in STOPWORDS]


	# --- List Normalization Function ---
	def normalize_list_tags(tags: Optional[List[Any]]) -> List[str]:
	"""
	Normalizes a list of tags (e.g., genres, moods).
	- Handles None input
	- Ensures elements are strings before processing
	- Strips whitespace, converts to lowercase, filters empty strings
	"""
	if tags is None:
	return []
	return [str(t).strip().lower() for t in tags if t is not None and isinstance(t, str) and str(t).strip()]


	# --- Main Preprocessing Function ---
	def main():
	"""
	Main function to load annotated book data, preprocess it,
	and save the results to JSON and CSV files.
	"""
	if not ANNOTATED_PATH.exists():
	raise FileNotFoundError(f"Cannot find input file: {ANNOTATED_PATH}. Please ensure annotation is complete.")

	print(f"Loading annotated book data from: {ANNOTATED_PATH}")
	try:
	with open(ANNOTATED_PATH, "r", encoding="utf-8") as f:
	books_annotated = json.load(f)
	except json.JSONDecodeError as e:
	print(f"Error decoding JSON from {ANNOTATED_PATH}: {e}")
	return

	processed_books = []
	print(f"Preprocessing {len(books_annotated)} book records...")
	for rec in books_annotated:
	processed_rec = dict(rec)

	processed_rec['title_tokens'] = clean_text(rec.get("title"))
	processed_rec['overview_tokens'] = clean_text(" ".join(rec.get("subjects", [])))

	processed_rec['genres'] = normalize_list_tags(rec.get("genres"))
	processed_rec['mood'] = normalize_list_tags(rec.get("mood"))
	processed_rec['authors'] = normalize_list_tags(rec.get('authors'))

	processed_rec['target_audience'] = rec.get("target_audience", "").lower().replace(" ", "_")
	processed_rec['era'] = rec.get("era", "").lower().replace(" ", "_")
	processed_rec['decade'] = rec.get("decade", "").lower()
	processed_rec['language'] = rec.get("language", "").lower()

	processed_books.append(processed_rec)

	print(f"Saving preprocessed data to: {OUT_JSON}")
	with open(OUT_JSON, "w", encoding="utf-8") as f:
	json.dump(processed_books, f, ensure_ascii=False, indent=2)

	fieldnames = [
	"title", "title_tokens", "overview_tokens", "genres", "mood",
	"target_audience", "era", "decade", "language", "authors"
	]

	print(f"Saving preprocessed data to: {OUT_CSV}")
	with open(OUT_CSV, "w", encoding="utf-8", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=fieldnames)
	writer.writeheader()
	for rec in processed_books:
	row = {k: rec.get(k, "") for k in fieldnames}
	row["title_tokens"] = " ".join(row["title_tokens"])
	row["overview_tokens"] = " ".join(row["overview_tokens"])
	row["genres"] = ";".join(row["genres"])
	row["mood"] = ";".join(row["mood"])
	row["authors"] = ";".join(row["authors"])

	writer.writerow(row)

	print(f"✅ Book preprocessing complete. Processed {len(processed_books)} records.")
	print(f"Outputs:\n - {OUT_JSON}\n - {OUT_CSV}")


	if __name__ == "__main__":
	main()