Spaces:

badminton001
/

Recommender

Sleeping

App Files Files Community

Recommender / vectorization /vectorize_books_50000.py

badminton001

Update vectorization/vectorize_books_50000.py

8b87258 verified 10 months ago

raw

history blame contribute delete

5.13 kB

	"""
	vectorization/vectorize_books_50000.py

	Vectorizes preprocessed book data using TF-IDF and SBERT.

	Input:
	- data/book/preprocessed/books_preprocessed_50000.json

	Outputs:
	- data/book/vectorized/books_tfidf_vectorizer_50000.pkl
	- data/book/vectorized/books_tfidf_matrix_50000.npz
	- data/book/vectorized/books_sbert_embeddings_50000.pkl
	- data/book/vectorized/books_sbert_model_50000.txt
	"""

	import os
	import json
	import pickle
	from pathlib import Path
	from typing import List, Dict, Any

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sentence_transformers import SentenceTransformer
	from scipy import sparse

	# --- Path Configurations ---
	# Define the base directory for book data
	DATA_DIR = Path(__file__).parent.parent / "data" / "book"
	# Path to the input preprocessed JSON file (updated for 50,000 records)
	INPUT_PATH = DATA_DIR / "preprocessed" / "books_preprocessed_50000.json"
	# Directory to save vectorized outputs
	OUT_DIR = DATA_DIR / "vectorized"
	# Ensure the output directory exists
	OUT_DIR.mkdir(parents=True, exist_ok=True)

	# --- Constants ---
	# Default SBERT model name to use for embeddings
	DEFAULT_SBERT_MODEL = "all-MiniLM-L6-v2"
	# Maximum number of features for TF-IDF vectorizer. Increased for larger dataset.
	TFIDF_MAX_FEATURES = 10000 # Adjusted from 5000 to 10000 for a larger corpus
	# Prefix for output files
	OUTPUT_PREFIX = "books"

	# --- Data Loading Function ---
	def load_records(path: Path) -> List[Dict[str, Any]]:
	"""Loads preprocessed book records from a JSON file."""
	try:
	with open(path, "r", encoding="utf-8") as f:
	return json.load(f)
	except FileNotFoundError:
	print(f"Error: Input file not found at {path}")
	return []
	except json.JSONDecodeError:
	print(f"Error: Could not decode JSON from {path}. Check file format.")
	return []

	# --- Corpus Building Function ---
	def build_corpus(records: List[Dict[str, Any]]) -> List[str]:
	"""
	Constructs a corpus of text from book records.
	Each document in the corpus is a concatenation of title and overview tokens.
	"""
	corpus = []
	for r in records:
	tokens = r.get("title_tokens", []) + r.get("overview_tokens", [])
	corpus.append(" ".join(tokens))
	return corpus

	# --- TF-IDF Vectorization ---
	def vectorize_tfidf(corpus: List[str], output_dir: Path, prefix: str):
	"""
	Performs TF-IDF vectorization on the given corpus.
	Saves the trained TF-IDF vectorizer and the resulting sparse matrix.
	"""
	print(f"🚀 [TF-IDF] Starting TF-IDF vectorization with {TFIDF_MAX_FEATURES} features...")
	vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)
	tfidf_matrix = vectorizer.fit_transform(corpus)

	# Save the fitted TF-IDF vectorizer (updated filename)
	with open(output_dir / f"{prefix}_tfidf_vectorizer_50000.pkl", "wb") as f:
	pickle.dump(vectorizer, f)

	# Save the TF-IDF matrix in sparse NPZ format (updated filename)
	sparse.save_npz(output_dir / f"{prefix}_tfidf_matrix_50000.npz", tfidf_matrix)

	print(f"✅ [TF-IDF] Vectorizer and matrix saved to {output_dir}.")
	print(f" TF-IDF matrix shape: {tfidf_matrix.shape}")

	# --- SBERT Vectorization ---
	def vectorize_sbert(corpus: List[str], output_dir: Path, prefix: str, model_name: str = DEFAULT_SBERT_MODEL):
	"""
	Performs SBERT embedding generation on the given corpus.
	Saves the generated embeddings and the name of the SBERT model used.
	"""
	print(f"🚀 [SBERT] Starting SBERT embedding generation using model: {model_name}...")
	try:
	model = SentenceTransformer(model_name)
	embeddings = model.encode(corpus, show_progress_bar=True, batch_size=32)

	# Save the generated embeddings (updated filename)
	with open(output_dir / f"{prefix}_sbert_embeddings_50000.pkl", "wb") as f:
	pickle.dump(embeddings, f)

	# Save the name of the SBERT model used (updated filename)
	with open(output_dir / f"{prefix}_sbert_model_50000.txt", "w", encoding="utf-8") as f:
	f.write(model_name)

	print(f"✅ [SBERT] Embeddings and model name saved to {output_dir}.")
	print(f" SBERT embeddings shape: {embeddings.shape}")
	except Exception as e:
	print(f"❌ [SBERT] Error during SBERT embedding generation: {e}")

	# --- Main Execution ---
	def main():
	"""Main function to orchestrate the book vectorization process."""
	if not INPUT_PATH.exists():
	raise FileNotFoundError(f"Missing input file: {INPUT_PATH}. Please ensure preprocessing is complete.")

	records = load_records(INPUT_PATH)
	if not records:
	print("No records loaded. Exiting vectorization process.")
	return

	corpus = build_corpus(records)
	print(f"📚 Loaded {len(records)} book records and built corpus of {len(corpus)} documents.")
	print("Starting vectorization process...")

	vectorize_tfidf(corpus, OUT_DIR, OUTPUT_PREFIX)
	vectorize_sbert(corpus, OUT_DIR, OUTPUT_PREFIX, DEFAULT_SBERT_MODEL)
	print("✨ Book vectorization complete!")

	if __name__ == "__main__":
	main()