Recommender / vectorization /vectorize_books_50000.py
badminton001's picture
Update vectorization/vectorize_books_50000.py
8b87258 verified
"""
vectorization/vectorize_books_50000.py
Vectorizes preprocessed book data using TF-IDF and SBERT.
Input:
- data/book/preprocessed/books_preprocessed_50000.json
Outputs:
- data/book/vectorized/books_tfidf_vectorizer_50000.pkl
- data/book/vectorized/books_tfidf_matrix_50000.npz
- data/book/vectorized/books_sbert_embeddings_50000.pkl
- data/book/vectorized/books_sbert_model_50000.txt
"""
import os
import json
import pickle
from pathlib import Path
from typing import List, Dict, Any
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from scipy import sparse
# --- Path Configurations ---
# Define the base directory for book data
DATA_DIR = Path(__file__).parent.parent / "data" / "book"
# Path to the input preprocessed JSON file (updated for 50,000 records)
INPUT_PATH = DATA_DIR / "preprocessed" / "books_preprocessed_50000.json"
# Directory to save vectorized outputs
OUT_DIR = DATA_DIR / "vectorized"
# Ensure the output directory exists
OUT_DIR.mkdir(parents=True, exist_ok=True)
# --- Constants ---
# Default SBERT model name to use for embeddings
DEFAULT_SBERT_MODEL = "all-MiniLM-L6-v2"
# Maximum number of features for TF-IDF vectorizer. Increased for larger dataset.
TFIDF_MAX_FEATURES = 10000 # Adjusted from 5000 to 10000 for a larger corpus
# Prefix for output files
OUTPUT_PREFIX = "books"
# --- Data Loading Function ---
def load_records(path: Path) -> List[Dict[str, Any]]:
"""Loads preprocessed book records from a JSON file."""
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError:
print(f"Error: Input file not found at {path}")
return []
except json.JSONDecodeError:
print(f"Error: Could not decode JSON from {path}. Check file format.")
return []
# --- Corpus Building Function ---
def build_corpus(records: List[Dict[str, Any]]) -> List[str]:
"""
Constructs a corpus of text from book records.
Each document in the corpus is a concatenation of title and overview tokens.
"""
corpus = []
for r in records:
tokens = r.get("title_tokens", []) + r.get("overview_tokens", [])
corpus.append(" ".join(tokens))
return corpus
# --- TF-IDF Vectorization ---
def vectorize_tfidf(corpus: List[str], output_dir: Path, prefix: str):
"""
Performs TF-IDF vectorization on the given corpus.
Saves the trained TF-IDF vectorizer and the resulting sparse matrix.
"""
print(f"🚀 [TF-IDF] Starting TF-IDF vectorization with {TFIDF_MAX_FEATURES} features...")
vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)
tfidf_matrix = vectorizer.fit_transform(corpus)
# Save the fitted TF-IDF vectorizer (updated filename)
with open(output_dir / f"{prefix}_tfidf_vectorizer_50000.pkl", "wb") as f:
pickle.dump(vectorizer, f)
# Save the TF-IDF matrix in sparse NPZ format (updated filename)
sparse.save_npz(output_dir / f"{prefix}_tfidf_matrix_50000.npz", tfidf_matrix)
print(f"✅ [TF-IDF] Vectorizer and matrix saved to {output_dir}.")
print(f" TF-IDF matrix shape: {tfidf_matrix.shape}")
# --- SBERT Vectorization ---
def vectorize_sbert(corpus: List[str], output_dir: Path, prefix: str, model_name: str = DEFAULT_SBERT_MODEL):
"""
Performs SBERT embedding generation on the given corpus.
Saves the generated embeddings and the name of the SBERT model used.
"""
print(f"🚀 [SBERT] Starting SBERT embedding generation using model: {model_name}...")
try:
model = SentenceTransformer(model_name)
embeddings = model.encode(corpus, show_progress_bar=True, batch_size=32)
# Save the generated embeddings (updated filename)
with open(output_dir / f"{prefix}_sbert_embeddings_50000.pkl", "wb") as f:
pickle.dump(embeddings, f)
# Save the name of the SBERT model used (updated filename)
with open(output_dir / f"{prefix}_sbert_model_50000.txt", "w", encoding="utf-8") as f:
f.write(model_name)
print(f"✅ [SBERT] Embeddings and model name saved to {output_dir}.")
print(f" SBERT embeddings shape: {embeddings.shape}")
except Exception as e:
print(f"❌ [SBERT] Error during SBERT embedding generation: {e}")
# --- Main Execution ---
def main():
"""Main function to orchestrate the book vectorization process."""
if not INPUT_PATH.exists():
raise FileNotFoundError(f"Missing input file: {INPUT_PATH}. Please ensure preprocessing is complete.")
records = load_records(INPUT_PATH)
if not records:
print("No records loaded. Exiting vectorization process.")
return
corpus = build_corpus(records)
print(f"📚 Loaded {len(records)} book records and built corpus of {len(corpus)} documents.")
print("Starting vectorization process...")
vectorize_tfidf(corpus, OUT_DIR, OUTPUT_PREFIX)
vectorize_sbert(corpus, OUT_DIR, OUTPUT_PREFIX, DEFAULT_SBERT_MODEL)
print("✨ Book vectorization complete!")
if __name__ == "__main__":
main()