Spaces:
Sleeping
Sleeping
| """ | |
| vectorization/vectorize_books_50000.py | |
| Vectorizes preprocessed book data using TF-IDF and SBERT. | |
| Input: | |
| - data/book/preprocessed/books_preprocessed_50000.json | |
| Outputs: | |
| - data/book/vectorized/books_tfidf_vectorizer_50000.pkl | |
| - data/book/vectorized/books_tfidf_matrix_50000.npz | |
| - data/book/vectorized/books_sbert_embeddings_50000.pkl | |
| - data/book/vectorized/books_sbert_model_50000.txt | |
| """ | |
| import os | |
| import json | |
| import pickle | |
| from pathlib import Path | |
| from typing import List, Dict, Any | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sentence_transformers import SentenceTransformer | |
| from scipy import sparse | |
| # --- Path Configurations --- | |
| # Define the base directory for book data | |
| DATA_DIR = Path(__file__).parent.parent / "data" / "book" | |
| # Path to the input preprocessed JSON file (updated for 50,000 records) | |
| INPUT_PATH = DATA_DIR / "preprocessed" / "books_preprocessed_50000.json" | |
| # Directory to save vectorized outputs | |
| OUT_DIR = DATA_DIR / "vectorized" | |
| # Ensure the output directory exists | |
| OUT_DIR.mkdir(parents=True, exist_ok=True) | |
| # --- Constants --- | |
| # Default SBERT model name to use for embeddings | |
| DEFAULT_SBERT_MODEL = "all-MiniLM-L6-v2" | |
| # Maximum number of features for TF-IDF vectorizer. Increased for larger dataset. | |
| TFIDF_MAX_FEATURES = 10000 # Adjusted from 5000 to 10000 for a larger corpus | |
| # Prefix for output files | |
| OUTPUT_PREFIX = "books" | |
| # --- Data Loading Function --- | |
| def load_records(path: Path) -> List[Dict[str, Any]]: | |
| """Loads preprocessed book records from a JSON file.""" | |
| try: | |
| with open(path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| except FileNotFoundError: | |
| print(f"Error: Input file not found at {path}") | |
| return [] | |
| except json.JSONDecodeError: | |
| print(f"Error: Could not decode JSON from {path}. Check file format.") | |
| return [] | |
| # --- Corpus Building Function --- | |
| def build_corpus(records: List[Dict[str, Any]]) -> List[str]: | |
| """ | |
| Constructs a corpus of text from book records. | |
| Each document in the corpus is a concatenation of title and overview tokens. | |
| """ | |
| corpus = [] | |
| for r in records: | |
| tokens = r.get("title_tokens", []) + r.get("overview_tokens", []) | |
| corpus.append(" ".join(tokens)) | |
| return corpus | |
| # --- TF-IDF Vectorization --- | |
| def vectorize_tfidf(corpus: List[str], output_dir: Path, prefix: str): | |
| """ | |
| Performs TF-IDF vectorization on the given corpus. | |
| Saves the trained TF-IDF vectorizer and the resulting sparse matrix. | |
| """ | |
| print(f"🚀 [TF-IDF] Starting TF-IDF vectorization with {TFIDF_MAX_FEATURES} features...") | |
| vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES) | |
| tfidf_matrix = vectorizer.fit_transform(corpus) | |
| # Save the fitted TF-IDF vectorizer (updated filename) | |
| with open(output_dir / f"{prefix}_tfidf_vectorizer_50000.pkl", "wb") as f: | |
| pickle.dump(vectorizer, f) | |
| # Save the TF-IDF matrix in sparse NPZ format (updated filename) | |
| sparse.save_npz(output_dir / f"{prefix}_tfidf_matrix_50000.npz", tfidf_matrix) | |
| print(f"✅ [TF-IDF] Vectorizer and matrix saved to {output_dir}.") | |
| print(f" TF-IDF matrix shape: {tfidf_matrix.shape}") | |
| # --- SBERT Vectorization --- | |
| def vectorize_sbert(corpus: List[str], output_dir: Path, prefix: str, model_name: str = DEFAULT_SBERT_MODEL): | |
| """ | |
| Performs SBERT embedding generation on the given corpus. | |
| Saves the generated embeddings and the name of the SBERT model used. | |
| """ | |
| print(f"🚀 [SBERT] Starting SBERT embedding generation using model: {model_name}...") | |
| try: | |
| model = SentenceTransformer(model_name) | |
| embeddings = model.encode(corpus, show_progress_bar=True, batch_size=32) | |
| # Save the generated embeddings (updated filename) | |
| with open(output_dir / f"{prefix}_sbert_embeddings_50000.pkl", "wb") as f: | |
| pickle.dump(embeddings, f) | |
| # Save the name of the SBERT model used (updated filename) | |
| with open(output_dir / f"{prefix}_sbert_model_50000.txt", "w", encoding="utf-8") as f: | |
| f.write(model_name) | |
| print(f"✅ [SBERT] Embeddings and model name saved to {output_dir}.") | |
| print(f" SBERT embeddings shape: {embeddings.shape}") | |
| except Exception as e: | |
| print(f"❌ [SBERT] Error during SBERT embedding generation: {e}") | |
| # --- Main Execution --- | |
| def main(): | |
| """Main function to orchestrate the book vectorization process.""" | |
| if not INPUT_PATH.exists(): | |
| raise FileNotFoundError(f"Missing input file: {INPUT_PATH}. Please ensure preprocessing is complete.") | |
| records = load_records(INPUT_PATH) | |
| if not records: | |
| print("No records loaded. Exiting vectorization process.") | |
| return | |
| corpus = build_corpus(records) | |
| print(f"📚 Loaded {len(records)} book records and built corpus of {len(corpus)} documents.") | |
| print("Starting vectorization process...") | |
| vectorize_tfidf(corpus, OUT_DIR, OUTPUT_PREFIX) | |
| vectorize_sbert(corpus, OUT_DIR, OUTPUT_PREFIX, DEFAULT_SBERT_MODEL) | |
| print("✨ Book vectorization complete!") | |
| if __name__ == "__main__": | |
| main() |