Spaces:

badminton001
/

Recommender

Sleeping

App Files Files Community

badminton001 commited on Jun 29, 2025

Commit

0efc436

verified ·

1 Parent(s): 8b87258

Update vectorization/vectorize_movies_50000.py

Browse files

Files changed (1) hide show

vectorization/vectorize_movies_50000.py +136 -139

vectorization/vectorize_movies_50000.py CHANGED Viewed

@@ -1,140 +1,137 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-vectorization/vectorize_movies_50000.py
-Vectorizes preprocessed TMDb movie data using TF-IDF and SBERT.
-Input:
-- data/movie/preprocessed/movies_preprocessed_50000.json
-Outputs:
-- data/movie/vectorized/movies_tfidf_vectorizer_50000.pkl
-- data/movie/vectorized/movies_tfidf_matrix_50000.npz
-- data/movie/vectorized/movies_sbert_embeddings_50000.pkl
-- data/movie/vectorized/movies_sbert_model_50000.txt
-"""
-import os
-import json
-import pickle
-from pathlib import Path
-from typing import List, Dict, Any
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sentence_transformers import SentenceTransformer
-from scipy import sparse
-# --- Path Configurations ---
-# Define the base directory for movie data
-DATA_DIR = Path(__file__).parent.parent / "data" / "movie"
-# Path to the input preprocessed JSON file (updated for 50,000 records)
-INPUT_PATH = DATA_DIR / "preprocessed" / "movies_preprocessed_50000.json"
-# Directory to save vectorized outputs
-OUT_DIR = DATA_DIR / "vectorized"
-# Ensure the output directory exists
-OUT_DIR.mkdir(parents=True, exist_ok=True)
-# --- Constants ---
-# Default SBERT model name to use for embeddings
-DEFAULT_SBERT_MODEL = "all-MiniLM-L6-v2"
-# Maximum number of features for TF-IDF vectorizer. Increased for larger dataset.
-TFIDF_MAX_FEATURES = 10000 # Adjusted from 5000 to 10000 for a larger corpus
-# Prefix for output files
-OUTPUT_PREFIX = "movies"
-# --- Data Loading Function ---
-def load_records(path: Path) -> List[Dict[str, Any]]:
-    """Loads preprocessed movie records from a JSON file."""
-    try:
-        with open(path, 'r', encoding='utf-8') as f:
-            return json.load(f)
-    except FileNotFoundError:
-        print(f"Error: Input file not found at {path}")
-        return []
-    except json.JSONDecodeError:
-        print(f"Error: Could not decode JSON from {path}. Check file format.")
-        return []
-# --- Corpus Building Function ---
-def build_corpus(records: List[Dict[str, Any]]) -> List[str]:
-    """
-    Constructs a corpus of text from movie records.
-    Each document in the corpus is a concatenation of title and overview tokens.
-    """
-    corpus = []
-    for r in records:
-        tokens = r.get('title_tokens', []) + r.get('overview_tokens', [])
-        corpus.append(' '.join(tokens))
-    return corpus
-# --- TF-IDF Vectorization ---
-def vectorize_tfidf(corpus: List[str], output_dir: Path, prefix: str):
-    """
-    Performs TF-IDF vectorization on the given corpus.
-    Saves the trained TF-IDF vectorizer and the resulting sparse matrix.
-    """
-    print(f"🚀 [TF-IDF] Starting TF-IDF vectorization with {TFIDF_MAX_FEATURES} features...")
-    vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)
-    tfidf_matrix = vectorizer.fit_transform(corpus)
-    # Save the fitted TF-IDF vectorizer (updated filename)
-    with open(output_dir / f"{prefix}_tfidf_vectorizer_50000.pkl", 'wb') as f:
-        pickle.dump(vectorizer, f)
-    # Save the TF-IDF matrix in sparse NPZ format (updated filename)
-    sparse.save_npz(output_dir / f"{prefix}_tfidf_matrix_50000.npz", tfidf_matrix)
-    print(f"✅ [TF-IDF] Vectorizer and matrix saved to {output_dir}.")
-    print(f"   TF-IDF matrix shape: {tfidf_matrix.shape}")
-# --- SBERT Vectorization ---
-def vectorize_sbert(corpus: List[str], output_dir: Path, prefix: str, model_name: str = DEFAULT_SBERT_MODEL):
-    """
-    Performs SBERT embedding generation on the given corpus.
-    Saves the generated embeddings and the name of the SBERT model used.
-    """
-    print(f"🚀 [SBERT] Starting SBERT embedding generation using model: {model_name}...")
-    try:
-        model = SentenceTransformer(model_name)
-        embeddings = model.encode(corpus, show_progress_bar=True, batch_size=32)
-        # Save the generated embeddings (updated filename)
-        with open(output_dir / f"{prefix}_sbert_embeddings_50000.pkl", 'wb') as f:
-            pickle.dump(embeddings, f)
-        # Save the name of the SBERT model used (updated filename)
-        with open(output_dir / f"{prefix}_sbert_model_50000.txt", 'w', encoding='utf-8') as f:
-            f.write(model_name)
-        print(f"✅ [SBERT] Embeddings and model name saved to {output_dir}.")
-        print(f"   SBERT embeddings shape: {embeddings.shape}")
-    except Exception as e:
-        print(f"❌ [SBERT] Error during SBERT embedding generation: {e}")
-# --- Main Execution ---
-def main():
-    """Main function to orchestrate the movie vectorization process."""
-    if not INPUT_PATH.exists():
-        raise FileNotFoundError(f"Missing input file: {INPUT_PATH}. Please ensure preprocessing is complete.")
-    records = load_records(INPUT_PATH)
-    if not records:
-        print("No records loaded. Exiting vectorization process.")
-        return
-    corpus = build_corpus(records)
-    print(f"📽️ Loaded {len(records)} movie records and built corpus of {len(corpus)} documents.")
-    print("Starting vectorization process...")
-    vectorize_tfidf(corpus, OUT_DIR, OUTPUT_PREFIX)
-    vectorize_sbert(corpus, OUT_DIR, OUTPUT_PREFIX, DEFAULT_SBERT_MODEL)
-    print("✨ Movie vectorization complete!")
-if __name__ == "__main__":
     main()

+"""
+vectorization/vectorize_movies_50000.py
+Vectorizes preprocessed TMDb movie data using TF-IDF and SBERT.
+Input:
+- data/movie/preprocessed/movies_preprocessed_50000.json
+Outputs:
+- data/movie/vectorized/movies_tfidf_vectorizer_50000.pkl
+- data/movie/vectorized/movies_tfidf_matrix_50000.npz
+- data/movie/vectorized/movies_sbert_embeddings_50000.pkl
+- data/movie/vectorized/movies_sbert_model_50000.txt
+"""
+import os
+import json
+import pickle
+from pathlib import Path
+from typing import List, Dict, Any
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sentence_transformers import SentenceTransformer
+from scipy import sparse
+# --- Path Configurations ---
+# Define the base directory for movie data
+DATA_DIR = Path(__file__).parent.parent / "data" / "movie"
+# Path to the input preprocessed JSON file (updated for 50,000 records)
+INPUT_PATH = DATA_DIR / "preprocessed" / "movies_preprocessed_50000.json"
+# Directory to save vectorized outputs
+OUT_DIR = DATA_DIR / "vectorized"
+# Ensure the output directory exists
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+# --- Constants ---
+# Default SBERT model name to use for embeddings
+DEFAULT_SBERT_MODEL = "all-MiniLM-L6-v2"
+# Maximum number of features for TF-IDF vectorizer. Increased for larger dataset.
+TFIDF_MAX_FEATURES = 10000 # Adjusted from 5000 to 10000 for a larger corpus
+# Prefix for output files
+OUTPUT_PREFIX = "movies"
+# --- Data Loading Function ---
+def load_records(path: Path) -> List[Dict[str, Any]]:
+    """Loads preprocessed movie records from a JSON file."""
+    try:
+        with open(path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"Error: Input file not found at {path}")
+        return []
+    except json.JSONDecodeError:
+        print(f"Error: Could not decode JSON from {path}. Check file format.")
+        return []
+# --- Corpus Building Function ---
+def build_corpus(records: List[Dict[str, Any]]) -> List[str]:
+    """
+    Constructs a corpus of text from movie records.
+    Each document in the corpus is a concatenation of title and overview tokens.
+    """
+    corpus = []
+    for r in records:
+        tokens = r.get('title_tokens', []) + r.get('overview_tokens', [])
+        corpus.append(' '.join(tokens))
+    return corpus
+# --- TF-IDF Vectorization ---
+def vectorize_tfidf(corpus: List[str], output_dir: Path, prefix: str):
+    """
+    Performs TF-IDF vectorization on the given corpus.
+    Saves the trained TF-IDF vectorizer and the resulting sparse matrix.
+    """
+    print(f"🚀 [TF-IDF] Starting TF-IDF vectorization with {TFIDF_MAX_FEATURES} features...")
+    vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)
+    tfidf_matrix = vectorizer.fit_transform(corpus)
+    # Save the fitted TF-IDF vectorizer (updated filename)
+    with open(output_dir / f"{prefix}_tfidf_vectorizer_50000.pkl", 'wb') as f:
+        pickle.dump(vectorizer, f)
+    # Save the TF-IDF matrix in sparse NPZ format (updated filename)
+    sparse.save_npz(output_dir / f"{prefix}_tfidf_matrix_50000.npz", tfidf_matrix)
+    print(f"✅ [TF-IDF] Vectorizer and matrix saved to {output_dir}.")
+    print(f"   TF-IDF matrix shape: {tfidf_matrix.shape}")
+# --- SBERT Vectorization ---
+def vectorize_sbert(corpus: List[str], output_dir: Path, prefix: str, model_name: str = DEFAULT_SBERT_MODEL):
+    """
+    Performs SBERT embedding generation on the given corpus.
+    Saves the generated embeddings and the name of the SBERT model used.
+    """
+    print(f"🚀 [SBERT] Starting SBERT embedding generation using model: {model_name}...")
+    try:
+        model = SentenceTransformer(model_name)
+        embeddings = model.encode(corpus, show_progress_bar=True, batch_size=32)
+        # Save the generated embeddings (updated filename)
+        with open(output_dir / f"{prefix}_sbert_embeddings_50000.pkl", 'wb') as f:
+            pickle.dump(embeddings, f)
+        # Save the name of the SBERT model used (updated filename)
+        with open(output_dir / f"{prefix}_sbert_model_50000.txt", 'w', encoding='utf-8') as f:
+            f.write(model_name)
+        print(f"✅ [SBERT] Embeddings and model name saved to {output_dir}.")
+        print(f"   SBERT embeddings shape: {embeddings.shape}")
+    except Exception as e:
+        print(f"❌ [SBERT] Error during SBERT embedding generation: {e}")
+# --- Main Execution ---
+def main():
+    """Main function to orchestrate the movie vectorization process."""
+    if not INPUT_PATH.exists():
+        raise FileNotFoundError(f"Missing input file: {INPUT_PATH}. Please ensure preprocessing is complete.")
+    records = load_records(INPUT_PATH)
+    if not records:
+        print("No records loaded. Exiting vectorization process.")
+        return
+    corpus = build_corpus(records)
+    print(f"📽️ Loaded {len(records)} movie records and built corpus of {len(corpus)} documents.")
+    print("Starting vectorization process...")
+    vectorize_tfidf(corpus, OUT_DIR, OUTPUT_PREFIX)
+    vectorize_sbert(corpus, OUT_DIR, OUTPUT_PREFIX, DEFAULT_SBERT_MODEL)
+    print("✨ Movie vectorization complete!")
+if __name__ == "__main__":
     main()