badminton001 commited on
Commit
0efc436
·
verified ·
1 Parent(s): 8b87258

Update vectorization/vectorize_movies_50000.py

Browse files
Files changed (1) hide show
  1. vectorization/vectorize_movies_50000.py +136 -139
vectorization/vectorize_movies_50000.py CHANGED
@@ -1,140 +1,137 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- vectorization/vectorize_movies_50000.py
6
-
7
- Vectorizes preprocessed TMDb movie data using TF-IDF and SBERT.
8
-
9
- Input:
10
- - data/movie/preprocessed/movies_preprocessed_50000.json
11
-
12
- Outputs:
13
- - data/movie/vectorized/movies_tfidf_vectorizer_50000.pkl
14
- - data/movie/vectorized/movies_tfidf_matrix_50000.npz
15
- - data/movie/vectorized/movies_sbert_embeddings_50000.pkl
16
- - data/movie/vectorized/movies_sbert_model_50000.txt
17
- """
18
-
19
- import os
20
- import json
21
- import pickle
22
- from pathlib import Path
23
- from typing import List, Dict, Any
24
-
25
- from sklearn.feature_extraction.text import TfidfVectorizer
26
- from sentence_transformers import SentenceTransformer
27
- from scipy import sparse
28
-
29
- # --- Path Configurations ---
30
- # Define the base directory for movie data
31
- DATA_DIR = Path(__file__).parent.parent / "data" / "movie"
32
- # Path to the input preprocessed JSON file (updated for 50,000 records)
33
- INPUT_PATH = DATA_DIR / "preprocessed" / "movies_preprocessed_50000.json"
34
- # Directory to save vectorized outputs
35
- OUT_DIR = DATA_DIR / "vectorized"
36
- # Ensure the output directory exists
37
- OUT_DIR.mkdir(parents=True, exist_ok=True)
38
-
39
- # --- Constants ---
40
- # Default SBERT model name to use for embeddings
41
- DEFAULT_SBERT_MODEL = "all-MiniLM-L6-v2"
42
- # Maximum number of features for TF-IDF vectorizer. Increased for larger dataset.
43
- TFIDF_MAX_FEATURES = 10000 # Adjusted from 5000 to 10000 for a larger corpus
44
- # Prefix for output files
45
- OUTPUT_PREFIX = "movies"
46
-
47
-
48
- # --- Data Loading Function ---
49
- def load_records(path: Path) -> List[Dict[str, Any]]:
50
- """Loads preprocessed movie records from a JSON file."""
51
- try:
52
- with open(path, 'r', encoding='utf-8') as f:
53
- return json.load(f)
54
- except FileNotFoundError:
55
- print(f"Error: Input file not found at {path}")
56
- return []
57
- except json.JSONDecodeError:
58
- print(f"Error: Could not decode JSON from {path}. Check file format.")
59
- return []
60
-
61
-
62
- # --- Corpus Building Function ---
63
- def build_corpus(records: List[Dict[str, Any]]) -> List[str]:
64
- """
65
- Constructs a corpus of text from movie records.
66
- Each document in the corpus is a concatenation of title and overview tokens.
67
- """
68
- corpus = []
69
- for r in records:
70
- tokens = r.get('title_tokens', []) + r.get('overview_tokens', [])
71
- corpus.append(' '.join(tokens))
72
- return corpus
73
-
74
-
75
- # --- TF-IDF Vectorization ---
76
- def vectorize_tfidf(corpus: List[str], output_dir: Path, prefix: str):
77
- """
78
- Performs TF-IDF vectorization on the given corpus.
79
- Saves the trained TF-IDF vectorizer and the resulting sparse matrix.
80
- """
81
- print(f"🚀 [TF-IDF] Starting TF-IDF vectorization with {TFIDF_MAX_FEATURES} features...")
82
- vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)
83
- tfidf_matrix = vectorizer.fit_transform(corpus)
84
-
85
- # Save the fitted TF-IDF vectorizer (updated filename)
86
- with open(output_dir / f"{prefix}_tfidf_vectorizer_50000.pkl", 'wb') as f:
87
- pickle.dump(vectorizer, f)
88
- # Save the TF-IDF matrix in sparse NPZ format (updated filename)
89
- sparse.save_npz(output_dir / f"{prefix}_tfidf_matrix_50000.npz", tfidf_matrix)
90
-
91
- print(f"✅ [TF-IDF] Vectorizer and matrix saved to {output_dir}.")
92
- print(f" TF-IDF matrix shape: {tfidf_matrix.shape}")
93
-
94
-
95
- # --- SBERT Vectorization ---
96
- def vectorize_sbert(corpus: List[str], output_dir: Path, prefix: str, model_name: str = DEFAULT_SBERT_MODEL):
97
- """
98
- Performs SBERT embedding generation on the given corpus.
99
- Saves the generated embeddings and the name of the SBERT model used.
100
- """
101
- print(f"🚀 [SBERT] Starting SBERT embedding generation using model: {model_name}...")
102
- try:
103
- model = SentenceTransformer(model_name)
104
- embeddings = model.encode(corpus, show_progress_bar=True, batch_size=32)
105
-
106
- # Save the generated embeddings (updated filename)
107
- with open(output_dir / f"{prefix}_sbert_embeddings_50000.pkl", 'wb') as f:
108
- pickle.dump(embeddings, f)
109
- # Save the name of the SBERT model used (updated filename)
110
- with open(output_dir / f"{prefix}_sbert_model_50000.txt", 'w', encoding='utf-8') as f:
111
- f.write(model_name)
112
-
113
- print(f" [SBERT] Embeddings and model name saved to {output_dir}.")
114
- print(f" SBERT embeddings shape: {embeddings.shape}")
115
- except Exception as e:
116
- print(f"❌ [SBERT] Error during SBERT embedding generation: {e}")
117
-
118
-
119
- # --- Main Execution ---
120
- def main():
121
- """Main function to orchestrate the movie vectorization process."""
122
- if not INPUT_PATH.exists():
123
- raise FileNotFoundError(f"Missing input file: {INPUT_PATH}. Please ensure preprocessing is complete.")
124
-
125
- records = load_records(INPUT_PATH)
126
- if not records:
127
- print("No records loaded. Exiting vectorization process.")
128
- return
129
-
130
- corpus = build_corpus(records)
131
- print(f"📽️ Loaded {len(records)} movie records and built corpus of {len(corpus)} documents.")
132
- print("Starting vectorization process...")
133
-
134
- vectorize_tfidf(corpus, OUT_DIR, OUTPUT_PREFIX)
135
- vectorize_sbert(corpus, OUT_DIR, OUTPUT_PREFIX, DEFAULT_SBERT_MODEL)
136
- print("✨ Movie vectorization complete!")
137
-
138
-
139
- if __name__ == "__main__":
140
  main()
 
1
+ """
2
+ vectorization/vectorize_movies_50000.py
3
+
4
+ Vectorizes preprocessed TMDb movie data using TF-IDF and SBERT.
5
+
6
+ Input:
7
+ - data/movie/preprocessed/movies_preprocessed_50000.json
8
+
9
+ Outputs:
10
+ - data/movie/vectorized/movies_tfidf_vectorizer_50000.pkl
11
+ - data/movie/vectorized/movies_tfidf_matrix_50000.npz
12
+ - data/movie/vectorized/movies_sbert_embeddings_50000.pkl
13
+ - data/movie/vectorized/movies_sbert_model_50000.txt
14
+ """
15
+
16
+ import os
17
+ import json
18
+ import pickle
19
+ from pathlib import Path
20
+ from typing import List, Dict, Any
21
+
22
+ from sklearn.feature_extraction.text import TfidfVectorizer
23
+ from sentence_transformers import SentenceTransformer
24
+ from scipy import sparse
25
+
26
+ # --- Path Configurations ---
27
+ # Define the base directory for movie data
28
+ DATA_DIR = Path(__file__).parent.parent / "data" / "movie"
29
+ # Path to the input preprocessed JSON file (updated for 50,000 records)
30
+ INPUT_PATH = DATA_DIR / "preprocessed" / "movies_preprocessed_50000.json"
31
+ # Directory to save vectorized outputs
32
+ OUT_DIR = DATA_DIR / "vectorized"
33
+ # Ensure the output directory exists
34
+ OUT_DIR.mkdir(parents=True, exist_ok=True)
35
+
36
+ # --- Constants ---
37
+ # Default SBERT model name to use for embeddings
38
+ DEFAULT_SBERT_MODEL = "all-MiniLM-L6-v2"
39
+ # Maximum number of features for TF-IDF vectorizer. Increased for larger dataset.
40
+ TFIDF_MAX_FEATURES = 10000 # Adjusted from 5000 to 10000 for a larger corpus
41
+ # Prefix for output files
42
+ OUTPUT_PREFIX = "movies"
43
+
44
+
45
+ # --- Data Loading Function ---
46
+ def load_records(path: Path) -> List[Dict[str, Any]]:
47
+ """Loads preprocessed movie records from a JSON file."""
48
+ try:
49
+ with open(path, 'r', encoding='utf-8') as f:
50
+ return json.load(f)
51
+ except FileNotFoundError:
52
+ print(f"Error: Input file not found at {path}")
53
+ return []
54
+ except json.JSONDecodeError:
55
+ print(f"Error: Could not decode JSON from {path}. Check file format.")
56
+ return []
57
+
58
+
59
+ # --- Corpus Building Function ---
60
+ def build_corpus(records: List[Dict[str, Any]]) -> List[str]:
61
+ """
62
+ Constructs a corpus of text from movie records.
63
+ Each document in the corpus is a concatenation of title and overview tokens.
64
+ """
65
+ corpus = []
66
+ for r in records:
67
+ tokens = r.get('title_tokens', []) + r.get('overview_tokens', [])
68
+ corpus.append(' '.join(tokens))
69
+ return corpus
70
+
71
+
72
+ # --- TF-IDF Vectorization ---
73
+ def vectorize_tfidf(corpus: List[str], output_dir: Path, prefix: str):
74
+ """
75
+ Performs TF-IDF vectorization on the given corpus.
76
+ Saves the trained TF-IDF vectorizer and the resulting sparse matrix.
77
+ """
78
+ print(f"🚀 [TF-IDF] Starting TF-IDF vectorization with {TFIDF_MAX_FEATURES} features...")
79
+ vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)
80
+ tfidf_matrix = vectorizer.fit_transform(corpus)
81
+
82
+ # Save the fitted TF-IDF vectorizer (updated filename)
83
+ with open(output_dir / f"{prefix}_tfidf_vectorizer_50000.pkl", 'wb') as f:
84
+ pickle.dump(vectorizer, f)
85
+ # Save the TF-IDF matrix in sparse NPZ format (updated filename)
86
+ sparse.save_npz(output_dir / f"{prefix}_tfidf_matrix_50000.npz", tfidf_matrix)
87
+
88
+ print(f"✅ [TF-IDF] Vectorizer and matrix saved to {output_dir}.")
89
+ print(f" TF-IDF matrix shape: {tfidf_matrix.shape}")
90
+
91
+
92
+ # --- SBERT Vectorization ---
93
+ def vectorize_sbert(corpus: List[str], output_dir: Path, prefix: str, model_name: str = DEFAULT_SBERT_MODEL):
94
+ """
95
+ Performs SBERT embedding generation on the given corpus.
96
+ Saves the generated embeddings and the name of the SBERT model used.
97
+ """
98
+ print(f"🚀 [SBERT] Starting SBERT embedding generation using model: {model_name}...")
99
+ try:
100
+ model = SentenceTransformer(model_name)
101
+ embeddings = model.encode(corpus, show_progress_bar=True, batch_size=32)
102
+
103
+ # Save the generated embeddings (updated filename)
104
+ with open(output_dir / f"{prefix}_sbert_embeddings_50000.pkl", 'wb') as f:
105
+ pickle.dump(embeddings, f)
106
+ # Save the name of the SBERT model used (updated filename)
107
+ with open(output_dir / f"{prefix}_sbert_model_50000.txt", 'w', encoding='utf-8') as f:
108
+ f.write(model_name)
109
+
110
+ print(f"✅ [SBERT] Embeddings and model name saved to {output_dir}.")
111
+ print(f" SBERT embeddings shape: {embeddings.shape}")
112
+ except Exception as e:
113
+ print(f" [SBERT] Error during SBERT embedding generation: {e}")
114
+
115
+
116
+ # --- Main Execution ---
117
+ def main():
118
+ """Main function to orchestrate the movie vectorization process."""
119
+ if not INPUT_PATH.exists():
120
+ raise FileNotFoundError(f"Missing input file: {INPUT_PATH}. Please ensure preprocessing is complete.")
121
+
122
+ records = load_records(INPUT_PATH)
123
+ if not records:
124
+ print("No records loaded. Exiting vectorization process.")
125
+ return
126
+
127
+ corpus = build_corpus(records)
128
+ print(f"📽️ Loaded {len(records)} movie records and built corpus of {len(corpus)} documents.")
129
+ print("Starting vectorization process...")
130
+
131
+ vectorize_tfidf(corpus, OUT_DIR, OUTPUT_PREFIX)
132
+ vectorize_sbert(corpus, OUT_DIR, OUTPUT_PREFIX, DEFAULT_SBERT_MODEL)
133
+ print("✨ Movie vectorization complete!")
134
+
135
+
136
+ if __name__ == "__main__":
 
 
 
137
  main()