badminton001 commited on
Commit
8b87258
·
verified ·
1 Parent(s): ecf1e4f

Update vectorization/vectorize_books_50000.py

Browse files
Files changed (1) hide show
  1. vectorization/vectorize_books_50000.py +132 -135
vectorization/vectorize_books_50000.py CHANGED
@@ -1,136 +1,133 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- vectorization/vectorize_books_50000.py
6
-
7
- Vectorizes preprocessed book data using TF-IDF and SBERT.
8
-
9
- Input:
10
- - data/book/preprocessed/books_preprocessed_50000.json
11
-
12
- Outputs:
13
- - data/book/vectorized/books_tfidf_vectorizer_50000.pkl
14
- - data/book/vectorized/books_tfidf_matrix_50000.npz
15
- - data/book/vectorized/books_sbert_embeddings_50000.pkl
16
- - data/book/vectorized/books_sbert_model_50000.txt
17
- """
18
-
19
- import os
20
- import json
21
- import pickle
22
- from pathlib import Path
23
- from typing import List, Dict, Any
24
-
25
- from sklearn.feature_extraction.text import TfidfVectorizer
26
- from sentence_transformers import SentenceTransformer
27
- from scipy import sparse
28
-
29
- # --- Path Configurations ---
30
- # Define the base directory for book data
31
- DATA_DIR = Path(__file__).parent.parent / "data" / "book"
32
- # Path to the input preprocessed JSON file (updated for 50,000 records)
33
- INPUT_PATH = DATA_DIR / "preprocessed" / "books_preprocessed_50000.json"
34
- # Directory to save vectorized outputs
35
- OUT_DIR = DATA_DIR / "vectorized"
36
- # Ensure the output directory exists
37
- OUT_DIR.mkdir(parents=True, exist_ok=True)
38
-
39
- # --- Constants ---
40
- # Default SBERT model name to use for embeddings
41
- DEFAULT_SBERT_MODEL = "all-MiniLM-L6-v2"
42
- # Maximum number of features for TF-IDF vectorizer. Increased for larger dataset.
43
- TFIDF_MAX_FEATURES = 10000 # Adjusted from 5000 to 10000 for a larger corpus
44
- # Prefix for output files
45
- OUTPUT_PREFIX = "books"
46
-
47
- # --- Data Loading Function ---
48
- def load_records(path: Path) -> List[Dict[str, Any]]:
49
- """Loads preprocessed book records from a JSON file."""
50
- try:
51
- with open(path, "r", encoding="utf-8") as f:
52
- return json.load(f)
53
- except FileNotFoundError:
54
- print(f"Error: Input file not found at {path}")
55
- return []
56
- except json.JSONDecodeError:
57
- print(f"Error: Could not decode JSON from {path}. Check file format.")
58
- return []
59
-
60
- # --- Corpus Building Function ---
61
- def build_corpus(records: List[Dict[str, Any]]) -> List[str]:
62
- """
63
- Constructs a corpus of text from book records.
64
- Each document in the corpus is a concatenation of title and overview tokens.
65
- """
66
- corpus = []
67
- for r in records:
68
- tokens = r.get("title_tokens", []) + r.get("overview_tokens", [])
69
- corpus.append(" ".join(tokens))
70
- return corpus
71
-
72
- # --- TF-IDF Vectorization ---
73
- def vectorize_tfidf(corpus: List[str], output_dir: Path, prefix: str):
74
- """
75
- Performs TF-IDF vectorization on the given corpus.
76
- Saves the trained TF-IDF vectorizer and the resulting sparse matrix.
77
- """
78
- print(f"🚀 [TF-IDF] Starting TF-IDF vectorization with {TFIDF_MAX_FEATURES} features...")
79
- vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)
80
- tfidf_matrix = vectorizer.fit_transform(corpus)
81
-
82
- # Save the fitted TF-IDF vectorizer (updated filename)
83
- with open(output_dir / f"{prefix}_tfidf_vectorizer_50000.pkl", "wb") as f:
84
- pickle.dump(vectorizer, f)
85
-
86
- # Save the TF-IDF matrix in sparse NPZ format (updated filename)
87
- sparse.save_npz(output_dir / f"{prefix}_tfidf_matrix_50000.npz", tfidf_matrix)
88
-
89
- print(f"✅ [TF-IDF] Vectorizer and matrix saved to {output_dir}.")
90
- print(f" TF-IDF matrix shape: {tfidf_matrix.shape}")
91
-
92
- # --- SBERT Vectorization ---
93
- def vectorize_sbert(corpus: List[str], output_dir: Path, prefix: str, model_name: str = DEFAULT_SBERT_MODEL):
94
- """
95
- Performs SBERT embedding generation on the given corpus.
96
- Saves the generated embeddings and the name of the SBERT model used.
97
- """
98
- print(f"🚀 [SBERT] Starting SBERT embedding generation using model: {model_name}...")
99
- try:
100
- model = SentenceTransformer(model_name)
101
- embeddings = model.encode(corpus, show_progress_bar=True, batch_size=32)
102
-
103
- # Save the generated embeddings (updated filename)
104
- with open(output_dir / f"{prefix}_sbert_embeddings_50000.pkl", "wb") as f:
105
- pickle.dump(embeddings, f)
106
-
107
- # Save the name of the SBERT model used (updated filename)
108
- with open(output_dir / f"{prefix}_sbert_model_50000.txt", "w", encoding="utf-8") as f:
109
- f.write(model_name)
110
-
111
- print(f" [SBERT] Embeddings and model name saved to {output_dir}.")
112
- print(f" SBERT embeddings shape: {embeddings.shape}")
113
- except Exception as e:
114
- print(f"❌ [SBERT] Error during SBERT embedding generation: {e}")
115
-
116
- # --- Main Execution ---
117
- def main():
118
- """Main function to orchestrate the book vectorization process."""
119
- if not INPUT_PATH.exists():
120
- raise FileNotFoundError(f"Missing input file: {INPUT_PATH}. Please ensure preprocessing is complete.")
121
-
122
- records = load_records(INPUT_PATH)
123
- if not records:
124
- print("No records loaded. Exiting vectorization process.")
125
- return
126
-
127
- corpus = build_corpus(records)
128
- print(f"📚 Loaded {len(records)} book records and built corpus of {len(corpus)} documents.")
129
- print("Starting vectorization process...")
130
-
131
- vectorize_tfidf(corpus, OUT_DIR, OUTPUT_PREFIX)
132
- vectorize_sbert(corpus, OUT_DIR, OUTPUT_PREFIX, DEFAULT_SBERT_MODEL)
133
- print("✨ Book vectorization complete!")
134
-
135
- if __name__ == "__main__":
136
  main()
 
1
+ """
2
+ vectorization/vectorize_books_50000.py
3
+
4
+ Vectorizes preprocessed book data using TF-IDF and SBERT.
5
+
6
+ Input:
7
+ - data/book/preprocessed/books_preprocessed_50000.json
8
+
9
+ Outputs:
10
+ - data/book/vectorized/books_tfidf_vectorizer_50000.pkl
11
+ - data/book/vectorized/books_tfidf_matrix_50000.npz
12
+ - data/book/vectorized/books_sbert_embeddings_50000.pkl
13
+ - data/book/vectorized/books_sbert_model_50000.txt
14
+ """
15
+
16
+ import os
17
+ import json
18
+ import pickle
19
+ from pathlib import Path
20
+ from typing import List, Dict, Any
21
+
22
+ from sklearn.feature_extraction.text import TfidfVectorizer
23
+ from sentence_transformers import SentenceTransformer
24
+ from scipy import sparse
25
+
26
+ # --- Path Configurations ---
27
+ # Define the base directory for book data
28
+ DATA_DIR = Path(__file__).parent.parent / "data" / "book"
29
+ # Path to the input preprocessed JSON file (updated for 50,000 records)
30
+ INPUT_PATH = DATA_DIR / "preprocessed" / "books_preprocessed_50000.json"
31
+ # Directory to save vectorized outputs
32
+ OUT_DIR = DATA_DIR / "vectorized"
33
+ # Ensure the output directory exists
34
+ OUT_DIR.mkdir(parents=True, exist_ok=True)
35
+
36
+ # --- Constants ---
37
+ # Default SBERT model name to use for embeddings
38
+ DEFAULT_SBERT_MODEL = "all-MiniLM-L6-v2"
39
+ # Maximum number of features for TF-IDF vectorizer. Increased for larger dataset.
40
+ TFIDF_MAX_FEATURES = 10000 # Adjusted from 5000 to 10000 for a larger corpus
41
+ # Prefix for output files
42
+ OUTPUT_PREFIX = "books"
43
+
44
+ # --- Data Loading Function ---
45
+ def load_records(path: Path) -> List[Dict[str, Any]]:
46
+ """Loads preprocessed book records from a JSON file."""
47
+ try:
48
+ with open(path, "r", encoding="utf-8") as f:
49
+ return json.load(f)
50
+ except FileNotFoundError:
51
+ print(f"Error: Input file not found at {path}")
52
+ return []
53
+ except json.JSONDecodeError:
54
+ print(f"Error: Could not decode JSON from {path}. Check file format.")
55
+ return []
56
+
57
+ # --- Corpus Building Function ---
58
+ def build_corpus(records: List[Dict[str, Any]]) -> List[str]:
59
+ """
60
+ Constructs a corpus of text from book records.
61
+ Each document in the corpus is a concatenation of title and overview tokens.
62
+ """
63
+ corpus = []
64
+ for r in records:
65
+ tokens = r.get("title_tokens", []) + r.get("overview_tokens", [])
66
+ corpus.append(" ".join(tokens))
67
+ return corpus
68
+
69
+ # --- TF-IDF Vectorization ---
70
+ def vectorize_tfidf(corpus: List[str], output_dir: Path, prefix: str):
71
+ """
72
+ Performs TF-IDF vectorization on the given corpus.
73
+ Saves the trained TF-IDF vectorizer and the resulting sparse matrix.
74
+ """
75
+ print(f"🚀 [TF-IDF] Starting TF-IDF vectorization with {TFIDF_MAX_FEATURES} features...")
76
+ vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)
77
+ tfidf_matrix = vectorizer.fit_transform(corpus)
78
+
79
+ # Save the fitted TF-IDF vectorizer (updated filename)
80
+ with open(output_dir / f"{prefix}_tfidf_vectorizer_50000.pkl", "wb") as f:
81
+ pickle.dump(vectorizer, f)
82
+
83
+ # Save the TF-IDF matrix in sparse NPZ format (updated filename)
84
+ sparse.save_npz(output_dir / f"{prefix}_tfidf_matrix_50000.npz", tfidf_matrix)
85
+
86
+ print(f"✅ [TF-IDF] Vectorizer and matrix saved to {output_dir}.")
87
+ print(f" TF-IDF matrix shape: {tfidf_matrix.shape}")
88
+
89
+ # --- SBERT Vectorization ---
90
+ def vectorize_sbert(corpus: List[str], output_dir: Path, prefix: str, model_name: str = DEFAULT_SBERT_MODEL):
91
+ """
92
+ Performs SBERT embedding generation on the given corpus.
93
+ Saves the generated embeddings and the name of the SBERT model used.
94
+ """
95
+ print(f"🚀 [SBERT] Starting SBERT embedding generation using model: {model_name}...")
96
+ try:
97
+ model = SentenceTransformer(model_name)
98
+ embeddings = model.encode(corpus, show_progress_bar=True, batch_size=32)
99
+
100
+ # Save the generated embeddings (updated filename)
101
+ with open(output_dir / f"{prefix}_sbert_embeddings_50000.pkl", "wb") as f:
102
+ pickle.dump(embeddings, f)
103
+
104
+ # Save the name of the SBERT model used (updated filename)
105
+ with open(output_dir / f"{prefix}_sbert_model_50000.txt", "w", encoding="utf-8") as f:
106
+ f.write(model_name)
107
+
108
+ print(f"✅ [SBERT] Embeddings and model name saved to {output_dir}.")
109
+ print(f" SBERT embeddings shape: {embeddings.shape}")
110
+ except Exception as e:
111
+ print(f" [SBERT] Error during SBERT embedding generation: {e}")
112
+
113
+ # --- Main Execution ---
114
+ def main():
115
+ """Main function to orchestrate the book vectorization process."""
116
+ if not INPUT_PATH.exists():
117
+ raise FileNotFoundError(f"Missing input file: {INPUT_PATH}. Please ensure preprocessing is complete.")
118
+
119
+ records = load_records(INPUT_PATH)
120
+ if not records:
121
+ print("No records loaded. Exiting vectorization process.")
122
+ return
123
+
124
+ corpus = build_corpus(records)
125
+ print(f"📚 Loaded {len(records)} book records and built corpus of {len(corpus)} documents.")
126
+ print("Starting vectorization process...")
127
+
128
+ vectorize_tfidf(corpus, OUT_DIR, OUTPUT_PREFIX)
129
+ vectorize_sbert(corpus, OUT_DIR, OUTPUT_PREFIX, DEFAULT_SBERT_MODEL)
130
+ print("✨ Book vectorization complete!")
131
+
132
+ if __name__ == "__main__":
 
 
 
133
  main()