Spaces:
Sleeping
Sleeping
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from utils import save_to_pickle | |
| from data_loader import load_and_clean_data | |
| import os | |
| # Paths | |
| data_path = "data/books_summary.csv" | |
| cleaned_data_path = "data/cleaned_books_summary.csv" | |
| vectorizer_path = "model/tfidf_vectorizer.pkl" | |
| tfidf_matrix_path = "model/tfidf_matrix.pkl" | |
| def train_tfidf_model(data): | |
| """ | |
| Train a TF-IDF vectorizer on the combined text data and save the model. | |
| """ | |
| # Initialize TF-IDF vectorizer | |
| vectorizer = TfidfVectorizer(max_features=10000, stop_words="english") | |
| # Fit and transform the combined text | |
| print("Training TF-IDF vectorizer...") | |
| tfidf_matrix = vectorizer.fit_transform(data["combined_text"]) | |
| print(f"TF-IDF matrix shape: {tfidf_matrix.shape}") | |
| # Save the TF-IDF vectorizer and matrix | |
| save_to_pickle(vectorizer, vectorizer_path) | |
| save_to_pickle(tfidf_matrix, tfidf_matrix_path) | |
| print( | |
| f"TF-IDF vectorizer and matrix saved to: {vectorizer_path} and {tfidf_matrix_path}" | |
| ) | |
| def main(): | |
| # Ensure the model directory exists | |
| os.makedirs("model", exist_ok=True) | |
| # Load, clean, and prepare data | |
| books_df = load_and_clean_data(data_path, cleaned_data_path) | |
| train_tfidf_model(books_df) | |
| if __name__ == "__main__": | |
| main() | |