Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| # from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.model_selection import train_test_split | |
| # from sklearn.preprocessing import LabelEncoder | |
| from sklearn.metrics import accuracy_score | |
| import numpy as np | |
| from sklearn.metrics import accuracy_score, hamming_loss, classification_report | |
| from sklearn.preprocessing import MultiLabelBinarizer | |
| from sklearn.svm import SVC | |
| from sklearn.multiclass import OneVsRestClassifier | |
| from joblib import dump | |
| from sklearn.ensemble import RandomForestClassifier | |
| from preprocessing import Preprocessing | |
| # class svm_classifier: | |
| # # def __init__(self,path_to_data): | |
| # # self.data = path_to_data | |
| # @staticmethod | |
| # def run_classifier(novels_data): | |
| # novels_data = pd.read_csv(self.data) | |
| novels_data = pd.read_csv('novels_data.csv') | |
| novels_data['title_author'] = novels_data['title'] + " by " + novels_data['author'] | |
| novels_data['contents_preprocessed'] = novels_data['content_original'].apply(Preprocessing.preprocess_content) | |
| novels_data = novels_data.drop(['content_preprocessed'],axis=1) | |
| novels_data.to_csv('novels_preprocessed_data.csv', index=False) | |
| # Initialize TF-IDF Vectorizer and Label Encoder | |
| tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') | |
| # label_encoder = LabelEncoder() | |
| print('tfidf_vectorization:',tfidf_vectorizer) | |
| # Encode target variable and extract features | |
| # y_encoded = label_encoder.fit_transform(novels_data['title_author']) | |
| X_tfidf = tfidf_vectorizer.fit_transform(novels_data['contents_preprocessed']) | |
| print('X_tdidf:',X_tfidf) | |
| # Assuming 'topic' and 'author' are columns in your dataset | |
| y = novels_data[['title', 'author']] | |
| # Use MultiLabelBinarizer for multi-label encoding | |
| mlb = MultiLabelBinarizer() | |
| y_encoded = mlb.fit_transform(y.values) | |
| # Initialize the Random Forest classifier | |
| rf_classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42)) | |
| # Split data into training and test sets | |
| X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42) | |
| # Train the classifier | |
| rf_classifier.fit(X_train, y_train) | |
| # Predict and calculate accuracy | |
| y_pred_rf = rf_classifier.predict(X_test) | |
| accuracy_rf = accuracy_score(y_test, y_pred_rf) | |
| hamming = hamming_loss(y_test, y_pred_rf) | |
| report = classification_report(y_test, y_pred_rf, target_names=mlb.classes_) | |
| print("Random Forest Accuracy:", accuracy_rf) | |
| print("Hamming Loss: ", hamming) | |
| print("Classification Report:\n", report) | |
| # Save the classifier, vectorizer, and label binarizer | |
| dump(rf_classifier, 'rf_classifier.joblib') | |
| dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib') | |
| dump(mlb, 'mlb.joblib') |