novel_based_chatbot / src /Classifier.py
Bhushan4829
Initital Commit
08ded12
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.metrics import accuracy_score, hamming_loss, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from joblib import dump
from sklearn.ensemble import RandomForestClassifier
from preprocessing import Preprocessing
# class svm_classifier:
# # def __init__(self,path_to_data):
# # self.data = path_to_data
# @staticmethod
# def run_classifier(novels_data):
# novels_data = pd.read_csv(self.data)
novels_data = pd.read_csv('novels_data.csv')
novels_data['title_author'] = novels_data['title'] + " by " + novels_data['author']
novels_data['contents_preprocessed'] = novels_data['content_original'].apply(Preprocessing.preprocess_content)
novels_data = novels_data.drop(['content_preprocessed'],axis=1)
novels_data.to_csv('novels_preprocessed_data.csv', index=False)
# Initialize TF-IDF Vectorizer and Label Encoder
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
# label_encoder = LabelEncoder()
print('tfidf_vectorization:',tfidf_vectorizer)
# Encode target variable and extract features
# y_encoded = label_encoder.fit_transform(novels_data['title_author'])
X_tfidf = tfidf_vectorizer.fit_transform(novels_data['contents_preprocessed'])
print('X_tdidf:',X_tfidf)
# Assuming 'topic' and 'author' are columns in your dataset
y = novels_data[['title', 'author']]
# Use MultiLabelBinarizer for multi-label encoding
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(y.values)
# Initialize the Random Forest classifier
rf_classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)
# Train the classifier
rf_classifier.fit(X_train, y_train)
# Predict and calculate accuracy
y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
hamming = hamming_loss(y_test, y_pred_rf)
report = classification_report(y_test, y_pred_rf, target_names=mlb.classes_)
print("Random Forest Accuracy:", accuracy_rf)
print("Hamming Loss: ", hamming)
print("Classification Report:\n", report)
# Save the classifier, vectorizer, and label binarizer
dump(rf_classifier, 'rf_classifier.joblib')
dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
dump(mlb, 'mlb.joblib')