Ramicaxi commited on
Commit
d688012
·
verified ·
1 Parent(s): 4899d78

Upload 3 files

Browse files
Random_Forest_Predict_Missing_Values.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Dec 27 20:58:48 2024
4
+
5
+ @author: ramio
6
+ """
7
+
8
+ import pandas as pd
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.ensemble import RandomForestClassifier
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,ConfusionMatrixDisplay
13
+ import matplotlib.pyplot as plt
14
+ from scipy.sparse import hstack
15
+ import nltk
16
+ from nltk.corpus import stopwords
17
+ import pickle
18
+
19
+ # NLTK Portuguese stopwords (only needed once)
20
+ nltk.download('stopwords')
21
+
22
+ # Load Portuguese stopwords
23
+ portuguese_stopwords = stopwords.words('portuguese')
24
+
25
+ # Load the dataset
26
+ file_path = 'Registo dos livros (Guardado automaticamente).xlsx'
27
+ df = pd.read_excel(file_path, header=1)
28
+
29
+ # Data Cleaning (drop column)
30
+ df.columns = df.columns.str.strip()
31
+ df = df.drop(['Unnamed: 14'], axis=1)
32
+
33
+ #Filtering data (train and missing)
34
+ missing_data= df [df["Tema & Localização"].isna()] # Rows where 'Tema & Localização' is missing (missing_data)
35
+ train_data = df [df["Tema & Localização"].notna()] # Rows where 'Tema & Localização' is not missing (train_data)
36
+
37
+ # Calculating class counts
38
+ class_counts = train_data['Tema & Localização'].value_counts()
39
+ print(class_counts)
40
+
41
+ # Identifying rare classes (less than 5 samples)
42
+ rare_classes = class_counts[class_counts < 5].index
43
+ print(rare_classes)
44
+
45
+ # Replacing rare classes with a new label
46
+ train_data['Tema & Localização'] = train_data['Tema & Localização'].replace(rare_classes, 'Other')
47
+
48
+ #Features selection
49
+ x= train_data[['Titulo','Autor','Editora','Tema & Localização']]
50
+ y= train_data['Tema & Localização']
51
+
52
+ # Converting text columns to numerical using TF-IDF
53
+ tfidf = TfidfVectorizer(stop_words=portuguese_stopwords, max_features=1000)
54
+
55
+ # Vectorizing each text column separately
56
+ x_tfidf_titulo = tfidf.fit_transform(x['Titulo'].fillna('')) # Transform 'Titulo' column
57
+ x_tfidf_autor = tfidf.transform(x['Autor'].fillna('')) # Transform 'Autor' column
58
+ x_tfidf_editora = tfidf.transform(x['Editora'].fillna('')) # Transform 'Editora' column
59
+ x_tfidf_tema = tfidf.transform(x['Tema & Localização'].fillna('')) # Transform 'Tema & Localização' column
60
+
61
+ # Combining the TF-IDF features from all columns into one feature matrix
62
+ x_combined = hstack([x_tfidf_titulo, x_tfidf_autor, x_tfidf_editora, x_tfidf_tema])
63
+
64
+ #Data split
65
+ x_train,x_test,y_train,y_test = train_test_split (x_combined,y, test_size=0.2, random_state=42)
66
+
67
+ #Train model
68
+ rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
69
+ rf_model.fit(x_train,y_train)
70
+
71
+ # Making prediction on the test set
72
+ y_pred = rf_model.predict(x_test)
73
+
74
+ # Calculating and print accuracy
75
+ accuracy = accuracy_score(y_test, y_pred)
76
+ print(f'Accuracy: {accuracy * 100:.2f}%')
77
+
78
+ # Confusion matrix
79
+ print('confusion matrix:')
80
+ # Confusion matrix
81
+ ConfusionMatrixDisplay.from_predictions(
82
+ y_test, y_pred,
83
+ cmap='Blues',
84
+ colorbar=True
85
+ )
86
+ plt.xticks(rotation=45, fontsize=5,ha='right')
87
+ plt.yticks(fontsize=5)
88
+ plt.title('Confusion Matrix')
89
+ plt.show()
90
+
91
+ # Classification report for more evaluation metrics
92
+ print('Classification Report:')
93
+ print(classification_report(y_test, y_pred))
94
+
95
+
96
+ """"Predicting missing values"""
97
+
98
+ # Predict the missing values in 'Tema & Localização'
99
+ x_missing = missing_data[['Titulo', 'Autor', 'Editora','Tema & Localização']] # Select features for rows with missing 'Tema & Localização'
100
+
101
+ # Vectorizing the missing data
102
+ x_missing_tfidf_titulo = tfidf.transform(x_missing['Titulo'].fillna(''))
103
+ x_missing_tfidf_autor = tfidf.transform(x_missing['Autor'].fillna(''))
104
+ x_missing_tfidf_editora = tfidf.transform(x_missing['Editora'].fillna(''))
105
+ x_missing_tfidf_tema = tfidf.transform(x_missing['Tema & Localização'].fillna('')) # Transform 'Tema & Localização' column
106
+
107
+ # Combining the TF-IDF features for the missing data
108
+ x_missing_combined = hstack([x_missing_tfidf_titulo, x_missing_tfidf_autor, x_missing_tfidf_editora,x_missing_tfidf_tema])
109
+
110
+ # Predicting missing values for 'Tema & Localização'
111
+ y_missing_pred = rf_model.predict(x_missing_combined)
112
+
113
+ # Replaceing the missing values in the original dataframe with the predicted values
114
+ df.loc[df["Tema & Localização"].isna(), 'Tema & Localização'] = y_missing_pred
115
+
116
+ # Displaying the dataframe with the predicted values filled in
117
+ print(df.head())
118
+
119
+
120
+ # Saving the trained model
121
+ with open('book_category_model.pkl', 'wb') as f:
122
+ pickle.dump(rf_model, f)
123
+
Registo dos livros (Guardado automaticamente).xlsx ADDED
Binary file (263 kB). View file
 
book_category_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aa3d84bf4a04121dc067947696508d05c8f16ed9e1f60312f4563b5ef20ba14
3
+ size 9616027