######IMPORTS###### import pandas as pd from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report import matplotlib.pyplot as plt import seaborn as sns import joblib from datetime import datetime start_time = datetime.now() # Step 1: Load dataset print("Loading Dataset...") data = load_iris() # we load the dataset to work on print("Dataset Loaded. Extracting data from the dataset...") X = pd.DataFrame(data.data, columns=data.feature_names) # extract the data (features but not the target or species) which is as a dictionary into a csv style type of dataset or dataframe print("Extracting Features or target values...") y = pd.Series(data.target, name='species') # We extract the target from the data and rename it as Species # Step 2: Exploratory Data Analysis print("Concating the new df") df = pd.concat([X, y], axis=1) # We concat and create a new df or dataframe and join X which is the features of each flower and y which is the species of the flower print("Renaming columns...") df.columns = data.feature_names + ['species'] # In the new dataframe we rename the column names with the data feature names and the species print("Plots the graph") sns.pairplot(df, hue='species') # Looks at every row in your dataset (every flower in the Iris dataset). For each flower, it checks its species value (species column) and colors that specific point according to which species it belongs to. print("Showing the graph...") plt.show() # shows the plot # Step 3: Split dataset into training and testing sets print("Splicing...") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Splices the data # Step 4: Train model print("Training the model...") model = RandomForestClassifier(n_estimators=100, random_state=42) # initializing the model model.fit(X_train, y_train) # training the model print("Training completed.") # Step 5: Predictions print("Testing the model...") y_pred = model.predict(X_test) # testing the model # Step 6: Evaluate model print("Finding Accuracy and classification report") print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}") # finding accuracy of the model print("Classification Report:") # a log print(classification_report(y_test, y_pred)) # We print the report of the model on how accurately it performed. # Step 7: Feature Importance visualization print("Extracting Important Features...") feature_importances = pd.Series(model.feature_importances_, index=data.feature_names) # We extract the important or most important features and their importance values from the model print("Sorting and plotting the graph...") feature_importances.sort_values(ascending=True).plot(kind='barh') # We sort those values in ascending order and plot the graph plt.title('Feature Importances') # We plot a title named feature importances print("Plotted.") plt.show() # We show the ploted graph # Step 8: Save the trained model print("Saving the model...") joblib.dump(model, 'random_forest_model.pkl') # saving the model print("Model saved as random_forest_model.pkl") end_time = datetime.now() elapsed_time = end_time - start_time print("Elapsed time:", elapsed_time)