rajthakkar123's picture
Upload 2 files
a637628 verified
######IMPORTS######
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from datetime import datetime
start_time = datetime.now()
# Step 1: Load dataset
print("Loading Dataset...")
data = load_iris() # we load the dataset to work on
print("Dataset Loaded. Extracting data from the dataset...")
X = pd.DataFrame(data.data, columns=data.feature_names) # extract the data (features but not the target or species) which is as a dictionary into a csv style type of dataset or dataframe
print("Extracting Features or target values...")
y = pd.Series(data.target, name='species') # We extract the target from the data and rename it as Species
# Step 2: Exploratory Data Analysis
print("Concating the new df")
df = pd.concat([X, y], axis=1) # We concat and create a new df or dataframe and join X which is the features of each flower and y which is the species of the flower
print("Renaming columns...")
df.columns = data.feature_names + ['species'] # In the new dataframe we rename the column names with the data feature names and the species
print("Plots the graph")
sns.pairplot(df, hue='species') # Looks at every row in your dataset (every flower in the Iris dataset). For each flower, it checks its species value (species column) and colors that specific point according to which species it belongs to.
print("Showing the graph...")
plt.show() # shows the plot
# Step 3: Split dataset into training and testing sets
print("Splicing...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Splices the data
# Step 4: Train model
print("Training the model...")
model = RandomForestClassifier(n_estimators=100, random_state=42) # initializing the model
model.fit(X_train, y_train) # training the model
print("Training completed.")
# Step 5: Predictions
print("Testing the model...")
y_pred = model.predict(X_test) # testing the model
# Step 6: Evaluate model
print("Finding Accuracy and classification report")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}") # finding accuracy of the model
print("Classification Report:") # a log
print(classification_report(y_test, y_pred)) # We print the report of the model on how accurately it performed.
# Step 7: Feature Importance visualization
print("Extracting Important Features...")
feature_importances = pd.Series(model.feature_importances_, index=data.feature_names) # We extract the important or most important features and their importance values from the model
print("Sorting and plotting the graph...")
feature_importances.sort_values(ascending=True).plot(kind='barh') # We sort those values in ascending order and plot the graph
plt.title('Feature Importances') # We plot a title named feature importances
print("Plotted.")
plt.show() # We show the ploted graph
# Step 8: Save the trained model
print("Saving the model...")
joblib.dump(model, 'random_forest_model.pkl') # saving the model
print("Model saved as random_forest_model.pkl")
end_time = datetime.now()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time)