rajthakkar123
/

Classification-Model-Classifying-Flower-Species-Based-on-some-features

Model card Files Files and versions

Classification-Model-Classifying-Flower-Species-Based-on-some-features / classification_model.py

rajthakkar123's picture

Upload 2 files

a637628 verified 5 months ago

history blame contribute delete

3.29 kB

	######IMPORTS######
	import pandas as pd
	from sklearn.datasets import load_iris
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import accuracy_score, classification_report
	import matplotlib.pyplot as plt
	import seaborn as sns
	import joblib
	from datetime import datetime

	start_time = datetime.now()

	# Step 1: Load dataset
	print("Loading Dataset...")
	data = load_iris() # we load the dataset to work on
	print("Dataset Loaded. Extracting data from the dataset...")
	X = pd.DataFrame(data.data, columns=data.feature_names) # extract the data (features but not the target or species) which is as a dictionary into a csv style type of dataset or dataframe
	print("Extracting Features or target values...")
	y = pd.Series(data.target, name='species') # We extract the target from the data and rename it as Species

	# Step 2: Exploratory Data Analysis
	print("Concating the new df")
	df = pd.concat([X, y], axis=1) # We concat and create a new df or dataframe and join X which is the features of each flower and y which is the species of the flower
	print("Renaming columns...")
	df.columns = data.feature_names + ['species'] # In the new dataframe we rename the column names with the data feature names and the species
	print("Plots the graph")
	sns.pairplot(df, hue='species') # Looks at every row in your dataset (every flower in the Iris dataset). For each flower, it checks its species value (species column) and colors that specific point according to which species it belongs to.
	print("Showing the graph...")
	plt.show() # shows the plot

	# Step 3: Split dataset into training and testing sets
	print("Splicing...")
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Splices the data

	# Step 4: Train model
	print("Training the model...")
	model = RandomForestClassifier(n_estimators=100, random_state=42) # initializing the model
	model.fit(X_train, y_train) # training the model
	print("Training completed.")

	# Step 5: Predictions
	print("Testing the model...")
	y_pred = model.predict(X_test) # testing the model

	# Step 6: Evaluate model
	print("Finding Accuracy and classification report")
	print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}") # finding accuracy of the model
	print("Classification Report:") # a log
	print(classification_report(y_test, y_pred)) # We print the report of the model on how accurately it performed.

	# Step 7: Feature Importance visualization
	print("Extracting Important Features...")
	feature_importances = pd.Series(model.feature_importances_, index=data.feature_names) # We extract the important or most important features and their importance values from the model
	print("Sorting and plotting the graph...")
	feature_importances.sort_values(ascending=True).plot(kind='barh') # We sort those values in ascending order and plot the graph
	plt.title('Feature Importances') # We plot a title named feature importances
	print("Plotted.")
	plt.show() # We show the ploted graph

	# Step 8: Save the trained model
	print("Saving the model...")
	joblib.dump(model, 'random_forest_model.pkl') # saving the model
	print("Model saved as random_forest_model.pkl")
	end_time = datetime.now()

	elapsed_time = end_time - start_time
	print("Elapsed time:", elapsed_time)