import streamlit as st import seaborn as sns import matplotlib.pyplot as plt from PIL import Image from datasets import load_dataset import random def run(): st.title('Tomato Leaf Health Classification') st.subheader("this page contains the EDA about tomato leaf health classification") # image = Image.open("./src/credit_card.jpg") # st.image(image, caption="Credit Card") # write st.write("the EDA will explore and analyse classifier tomato leaf health") # fetch dataset dataset_dict = load_dataset("Heizsenberg/leaf-image-dataset") label_names = dataset_dict["train"].features["label"].names dataset_df = dataset_dict['train'].to_pandas() dataset_df["label_name"] = dataset_df["label"].map(dict(enumerate(label_names))) st.write("sample from the dataframe") st.write(dataset_df.sample(15)) st.write("content of the dataframe") st.write("Total images:", len(dataset_df)) st.write("Total classes:", dataset_df["label"].nunique()) st.write("Tomato Leaf Training dataset class distribution") fig, ax = plt.subplots(figsize=(10,5)) sns.countplot(data=dataset_df, x="label_name", order=dataset_df["label_name"].value_counts().index, ax=ax) plt.xticks(rotation=90) plt.title("Class Distribution") st.pyplot(fig) st.write("sample image size and mode") sample_path_obj = random.choice(dataset_df["image"].values) sample_path = sample_path_obj['path'] img = Image.open(sample_path) st.write("Image size:", img.size) st.write("Image mode:", img.mode) st.write("sample from each classes") fig_samp, ax_samp = plt.subplots(4, 3, figsize=(12,12)) # samples = dataset_df.sample(10) samples = dataset_df.groupby("label_name").sample(1, random_state=42) for ax, (_, row) in zip(ax_samp.flatten(), samples.iterrows()): image_path = row['image'] img = Image.open(image_path['path']) ax.imshow(img) ax.set_title(row["label_name"]) ax.axis("off") plt.tight_layout() # Show inside Streamlit st.pyplot(fig_samp) st.write(""" ## Insight 1. dataset contains around 16.011 in 10 classes 2. class distribution generally spread evenly with few exceptions on `tomato_tomato_mosaic_virus` has lowest samples and `Tomato_YellowLeaf_curl_virus` having the largest samples, showing complexity in detecting the diseases and easier detection of tomato mosaic virus 3. the dataset images is on size (256x256) which needs to be rescaled for lower GPU load 4. several samples is shown from 10 different classes, showing both healthy and disease afflicted leaves """) if __name__ == '__main__': run()