File size: 2,820 Bytes
4d8779f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import streamlit as st 
import seaborn as sns 
import matplotlib.pyplot as plt
from PIL import Image
from datasets import load_dataset
import random

def run():
    st.title('Tomato Leaf Health Classification')
    st.subheader("this page contains the EDA about tomato leaf health classification")
    
    # image = Image.open("./src/credit_card.jpg")
    # st.image(image, caption="Credit Card")
    
    # write
    st.write("the EDA will explore and analyse classifier tomato leaf health")
    
    # fetch dataset 
    dataset_dict = load_dataset("Heizsenberg/leaf-image-dataset")
    label_names = dataset_dict["train"].features["label"].names

    
    dataset_df = dataset_dict['train'].to_pandas()
    dataset_df["label_name"] = dataset_df["label"].map(dict(enumerate(label_names)))
    
    st.write("sample from the dataframe")
    st.write(dataset_df.sample(15))
    
    st.write("content of the dataframe")
    st.write("Total images:", len(dataset_df))
    st.write("Total classes:", dataset_df["label"].nunique())  
    
    st.write("Tomato Leaf Training dataset class distribution")
    fig, ax = plt.subplots(figsize=(10,5))
    sns.countplot(data=dataset_df, x="label_name", order=dataset_df["label_name"].value_counts().index, ax=ax)
    plt.xticks(rotation=90)
    plt.title("Class Distribution")

    st.pyplot(fig)
    
    st.write("sample image size and mode")
    sample_path_obj = random.choice(dataset_df["image"].values)
    sample_path = sample_path_obj['path']
    img = Image.open(sample_path)

    st.write("Image size:", img.size)
    st.write("Image mode:", img.mode)
    
    st.write("sample from each classes")
    fig_samp, ax_samp = plt.subplots(4, 3, figsize=(12,12))
    
    # samples = dataset_df.sample(10)
    samples = dataset_df.groupby("label_name").sample(1, random_state=42)

    for ax, (_, row) in zip(ax_samp.flatten(), samples.iterrows()):
        image_path = row['image']
        img = Image.open(image_path['path'])
        ax.imshow(img)
        ax.set_title(row["label_name"])
        ax.axis("off")

    plt.tight_layout()

    # Show inside Streamlit
    st.pyplot(fig_samp)
    
    st.write("""
             ## Insight

                1. dataset contains around 16.011 in 10 classes
                2. class distribution generally spread evenly with few exceptions on `tomato_tomato_mosaic_virus` has lowest samples and `Tomato_YellowLeaf_curl_virus` having the largest samples, showing complexity in detecting the diseases and easier detection of tomato mosaic virus
                3. the dataset images is on size (256x256) which needs to be rescaled for lower GPU load
                4. several samples is shown from 10 different classes, showing both healthy and disease afflicted leaves
             """)

    
if __name__ == '__main__':
    run()