Spaces:

darly9991
/

Water_Quality

Sleeping

App Files Files Community

Water_Quality / eda.py

darly9991

Update eda.py

6ccde22 verified almost 2 years ago

raw

history blame contribute delete

10.1 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	from PIL import Image
	import streamlit as st
	import pandas as pd
	import random
	import matplotlib.pyplot as plt
	from bokeh.plotting import figure
	from bokeh.models import ColumnDataSource, HoverTool
	from bokeh.models.formatters import NumeralTickFormatter
	from bokeh.plotting import figure
	from bokeh.models import HoverTool, NumeralTickFormatter
	from bokeh.layouts import gridplot
	from bokeh.palettes import Category20
	import seaborn as sns

	# Fungsi EDA Scatterplot dengan Regresi menggunakan Bokeh
	def scatter_plot_regression(df, x_col='Year', y_col='Salary', x_label='Tahun', y_label='Gaji Rata-rata', title='Diagram Sebaran dengan Garis Regresi'):
	# Konversi data menjadi array numpy
	x_data = df[x_col].values
	y_data = df[y_col].values

	# Membuat model regresi linier
	model = np.polyfit(x_data, y_data, 1)
	y_pred = np.polyval(model, x_data)

	# Buat ColumnDataSource
	source = ColumnDataSource(data={x_col: x_data, y_col: y_data, 'regression_line': y_pred})

	# Buat plot baru dengan judul dan label sumbu
	p = figure(title=title, x_axis_label=x_label, y_axis_label=y_label, width=800, height=400,
	tools="pan,box_zoom,wheel_zoom,reset,save")

	# Tambahkan diagram sebaran
	p.circle(x_col, y_col, source=source, size=8, color="navy", alpha=0.5, legend_label=y_label)

	# Tambahkan garis regresi
	p.line(x_col, 'regression_line', source=source, line_width=2, line_color="red", legend_label="Garis Regresi")

	# Tambahkan alat hover untuk menampilkan nilai data
	hover = HoverTool()
	hover.tooltips = [(x_label, f"@{x_col}"), (y_label, f"@{y_col}")]
	p.add_tools(hover)

	# Sesuaikan gaya label sumbu
	p.xaxis.axis_label_text_font_style = "bold"
	p.yaxis.axis_label_text_font_style = "bold"

	# Atur format untuk sumbu Y agar menampilkan nilai tanpa notasi ilmiah
	p.yaxis.formatter = NumeralTickFormatter(format="0")

	# Sesuaikan plot
	p.legend.location = "top_left"
	p.legend.click_policy = "hide"

	# Tampilkan plot menggunakan st.bokeh_chart()
	st.bokeh_chart(p)

	# Fungsi EDA untuk IQR plot & Histogram 1 Kolom
	def histogram_boxplot(df, nama_kolom, judul="Contoh Bar Plot"):
	# Ekstrak data kolom
	data_kolom = df[nama_kolom]

	# Plot histogram
	hist, edges = np.histogram(data_kolom, bins=20)

	# Generate random colors for the bars
	colors = random.choices(Category20[20], k=len(hist))

	p1 = figure(title=f"{judul} (Histogram)", tools="save,hover", background_fill_color="#fafafa",
	width=600, height=400, tooltips=[("Jumlah", "@top"), ("Interval", "@left{0.00} hingga @right{0.00}")],
	x_axis_label=judul, y_axis_label="Frequency")

	p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
	fill_color=colors, line_color="white", alpha=0.7)

	# Box plot
	q1 = data_kolom.quantile(0.25)
	q2 = data_kolom.quantile(0.50)
	q3 = data_kolom.quantile(0.75)
	iqr = q3 - q1

	lower_whisker = data_kolom[data_kolom >= (q1 - 1.5 * iqr)].min()
	upper_whisker = data_kolom[data_kolom <= (q3 + 1.5 * iqr)].max()

	outliers = data_kolom[(data_kolom > upper_whisker) \| (data_kolom < lower_whisker)]

	p2 = figure(title=f"{judul} (Boxplot)", tools="save,hover", background_fill_color="#fafafa",
	width=400, height=400, tooltips=[("Nilai", "@y"), ("Q1", f"{q1:.2f}"),
	("Q2 (Median)", f"{q2:.2f}"), ("Q3", f"{q3:.2f}"),
	("Lower Whisker", f"{lower_whisker:.2f}"),
	("Upper Whisker", f"{upper_whisker:.2f}")])

	# Menambahkan elemen diagram kotak
	p2.segment(1, lower_whisker, 1, q1, line_color="black")
	p2.segment(1, q3, 1, upper_whisker, line_color="black")
	p2.vbar(1, 0.7, q1, q3, fill_color="navy", line_color="black")
	p2.vbar(1, 0.7, q2, q2, line_color="black")

	# Whiskers
	p2.rect(1, lower_whisker, 0.2, 0.01, line_color="black")
	p2.rect(1, upper_whisker, 0.2, 0.01, line_color="black")

	# Outliers
	p2.scatter([1]*len(outliers), outliers, size=6, color="red", fill_alpha=0.6)

	# Menghapus label sumbu dan tanda sumbu pada boxplot
	p2.xaxis.axis_label = ""
	p2.yaxis.axis_label = ""
	p2.xaxis.visible = False
	p2.yaxis.visible = False

	# Set gaya label sumbu dan tanda sumbu
	p1.xaxis.axis_label_text_font_style = "bold"
	p1.xaxis.axis_label_text_font_size = "10pt"
	p1.xaxis.major_label_text_font_style = "bold"
	p1.xaxis.major_label_text_font_size = "8pt"
	p1.yaxis.axis_label_text_font_style = "bold"
	p1.yaxis.axis_label_text_font_size = "10pt"
	p1.yaxis.major_label_text_font_style = "bold"
	p1.yaxis.major_label_text_font_size = "8pt"

	# Menghapus garis grid
	p1.grid.grid_line_color = None
	p2.grid.grid_line_color = None

	# Mengatur formatter sumbu agar tidak menggunakan notasi ilmiah
	p1.yaxis.formatter.use_scientific = False
	p1.xaxis.formatter.use_scientific = False

	# Menata plot dalam grid
	grid = gridplot([[p1, p2]])

	# Tampilkan plot menggunakan st.bokeh_chart()
	st.bokeh_chart(grid)


	def run():
	# Membuat judul
	st.title('Water Quality')

	# Membuat Sub Header
	st.header('Water Quality Data Visualization', divider='gray')

	# Menambahkan Gambar
	image = Image.open('water.jpg')
	st.image(image, caption = 'Water Pollution (wallpapers.com)', channels='RGB')

	# Menambahkan Divider
	st.divider()

	# Menampilkan Dataframe
	st.header('Dataframe', divider='gray')
	df = pd.read_csv('water_quality.csv')
	st.dataframe(df)
	st.divider()

	# Display descriptive statistics for all numeric columns
	# Fungsi Untuk Menghitung Mean, Median, Mode dan Mengevaluasi Distribusi
	def evaluate_distribution(col):
	mean = col.mean()
	median = col.median()
	mode = col.mode()[0] # Ambil mode pertama jika ada beberapa mode
	if (abs(mean - median) / mean <= 0.05 and abs(mean - mode) / mean <= 0.05):
	evaluasi = 'Normal Distribution'
	elif mean > median:
	evaluasi = 'Positive Skewness'
	else:
	evaluasi = 'Negative Skewness'
	return pd.Series({'Mean': mean, 'Median': median, 'Mode': mode, 'Evaluasi': evaluasi})

	# Memilih Hanya Kolom Numerik
	numerical_cols = df.select_dtypes(include=[np.number])

	# Terapkan Fungsi ke Setiap Kolom Numerik dalam DataFrame
	result = numerical_cols.apply(evaluate_distribution)
	st.header('Descriptive Statistics', divider='gray')
	st.write(result)
	st.divider()

	st.header('Correlation Bacteria and Viruses', divider='gray')
	scatter_plot_regression(df, x_col='bacteria', y_col='viruses', x_label='Bacteria', y_label='Viruses', title='Correlation Bacteria and Viruses')
	st.divider()

	st.header('Correlation Arsenic and Selenium', divider='gray')
	scatter_plot_regression(df, x_col='arsenic', y_col='selenium', x_label='Arsenic', y_label='Selenium', title='Correlation Arsenic and Selenium')
	st.divider()

	st.header('Correlation Lead and Copper', divider='gray')
	scatter_plot_regression(df, x_col='lead', y_col='copper', x_label='Lead', y_label='copper', title='Correlation Lead and Copper')
	st.divider()

	st.header('Correlation Chloramine and Bacteria', divider='gray')
	scatter_plot_regression(df, x_col='chloramine', y_col='bacteria', x_label='Chloramine', y_label='Bacteria', title='Correlation Chloramine and Bacteria')
	st.divider()


	# Fungsi untuk menghitung korelasi dan menampilkan hasilnya di Streamlit
	def tampilkan_korelasi(df):
	# Hitung korelasi
	cor = df.corr()

	# Membuat DataFrame dari Matriks Korelasi
	cor_df = pd.DataFrame(cor.stack(), columns=['Correlation'])

	# Menambahkan Kolom Baris dan Kolom
	cor_df.reset_index(inplace=True)
	cor_df.columns = ['Variable 1', 'Variable 2', 'Correlation']

	# Menambahkan Kolom Interpretasi
	cor_df['Interpretation'] = np.where(cor_df['Correlation'] < 0.05, 'Ada korelasi', 'Tidak ada korelasi')

	# Menampilkan DataFrame menggunakan Streamlit
	st.dataframe(cor_df)

	# Menampilkan heatmap korelasi
	st.write("Heatmap Korelasi:")
	fig, ax = plt.subplots(figsize=(10, 8))
	sns.heatmap(cor, annot=True, fmt=".2f", cmap='coolwarm', ax=ax)
	st.pyplot(fig)

	# Tombol untuk menghitung dan menampilkan korelasi
	if st.button('Hitung Korelasi'):
	tampilkan_korelasi(df)

	st.divider()

	st.header('Aluminium Bar Plot', divider='gray')
	histogram_boxplot(df, 'aluminium', judul="Aluminium Plot")
	st.divider()

	st.header('Arsenic Bar Plot', divider='gray')
	histogram_boxplot(df, 'arsenic', judul="Arsenic Plot")
	st.divider()

	# Fungsi untuk membuat pie chart
	def plot_pie_chart(df, column):
	# Menghitung distribusi nilai dalam kolom
	value_counts = df[column].value_counts()

	# Set up the matplotlib figure
	fig, ax = plt.subplots(figsize=(8, 8))

	# Create the pie chart
	ax.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors)

	# Set title
	plt.title(f'Pie Chart for {column}')

	# Display the plot
	st.pyplot(fig)

	# Menambahkan Gambar
	image2 = Image.open('output.png')
	st.header('Feature Selection', divider='gray')
	st.image(image2, caption = 'Feature Selection', channels='RGB')

	# Menampilkan gambar menggunakan tombol
	if st.button('Show Image'):
	# Gantilah 'image2.png' dengan jalur ke file gambar Anda
	image_path = 'water1.jpeg'
	st.image(image_path, caption='Nickel Processing Factory (Smelter) in the Obi Island Industrial Area, North Maluku Province.', channels='RGB')

	if __name__ == '__main__':
	run()