Spaces:

Delima1
/

kmi_dashboard

Configuration error

kmi_dashboard / eda_functions.py

delima1234-Sunbright

KMI Dashboard

5e0490f 4 months ago

46.5 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from datetime import datetime, timedelta
	import warnings
	warnings.filterwarnings('ignore')

	class SprayDryerEDAPipeline:
	"""
	Pipeline untuk Exploratory Data Analysis (EDA) dan preprocessing data spray dryer
	"""

	def __init__(self, data_path=None, dataframe=None):
	"""
	Inisialisasi pipeline

	Parameters:
	-----------
	data_path : str, optional
	Path ke file data (CSV, Excel, dll)
	dataframe : pd.DataFrame, optional
	DataFrame yang sudah dimuat
	"""
	if dataframe is not None:
	self.df_original = dataframe.copy()
	elif data_path:
	self.df_original = self.load_data(data_path)
	else:
	raise ValueError("Harus memberikan data_path atau dataframe")

	self.df = self.df_original.copy()
	self.product_dataframes = {}
	self.setup_visualization()

	def setup_visualization(self):
	"""Setup parameter visualisasi"""
	plt.style.use('default')
	sns.set_palette("husl")

	def load_data(self, path):
	"""Load data dari file"""
	if path.endswith('.csv'):
	return pd.read_csv(path)
	elif path.endswith(('.xlsx', '.xls')):
	return pd.read_excel(path)
	else:
	raise ValueError("Format file tidak didukung")

	# ============= STEP 1: PEMERIKSAAN KOLOM AWAL =============
	def check_and_fix_columns(self):
	"""
	Step 1: Pemeriksaan dan perbaikan nama kolom
	"""
	print("="*80)
	print("STEP 1: PEMERIKSAAN KOLOM AWAL")
	print("="*80)

	# Daftar kolom standar
	standard_columns = [
	'Date_time', 'Drier_On_Product', 'D101330TT', 'D102260TIC_CV',
	'D102265TIC_PV', 'D102265TIC_CV', 'D102266TIC', 'D101264FTSCL',
	'Product', 'GAS_MMBTU', 'fixed_rounded_time'
	]

	print(f"Kolom yang ada di dataframe: {list(self.df.columns)}")
	print(f"\nKolom standar yang diharapkan: {standard_columns}")
	standard_lookup = {col.lower(): col for col in standard_columns}
	column_mapping = {}
	unmatched_column = []
	for actual_col in self.df.columns:
	actual_col_lower = actual_col.lower()
	if actual_col_lower in standard_lookup:
	standard_name = standard_lookup[actual_col_lower]
	if actual_col != standard_name:
	column_mapping[actual_col] = standard_name
	else:
	unmatched_column.append(actual_col)

	# Rename kolom
	self.df.rename(columns=column_mapping, inplace=True)

	# Hapus kolom yang tidak ada dalam daftar standar
	cols_to_keep = [col for col in self.df.columns if col in standard_columns]
	cols_removed = [col for col in self.df.columns if col not in standard_columns]

	if cols_removed:
	print(f"\nKolom yang dihapus: {cols_removed}")

	self.df = self.df[cols_to_keep]

	# Cek apakah fixed_rounded_time ada
	if 'fixed_rounded_time' not in self.df.columns:
	print("\nKolom 'fixed_rounded_time' tidak ditemukan. Akan dibuat nanti.")

	print(f"\nKolom final: {list(self.df.columns)}")
	print(f"Shape dataframe: {self.df.shape}")

	# ============= STEP 2: VALIDASI KOLOM PRODUCT =============
	def validate_product_names(self):
	"""
	Step 2: Validasi dan standardisasi nama produk secara otomatis.
	"""
	print("\n" + "="*80)
	print("STEP 2: VALIDASI DAN STANDARDISASI KOLOM PRODUCT")
	print("="*80)

	# Pastikan kolom 'Product' ada
	if 'Product' not in self.df.columns:
	print("PERINGATAN: Kolom 'Product' tidak ditemukan. Melewati langkah ini.")
	print("="*80 + "\n")
	return self.df

	# 1. Daftar nama produk standar (sumber kebenaran)
	standard_products = [
	'CKP BASE', 'CMP BASE', 'BMP BASE', 'MORIGRO BASE', 'CKH BASE',
	'CMH BASE', 'BMH BASE', 'CKR BASE', 'CMR BASE', 'BMR BASE',
	'CGI BASE', 'NL33 BASE POWDER', 'CKS BASE', 'CHIL SCHOOL',
	'CHIL MIL SOYA', 'CIP', 'CIP CHAMBER'
	]

	# 2. Mapping HANYA untuk kasus-kasus khusus/salah ketik yang tidak bisa ditebak
	# Contoh: ada kata 'BASE' ganda, atau singkatan yang tidak standar.
	special_product_mapping = {
	'CMR BASE BASE': 'CMR BASE',
	'CGI 6-12 BASE' : 'CGI BASE',
	'CMH BASE': 'CMH BASE',
	'BMH BASE': 'BMH BASE'
	}

	print(f"Produk unik sebelum standardisasi: {self.df['Product'].unique()}")

	# 3. Buat kamus pencocokan (lookup map) utama secara otomatis
	# Kunci: nama produk dalam format UPPERCASE dan tanpa spasi berlebih
	# Nilai: nama produk standar yang benar

	# Mulai dengan standard products
	product_lookup = {prod.upper().strip(): prod for prod in standard_products}

	# Timpa/tambahkan dengan special mapping. Ini memastikan kasus khusus diutamakan.
	for key, value in special_product_mapping.items():
	product_lookup[key.upper().strip()] = value

	# 4. Gunakan metode .map() dari Pandas untuk efisiensi tinggi
	# Ini jauh lebih cepat daripada .apply() untuk data besar

	# Simpan kolom produk asli untuk perbandingan
	original_products = self.df['Product'].copy()

	# Buat series baru dengan nilai yang sudah dinormalisasi (uppercase, strip)
	normalized_products = self.df['Product'].astype(str).str.upper().str.strip()

	# Gunakan .map() untuk mengganti nilai. Nilai yang tidak ada di `product_lookup` akan menjadi NaN
	self.df['Product'] = normalized_products.map(product_lookup)

	# Isi kembali nilai yang menjadi NaN dengan nilai aslinya.
	# Ini memastikan produk yang tidak dikenali tidak akan hilang/diubah.
	self.df['Product'].fillna(original_products, inplace=True)

	print(f"\nProduk unik setelah standardisasi: {self.df['Product'].unique()}")
	print(f"\nJumlah setiap produk:\n{self.df['Product'].value_counts()}")

	# 5. (Opsional tapi sangat direkomendasikan) Laporkan produk yang tidak berhasil distandardisasi
	final_products_set = set(self.df['Product'].unique())
	standard_products_set = set(standard_products)

	unstandardized = final_products_set - standard_products_set
	# Hapus None atau NaN jika ada dalam hasil
	unstandardized = {item for item in unstandardized if pd.notna(item)}

	if unstandardized:
	print("\n" + "-"*40)
	print(f"PERINGATAN: Ditemukan {len(unstandardized)} produk yang tidak sesuai standar:")
	for item in unstandardized:
	print(f" - '{item}'")
	print("Pertimbangkan untuk menambahkannya ke `standard_products` atau `special_product_mapping`.")
	print("-"*40)

	print("\n" + "="*80)
	print("STEP 2 SELESAI")
	print("="*80 + "\n")
	return self.df

	# ============= STEP 3: PEMISAHAN DATA PER PRODUK =============
	def separate_data_by_product(self):
	"""
	Step 3: Pemisahan data berdasarkan produk
	"""
	print("\n" + "="*80)
	print("STEP 3: PEMISAHAN DATA PER PRODUK")
	print("="*80)

	unique_products = self.df['Product'].unique()
	print(f"Memisahkan data untuk {len(unique_products)} produk...")

	for product in unique_products:
	self.product_dataframes[product] = self.df[self.df['Product'] == product].copy()
	print(f"\n{product}: {len(self.product_dataframes[product])} baris")

	# Tampilkan statistik deskriptif
	print("\n" + "-"*50)
	print("STATISTIK DESKRIPTIF - DATA KESELURUHAN")
	print("-"*50)
	print(self.df.describe())

	print("\n" + "-"*50)
	print("INFO DATA KESELURUHAN")
	print("-"*50)
	print(self.df.info())

	# Statistik per produk
	for product, df_product in self.product_dataframes.items():
	print("\n" + "-"*50)
	print(f"STATISTIK DESKRIPTIF - {product}")
	print("-"*50)
	print(df_product.describe())

	print(f"\nINFO - {product}")
	print(df_product.info())

	# ============= STEP 4: IDENTIFIKASI ANOMALI DATA =============
	def identify_anomalies(self):
	"""
	Step 4: Identifikasi anomali berdasarkan aturan teknis
	"""
	print("\n" + "="*80)
	print("STEP 4: IDENTIFIKASI ANOMALI DATA")
	print("="*80)

	anomaly_rules = {
	'D101330TT': {'min': 20, 'max': 130, 'zero_anomaly': True},
	'D102265TIC_PV': {'min': 20, 'zero_anomaly': True},
	'D102265TIC_CV': {'zero_allowed_products': ['CIP', 'CIP CHAMBER']},
	'D102266TIC': {'zero_anomaly': True}
	}

	anomalies = []

	for product, df_product in self.product_dataframes.items():
	print(f"\nMemeriksa anomali untuk produk: {product}")

	for column, rules in anomaly_rules.items():
	if column not in df_product.columns:
	continue

	# Cek nilai 0
	if 'zero_anomaly' in rules and rules['zero_anomaly']:
	zero_count = (df_product[column] == 0).sum()
	if zero_count > 0:
	anomalies.append({
	'Product': product,
	'Column': column,
	'Anomaly': 'Nilai 0',
	'Count': zero_count
	})
	print(f" - {column}: Ditemukan {zero_count} nilai 0 (anomali)")

	# Cek nilai 0 untuk D102265TIC_CV
	if 'zero_allowed_products' in rules:
	if product not in rules['zero_allowed_products']:
	zero_count = (df_product[column] == 0).sum()
	if zero_count > 0:
	anomalies.append({
	'Product': product,
	'Column': column,
	'Anomaly': 'Nilai 0 (tidak diizinkan untuk produk ini)',
	'Count': zero_count
	})
	print(f" - {column}: Ditemukan {zero_count} nilai 0 (anomali untuk produk non-CIP)")

	# Cek nilai minimum
	if 'min' in rules:
	below_min = (df_product[column] < rules['min']).sum()
	if below_min > 0:
	anomalies.append({
	'Product': product,
	'Column': column,
	'Anomaly': f'Nilai < {rules["min"]}',
	'Count': below_min
	})
	print(f" - {column}: Ditemukan {below_min} nilai < {rules['min']}")

	# Cek nilai maksimum
	if 'max' in rules:
	above_max = (df_product[column] > rules['max']).sum()
	if above_max > 0:
	anomalies.append({
	'Product': product,
	'Column': column,
	'Anomaly': f'Nilai > {rules["max"]}',
	'Count': above_max
	})
	print(f" - {column}: Ditemukan {above_max} nilai > {rules['max']}")

	if anomalies:
	anomaly_df = pd.DataFrame(anomalies)
	print("\n" + "-"*50)
	print("RINGKASAN ANOMALI")
	print("-"*50)
	print(anomaly_df.to_string())
	else:
	print("\nTidak ditemukan anomali berdasarkan aturan yang ditetapkan.")

	# ============= STEP 5: VALIDASI KOLOM DRIER_ON_PRODUCT =============
	def validate_drier_on_product(self):
	"""
	Step 5: Validasi kolom Drier_On_Product
	"""
	print("\n" + "="*80)
	print("STEP 5: VALIDASI KOLOM DRIER_ON_PRODUCT")
	print("="*80)

	production_products = [
	'CKP BASE', 'CMP BASE', 'BMP BASE', 'MORIGRO BASE', 'CKH BASE',
	'CMH BASE', 'BMH BASE', 'CKR BASE', 'CMR BASE', 'BMR BASE',
	'CGI BASE', 'NL33 BASE POWDER', 'CKS BASE', 'CHIL SCHOOL',
	'CHIL MIL SOYA'
	]

	cip_products = ['CIP', 'CIP CHAMBER']

	validation_errors = []

	for product in self.df['Product'].unique():
	df_product = self.df[self.df['Product'] == product]

	if product in production_products:
	# Harus 1
	wrong_values = df_product[df_product['Drier_On_Product'] != 1]
	if len(wrong_values) > 0:
	validation_errors.append({
	'Product': product,
	'Expected': 1,
	'Wrong_Count': len(wrong_values)
	})
	print(f"ERROR: {product} memiliki {len(wrong_values)} baris dengan Drier_On_Product != 1")

	elif product in cip_products:
	# Harus 0
	wrong_values = df_product[df_product['Drier_On_Product'] != 0]
	if len(wrong_values) > 0:
	validation_errors.append({
	'Product': product,
	'Expected': 0,
	'Wrong_Count': len(wrong_values)
	})
	print(f"ERROR: {product} memiliki {len(wrong_values)} baris dengan Drier_On_Product != 0")

	if not validation_errors:
	print("✓ Semua nilai Drier_On_Product sesuai dengan ketentuan")
	else:
	error_df = pd.DataFrame(validation_errors)
	print("\nRingkasan Error Validasi:")
	print(error_df)

	# ============= STEP 6: CEK MISSING VALUES DAN DUPLIKASI =============
	def check_missing_and_duplicates(self):
	"""
	Step 6: Periksa missing values dan hapus duplikasi
	"""
	print("\n" + "="*80)
	print("STEP 6: CEK MISSING VALUES DAN DUPLIKASI")
	print("="*80)

	# Cek missing values
	print("Missing Values per Kolom:")
	missing_counts = self.df.isnull().sum()
	print(missing_counts[missing_counts > 0] if any(missing_counts > 0) else "Tidak ada missing values")

	# Cek duplikasi berdasarkan Date_time
	duplicates = self.df[self.df.duplicated(subset=['Date_time'], keep=False)]
	print(f"\nJumlah baris duplikat berdasarkan Date_time: {len(duplicates)}")

	if len(duplicates) > 0:
	print("Menghapus duplikasi...")
	self.df = self.df.drop_duplicates(subset=['Date_time'], keep='first')
	print(f"Shape setelah menghapus duplikasi: {self.df.shape}")

	# Update product dataframes
	for product in self.product_dataframes.keys():
	self.product_dataframes[product] = self.df[self.df['Product'] == product].copy()

	# ============= STEP 7: PERHITUNGAN DURASI PRODUKSI =============
	def calculate_production_duration(self):
	"""
	Step 7: Hitung durasi produksi untuk setiap produk berdasarkan segmen produksi yang berkelanjutan.

	Logika:
	1. Data diurutkan berdasarkan waktu.
	2. Sebuah "segmen" produksi diidentifikasi sebagai blok baris yang berurutan
	di mana nama produknya sama.
	3. Jika nama produk pada baris saat ini berbeda dari baris sebelumnya,
	maka itu dianggap sebagai awal dari segmen baru.
	4. Durasi dihitung untuk setiap segmen (end_time - start_time).
	5. Total durasi untuk satu produk adalah jumlah dari semua durasi segmennya.
	"""
	print("\n" + "="*80)
	print("STEP 7: PERHITUNGAN DURASI PRODUKSI (METODE SEGMENTASI)")
	print("="*80)

	# Pastikan tipe data dan urutan sudah benar
	try:
	self.df['Date_time'] = pd.to_datetime(self.df['Date_time'])
	except Exception as e:
	print(f"Error saat konversi 'Date_time': {e}")
	return

	if self.df.empty:
	print("DataFrame kosong, tidak ada durasi untuk dihitung.")
	return

	# Urutkan dataframe berdasarkan waktu, ini krusial untuk logika segmentasi
	df_sorted = self.df.sort_values('Date_time').copy()

	# --- Logika Inti: Identifikasi Segmen Produksi ---
	# Buat kolom 'segment_id' yang akan unik untuk setiap blok produksi yang berkelanjutan.
	# .shift() membandingkan produk di baris saat ini dengan baris sebelumnya.
	# .cumsum() akan mengakumulasi nilai (True=1, False=0), sehingga menciptakan ID unik untuk setiap segmen.
	df_sorted['segment_id'] = (df_sorted['Product'] != df_sorted['Product'].shift()).cumsum()

	# Kelompokkan berdasarkan Produk dan ID Segmen untuk mendapatkan start dan end time setiap segmen
	production_segments = df_sorted.groupby(['Product', 'segment_id']).agg(
	Start_Time=('Date_time', 'min'),
	End_Time=('Date_time', 'max'),
	Data_Points=('Date_time', 'count')
	).reset_index()

	# Hitung durasi untuk setiap segmen
	production_segments['Duration'] = production_segments['End_Time'] - production_segments['Start_Time']

	# Filter hanya untuk produk produksi (bukan CIP)
	production_segments_filtered = production_segments[
	~production_segments['Product'].isin(['CIP', 'CIP CHAMBER'])
	].copy()

	if production_segments_filtered.empty:
	print("Tidak ada data produksi (non-CIP) untuk dihitung durasinya.")
	return

	# Hitung total durasi dengan menjumlahkan durasi dari semua segmen per produk
	total_durations = production_segments_filtered.groupby('Product')['Duration'].sum().reset_index()

	# Konversi total durasi ke jam
	total_durations['Total_Duration_Hours'] = round(total_durations['Duration'].dt.total_seconds() / 3600, 2)

	# Gabungkan dengan jumlah data points
	total_data_points = production_segments_filtered.groupby('Product')['Data_Points'].sum().reset_index()
	summary_df = pd.merge(total_durations, total_data_points, on='Product')

	print("--- RINGKASAN TOTAL DURASI PRODUKSI PER PRODUK ---")
	print(summary_df[['Product', 'Total_Duration_Hours', 'Data_Points']].to_string(index=False))

	print("\n" + "-"*80)
	print("--- DETAIL SEGMEN PRODUKSI ---")
	# Tampilkan detail setiap segmen untuk setiap produk
	for product in summary_df['Product'].unique():
	print(f"\nProduk: {product}")
	product_segment_details = production_segments_filtered[production_segments_filtered['Product'] == product].copy()

	# Konversi durasi segmen ke menit untuk keterbacaan
	product_segment_details['Duration_Minutes'] = round(product_segment_details['Duration'].dt.total_seconds() / 60, 2)

	print(product_segment_details[[
	'Start_Time',
	'End_Time',
	'Duration_Minutes',
	'Data_Points'
	]].to_string(index=False))

	# ============= STEP 8: PEMBUATAN KOLOM FIXED_ROUNDED_TIME =============
	def create_fixed_rounded_time(self):
	"""
	Step 8: Buat kolom fixed_rounded_time jika belum ada
	"""
	print("\n" + "="*80)
	print("STEP 8: PEMBUATAN KOLOM FIXED_ROUNDED_TIME")
	print("="*80)

	if 'fixed_rounded_time' not in self.df.columns:
	print("Membuat kolom fixed_rounded_time...")
	self.df['Date_time'] = pd.to_datetime(self.df['Date_time'])
	self.df['fixed_rounded_time'] = (self.df['Date_time'] + pd.Timedelta(hours=1)).dt.floor('H')

	print("Sample hasil:")
	print(self.df[['Date_time', 'fixed_rounded_time']].head(10))

	# Update product dataframes
	for product in self.product_dataframes.keys():
	self.product_dataframes[product] = self.df[self.df['Product'] == product].copy()
	else:
	print("Kolom fixed_rounded_time sudah ada")

	# ============= STEP 9: PERHITUNGAN JUMLAH MENIT =============
	def calculate_minutes_per_hour(self):
	"""
	Step 9: Hitung jumlah data per jam berdasarkan fixed_rounded_time
	"""
	print("\n" + "="*80)
	print("STEP 9: PERHITUNGAN JUMLAH DATA PER JAM")
	print("="*80)

	if 'fixed_rounded_time' not in self.df.columns:
	print("ERROR: Kolom fixed_rounded_time tidak ditemukan!")
	return

	# Hitung jumlah data per jam
	jumlah_data_per_jam = self.df.groupby('fixed_rounded_time').size()
	jumlah_data_per_jam_df = jumlah_data_per_jam.reset_index(name='Jumlah Data Per Jam')

	# Filter data dengan jumlah kurang dari 60
	jumlah_data_kurang_60 = jumlah_data_per_jam_df[jumlah_data_per_jam_df['Jumlah Data Per Jam'] <= 60]

	# Urutkan dari jumlah terkecil ke terbesar
	jumlah_data_kurang_60 = jumlah_data_kurang_60.sort_values(by='Jumlah Data Per Jam', ascending=True)

	print(f"Jam dengan data < 60 menit ({len(jumlah_data_kurang_60)} jam):")
	pd.set_option("display.max_rows", None)
	print(jumlah_data_kurang_60.to_string())
	pd.set_option("display.max_rows", 10)

	# ============= STEP 10: VISUALISASI DATA =============
	def create_line_plots(self, show_all_products=True, show_overall=True):
	"""
	Step 10: Buat line plot untuk visualisasi data
	"""
	print("\n" + "="*80)
	print("STEP 10: VISUALISASI DATA (LINE PLOTS)")
	print("="*80)

	numeric_columns = self.df.select_dtypes(include=[np.number]).columns
	numeric_columns = [col for col in numeric_columns if col not in ['Drier_On_Product']]

	# Plot untuk keseluruhan data
	if show_overall:
	print("\nMembuat plot untuk keseluruhan produk...")
	df_plot = self.df[self.df['Drier_On_Product'] == 1].copy()

	if len(df_plot) > 0:
	df_plot = df_plot.sort_values('Date_time')

	for column in numeric_columns:
	if column in df_plot.columns:
	plt.figure(figsize=(30, 5))
	plt.plot(df_plot['Date_time'], df_plot[column], marker='o', markersize=2, label=column)
	plt.title(f'Line Plot of {column} Over Time - All Products', fontsize=14)
	plt.xlabel('Date_time')
	plt.ylabel(column)
	plt.xticks(rotation=45)
	plt.legend()
	plt.grid(True, alpha=0.3)
	plt.tight_layout()
	plt.show()

	# Plot untuk setiap produk
	if show_all_products:
	for product, df_product in self.product_dataframes.items():
	if product not in ['CIP', 'CIP CHAMBER']:
	print(f"\nMembuat plot untuk produk: {product}")
	df_plot = df_product[df_product['Drier_On_Product'] == 1].copy()

	if len(df_plot) > 0:
	df_plot = df_plot.sort_values('Date_time')

	for column in numeric_columns:
	if column in df_plot.columns:
	plt.figure(figsize=(20, 4))
	plt.plot(df_plot['Date_time'], df_plot[column], marker='o', markersize=3, label=column)
	plt.title(f'{product} - {column} Over Time', fontsize=12)
	plt.xlabel('Date_time')
	plt.ylabel(column)
	plt.xticks(rotation=45)
	plt.legend()
	plt.grid(True, alpha=0.3)
	plt.tight_layout()
	plt.show()

	# ============= STEP 11: IDENTIFIKASI OUTLIERS =============
	def identify_outliers(self, show_plots=True):
	"""
	Step 11: Identifikasi outliers menggunakan metode IQR
	"""
	print("\n" + "="*80)
	print("STEP 11: IDENTIFIKASI OUTLIERS")
	print("="*80)

	def analyze_outliers(dataframe, product_name="Overall"):
	"""Analisis outliers untuk dataframe tertentu"""

	df_copy = dataframe.copy()
	df_copy['Date_time'] = pd.to_datetime(df_copy['Date_time'])
	drier_on_data = df_copy[df_copy['Drier_On_Product'] == 1].copy()

	if drier_on_data.empty:
	print(f"Tidak ada data dengan Drier_On_Product == 1 untuk {product_name}")
	return None, None

	print(f"\n{'='*60}")
	print(f"Analisis Outliers - {product_name}")
	print(f"Total data yang dianalisis: {len(drier_on_data)} baris")
	print(f"{'='*60}")

	all_stats_data = []
	list_of_outliers = []

	numeric_columns = drier_on_data.select_dtypes(include=np.number).columns.drop('Drier_On_Product', errors='ignore')

	for column in numeric_columns:
	if column in drier_on_data.columns:
	param_data = drier_on_data[column].dropna()

	if len(param_data) > 0:
	Q1 = param_data.quantile(0.25)
	Q3 = param_data.quantile(0.75)
	IQR = Q3 - Q1
	lower_bound = Q1 - 1.5 * IQR
	upper_bound = Q3 + 1.5 * IQR

	outliers = param_data[(param_data < lower_bound) \| (param_data > upper_bound)]
	has_outliers = not outliers.empty

	mean_val = param_data.mean()
	median_val = param_data.median()
	std_val = param_data.std()
	chosen_val = median_val if has_outliers else mean_val

	all_stats_data.append({
	'Parameter': column,
	'Mean': mean_val,
	'Median': median_val,
	'Std_Dev': std_val,
	'Batas_Bawah': lower_bound,
	'Batas_Atas': upper_bound,
	'Has_Outliers': has_outliers,
	'Outliers_Count': len(outliers),
	'Chosen_Value': chosen_val
	})

	if has_outliers:
	outlier_mask = (drier_on_data[column] < lower_bound) \| (drier_on_data[column] > upper_bound)
	outlier_rows = drier_on_data[outlier_mask]

	for index, row in outlier_rows.iterrows():
	list_of_outliers.append({
	'Tanggal dan Jam': row['Date_time'],
	'Kolom Outliers': column,
	'Nilai Outliers': row[column],
	'Produk': row['Product'] if 'Product' in row else product_name
	})

	# Visualisasi jika diminta
	if show_plots and has_outliers:
	fig, axes = plt.subplots(1, 2, figsize=(18, 5))

	# Histogram
	axes[0].hist(param_data, bins=30, edgecolor='black', alpha=0.7)
	axes[0].axvline(mean_val, color='green', linestyle='--', label=f'Mean: {mean_val:.2f}')
	axes[0].axvline(median_val, color='red', linestyle='--', label=f'Median: {median_val:.2f}')
	axes[0].set_title(f'Distribution - {column}')
	axes[0].legend()

	# Time series with outliers
	axes[1].plot(range(len(param_data)), param_data.values, 'b-', alpha=0.5)
	axes[1].axhline(upper_bound, color='purple', linestyle='--', label=f'Upper: {upper_bound:.2f}')
	axes[1].axhline(lower_bound, color='orange', linestyle='--', label=f'Lower: {lower_bound:.2f}')

	if has_outliers:
	outlier_indices = []
	outlier_values = []
	for i, (idx, val) in enumerate(param_data.items()):
	if val < lower_bound or val > upper_bound:
	outlier_indices.append(i)
	outlier_values.append(val)
	axes[1].scatter(outlier_indices, outlier_values, color='red', s=50, zorder=5, label='Outliers')

	axes[1].set_title(f'Time Series - {column}')
	axes[1].legend()

	plt.suptitle(f'{product_name}: {column}', fontsize=14)
	plt.tight_layout()
	plt.show()

	result_df = pd.DataFrame(all_stats_data) if all_stats_data else None
	outliers_df = pd.DataFrame(list_of_outliers) if list_of_outliers else None

	if outliers_df is not None and not outliers_df.empty:
	outliers_df = outliers_df.sort_values(by='Tanggal dan Jam').reset_index(drop=True)

	return result_df, outliers_df

	# Analisis untuk keseluruhan data
	print("\n" + "="*70)
	print("ANALISIS OUTLIERS - KESELURUHAN DATA")
	print("="*70)
	overall_stats, overall_outliers = analyze_outliers(self.df, "OVERALL")

	if overall_stats is not None:
	print("\nRingkasan Statistik - Keseluruhan:")
	print(overall_stats.to_string())

	if overall_outliers is not None and not overall_outliers.empty:
	print(f"\nTotal Outliers Keseluruhan: {len(overall_outliers)}")
	print("\nSample Outliers (10 pertama):")
	print(overall_outliers.head(10).to_string())

	# Analisis untuk setiap produk
	for product, df_product in self.product_dataframes.items():
	if product not in ['CIP', 'CIP CHAMBER']:
	stats, outliers = analyze_outliers(df_product, product)

	if stats is not None:
	print(f"\n{'='*50}")
	print(f"Ringkasan Statistik - {product}:")
	print(stats.to_string())

	if outliers is not None and not outliers.empty:
	print(f"\nTotal Outliers {product}: {len(outliers)}")

	# ============= MAIN PIPELINE EXECUTION =============
	def run_full_pipeline(self, show_visualizations=True):
	"""
	Menjalankan seluruh pipeline EDA
	"""
	print("\n" + "="*80)
	print(" " * 20 + "SPRAY DRYER EDA PIPELINE")
	print(" " * 25 + "STARTING ANALYSIS")
	print("="*80)

	try:
	# Step 1: Pemeriksaan kolom
	self.check_and_fix_columns()

	# Step 2: Validasi nama produk
	self.validate_product_names()

	# Step 3: Pemisahan data per produk
	self.separate_data_by_product()

	# Step 4: Identifikasi anomali
	self.identify_anomalies()

	# Step 5: Validasi Drier_On_Product
	self.validate_drier_on_product()

	# Step 6: Cek missing values dan duplikasi
	self.check_missing_and_duplicates()

	# Step 7: Hitung durasi produksi
	self.calculate_production_duration()

	# Step 8: Buat kolom fixed_rounded_time
	self.create_fixed_rounded_time()

	# Step 9: Hitung jumlah menit per jam
	self.calculate_minutes_per_hour()

	# Step 10: Visualisasi (optional)
	if show_visualizations:
	self.create_line_plots(show_all_products=False, show_overall=True)

	# Step 11: Identifikasi outliers
	self.identify_outliers(show_plots=show_visualizations)

	print("\n" + "="*80)
	print(" " * 25 + "PIPELINE COMPLETED SUCCESSFULLY")
	print("="*80)

	return self.df, self.product_dataframes

	except Exception as e:
	print(f"\nERROR dalam pipeline: {str(e)}")
	raise

	def get_summary(self):
	"""
	Mendapatkan ringkasan hasil analisis
	"""
	summary = {
	'total_rows': len(self.df),
	'total_columns': len(self.df.columns),
	'unique_products': self.df['Product'].nunique(),
	'date_range': {
	'start': self.df['Date_time'].min(),
	'end': self.df['Date_time'].max()
	},
	'missing_values': self.df.isnull().sum().to_dict(),
	'product_counts': self.df['Product'].value_counts().to_dict()
	}

	print("\n" + "="*50)
	print("📊 DATA SUMMARY")
	print("="*50)
	print(f"{'Total rows':20}: {summary['total_rows']:,}")
	print(f"{'Total columns':20}: {summary['total_columns']}")
	print(f"{'Unique products':20}: {summary['unique_products']}")
	print(f"{'Date range':20}: {summary['date_range']['start']} → {summary['date_range']['end']}")

	print("\n🔍 Missing values per column")
	print("-"*50)
	for col, val in summary['missing_values'].items():
	print(f"{col:25} : {val}")

	print("\n📦 Product counts")
	print("-"*50)
	for prod, count in summary['product_counts'].items():
	print(f"{prod:25} : {count:,}")

	return summary

	# ======================================================================
	# HELPER FUNCTIONS UNTUK DASHBOARD STREAMLIT (EDA)
	# ======================================================================

	def compute_eda_summary(df: pd.DataFrame,
	date_col: str = "Date_time",
	product_col: str = "Product") -> dict:
	"""Ringkasan umum dataset untuk metric/cards di dashboard."""
	df = df.copy()

	# Tanggal
	if date_col in df.columns:
	df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
	date_min = df[date_col].min()
	date_max = df[date_col].max()
	else:
	date_min, date_max = pd.NaT, pd.NaT

	total_rows = len(df)
	total_columns = df.shape[1]
	total_missing = int(df.isna().sum().sum())
	duplicate_rows = int(df.duplicated().sum())

	if product_col in df.columns:
	product_counts = df[product_col].value_counts().to_dict()
	unique_products = int(df[product_col].nunique())
	else:
	product_counts = {}
	unique_products = 0

	summary = {
	"total_rows": total_rows,
	"total_columns": total_columns,
	"date_min": date_min,
	"date_max": date_max,
	"total_missing": total_missing,
	"duplicate_rows": duplicate_rows,
	"unique_products": unique_products,
	"product_counts": product_counts,
	}
	return summary


	def compute_anomaly_table(df: pd.DataFrame,
	product_col: str = "Product") -> pd.DataFrame:
	"""
	Hitung anomali berbasis rule teknis (error validasi),
	dengan struktur kolom: Product, Column, Anomaly, Count.
	"""
	if product_col not in df.columns:
	return pd.DataFrame(columns=["Product", "Column", "Anomaly", "Count"])

	anomaly_rules = {
	'D101330TT': {'min': 20, 'max': 130, 'zero_anomaly': True},
	'D102265TIC_PV': {'min': 20, 'zero_anomaly': True},
	'D102265TIC_CV': {'zero_allowed_products': ['CIP', 'CIP CHAMBER']},
	'D102266TIC': {'zero_anomaly': True}
	}

	anomalies = []

	for product, df_product in df.groupby(product_col):
	for column, rules in anomaly_rules.items():
	if column not in df_product.columns:
	continue

	# Nilai 0 yang dianggap anomali
	if rules.get("zero_anomaly", False):
	zero_count = (df_product[column] == 0).sum()
	if zero_count > 0:
	anomalies.append({
	"Product": product,
	"Column": column,
	"Anomaly": "Nilai 0",
	"Count": int(zero_count),
	})

	# Nilai 0 untuk kolom yang hanya boleh 0 di produk tertentu
	if "zero_allowed_products" in rules:
	if product not in rules["zero_allowed_products"]:
	zero_count = (df_product[column] == 0).sum()
	if zero_count > 0:
	anomalies.append({
	"Product": product,
	"Column": column,
	"Anomaly": "Nilai 0 (tidak diizinkan untuk produk ini)",
	"Count": int(zero_count),
	})

	# Nilai < min
	if "min" in rules:
	below_min = (df_product[column] < rules["min"]).sum()
	if below_min > 0:
	anomalies.append({
	"Product": product,
	"Column": column,
	"Anomaly": f"Nilai < {rules['min']}",
	"Count": int(below_min),
	})

	# Nilai > max
	if "max" in rules:
	above_max = (df_product[column] > rules["max"]).sum()
	if above_max > 0:
	anomalies.append({
	"Product": product,
	"Column": column,
	"Anomaly": f"Nilai > {rules['max']}",
	"Count": int(above_max),
	})

	if not anomalies:
	return pd.DataFrame(columns=["Product", "Column", "Anomaly", "Count"])

	anomaly_df = pd.DataFrame(anomalies)
	anomaly_df = anomaly_df.groupby(
	["Product", "Column", "Anomaly"], as_index=False
	)["Count"].sum()

	return anomaly_df


	def compute_production_segments(df: pd.DataFrame,
	product_col: str = "Product",
	time_col: str = "Date_time") -> pd.DataFrame:
	"""
	Hitung segmen produksi kontinu per produk.

	Output kolom:
	Product \| Start_Time \| End_Time \| Duration_Minutes \| Data_Points
	"""
	if product_col not in df.columns or time_col not in df.columns:
	return pd.DataFrame(columns=["Product", "Start_Time", "End_Time", "Duration_Minutes", "Data_Points"])

	df_seg = df[[product_col, time_col]].copy()
	df_seg[time_col] = pd.to_datetime(df_seg[time_col], errors="coerce")
	df_seg = df_seg.dropna(subset=[time_col]).sort_values(time_col)

	# Segment id: berubah setiap kali Product berubah
	df_seg["segment_id"] = (df_seg[product_col] != df_seg[product_col].shift()).cumsum()

	grouped = df_seg.groupby([product_col, "segment_id"]).agg(
	Start_Time=(time_col, "min"),
	End_Time=(time_col, "max"),
	Data_Points=(time_col, "count"),
	).reset_index()

	grouped["Duration"] = grouped["End_Time"] - grouped["Start_Time"]
	# Filter non-CIP jika perlu (optional)
	grouped = grouped[~grouped[product_col].isin(["CIP", "CIP CHAMBER"])]

	if grouped.empty:
	return pd.DataFrame(columns=["Product", "Start_Time", "End_Time", "Duration_Minutes", "Data_Points"])

	grouped["Duration_Minutes"] = grouped["Duration"].dt.total_seconds() / 60.0

	result = grouped[[product_col, "Start_Time", "End_Time", "Duration_Minutes", "Data_Points"]].copy()
	result.rename(columns={product_col: "Product"}, inplace=True)

	return result


	def create_line_plots(df: pd.DataFrame,
	params: list,
	product_label: str = "All Data",
	time_col: str = "Date_time"):
	"""
	Membuat 6 plot distribusi parameter proses (2x3 grid) vs waktu.
	Mengembalikan satu Figure matplotlib untuk ditampilkan di Streamlit.
	"""
	df_plot = df.copy()
	if time_col in df_plot.columns:
	df_plot[time_col] = pd.to_datetime(df_plot[time_col], errors="coerce")
	df_plot = df_plot.dropna(subset=[time_col]).sort_values(time_col)

	# Siapkan figure 2x3
	fig, axes = plt.subplots(2, 3, figsize=(18, 8), sharex=True)
	axes = axes.flatten()

	for i, param in enumerate(params):
	ax = axes[i]
	if param in df_plot.columns:
	ax.plot(df_plot[time_col], df_plot[param], marker=".", linewidth=0.7)
	ax.set_title(param)
	ax.grid(True, alpha=0.3)
	else:
	ax.set_title(f"{param} (not found)")
	ax.axis("off")

	# Jika params < 6, matikan axis kosong
	for j in range(len(params), 6):
	axes[j].axis("off")

	fig.suptitle(f"Distribusi Parameter Proses – {product_label}", fontsize=14)
	fig.tight_layout(rect=[0, 0.03, 1, 0.95])
	return fig


	def identify_outliers(df: pd.DataFrame,
	params: list,
	product_label: str = "All Data",
	time_col: str = "Date_time"):
	"""
	Deteksi outlier dengan metode IQR untuk setiap parameter dalam `params`.
	Mengembalikan:
	- fig_out : Figure 2x3 dengan plot time series + highlight outlier
	- total_outliers: total jumlah outlier semua parameter
	- outlier_stats_df: tabel ringkasan per parameter
	"""
	df_proc = df.copy()
	if time_col in df_proc.columns:
	df_proc[time_col] = pd.to_datetime(df_proc[time_col], errors="coerce")
	df_proc = df_proc.dropna(subset=[time_col]).sort_values(time_col)

	stats_rows = []
	total_outliers = 0

	fig, axes = plt.subplots(2, 3, figsize=(18, 8), sharex=True)
	axes = axes.flatten()

	for i, param in enumerate(params):
	ax = axes[i]
	if param not in df_proc.columns:
	ax.set_title(f"{param} (not found)")
	ax.axis("off")
	continue

	series = df_proc[param].astype(float)
	series_no_na = series.dropna()

	if series_no_na.empty:
	ax.set_title(f"{param} (no data)")
	ax.axis("off")
	continue

	Q1 = series_no_na.quantile(0.25)
	Q3 = series_no_na.quantile(0.75)
	IQR = Q3 - Q1
	lower = Q1 - 1.5 * IQR
	upper = Q3 + 1.5 * IQR

	outlier_mask = (series < lower) \| (series > upper)
	outlier_idx = df_proc.index[outlier_mask]
	outlier_vals = series[outlier_mask]

	count_out = int(outlier_mask.sum())
	total_outliers += count_out

	stats_rows.append({
	"Parameter": param,
	"Q1": Q1,
	"Q3": Q3,
	"IQR": IQR,
	"Lower_Bound": lower,
	"Upper_Bound": upper,
	"Outliers_Count": count_out,
	})

	# Plot time series + highlight outliers
	ax.plot(df_proc[time_col], series, linewidth=0.7)
	if count_out > 0:
	ax.scatter(df_proc.loc[outlier_idx, time_col], outlier_vals, s=15)
	ax.axhline(lower, linestyle="--")
	ax.axhline(upper, linestyle="--")
	ax.set_title(f"{param} (outliers: {count_out})")
	ax.grid(True, alpha=0.3)

	# Matikan axis kosong jika params < 6
	for j in range(len(params), 6):
	axes[j].axis("off")

	fig.suptitle(f"Outlier Detection – {product_label}", fontsize=14)
	fig.tight_layout(rect=[0, 0.03, 1, 0.95])

	outlier_stats_df = pd.DataFrame(stats_rows)
	return fig, total_outliers, outlier_stats_df


	def compute_stats_table(df: pd.DataFrame,
	params: list,
	target_col: str = None) -> pd.DataFrame:
	"""
	Tabel statistik deskriptif untuk parameter proses + kolom target gas (jika ada).
	"""
	cols = [c for c in params if c in df.columns]
	if target_col and target_col in df.columns:
	cols.append(target_col)

	if not cols:
	return pd.DataFrame()

	desc = df[cols].describe().T # index = parameter
	return desc


	# ============= CARA PENGGUNAAN =============
	if __name__ == "__main__":
	df = pd.read_csv(r"C:\Dokumen\One To Many_17_10_2025\MMBTU\DASHBOARD\One To Many\disagregasi_data_spraydryer_terbaru_10_17_2025.csv")
	pipeline = SprayDryerEDAPipeline(dataframe=df)
	processed_df, product_dfs = pipeline.run_full_pipeline(show_visualizations=True)
	processed_df.to_csv(r"Processed Data Pipeline EDA_21_10_2025.csv", index=False)
	summary = pipeline.get_summary()
	print(summary)