kmi_Dashboard1 / src /Disagregasi_mmbtu.py
Delima1's picture
Upload dashbord terbaru
6c260ef verified
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
def pipeline_disagregasi(Data_Terbaru, min_minutes_threshold=50):
"""
Pipeline untuk disagregasi data dengan validasi jam produksi
Parameters:
-----------
Data_Terbaru : DataFrame
DataFrame input dengan kolom yang diperlukan
min_minutes_threshold : int
Minimum jumlah menit per jam untuk proses disagregasi (default: 50)
Returns:
--------
df_disaggregated_final : DataFrame
DataFrame hasil disagregasi
validation_report : dict
Laporan validasi dan statistik
"""
print("="*60)
print("PIPELINE DISAGREGASI DATA DENGAN VALIDASI JAM PRODUKSI")
print("="*60)
# Simpan daftar kolom asli untuk memastikan semuanya dipertahankan
original_cols = list(Data_Terbaru.columns)
# ========================================
# TAHAP 1: VALIDASI JAM PRODUKSI
# ========================================
print("\n--- TAHAP 1: Validasi Jam Produksi ---")
# Hitung jumlah data (menit) per jam
jumlah_data_per_jam = Data_Terbaru.groupby('fixed_rounded_time').size()
jumlah_data_per_jam_df = jumlah_data_per_jam.reset_index(name='Jumlah_Menit')
# Identifikasi jam yang valid (>= 50 menit)
jam_valid = jumlah_data_per_jam_df[jumlah_data_per_jam_df['Jumlah_Menit'] >= min_minutes_threshold]['fixed_rounded_time'].tolist()
jam_tidak_valid = jumlah_data_per_jam_df[jumlah_data_per_jam_df['Jumlah_Menit'] < min_minutes_threshold]
print(f"Total jam dalam dataset: {len(jumlah_data_per_jam_df)}")
print(f"Jam valid (>= {min_minutes_threshold} menit): {len(jam_valid)}")
print(f"Jam tidak valid (< {min_minutes_threshold} menit): {len(jam_tidak_valid)}")
# Tampilkan detail jam yang tidak valid
if len(jam_tidak_valid) > 0:
print("\nDetail jam yang TIDAK akan diproses:")
print(jam_tidak_valid.sort_values('Jumlah_Menit')[['fixed_rounded_time', 'Jumlah_Menit']].to_string(index=False))
else:
print("\nSemua jam valid untuk diproses!")
# Filter data hanya untuk jam yang valid
df_work = Data_Terbaru[Data_Terbaru['fixed_rounded_time'].isin(jam_valid)].copy()
if df_work.empty:
print("\n⚠️ PERINGATAN: Tidak ada data yang memenuhi kriteria validasi!")
return pd.DataFrame(), {"status": "failed", "reason": "no_valid_hours"}
print(f"\nData yang akan diproses: {len(df_work)} baris dari {len(Data_Terbaru)} baris total")
print(f"Persentase data yang diproses: {len(df_work)/len(Data_Terbaru)*100:.2f}%")
# ========================================
# TAHAP 2: PERHITUNGAN BOBOT INDIKATOR
# ========================================
print("\n--- TAHAP 2: Perhitungan Bobot Indikator ---")
indicator_cols = [
"D101330TT", "D102260TIC_CV", "D102265TIC_PV",
"D102265TIC_CV", "D102266TIC", "D101264FTSCL"
]
# Handle missing values
df_work[indicator_cols] = df_work[indicator_cols].fillna(0)
# Normalisasi menggunakan MinMaxScaler
scaled_cols = [col + '_scaled' for col in indicator_cols]
scaler = MinMaxScaler()
df_work[scaled_cols] = scaler.fit_transform(df_work[indicator_cols])
# Hitung bobot total per baris
df_work['w_m'] = df_work[scaled_cols].sum(axis=1)
print(f"Indikator yang digunakan: {', '.join(indicator_cols)}")
print(f"Range bobot (w_m): Min={df_work['w_m'].min():.4f}, Max={df_work['w_m'].max():.4f}")
# ========================================
# TAHAP 3: APLIKASI ALGORITMA DISAGREGASI
# ========================================
print("\n--- TAHAP 3: Aplikasi Algoritma Disagregasi ---")
# Hitung total bobot per jam
total_weight_per_block = df_work.groupby('fixed_rounded_time')['w_m'].transform('sum')
total_weight_per_block[total_weight_per_block == 0] = 1
# Hitung proporsi share
df_work['proportional_share'] = df_work['w_m'] / total_weight_per_block
# Ambil nilai GAS_MMBTU total per jam
gas_total_per_block = df_work.groupby('fixed_rounded_time')['GAS_MMBTU'].transform('first')
# Hitung GAS_MMBTU yang sudah didisagregasi
df_work['GAS_MMBTU_Disaggregated'] = gas_total_per_block * df_work['proportional_share']
# Handle kasus khusus: jam dengan total bobot = 0
zero_weight_blocks = df_work.groupby('fixed_rounded_time')['w_m'].sum()
zero_weight_blocks = zero_weight_blocks[zero_weight_blocks == 0].index
if not zero_weight_blocks.empty:
print(f"⚠️ Ditemukan {len(zero_weight_blocks)} jam dengan total bobot = 0")
print(" Menggunakan distribusi merata untuk jam tersebut")
for block_time in zero_weight_blocks:
mask = df_work['fixed_rounded_time'] == block_time
gas_value = df_work.loc[mask, 'GAS_MMBTU'].iloc[0]
count_in_block = mask.sum()
df_work.loc[mask, 'GAS_MMBTU_Disaggregated'] = gas_value / count_in_block if count_in_block > 0 else 0
print("✓ Disagregasi selesai dilakukan")
# ========================================
# TAHAP 4: VALIDASI HASIL DISAGREGASI
# ========================================
print("\n--- TAHAP 4: Validasi Hasil Disagregasi ---")
# Bandingkan total per jam
original_total = df_work.groupby('fixed_rounded_time')['GAS_MMBTU'].first()
disaggregated_total = df_work.groupby('fixed_rounded_time')['GAS_MMBTU_Disaggregated'].sum()
validation_df = pd.DataFrame({
'Original_Total': original_total,
'Disaggregated_Total': disaggregated_total,
'Difference': original_total - disaggregated_total
})
# Statistik validasi
max_diff = validation_df['Difference'].abs().max()
total_diff = validation_df['Difference'].abs().sum()
print(f"Jumlah jam yang divalidasi: {len(validation_df)}")
print(f"Total selisih absolut: {total_diff:.10f}")
print(f"Selisih maksimum: {max_diff:.10f}")
if total_diff < 1e-8:
print("✓ Validasi BERHASIL: Total gas terjaga dengan sempurna")
else:
print("⚠️ PERINGATAN: Terdapat selisih kecil dalam disagregasi")
# Tampilkan 5 jam dengan selisih terbesar
if max_diff > 1e-10:
print("\n5 Jam dengan selisih terbesar:")
top_diff = validation_df.nlargest(5, 'Difference')[['Original_Total', 'Disaggregated_Total', 'Difference']]
print(top_diff.to_string())
# ========================================
# TAHAP 5: PERSIAPAN OUTPUT FINAL
# ========================================
print("\n--- TAHAP 5: Persiapan Output Final ---")
# --- PERUBAHAN KUNCI ---
# Daripada menghapus kolom, kita secara eksplisit memilih semua kolom asli
# ditambah kolom hasil disagregasi yang baru. Ini memastikan semua
# kolom lain yang tidak terpakai tetap ada di hasil akhir.
# Tentukan daftar kolom final
final_cols = original_cols + ['GAS_MMBTU_Disaggregated']
# Hapus duplikat jika 'GAS_MMBTU_Disaggregated' sudah ada
final_cols = list(dict.fromkeys(final_cols))
# Buat dataframe final dengan memilih kolom yang relevan dari df_work
df_disaggregated_final = df_work[final_cols]
print(f"Dimensi data final: {df_disaggregated_final.shape}")
print("Kolom-kolom asli yang tidak digunakan dalam proses telah berhasil dipertahankan.")
# ========================================
# LAPORAN RINGKASAN
# ========================================
print("\n" + "="*60)
print("RINGKASAN PIPELINE")
print("="*60)
validation_report = {
"total_jam_input": len(jumlah_data_per_jam_df),
"jam_valid": len(jam_valid),
"jam_tidak_valid": len(jam_tidak_valid),
"total_baris_input": len(Data_Terbaru),
"total_baris_diproses": len(df_work),
"persentase_data_diproses": len(df_work)/len(Data_Terbaru)*100 if len(Data_Terbaru) > 0 else 0,
"total_selisih_disagregasi": total_diff,
"jam_dengan_bobot_nol": len(zero_weight_blocks) if not zero_weight_blocks.empty else 0,
"validation_df": validation_df,
"jam_tidak_valid_detail": jam_tidak_valid,
"weight_min": float(df_work["w_m"].min()),
"weight_max": float(df_work["w_m"].max())
}
print(f"• Total jam input: {validation_report['total_jam_input']}")
print(f"• Jam valid untuk disagregasi: {validation_report['jam_valid']}")
print(f"• Jam tidak valid (skip): {validation_report['jam_tidak_valid']}")
print(f"• Total baris yang diproses: {validation_report['total_baris_diproses']:,} dari {validation_report['total_baris_input']:,}")
print(f"• Persentase data diproses: {validation_report['persentase_data_diproses']:.2f}%")
print(f"• Akurasi disagregasi (total selisih): {validation_report['total_selisih_disagregasi']:.10f}")
print("\n✅ Pipeline selesai dijalankan!")
return df_disaggregated_final, validation_report
# ========================================
# FUNGSI UTILITAS TAMBAHAN
# ========================================
def analyze_disagregation_results(df_result, validation_report):
"""
Analisis mendalam hasil disagregasi
"""
print("\n" + "="*60)
print("ANALISIS HASIL DISAGREGASI")
print("="*60)
# Cek jika df_result kosong
if df_result.empty:
print("Tidak ada hasil untuk dianalisis.")
return None
# Statistik GAS_MMBTU sebelum dan sesudah
print("\n📊 Statistik GAS_MMBTU_Disaggregated:")
print(df_result['GAS_MMBTU_Disaggregated'].describe())
# Distribusi per jam
print("\n📊 Distribusi data per jam (top 10):")
hourly_stats = df_result.groupby('fixed_rounded_time').agg({
'GAS_MMBTU_Disaggregated': ['sum', 'mean', 'std', 'count']
}).round(4)
print(hourly_stats.head(10))
# Jam dengan nilai ekstrem
print("\n⚠️ Jam dengan total GAS tertinggi:")
top_hours = df_result.groupby('fixed_rounded_time')['GAS_MMBTU_Disaggregated'].sum().nlargest(5)
print(top_hours)
print("\n⚠️ Jam dengan total GAS terendah:")
bottom_hours = df_result.groupby('fixed_rounded_time')['GAS_MMBTU_Disaggregated'].sum().nsmallest(5)
print(bottom_hours)
return hourly_stats
def run_disagregasi_pipeline(file_obj, min_minutes_threshold=50):
"""
Wrapper yang dipakai dashboard Streamlit.
- file_obj bisa berupa path string atau UploadedFile dari Streamlit.
- Mengembalikan: (df_hasil_disagregasi, validation_report_diperluas)
"""
# Baca CSV dari file_obj (path string atau UploadedFile)
if isinstance(file_obj, str):
df_input = pd.read_csv(file_obj)
else:
# Asumsikan ini adalah UploadedFile dari st.file_uploader
df_input = pd.read_csv(file_obj)
# Jalankan pipeline utama
df_hasil, laporan = pipeline_disagregasi(
Data_Terbaru=df_input,
min_minutes_threshold=min_minutes_threshold
)
# Jika hasil kosong → langsung kembalikan
if df_hasil.empty:
return df_hasil, laporan
# Tambah statistik GAS_MMBTU_Disaggregated untuk kebutuhan dashboard
if "GAS_MMBTU_Disaggregated" in df_hasil.columns:
stats = df_hasil["GAS_MMBTU_Disaggregated"].describe()
laporan["gas_disagg_stats"] = stats
# Total per jam
hourly_total = (
df_hasil
.groupby("fixed_rounded_time")["GAS_MMBTU_Disaggregated"]
.sum()
)
# 10 jam dengan konsumsi gas tertinggi
laporan["top_hours"] = hourly_total.sort_values(ascending=False).head(10)
# Jam dengan total GAS = 0 (kemungkinan shutdown)
laporan["zero_hours"] = hourly_total[hourly_total == 0.0]
return df_hasil, laporan
# ========================================
# CARA PENGGUNAAN
# ========================================
# Ganti dengan path file Anda yang sebenarnya
if __name__ == "__main__":
# Jalankan pipeline utama (mode CLI / testing)
df_hasil, laporan = pipeline_disagregasi(
Data_Terbaru=pd.read_csv("/work/Dataset 18 Mar - 19 Jun/Processed Data Pipeline EDA_10_17_2025.csv"),
min_minutes_threshold=50
)
# Analisis hasil (opsional)
stats = analyze_disagregation_results(df_hasil, laporan)
# Simpan hasil ke CSV (opsional)
df_hasil = df_hasil[~df_hasil['Product'].isin(['CIP', 'CIP CHAMBER'])]
df_hasil.to_csv('/work/Dataset 18 Mar - 19 Jun/disagregasi_data_spraydryer_terbaru_10_17_2025.csv', index=False)
# Akses detail validasi
print(laporan['validation_df'])
print(laporan['jam_tidak_valid_detail'])