Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import MinMaxScaler | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| def pipeline_disagregasi(Data_Terbaru, min_minutes_threshold=50): | |
| """ | |
| Pipeline untuk disagregasi data dengan validasi jam produksi | |
| Parameters: | |
| ----------- | |
| Data_Terbaru : DataFrame | |
| DataFrame input dengan kolom yang diperlukan | |
| min_minutes_threshold : int | |
| Minimum jumlah menit per jam untuk proses disagregasi (default: 50) | |
| Returns: | |
| -------- | |
| df_disaggregated_final : DataFrame | |
| DataFrame hasil disagregasi | |
| validation_report : dict | |
| Laporan validasi dan statistik | |
| """ | |
| print("="*60) | |
| print("PIPELINE DISAGREGASI DATA DENGAN VALIDASI JAM PRODUKSI") | |
| print("="*60) | |
| # Simpan daftar kolom asli untuk memastikan semuanya dipertahankan | |
| original_cols = list(Data_Terbaru.columns) | |
| # ======================================== | |
| # TAHAP 1: VALIDASI JAM PRODUKSI | |
| # ======================================== | |
| print("\n--- TAHAP 1: Validasi Jam Produksi ---") | |
| # Hitung jumlah data (menit) per jam | |
| jumlah_data_per_jam = Data_Terbaru.groupby('fixed_rounded_time').size() | |
| jumlah_data_per_jam_df = jumlah_data_per_jam.reset_index(name='Jumlah_Menit') | |
| # Identifikasi jam yang valid (>= 50 menit) | |
| jam_valid = jumlah_data_per_jam_df[jumlah_data_per_jam_df['Jumlah_Menit'] >= min_minutes_threshold]['fixed_rounded_time'].tolist() | |
| jam_tidak_valid = jumlah_data_per_jam_df[jumlah_data_per_jam_df['Jumlah_Menit'] < min_minutes_threshold] | |
| print(f"Total jam dalam dataset: {len(jumlah_data_per_jam_df)}") | |
| print(f"Jam valid (>= {min_minutes_threshold} menit): {len(jam_valid)}") | |
| print(f"Jam tidak valid (< {min_minutes_threshold} menit): {len(jam_tidak_valid)}") | |
| # Tampilkan detail jam yang tidak valid | |
| if len(jam_tidak_valid) > 0: | |
| print("\nDetail jam yang TIDAK akan diproses:") | |
| print(jam_tidak_valid.sort_values('Jumlah_Menit')[['fixed_rounded_time', 'Jumlah_Menit']].to_string(index=False)) | |
| else: | |
| print("\nSemua jam valid untuk diproses!") | |
| # Filter data hanya untuk jam yang valid | |
| df_work = Data_Terbaru[Data_Terbaru['fixed_rounded_time'].isin(jam_valid)].copy() | |
| if df_work.empty: | |
| print("\n⚠️ PERINGATAN: Tidak ada data yang memenuhi kriteria validasi!") | |
| return pd.DataFrame(), {"status": "failed", "reason": "no_valid_hours"} | |
| print(f"\nData yang akan diproses: {len(df_work)} baris dari {len(Data_Terbaru)} baris total") | |
| print(f"Persentase data yang diproses: {len(df_work)/len(Data_Terbaru)*100:.2f}%") | |
| # ======================================== | |
| # TAHAP 2: PERHITUNGAN BOBOT INDIKATOR | |
| # ======================================== | |
| print("\n--- TAHAP 2: Perhitungan Bobot Indikator ---") | |
| indicator_cols = [ | |
| "D101330TT", "D102260TIC_CV", "D102265TIC_PV", | |
| "D102265TIC_CV", "D102266TIC", "D101264FTSCL" | |
| ] | |
| # Handle missing values | |
| df_work[indicator_cols] = df_work[indicator_cols].fillna(0) | |
| # Normalisasi menggunakan MinMaxScaler | |
| scaled_cols = [col + '_scaled' for col in indicator_cols] | |
| scaler = MinMaxScaler() | |
| df_work[scaled_cols] = scaler.fit_transform(df_work[indicator_cols]) | |
| # Hitung bobot total per baris | |
| df_work['w_m'] = df_work[scaled_cols].sum(axis=1) | |
| print(f"Indikator yang digunakan: {', '.join(indicator_cols)}") | |
| print(f"Range bobot (w_m): Min={df_work['w_m'].min():.4f}, Max={df_work['w_m'].max():.4f}") | |
| # ======================================== | |
| # TAHAP 3: APLIKASI ALGORITMA DISAGREGASI | |
| # ======================================== | |
| print("\n--- TAHAP 3: Aplikasi Algoritma Disagregasi ---") | |
| # Hitung total bobot per jam | |
| total_weight_per_block = df_work.groupby('fixed_rounded_time')['w_m'].transform('sum') | |
| total_weight_per_block[total_weight_per_block == 0] = 1 | |
| # Hitung proporsi share | |
| df_work['proportional_share'] = df_work['w_m'] / total_weight_per_block | |
| # Ambil nilai GAS_MMBTU total per jam | |
| gas_total_per_block = df_work.groupby('fixed_rounded_time')['GAS_MMBTU'].transform('first') | |
| # Hitung GAS_MMBTU yang sudah didisagregasi | |
| df_work['GAS_MMBTU_Disaggregated'] = gas_total_per_block * df_work['proportional_share'] | |
| # Handle kasus khusus: jam dengan total bobot = 0 | |
| zero_weight_blocks = df_work.groupby('fixed_rounded_time')['w_m'].sum() | |
| zero_weight_blocks = zero_weight_blocks[zero_weight_blocks == 0].index | |
| if not zero_weight_blocks.empty: | |
| print(f"⚠️ Ditemukan {len(zero_weight_blocks)} jam dengan total bobot = 0") | |
| print(" Menggunakan distribusi merata untuk jam tersebut") | |
| for block_time in zero_weight_blocks: | |
| mask = df_work['fixed_rounded_time'] == block_time | |
| gas_value = df_work.loc[mask, 'GAS_MMBTU'].iloc[0] | |
| count_in_block = mask.sum() | |
| df_work.loc[mask, 'GAS_MMBTU_Disaggregated'] = gas_value / count_in_block if count_in_block > 0 else 0 | |
| print("✓ Disagregasi selesai dilakukan") | |
| # ======================================== | |
| # TAHAP 4: VALIDASI HASIL DISAGREGASI | |
| # ======================================== | |
| print("\n--- TAHAP 4: Validasi Hasil Disagregasi ---") | |
| # Bandingkan total per jam | |
| original_total = df_work.groupby('fixed_rounded_time')['GAS_MMBTU'].first() | |
| disaggregated_total = df_work.groupby('fixed_rounded_time')['GAS_MMBTU_Disaggregated'].sum() | |
| validation_df = pd.DataFrame({ | |
| 'Original_Total': original_total, | |
| 'Disaggregated_Total': disaggregated_total, | |
| 'Difference': original_total - disaggregated_total | |
| }) | |
| # Statistik validasi | |
| max_diff = validation_df['Difference'].abs().max() | |
| total_diff = validation_df['Difference'].abs().sum() | |
| print(f"Jumlah jam yang divalidasi: {len(validation_df)}") | |
| print(f"Total selisih absolut: {total_diff:.10f}") | |
| print(f"Selisih maksimum: {max_diff:.10f}") | |
| if total_diff < 1e-8: | |
| print("✓ Validasi BERHASIL: Total gas terjaga dengan sempurna") | |
| else: | |
| print("⚠️ PERINGATAN: Terdapat selisih kecil dalam disagregasi") | |
| # Tampilkan 5 jam dengan selisih terbesar | |
| if max_diff > 1e-10: | |
| print("\n5 Jam dengan selisih terbesar:") | |
| top_diff = validation_df.nlargest(5, 'Difference')[['Original_Total', 'Disaggregated_Total', 'Difference']] | |
| print(top_diff.to_string()) | |
| # ======================================== | |
| # TAHAP 5: PERSIAPAN OUTPUT FINAL | |
| # ======================================== | |
| print("\n--- TAHAP 5: Persiapan Output Final ---") | |
| # --- PERUBAHAN KUNCI --- | |
| # Daripada menghapus kolom, kita secara eksplisit memilih semua kolom asli | |
| # ditambah kolom hasil disagregasi yang baru. Ini memastikan semua | |
| # kolom lain yang tidak terpakai tetap ada di hasil akhir. | |
| # Tentukan daftar kolom final | |
| final_cols = original_cols + ['GAS_MMBTU_Disaggregated'] | |
| # Hapus duplikat jika 'GAS_MMBTU_Disaggregated' sudah ada | |
| final_cols = list(dict.fromkeys(final_cols)) | |
| # Buat dataframe final dengan memilih kolom yang relevan dari df_work | |
| df_disaggregated_final = df_work[final_cols] | |
| print(f"Dimensi data final: {df_disaggregated_final.shape}") | |
| print("Kolom-kolom asli yang tidak digunakan dalam proses telah berhasil dipertahankan.") | |
| # ======================================== | |
| # LAPORAN RINGKASAN | |
| # ======================================== | |
| print("\n" + "="*60) | |
| print("RINGKASAN PIPELINE") | |
| print("="*60) | |
| validation_report = { | |
| "total_jam_input": len(jumlah_data_per_jam_df), | |
| "jam_valid": len(jam_valid), | |
| "jam_tidak_valid": len(jam_tidak_valid), | |
| "total_baris_input": len(Data_Terbaru), | |
| "total_baris_diproses": len(df_work), | |
| "persentase_data_diproses": len(df_work)/len(Data_Terbaru)*100 if len(Data_Terbaru) > 0 else 0, | |
| "total_selisih_disagregasi": total_diff, | |
| "jam_dengan_bobot_nol": len(zero_weight_blocks) if not zero_weight_blocks.empty else 0, | |
| "validation_df": validation_df, | |
| "jam_tidak_valid_detail": jam_tidak_valid, | |
| "weight_min": float(df_work["w_m"].min()), | |
| "weight_max": float(df_work["w_m"].max()) | |
| } | |
| print(f"• Total jam input: {validation_report['total_jam_input']}") | |
| print(f"• Jam valid untuk disagregasi: {validation_report['jam_valid']}") | |
| print(f"• Jam tidak valid (skip): {validation_report['jam_tidak_valid']}") | |
| print(f"• Total baris yang diproses: {validation_report['total_baris_diproses']:,} dari {validation_report['total_baris_input']:,}") | |
| print(f"• Persentase data diproses: {validation_report['persentase_data_diproses']:.2f}%") | |
| print(f"• Akurasi disagregasi (total selisih): {validation_report['total_selisih_disagregasi']:.10f}") | |
| print("\n✅ Pipeline selesai dijalankan!") | |
| return df_disaggregated_final, validation_report | |
| # ======================================== | |
| # FUNGSI UTILITAS TAMBAHAN | |
| # ======================================== | |
| def analyze_disagregation_results(df_result, validation_report): | |
| """ | |
| Analisis mendalam hasil disagregasi | |
| """ | |
| print("\n" + "="*60) | |
| print("ANALISIS HASIL DISAGREGASI") | |
| print("="*60) | |
| # Cek jika df_result kosong | |
| if df_result.empty: | |
| print("Tidak ada hasil untuk dianalisis.") | |
| return None | |
| # Statistik GAS_MMBTU sebelum dan sesudah | |
| print("\n📊 Statistik GAS_MMBTU_Disaggregated:") | |
| print(df_result['GAS_MMBTU_Disaggregated'].describe()) | |
| # Distribusi per jam | |
| print("\n📊 Distribusi data per jam (top 10):") | |
| hourly_stats = df_result.groupby('fixed_rounded_time').agg({ | |
| 'GAS_MMBTU_Disaggregated': ['sum', 'mean', 'std', 'count'] | |
| }).round(4) | |
| print(hourly_stats.head(10)) | |
| # Jam dengan nilai ekstrem | |
| print("\n⚠️ Jam dengan total GAS tertinggi:") | |
| top_hours = df_result.groupby('fixed_rounded_time')['GAS_MMBTU_Disaggregated'].sum().nlargest(5) | |
| print(top_hours) | |
| print("\n⚠️ Jam dengan total GAS terendah:") | |
| bottom_hours = df_result.groupby('fixed_rounded_time')['GAS_MMBTU_Disaggregated'].sum().nsmallest(5) | |
| print(bottom_hours) | |
| return hourly_stats | |
| def run_disagregasi_pipeline(file_obj, min_minutes_threshold=50): | |
| """ | |
| Wrapper yang dipakai dashboard Streamlit. | |
| - file_obj bisa berupa path string atau UploadedFile dari Streamlit. | |
| - Mengembalikan: (df_hasil_disagregasi, validation_report_diperluas) | |
| """ | |
| # Baca CSV dari file_obj (path string atau UploadedFile) | |
| if isinstance(file_obj, str): | |
| df_input = pd.read_csv(file_obj) | |
| else: | |
| # Asumsikan ini adalah UploadedFile dari st.file_uploader | |
| df_input = pd.read_csv(file_obj) | |
| # Jalankan pipeline utama | |
| df_hasil, laporan = pipeline_disagregasi( | |
| Data_Terbaru=df_input, | |
| min_minutes_threshold=min_minutes_threshold | |
| ) | |
| # Jika hasil kosong → langsung kembalikan | |
| if df_hasil.empty: | |
| return df_hasil, laporan | |
| # Tambah statistik GAS_MMBTU_Disaggregated untuk kebutuhan dashboard | |
| if "GAS_MMBTU_Disaggregated" in df_hasil.columns: | |
| stats = df_hasil["GAS_MMBTU_Disaggregated"].describe() | |
| laporan["gas_disagg_stats"] = stats | |
| # Total per jam | |
| hourly_total = ( | |
| df_hasil | |
| .groupby("fixed_rounded_time")["GAS_MMBTU_Disaggregated"] | |
| .sum() | |
| ) | |
| # 10 jam dengan konsumsi gas tertinggi | |
| laporan["top_hours"] = hourly_total.sort_values(ascending=False).head(10) | |
| # Jam dengan total GAS = 0 (kemungkinan shutdown) | |
| laporan["zero_hours"] = hourly_total[hourly_total == 0.0] | |
| return df_hasil, laporan | |
| # ======================================== | |
| # CARA PENGGUNAAN | |
| # ======================================== | |
| # Ganti dengan path file Anda yang sebenarnya | |
| if __name__ == "__main__": | |
| # Jalankan pipeline utama (mode CLI / testing) | |
| df_hasil, laporan = pipeline_disagregasi( | |
| Data_Terbaru=pd.read_csv("/work/Dataset 18 Mar - 19 Jun/Processed Data Pipeline EDA_10_17_2025.csv"), | |
| min_minutes_threshold=50 | |
| ) | |
| # Analisis hasil (opsional) | |
| stats = analyze_disagregation_results(df_hasil, laporan) | |
| # Simpan hasil ke CSV (opsional) | |
| df_hasil = df_hasil[~df_hasil['Product'].isin(['CIP', 'CIP CHAMBER'])] | |
| df_hasil.to_csv('/work/Dataset 18 Mar - 19 Jun/disagregasi_data_spraydryer_terbaru_10_17_2025.csv', index=False) | |
| # Akses detail validasi | |
| print(laporan['validation_df']) | |
| print(laporan['jam_tidak_valid_detail']) | |