import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go import numpy as np from datetime import datetime, timedelta from typing import List import os import sklearn import kaleido from transformers import pipeline import transformers print("Transformers version:", transformers.__version__) from transformers import pipeline print("โœ… pipeline imported successfully") # =================== PAGE CONFIG =================== st.set_page_config( page_title="Proactive Safety Intelligence & Analytics Dashboard", page_icon="", layout="wide", initial_sidebar_state="expanded" ) # =================== CUSTOM CSS (Updated for PLN Colors) =================== st.markdown("""""", unsafe_allow_html=True) # =================== DATA LOADING (FROM data.xlsx) =================== @st.cache_data(ttl=300) # refresh every 5 min def load_data(): file_path = "data.xlsx" if not os.path.exists(file_path): st.error(f"โŒ File **`{file_path}`** not found. Please ensure it's in the same directory as this script.") return pd.DataFrame() try: # Load Excel file df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl') # Check for required columns required_cols = ['created_at'] missing = [c for c in required_cols if c not in df.columns] if missing: st.error(f"โŒ Missing required columns: {missing}. Available: {list(df.columns)}") return pd.DataFrame() # Parse datetime df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') if df['created_at'].isna().all(): st.error("โŒ `created_at` column could not be parsed as datetime.") return pd.DataFrame() # Optional: close_at if 'close_at' in df.columns: df['close_at'] = pd.to_datetime(df['close_at'], errors='coerce') df['days_to_close'] = (df['close_at'] - df['created_at']).dt.total_seconds() / (24 * 3600) df['days_to_close'] = df['days_to_close'].apply(lambda x: x if x >= 0 else np.nan) else: df['days_to_close'] = np.nan # Derived columns df['created_month'] = df['created_at'].dt.to_period('M') df['created_date'] = df['created_at'].dt.date df['created_week'] = df['created_at'].dt.to_period('W') # Keep only valid rows df = df.dropna(subset=['created_at']).copy() # Log shape st.sidebar.success(f"Loaded {len(df):,} Audit Findings from `data.xlsx`") return df except Exception as e: st.exception(f"Error loading data.xlsx: {e}") return pd.DataFrame() df = load_data() if df.empty: st.stop() # =================== SIDEBAR FILTERS (Perbaikan) =================== st.sidebar.markdown('
', unsafe_allow_html=True) st.sidebar.markdown('

Filter Dashboard

', unsafe_allow_html=True) # Inisialisasi df_filtered df_filtered = df.copy() # Flag to track if filters were applied filters_applied = False # 1. Date Range Filter min_date = df['created_at'].min().date() max_date = df['created_at'].max().date() date_range = st.sidebar.date_input( "Date Range", value=(min_date, max_date), min_value=min_date, max_value=max_date ) # 2. Filter by Vendor (nama_perusahaan) - Default to All if 'nama_perusahaan' in df.columns: unique_vendors = sorted(df['nama_perusahaan'].dropna().astype(str).unique()) all_vendors_option = "All Vendors" vendor_options = [all_vendors_option] + list(unique_vendors) selected_vendor = st.sidebar.selectbox("Vendor", vendor_options, index=0) # Default to "All" if selected_vendor != all_vendors_option: df_filtered = df_filtered[df_filtered['nama_perusahaan'].astype(str) == selected_vendor] filters_applied = True # 3. Filter by Area/Unit Type (temuan_nama_distrik or creator_nama_distrik) - Renamed area_col = None if 'temuan_nama_distrik' in df_filtered.columns: area_col = 'temuan_nama_distrik' elif 'creator_nama_distrik' in df_filtered.columns: area_col = 'creator_nama_distrik' if area_col: # Define mapping for display names area_mapping = { 'UMRO': 'Unit Maintenance', 'UP GRESIK': 'Unit Pembangkit' } unique_areas_raw = sorted(df_filtered[area_col].dropna().astype(str).unique()) # Map raw values to display names, keep unmapped values as is unique_areas_display = [area_mapping.get(area, area) for area in unique_areas_raw] # Prepend "All" option all_areas_option = "All Units" area_options = [all_areas_option] + unique_areas_display selected_area_display = st.sidebar.selectbox("Unit Type", area_options, index=0) # Default to "All" if selected_area_display != all_areas_option: # Reverse map the selected display name back to the raw value for filtering selected_area_raw = next((raw for raw, disp in area_mapping.items() if disp == selected_area_display), selected_area_display) df_filtered = df_filtered[df_filtered[area_col].astype(str) == selected_area_raw] filters_applied = True # 4. Status filter - Changed to selectbox (dropdown) status_filter_applied = False if 'temuan_status' in df_filtered.columns: all_status = sorted(df_filtered['temuan_status'].dropna().astype(str).unique()) # Prepend "All" option all_status_option = "All Status" status_options = [all_status_option] + list(all_status) selected_status = st.sidebar.selectbox( "Status", status_options, index=0 # Default to "All" ) if selected_status != all_status_option: df_filtered = df_filtered[df_filtered['temuan_status'].astype(str) == selected_status] status_filter_applied = True filters_applied = True # Apply date filter *after* other filters if len(date_range) == 2: df_filtered = df_filtered[ (df_filtered['created_at'].dt.date >= date_range[0]) & (df_filtered['created_at'].dt.date <= date_range[1]) ] if date_range[0] != min_date or date_range[1] != max_date: filters_applied = True # Submit Button submit_clicked = st.sidebar.button("Apply Filters") # Apply filters logic when button is clicked if submit_clicked: # The filtering based on selections already happened above # Here we just update the summary based on the current state of df_filtered active_filters = [] if 'selected_vendor' in locals() and selected_vendor != all_vendors_option: active_filters.append(f"Vendor: {selected_vendor}") if 'selected_area_display' in locals() and selected_area_display != all_areas_option: active_filters.append(f"Unit: {selected_area_display}") if 'selected_status' in locals() and selected_status != all_status_option: active_filters.append(f"Status: {selected_status}") if len(date_range) == 2 and (date_range[0] != min_date or date_range[1] != max_date): active_filters.append(f"Date: {date_range[0]} to {date_range[1]}") if active_filters: st.sidebar.success("**Active Filters:**") for f in active_filters: st.sidebar.markdown(f"- {f}") st.sidebar.info(f"Showing {len(df_filtered)} records based on filters.") else: st.sidebar.info("No specific filters applied (showing all records).") else: # Show default message when not submitted yet st.sidebar.info("Set filters and click 'Apply Filters'.") st.sidebar.markdown('
', unsafe_allow_html=True) # =================== HEADER =================== import streamlit as st import os import streamlit as st import os # ====================== CSS UNTUK HEADER =========================== # CSS biar header terlihat seperti 1 kotak besar st.markdown(""" """, unsafe_allow_html=True) # ==== HEADER LAYOUT (Columns) ==== with st.container(): st.markdown('
', unsafe_allow_html=True) col1, col2, col3 = st.columns([1, 5, 1]) # tengah lebih besar # LEFT (kosong) with col1: st.write("") # CENTER โ€” TITLE with col2: st.markdown("""

Proactive Safety Intelligence & Analytics Dashboard

""", unsafe_allow_html=True) # RIGHT โ€” LOGO with col3: try: st.image("pln.png", width=120) # ๐Ÿ”ฅ Diperbesar dari 80px except: st.write("") # Jika logo tidak ditemukan, kosongkan st.markdown('
', unsafe_allow_html=True) # =================== OBJECTIVE 1 - Company Reporting Activity (Polar Bar Chart) =================== st.markdown( """

OBJECTIVE 1 โ€” Company Reporting Activity: Who Reports the Most?

""", unsafe_allow_html=True ) df_local = df_filtered.copy() df_local['created_month'] = df_local['created_at'].dt.to_period('M') if 'temuan_kode_distrik' in df_local.columns: df_local['Area_Type'] = df_local['temuan_kode_distrik'].apply( lambda x: 'PG' if 'PG' in str(x).upper() else 'UM' if 'UM' in str(x).upper() else 'Other' ) df_pg = df_local[df_local['Area_Type'] == 'PG'].copy() df_um = df_local[df_local['Area_Type'] == 'UM'].copy() # --- Hitung rasio per perusahaan --- def calculate_avg_ratio_per_company(df_area): if df_area.empty: return pd.DataFrame() findings = df_area.groupby(['created_month', 'nama_perusahaan']).size().reset_index(name='findings_count') creators = df_area.groupby(['created_month', 'nama_perusahaan'])['creator_nid'].nunique().reset_index(name='unique_creators') merged = findings.merge(creators, on=['created_month', 'nama_perusahaan'], how='outer') merged = merged.fillna({'findings_count': 0, 'unique_creators': 0}) merged = merged[merged['unique_creators'] > 0] merged['ratio'] = merged['findings_count'] / merged['unique_creators'] merged['ratio'] = merged['ratio'].replace([np.inf, -np.inf], np.nan) avg_ratio = merged.groupby('nama_perusahaan')['ratio'].mean().reset_index(name='avg_monthly_ratio') return avg_ratio.dropna(subset=['avg_monthly_ratio']) avg_ratio_pg = calculate_avg_ratio_per_company(df_pg) avg_ratio_um = calculate_avg_ratio_per_company(df_um) # ๐Ÿ”‘ PALET & COLOR MAPPING GLOBAL โ€” KONSISTEN ANTAR CHART PLN_COLOR = "#FFD700" PASTEL_BLUES = [ "#A8DADC", "#E2ECE9", "#CCE4E7", "#B5D9D9", "#98C8D1", "#7FB9C1", "#6BA9B3", "#5A9CB5", "#4A8FA7", "#3A8399" ] # diperpanjang sedikit untuk antisipasi banyak kontraktor # ๐Ÿ”ฅ Dapatkan daftar *semua* perusahaan non-PLN unik (dari seluruh data), urut alfabetis โ†’ assign warna deterministik all_companies = pd.concat([avg_ratio_pg, avg_ratio_um])['nama_perusahaan'].dropna().unique() non_pln_companies = sorted([c for c in all_companies if 'PLN' not in str(c).upper()]) # Mapping: company โ†’ color (PLN khusus, lainnya dari palet biru berurut) COMPANY_COLOR_MAP = {} for i, company in enumerate(non_pln_companies): COMPANY_COLOR_MAP[company] = PASTEL_BLUES[i % len(PASTEL_BLUES)] # Pastikan PLN juga masuk mapping (jika muncul) for company in all_companies: if 'PLN' in str(company).upper(): COMPANY_COLOR_MAP[company] = PLN_COLOR # ๐Ÿ”ฅ Fungsi helper: dapatkan warna berdasarkan nama perusahaan (konsisten!) def get_color(company_name): return COMPANY_COLOR_MAP.get(company_name, "#CCCCCC") # fallback abu-abu # ๐Ÿ”ฅ Fungsi chart โ€” pakai mapping global def create_polar_bar_chart(df, area_name): if df.empty: return None # Urutkan untuk stabilitas visual (misal: ascending ratio) df = df.sort_values('avg_monthly_ratio', ascending=True).reset_index(drop=True) companies = df['nama_perusahaan'].tolist() ratios = df['avg_monthly_ratio'].tolist() colors = [get_color(comp) for comp in companies] # Hitung proporsi temuan (untuk lebar bar) total_findings = df_local[df_local['Area_Type'] == area_name].groupby('nama_perusahaan').size() angles = [total_findings.get(comp, 0) / total_findings.sum() * 360 if total_findings.sum() > 0 else 0 for comp in companies] # Hitung posisi tengah mid_angles = [] current = 0 for a in angles: mid_angles.append(current + a / 2) current += a fig = go.Figure() # Satu trace saja โ€” lebih clean, legend bisa diatur manual jika perlu fig.add_trace(go.Barpolar( r=ratios, theta=mid_angles, width=angles, marker_color=colors, marker_line_color="white", marker_line_width=1.2, opacity=0.9, hovertemplate="%{text}
Avg Ratio: %{r:.2f}", text=companies, showlegend=False # Kita akan buat legend manual yang rapi )) # ๐Ÿ”ฅ LEGEND MANUAL โ€” hanya tampilkan setiap perusahaan sekali, dengan warna konsisten # Tambahkan satu scatter "dummy" per perusahaan untuk legend for company in sorted(set(companies)): color = get_color(company) fig.add_trace(go.Scatterpolar( r=[None], theta=[None], # invisible mode='markers', marker=dict(color=color, size=10), name=company, showlegend=True )) fig.update_layout( title=f'{area_name} Area', polar=dict( radialaxis=dict( visible=True, tickfont=dict(size=9, color="gray"), gridcolor='lightgray', title=dict(text='Avg Finding/Person', font=dict(size=10, color="gray")) ), angularaxis=dict( visible=True, direction='clockwise', tickfont=dict(size=9, color="white"), showline=False, gridcolor="lightgray" ), ), showlegend=True, legend=dict( orientation="v", yanchor="top", y=1, xanchor="right", x=1.02, font=dict(size=10) ), height=450, margin=dict(t=40, b=20, l=20, r=40) ) return fig # Plot col1, col2 = st.columns(2) with col1: st.markdown("
Unit Pembangkit: Monthly Finding by Company
", unsafe_allow_html=True) fig_pg = create_polar_bar_chart(avg_ratio_pg, 'PG') if fig_pg: st.plotly_chart(fig_pg, use_container_width=True) if not avg_ratio_pg.empty: top = avg_ratio_pg.loc[avg_ratio_pg['avg_monthly_ratio'].idxmax()] low = avg_ratio_pg.loc[avg_ratio_pg['avg_monthly_ratio'].idxmin()] st.markdown("### Insight") st.markdown( f"
" f"In PG Area, {top['nama_perusahaan']} has the highest ratio ({top['avg_monthly_ratio']:.2f}), " f"while {low['nama_perusahaan']} has the lowest ({low['avg_monthly_ratio']:.2f}). " f"Consider cross-learning between them to standardize reporting culture." f"
", unsafe_allow_html=True ) else: st.warning("No data for PG area.") with col2: st.markdown("
Unit Maintenance: Monthly Finding by Company
", unsafe_allow_html=True) fig_um = create_polar_bar_chart(avg_ratio_um, 'UM') if fig_um: st.plotly_chart(fig_um, use_container_width=True) if not avg_ratio_um.empty: top = avg_ratio_um.loc[avg_ratio_um['avg_monthly_ratio'].idxmax()] low = avg_ratio_um.loc[avg_ratio_um['avg_monthly_ratio'].idxmin()] st.markdown("### Insight") st.markdown( f"
" f"Across all companies, the finding-per-person ratio is similar in the UM Area " f"
", unsafe_allow_html=True ) else: st.warning("No data for UM area.") else: st.error("Column 'temuan_kode_distrik' not found.") st.stop() # =================== OBJECTIVE 2 โ€” Active vs Inactive Locations (Treemap with Color Gradient) =================== st.markdown( """

OBJECTIVE 2 โ€” Active vs Inactive Locations: Who Leads?

""", unsafe_allow_html=True ) df_local = df_filtered.copy() if df_local.empty: st.warning("No data available after filtering.") st.stop() df_local['created_month'] = df_local['created_at'].dt.to_period('M') # Hitung temuan per bulan per lokasi findings_by_location_month = df_local.groupby(['created_month', 'nama_lokasi_full']).size().reset_index(name='findings_count') # Hitung jumlah orang unik per bulan per lokasi creators_by_location_month = df_local.groupby(['created_month', 'nama_lokasi_full'])['creator_nid'].nunique().reset_index(name='unique_creators') # Gabung merged_loc = findings_by_location_month.merge(creators_by_location_month, on=['created_month', 'nama_lokasi_full'], how='outer') # Isi NaN dengan 0 merged_loc = merged_loc.fillna({'findings_count': 0, 'unique_creators': 0}) # Hindari pembagian dengan nol merged_loc = merged_loc[merged_loc['unique_creators'] > 0] # Hitung rasio merged_loc['ratio'] = merged_loc['findings_count'] / merged_loc['unique_creators'] merged_loc['ratio'] = merged_loc['ratio'].replace([np.inf, -np.inf], np.nan) # Rata-rata bulanan per lokasi avg_ratio_per_location = merged_loc.groupby('nama_lokasi_full')['ratio'].mean().reset_index(name='avg_monthly_ratio') avg_ratio_per_location = avg_ratio_per_location.dropna(subset=['avg_monthly_ratio']) if avg_ratio_per_location.empty: st.write("Data kosong.") else: # Treemap โ€” hanya dibuat & ditampilkan sekali fig_treemap = px.treemap( avg_ratio_per_location, path=['nama_lokasi_full'], values='avg_monthly_ratio', title='Avg Monthly Finding/Person Ratio by Location', labels={ 'avg_monthly_ratio': 'Avg Monthly Finding/Person Ratio', 'nama_lokasi_full': 'Location' }, color='avg_monthly_ratio', color_continuous_scale=[ [0.0, '#D32F2F'], # Red (low activity) [0.5, '#FFB300'], # Amber/orange-yellow (medium) [1.0, '#4CAF50'] # Green (high activity) ], hover_data={'avg_monthly_ratio': ':.2f'} # pastikan format hover sudah 2 desimal ) # Perbaiki tata letak & hover โ€” lakukan *sebelum* menampilkan fig_treemap.update_traces( hovertemplate="%{label}
Avg Ratio: %{value:.2f}" ) fig_treemap.update_layout( title=dict(text='Avg Monthly Finding/Person Ratio by Location', x=0.5, xanchor='center'), height=600, margin=dict(t=50, l=25, r=25, b=25) ) # Tampilkan hanya sekali! st.plotly_chart(fig_treemap, use_container_width=True) # === AI Insight === top_location = avg_ratio_per_location.loc[avg_ratio_per_location['avg_monthly_ratio'].idxmax()] low_location = avg_ratio_per_location.loc[avg_ratio_per_location['avg_monthly_ratio'].idxmin()] st.markdown("### Insight") insight_text = ( f"
" f"{top_location['nama_lokasi_full']} shows the highest activity level " f"({top_location['avg_monthly_ratio']:.2f} findings per person/month). " f"In contrast, {low_location['nama_lokasi_full']} has the lowest activity level " f"({low_location['avg_monthly_ratio']:.2f}).

" f"๐ŸŸข High-ratio (green) locations indicate either high reporting diligence or high exposure to risks โ€” both warrant deeper root-cause analysis.
" f"๐ŸŸก Medium-ratio (yellow) areas need monitoring to ensure trends do not deteriorate.
" f"๐Ÿ”ด Low-ratio (red) locations may reflect under-reporting, lack of engagement, or low hazard visibility โ€” recommend follow-up audits to verify data completeness." f"
" ) st.markdown(insight_text, unsafe_allow_html=True) # =================== OBJECTIVE 3 - Frequency & Response Time =================== st.markdown( """

OBJECTIVE 3 โ€” Frequency & Response Time: Who Reports Well? Who Executes Well?

""", unsafe_allow_html=True ) df_local = df_filtered.copy() if df_local.empty: st.warning("No data available after filtering.") st.stop() df_local['created_month'] = df_local['created_at'].dt.to_period('M') # โ”€โ”€โ”€ Helper: Hitung rasio per division (reporter) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ def compute_reporter_ratio_by_nama(df): if 'nama' not in df.columns: return pd.DataFrame() findings_by_nama_month = df.groupby(['created_month', 'nama']).size().reset_index(name='findings_count') creators_by_nama_month = df.groupby(['created_month', 'nama'])['creator_nid'].nunique().reset_index(name='unique_creators') merged_rep = findings_by_nama_month.merge(creators_by_nama_month, on=['created_month', 'nama'], how='outer') merged_rep = merged_rep.fillna({'findings_count': 0, 'unique_creators': 0}) merged_rep = merged_rep[merged_rep['unique_creators'] > 0] merged_rep['ratio'] = merged_rep['findings_count'] / merged_rep['unique_creators'] merged_rep['ratio'] = merged_rep['ratio'].replace([np.inf, -np.inf], np.nan) avg_ratio_per_nama = merged_rep.groupby('nama')['ratio'].mean().reset_index(name='avg_monthly_ratio') avg_ratio_per_nama = avg_ratio_per_nama.dropna(subset=['avg_monthly_ratio']) return avg_ratio_per_nama # โ”€โ”€โ”€ Helper: Hitung rata-rata temuan per reporter (individu) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ def compute_reporter_rate_by_creator(df): if 'creator_name' not in df.columns: return pd.DataFrame() findings_by_creator_month = df.groupby(['created_month', 'creator_name']).size().reset_index(name='findings_count') active_months_by_creator = findings_by_creator_month.groupby('creator_name')['created_month'].nunique().reset_index(name='active_months') total_findings_by_creator = findings_by_creator_month.groupby('creator_name')['findings_count'].sum().reset_index() merged_rep_creator = total_findings_by_creator.merge(active_months_by_creator, on='creator_name', how='outer') merged_rep_creator = merged_rep_creator.fillna({'findings_count': 0, 'active_months': 0}) merged_rep_creator = merged_rep_creator[merged_rep_creator['active_months'] > 0] merged_rep_creator['avg_monthly_rate'] = merged_rep_creator['findings_count'] / merged_rep_creator['active_months'] merged_rep_creator['avg_monthly_rate'] = merged_rep_creator['avg_monthly_rate'].replace([np.inf, -np.inf], np.nan) avg_rate_per_creator = merged_rep_creator.dropna(subset=['avg_monthly_rate']) return avg_rate_per_creator # โ”€โ”€โ”€ Helper: Hitung lead time per division (executor) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ def compute_executor_leadtime_by_nama(df): if 'nama' not in df.columns or 'days_to_close' not in df.columns: return pd.DataFrame() # Filter hanya data dengan lead time valid df_valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)] leadtime_by_nama_month = df_valid.groupby(['created_month', 'nama'])['days_to_close'].mean().reset_index(name='avg_leadtime') avg_leadtime_nama = leadtime_by_nama_month.groupby('nama')['avg_leadtime'].mean().reset_index(name='avg_monthly_leadtime') avg_leadtime_nama = avg_leadtime_nama.dropna(subset=['avg_monthly_leadtime']) return avg_leadtime_nama # โ”€โ”€โ”€ Helper: Hitung lead time per individu executor (deteksi kolom otomatis) โ”€ def compute_executor_leadtime_by_individual(df, name_col='creator_name'): if name_col not in df.columns or 'days_to_close' not in df.columns: return pd.DataFrame() df_valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)] leadtime_by_indiv_month = df_valid.groupby(['created_month', name_col])['days_to_close'].mean().reset_index(name='avg_leadtime') avg_leadtime_indiv = leadtime_by_indiv_month.groupby(name_col)['avg_leadtime'].mean().reset_index(name='avg_monthly_leadtime') avg_leadtime_indiv = avg_leadtime_indiv.dropna(subset=['avg_monthly_leadtime']) return avg_leadtime_indiv # โ”€โ”€โ”€ Deteksi kolom executor individu โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ EXECUTOR_INDIV_COL = None candidate_executor_cols = ['pic', 'pic_name', 'responsible', 'responsible_name', 'assigned_to', 'closed_by', 'executor_name', 'executor'] for col in candidate_executor_cols: if col in df_local.columns: EXECUTOR_INDIV_COL = col break if EXECUTOR_INDIV_COL is None: # Fallback โ€” gunakan creator_name (dengan warning transparan) EXECUTOR_INDIV_COL = 'nama_pic' # st.warning( # "โš ๏ธ No dedicated executor column (e.g., 'pic', 'responsible') found. " # "Using 'creator_name' as proxy for executor โ€” insights may conflate reporters & executors. " # "Consider adding an executor identifier column for accuracy." # ) # โ”€โ”€โ”€ Hitung semua metrik โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ avg_ratio_per_nama = compute_reporter_ratio_by_nama(df_local) # 3a avg_rate_per_creator = compute_reporter_rate_by_creator(df_local) # 3c avg_leadtime_nama = compute_executor_leadtime_by_nama(df_local) # 3b avg_leadtime_per_indiv = compute_executor_leadtime_by_individual(df_local, name_col=EXECUTOR_INDIV_COL) # 3d # โ”€โ”€โ”€ Helper: Warna berdasarkan ranking global โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ def add_color_by_global_rank(df, value_col, top_n=5, worst_n=5, high_is_good=True): df = df.copy() df['color'] = '#1f77b4' # default biru if len(df) == 0: return df if high_is_good: # Nilai tinggi = baik โ†’ top N โ†’ hijau top_names = df.nlargest(top_n, value_col)['nama' if 'nama' in df.columns else df.columns[0]] df.loc[df[df.columns[0]].isin(top_names), 'color'] = '#4CAF50' else: # Nilai tinggi = buruk (e.g., lead time) โ†’ worst N (tertinggi) โ†’ merah worst_names = df.nlargest(worst_n, value_col)['nama' if 'nama' in df.columns else df.columns[0]] df.loc[df[df.columns[0]].isin(worst_names), 'color'] = '#D32F2F' return df # โ”€โ”€โ”€ Layout: 2 baris ร— 2 kolom โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ col_3a, col_3c = st.columns(2) # โ”€โ”€โ”€ 3a: Reporter by Division (Rasio Temuan/Orang) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ with col_3a: st.markdown("
3a. Finding/Person Ratio by Division (Reporter)
", unsafe_allow_html=True) if avg_ratio_per_nama.empty: st.warning("No data for division-level reporter analysis.") else: sort_opt = st.selectbox("Show:", ["Top 10", "Bottom 10"], key='sort_3a') full_sorted = avg_ratio_per_nama.sort_values('avg_monthly_ratio', ascending=False) subset = full_sorted.head(10) if sort_opt == "Top 10" else full_sorted.tail(10).sort_values('avg_monthly_ratio', ascending=True) # Tambahkan warna: top 5 โ†’ hijau colored = add_color_by_global_rank(avg_ratio_per_nama, 'avg_monthly_ratio', top_n=5, high_is_good=True) subset = subset.merge(colored[['nama', 'color']], on='nama', how='left').fillna({'color': '#1f77b4'}) # Reverse untuk visual (tertinggi di atas) if sort_opt == "Top 10": subset = subset.iloc[::-1] fig = px.bar( subset, x='avg_monthly_ratio', y='nama', orientation='h', title=f'{sort_opt} Divisions', labels={'avg_monthly_ratio': 'Monthly Ratio', 'nama': 'Division'}, color='color', color_discrete_map={c: c for c in subset['color'].unique()}, text=subset['avg_monthly_ratio'].apply(lambda x: f'{x:.2f}') ) fig.update_layout(height=450, showlegend=False, yaxis={'categoryorder': 'array', 'categoryarray': subset['nama'].tolist()}) fig.update_traces(textposition='auto') st.plotly_chart(fig, use_container_width=True) # ๐Ÿ” Insight (dari full data) if len(full_sorted) >= 2: min_r, max_r, mean_r = full_sorted['avg_monthly_ratio'].min(), full_sorted['avg_monthly_ratio'].max(), full_sorted['avg_monthly_ratio'].mean() best, worst = full_sorted.iloc[0]['nama'], full_sorted.iloc[-1]['nama'] st.markdown( f"
" f"Insight: Division reporting ratio ranges from {min_r:.2f} to {max_r:.2f} (avg: {mean_r:.2f}). " f"{best} leads." # {worst} lags. f"Recommendation: Benchmark processes from {best}; assess capacity/tooling gaps in {worst}." f"
", unsafe_allow_html=True ) # โ”€โ”€โ”€ 3c: Reporter by Individual โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ with col_3c: st.markdown("
3c. Monthly Findings per Reporter (Individual)
", unsafe_allow_html=True) if avg_rate_per_creator.empty: st.warning("No data for individual reporter analysis.") else: sort_opt = st.selectbox("Show:", ["Top 10", "Bottom 10"], key='sort_3c') full_sorted = avg_rate_per_creator.sort_values('avg_monthly_rate', ascending=False) subset = full_sorted.head(10) if sort_opt == "Top 10" else full_sorted.tail(10).sort_values('avg_monthly_rate', ascending=True) colored = add_color_by_global_rank(avg_rate_per_creator, 'avg_monthly_rate', top_n=5, high_is_good=True) subset = subset.merge(colored[['creator_name', 'color']], on='creator_name', how='left').fillna({'color': '#1f77b4'}) if sort_opt == "Top 10": subset = subset.iloc[::-1] fig = px.bar( subset, x='avg_monthly_rate', y='creator_name', orientation='h', title=f'{sort_opt} Reporters', labels={'avg_monthly_rate': 'Monthly Findings', 'creator_name': 'Reporter'}, color='color', color_discrete_map={c: c for c in subset['color'].unique()}, text=subset['avg_monthly_rate'].apply(lambda x: f'{x:.2f}') ) fig.update_layout(height=450, showlegend=False, yaxis={'categoryorder': 'array', 'categoryarray': subset['creator_name'].tolist()}) fig.update_traces(textposition='auto') st.plotly_chart(fig, use_container_width=True) if len(full_sorted) >= 2: min_r, max_r, mean_r = full_sorted['avg_monthly_rate'].min(), full_sorted['avg_monthly_rate'].max(), full_sorted['avg_monthly_rate'].mean() top_reporter = full_sorted.iloc[0]['creator_name'] st.markdown( f"
" f"Insight: Individual reporting ranges from {min_r:.2f} to {max_r:.2f} findings/month (avg: {mean_r:.2f}). " f"{top_reporter} is the most active contributor. " f"Recommendation: Recognize top reporters; inspection causes of low activity (<0.5/month) via 1:1 review." f"
", unsafe_allow_html=True ) # โ”€โ”€โ”€ Baris 2: Executor โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ col_3b, col_3d = st.columns(2) # โ”€โ”€โ”€ 3b: Executor by Division (Lead Time) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ with col_3b: st.markdown("
3b. Monthly Lead Time by Division (Executor)
", unsafe_allow_html=True) if avg_leadtime_nama.empty: st.warning("No data for division-level executor analysis.") else: sort_opt = st.selectbox( "Show:", ["Top 10", "Bottom 10"], key='sort_3b' ) # Sort penuh sekali โ€” ascending: tercepat โ†’ terlambat full_sorted = avg_leadtime_nama.sort_values('avg_monthly_leadtime', ascending=True) # Ambil subset sesuai pilihan if sort_opt == "Top 10": # 10 tercepat: ascending (kecil โ†’ besar), tetap diurut ascending โ†’ tercepat di atas subset = full_sorted.head(10).sort_values('avg_monthly_leadtime', ascending=False) else: # "Bottom 10 Slowest" # 10 terlambat: descending (besar โ†’ kecil), agar terlambat di atas subset = full_sorted.tail(10).sort_values('avg_monthly_leadtime', ascending=False) # Warna: 5 terlambat secara global (bukan di subset!) โ†’ warna merah colored = add_color_by_global_rank( avg_leadtime_nama, 'avg_monthly_leadtime', worst_n=5, high_is_good=False # lebih tinggi = buruk ) subset = subset.merge(colored[['nama', 'color']], on='nama', how='left').fillna({'color': '#1f77b4'}) fig = px.bar( subset, x='avg_monthly_leadtime', y='nama', orientation='h', title=sort_opt, labels={'avg_monthly_leadtime': 'Monthly Lead Time (Days)', 'nama': 'Division'}, color='color', color_discrete_map={c: c for c in subset['color'].unique()}, text=subset['avg_monthly_leadtime'].apply(lambda x: f'{x:.1f}') ) fig.update_layout( height=450, showlegend=False, yaxis={'categoryorder': 'array', 'categoryarray': subset['nama'].tolist()} ) fig.update_traces(textposition='auto') st.plotly_chart(fig, use_container_width=True) if len(full_sorted) >= 2: min_lt = full_sorted['avg_monthly_leadtime'].min() max_lt = full_sorted['avg_monthly_leadtime'].max() mean_lt = full_sorted['avg_monthly_leadtime'].mean() fastest = full_sorted.iloc[0]['nama'] slowest = full_sorted.iloc[-1]['nama'] st.markdown( f"
" f"Insight: Resolution time ranges from {min_lt:.1f} to {max_lt:.1f} days (avg: {mean_lt:.1f}). " f"{slowest} has highest risk of SLA breach. " f"Recommendation: Initiate RCA for {slowest}; replicate workflow from {fastest}. Set SLA threshold at 7 days." f"
", unsafe_allow_html=True ) # โ”€โ”€โ”€ 3d: Executor by Individual โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ with col_3d: st.markdown( f"
3d. Monthly Lead Time per Executor (Individual)
", unsafe_allow_html=True ) if avg_leadtime_per_indiv.empty: st.warning(f"No data for individual executor analysis (column: '{EXECUTOR_INDIV_COL}').") else: sort_opt = st.selectbox( "Show:", ["Top 10", "Bottom 10"], key='sort_3d' ) full_sorted = avg_leadtime_per_indiv.sort_values('avg_monthly_leadtime', ascending=True) if sort_opt == "Top 10": subset = full_sorted.head(10).subset = full_sorted.head(10).sort_values('avg_monthly_leadtime', ascending=False) else: # "Bottom 10 Slowest" subset = full_sorted.tail(10).sort_values('avg_monthly_leadtime', ascending=False) # Warna berdasarkan ranking global colored = add_color_by_global_rank( avg_leadtime_per_indiv, 'avg_monthly_leadtime', worst_n=5, high_is_good=False ) id_col = EXECUTOR_INDIV_COL subset = subset.merge(colored[[id_col, 'color']], on=id_col, how='left').fillna({'color': '#1f77b4'}) fig = px.bar( subset, x='avg_monthly_leadtime', y=id_col, orientation='h', title=sort_opt, labels={'avg_monthly_leadtime': 'Monthly Lead Time (Days)', id_col: 'Executor'}, color='color', color_discrete_map={c: c for c in subset['color'].unique()}, text=subset['avg_monthly_leadtime'].apply(lambda x: f'{x:.1f}') ) fig.update_layout( height=450, showlegend=False, yaxis={'categoryorder': 'array', 'categoryarray': subset[id_col].tolist()} ) fig.update_traces(textposition='auto') st.plotly_chart(fig, use_container_width=True) if len(full_sorted) >= 2: min_lt = full_sorted['avg_monthly_leadtime'].min() max_lt = full_sorted['avg_monthly_leadtime'].max() mean_lt = full_sorted['avg_monthly_leadtime'].mean() slowest_exec = full_sorted.iloc[-1][id_col] fastest_exec = full_sorted.iloc[0][id_col] st.markdown( f"
" f"Insight: Executor performance ranges from {min_lt:.1f} to {max_lt:.1f} days (avg: {mean_lt:.1f}). " f"{slowest_exec} requires support to meet SLA. " f"Recommendation: Assign mentor to executors >7 days; document & share best practices from top performers (e.g., {fastest_exec})." f"
", unsafe_allow_html=True ) #Objective 4 try: from wordcloud import WordCloud import matplotlib.pyplot as plt import plotly.express as px from collections import Counter WORDCLOUD_AVAILABLE = True except ImportError: WORDCLOUD_AVAILABLE = False st.markdown("

OBJECTIVE 4 - Unsafe Issues: Which One is the Most Often?

", unsafe_allow_html=True) # ๐Ÿ”น Fungsi untuk membuat judul seragam def create_consistent_title(title_text): return f"
{title_text}
" if WORDCLOUD_AVAILABLE: # ๐Ÿ”ฅ Data untuk pie chart: semua kategori df_all_kategori = df_local.copy() # ๐Ÿ”ฅ Data untuk wordcloud: hanya Non-Positive df_filtered_kategori = df_local[df_local['temuan_kategori'] != 'Positive'] \ if 'temuan_kategori' in df_local.columns else df_local # 2 Kolom col1, col2 = st.columns(2) # === PIE CHART: Semua temuan_kategori (Dengan Custom Text Position untuk Unsafe Action & Near Miss) === with col1: st.markdown(create_consistent_title("Distribution of All Issue Categories"), unsafe_allow_html=True) if 'temuan_kategori' in df_all_kategori.columns: category_counts = df_all_kategori['temuan_kategori'].value_counts() if not category_counts.empty: # Mapping warna color_map = { 'Positive': '#2E7D32', # Hijau 'Unsafe Condition': '#EF5350', # Merah Muda 'Unsafe Action': '#F48FB1', # Pink 'Near Miss': '#BDBDBD' # Abu-abu } colors = [color_map.get(cat, '#9E9E9E') for cat in category_counts.index] # Buat pie chart tanpa legend fig_pie = px.pie( names=category_counts.index, values=category_counts.values, color_discrete_sequence=colors ) # โš™๏ธ Custom: tarik keluar slice untuk Unsafe Action & Near Miss pull_values = [] for cat in category_counts.index: if cat in ['Unsafe Action', 'Near Miss']: pull_values.append(0.1) else: pull_values.append(0.0) fig_pie.update_traces( textposition='outside', textinfo='percent+label', pull=pull_values, marker=dict(line=dict(color='#FFFFFF', width=1)) ) fig_pie.update_layout( showlegend=False, height=450, margin=dict(t=20, b=20, l=40, r=40) ) st.plotly_chart(fig_pie, use_container_width=True) # โœ… CUSTOM LEGEND st.markdown("
", unsafe_allow_html=True) legend_items = [] for cat in category_counts.index: color = color_map.get(cat, '#9E9E9E') item = ( f"{cat}" ) legend_items.append(item) legend_html = " | ".join(legend_items) st.markdown(f"
{legend_html}
", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) else: st.warning("No data available for pie chart.") else: st.warning("Column 'temuan_kategori' not available.") # === WORDCLOUD: Hanya Non-Positive (dari keyword_kategori) === with col2: st.markdown(create_consistent_title("Unsafe Issues"), unsafe_allow_html=True) st.markdown( """

Categorization uses NLP โ€” Natural Language Processing from random text

""", unsafe_allow_html=True ) if df_filtered_kategori.empty: st.warning("No data available after filtering out 'Positive' category.") else: if 'keyword_kategori' in df_filtered_kategori.columns: # Gabungkan & bersihkan teks import re text_series = df_filtered_kategori['keyword_kategori'].dropna().astype(str) text = ' '.join(text_series) text = re.sub(r'[^a-zA-Z\s]', ' ', text).strip() if text: # Generate wordcloud wordcloud = WordCloud( width=1600, height=800, background_color='white', colormap='viridis', max_words=1000, random_state=42 ).generate(text) # Tampilkan fig, ax = plt.subplots(figsize=(3, 2), dpi=200) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') plt.tight_layout() st.pyplot(fig, use_container_width=True) # ๐Ÿ” ๐Ÿ”น Insight dinamis: top 3 keywords words_clean = [w.lower() for w in text.split() if len(w) > 2] top3 = [word.capitalize() for word, _ in Counter(words_clean).most_common(3)] if top3: insight = f"The most frequent unsafe issue themes are: {', '.join(top3)}." else: insight = "Recurring unsafe issue patterns are detectable in textual findings." st.markdown( f"""
Insight: {insight}
""", unsafe_allow_html=True ) else: st.warning("No valid text remaining after cleaning.") else: st.warning("Column 'keyword_kategori' not available.") else: st.info("WordCloud library not installed. Install `wordcloud` and `matplotlib` to enable this feature.") # =================== 5. Matrix (Tetap Dipertahankan) =================== # =================== 5. Matrix (Tetap Dipertahankan) =================== st.markdown("

OBJECTIVE 5 - Findings vs Lead Time: Which Companies Move Slow?

", unsafe_allow_html=True) import math import plotly.express as px import pandas as pd try: df_local_matrix = df.copy() # ============================ # 0. Filter: ONLY 1 COMPANY & 1 PROFILE (if applicable) # ============================ # (Skipped for general dashboard view) # ============================ # 1. Exclude Positive findings # ============================ if 'temuan_kategori' in df_local_matrix.columns: df_local_matrix = df_local_matrix[df_local_matrix["temuan_kategori"] != "Positive"] # ============================ # 2. Ensure datetime columns # ============================ df_local_matrix['created_at'] = pd.to_datetime(df_local_matrix['created_at'], errors='coerce') df_local_matrix['close_at'] = pd.to_datetime(df_local_matrix['close_at'], errors='coerce') # ============================ # 3. Compute LEAD TIME # ============================ df_local_matrix['lead_time_days'] = (df_local_matrix['close_at'] - df_local_matrix['created_at']).dt.days df_local_matrix['lead_time_days'] = df_local_matrix['lead_time_days'].fillna(0) # ============================ # 4. Average Monthly Finding Count per Operator # ============================ if 'nama' not in df_local_matrix.columns: st.error("โŒ Kolom 'nama' (operator) tidak ditemukan.") # st.stop() # Stop bisa dihilangkan agar script tetap jalan else: # Buat kolom bulan (YYYY-MM) df_local_matrix = df_local_matrix.assign(month=df_local_matrix['created_at'].dt.to_period('M').astype(str)) # Hitung jumlah temuan per operator per bulan monthly_counts = ( df_local_matrix .groupby(['nama', 'month'])['kode_temuan'] .nunique() .reset_index(name='monthly_count') ) # Hitung rata-rata bulanan per operator operator_avg = ( monthly_counts .groupby('nama')['monthly_count'] .mean() # <-- RATA-RATA per bulan (bukan total!) .reset_index(name='Finding Count') ) # ============================ # 5. Average Lead Time per Operator # ============================ operator_lead = ( df_local_matrix.groupby('nama')['lead_time_days'] .mean() .reset_index(name='Average Lead Time') ) # ============================ # 6. Merge Risk Matrix # ============================ risk_matrix = operator_avg.merge(operator_lead, on='nama', how='left') risk_matrix = risk_matrix.rename(columns={'nama': 'Operator Name'}) # Handle operator tanpa lead time (e.g., belum closed) risk_matrix['Average Lead Time'] = risk_matrix['Average Lead Time'].fillna(0) # ============================ # 7. Quadrant Logic (unchanged) # ============================ X_LIMIT = 20 Y_LIMIT = 3 def assign_quadrant(row): if row['Finding Count'] >= X_LIMIT and row['Average Lead Time'] >= Y_LIMIT: return "Quadrant I โ€“ High Leadtime & High Count" elif row['Finding Count'] < X_LIMIT and row['Average Lead Time'] >= Y_LIMIT: return "Quadrant II โ€“ High Leadtime but Low Count" elif row['Finding Count'] >= X_LIMIT and row['Average Lead Time'] < Y_LIMIT: return "Quadrant III โ€“ Low Leadtime but High Count" else: return "Quadrant IV โ€“ Low Leadtime & Low Count" risk_matrix['quadrant'] = risk_matrix.apply(assign_quadrant, axis=1) quadrant_count = risk_matrix['quadrant'].value_counts() # ============================ # 8. Scatter Plot (format visual tetap sam persis) # ============================ max_x = risk_matrix['Finding Count'].max() + 1 max_y = risk_matrix['Average Lead Time'].max() + 5 fig = px.scatter( risk_matrix, x='Finding Count', y='Average Lead Time', hover_name="Operator Name", size=[12] * len(risk_matrix), size_max=15, title="Audit Findings Risk Matrix: Avg Monthly Count vs Lead Time" ) # Background quadrant (same as original) fig.add_shape(type="rect", x0=X_LIMIT, x1=max_x, y0=Y_LIMIT, y1=max_y, fillcolor="rgba(255,0,0,0.25)", line_width=0) # Q1 fig.add_shape(type="rect", x0=0, x1=X_LIMIT, y0=Y_LIMIT, y1=max_y, fillcolor="rgba(255,150,50,0.25)", line_width=0) # Q2 fig.add_shape(type="rect", x0=X_LIMIT, x1=max_x, y0=0, y1=Y_LIMIT, fillcolor="rgba(255,200,200,0.25)", line_width=0) # Q3 fig.add_shape(type="rect", x0=0, x1=X_LIMIT, y0=0, y1=Y_LIMIT, fillcolor="rgba(0,120,255,0.15)", line_width=0) # Q4 fig.add_vline(x=X_LIMIT, line_dash="dash", line_color="black") fig.add_hline(y=Y_LIMIT, line_dash="dash", line_color="black") # Quadrant count annotations (same positions & style) fig.add_annotation(x=X_LIMIT + (max_x - X_LIMIT)/2, y=Y_LIMIT + (max_y - Y_LIMIT)/2, text=f"{quadrant_count.get('Quadrant I โ€“ High Leadtime & High Count',0)}", showarrow=False, font=dict(size=22, color="darkred")) fig.add_annotation(x=X_LIMIT/2, y=Y_LIMIT + (max_y - Y_LIMIT)/2, text=f"{quadrant_count.get('Quadrant II โ€“ High Leadtime but Low Count',0)}", showarrow=False, font=dict(size=22, color="orange")) fig.add_annotation(x=X_LIMIT + (max_x - X_LIMIT)/2, y=Y_LIMIT/2, text=f"{quadrant_count.get('Quadrant III โ€“ Low Leadtime but High Count',0)}", showarrow=False, font=dict(size=22, color="red")) fig.add_annotation(x=X_LIMIT/2, y=Y_LIMIT/2, text=f"{quadrant_count.get('Quadrant IV โ€“ Low Leadtime & Low Count',0)}", showarrow=False, font=dict(size=22, color="green")) st.plotly_chart(fig, use_container_width=True) # ============================ # 9. Summary Table # ============================ st.subheader("Summary (Avg Monthly Count vs Avg Lead Time)") st.dataframe( risk_matrix.sort_values("Finding Count", ascending=False), use_container_width=True ) except Exception as e: st.error(f"โš ๏ธ Error Risk Matrix: {e}") # st.exception(e) # Uncomment for debugging # st.exception(e) # Uncomment for debugging import streamlit as st import plotly.graph_objects as go import numpy as np import pandas as pd # =================== OBJECTIVE 6 - Predictive Dashboard & Early Warning Signals =================== st.markdown("

OBJECTIVE 6 โ€” Prediction for Early Warning Signals: Which Ones Have Less Future Inspections?

", unsafe_allow_html=True) # โœ… Enhanced CSS + Minimal Sortable JS st.markdown(""" """, unsafe_allow_html=True) #bloob line # ๐Ÿ”น Helper: Sparkline ala chart saham โ€” hijau kalau naik, merah kalau turun def ascii_sparkline_pln(data): if not data or len(data) == 0: return "โ€”" try: data = [float(x) for x in data] n = len(data) if n == 1: return "โ€ข" # Hitung slope (linear trend) xs = list(range(n)) x_mean = sum(xs) / n y_mean = sum(data) / n numerator = sum((x - x_mean) * (y - y_mean) for x, y in zip(xs, data)) denominator = sum((x - x_mean) ** 2 for x in xs) slope = numerator / (denominator + 1e-12) # Tentukan warna & label berdasarkan slope if slope > 0.001: color = "#2E7D32" # hijau (naik) label = "โ†‘" elif slope < -0.001: color = "#C62828" # merah (turun) label = "โ†“" else: color = "#546E7A" # abu-abu (datar) label = "โ†’" # Normalisasi koordinat Y untuk SVG height = 16 margin = 2 usable_h = height - 2 * margin min_y, max_y = min(data), max(data) if max_y == min_y: y_coords = [margin + usable_h / 2] * n else: y_coords = [ margin + usable_h * (1 - (v - min_y) / (max_y - min_y + 1e-9)) for v in data ] # Buat polyline points = " ".join(f"{i * 8},{y:.1f}" for i, y in enumerate(y_coords)) width = max(24, (n - 1) * 8 + 4) # SVG inline svg = ( f'' f'' f'{label}' f'' ) return svg except Exception: return "โš " # โ€”โ€”โ€”โ€”โ€”โ€”โ€” 1. Creators: ONLY Coverage < 90% AND Slope < 0 โ€”โ€”โ€”โ€”โ€”โ€”โ€” def predict_creators(df): # โŒ Tidak ada filter Non-Positive if 'creator_name' not in df.columns or df.empty: return pd.DataFrame() start_month = df['created_at'].min().to_period('M') end_month = df['created_at'].max().to_period('M') all_months = pd.period_range(start=start_month, end=end_month, freq='M') df_monthly = ( df.groupby(['creator_name', df['created_at'].dt.to_period('M')]) .size() .unstack(fill_value=0) .reindex(columns=all_months, fill_value=0) .stack() .reset_index(name='count') ) df_monthly.columns = ['Creator', 'Month', 'Count'] results = [] for creator, group in df_monthly.groupby('Creator'): ts = group.set_index('Month')['Count'] total = len(all_months) active = (ts > 0).sum() coverage = active / total if total > 0 else 0 avg_rate = ts.mean() if len(ts) >= 2: try: slope = np.polyfit(np.arange(len(ts)), ts.values, 1)[0] # โœ… FILTER: Coverage < 90% AND Slope < 0 if slope < 0 and coverage < 0.9: reason = f"Slope = {slope:.3f}, Coverage = {coverage*100:.1f}%. Avg: {avg_rate:.2f}/mo." results.append({ 'Creator': creator, 'Reports/Month': round(avg_rate, 2), 'Monthly Consistency (%)': round(coverage * 100, 1), 'Trend Slope': round(slope, 3), 'Trend': ascii_sparkline_pln(ts.values.tolist()), 'Reason': reason }) except: continue df_res = pd.DataFrame(results) # โœ… Ambil 10 creator dengan slope paling negatif (paling turun) return df_res.sort_values('Trend Slope', ascending=True).head(10) if not df_res.empty else df_res # โ€”โ€”โ€”โ€”โ€”โ€”โ€” 2. Locations: ONLY Coverage < 90% AND Slope < 0 โ€”โ€”โ€”โ€”โ€”โ€”โ€” def predict_locations(df): # โŒ Tidak ada filter Non-Positive if 'nama_lokasi_full' not in df.columns or df.empty: return pd.DataFrame() start_month = df['created_at'].min().to_period('M') end_month = df['created_at'].max().to_period('M') all_months = pd.period_range(start=start_month, end=end_month, freq='M') df_monthly = ( df.groupby(['nama_lokasi_full', df['created_at'].dt.to_period('M')]) .size() .unstack(fill_value=0) .reindex(columns=all_months, fill_value=0) .stack() .reset_index(name='count') ) df_monthly.columns = ['Location', 'Month', 'Count'] results = [] for lokasi, group in df_monthly.groupby('Location'): ts = group.set_index('Month')['Count'] total = len(all_months) active = (ts > 0).sum() coverage = active / total if total > 0 else 0 avg_rate = ts.mean() if len(ts) >= 2: try: slope = np.polyfit(np.arange(len(ts)), ts.values, 1)[0] # โœ… FILTER: Coverage < 90% AND Slope < 0 if slope < 0 and coverage < 0.9: reason = f"Slope = {slope:.3f}, Coverage = {coverage*100:.1f}%. Avg: {avg_rate:.2f}/mo." results.append({ 'Location': lokasi, 'Reports/Month': round(avg_rate, 2), 'Monthly Consistency (%)': round(coverage * 100, 1), 'Trend Slope': round(slope, 3), 'Trend': ascii_sparkline_pln(ts.values.tolist()), 'Reason': reason }) except: continue df_res = pd.DataFrame(results) # โœ… Ambil 10 lokasi dengan slope paling negatif (paling turun) return df_res.sort_values('Trend Slope', ascending=True).head(10) if not df_res.empty else df_res # โ€”โ€”โ€”โ€”โ€”โ€”โ€” 3. Divisions: ONLY Coverage < 90% AND Slope < 0 โ€”โ€”โ€”โ€”โ€”โ€”โ€” def predict_divisions(df): # โŒ Tidak ada filter Non-Positive if 'nama' not in df.columns or df.empty: return pd.DataFrame() start_month = df['created_at'].min().to_period('M') end_month = df['created_at'].max().to_period('M') all_months = pd.period_range(start=start_month, end=end_month, freq='M') df_monthly = ( df.groupby(['nama', df['created_at'].dt.to_period('M')]) .size() .unstack(fill_value=0) .reindex(columns=all_months, fill_value=0) .stack() .reset_index(name='count') ) df_monthly.columns = ['Division', 'Month', 'Count'] results = [] for div, group in df_monthly.groupby('Division'): ts = group.set_index('Month')['Count'] total = len(all_months) active = (ts > 0).sum() coverage = active / total if total > 0 else 0 avg_rate = ts.mean() if len(ts) >= 2: try: slope = np.polyfit(np.arange(len(ts)), ts.values, 1)[0] # โœ… FILTER: Coverage < 90% AND Slope < 0 if slope < 0 and coverage < 0.9: reason = f"Slope = {slope:.3f}, Coverage = {coverage*100:.1f}%. Avg: {avg_rate:.2f}/mo." results.append({ 'Division': div, 'Reports/Month': round(avg_rate, 2), 'Monthly Consistency (%)': round(coverage * 100, 1), 'Trend Slope': round(slope, 3), 'Trend': ascii_sparkline_pln(ts.values.tolist()), 'Reason': reason }) except: continue df_res = pd.DataFrame(results) # โœ… Ambil 10 divisi dengan slope paling negatif (paling turun) return df_res.sort_values('Trend Slope', ascending=True).head(10) if not df_res.empty else df_res # โ€”โ€”โ€”โ€”โ€”โ€”โ€” 4. Categories: ONLY Non-Positive + Coverage=100% & Trend Slope > 0 โ€”โ€”โ€”โ€”โ€”โ€”โ€” def predict_categories(df): # ๐Ÿ”ฅ Filter: Hanya yang bukan 'Positive' df = df[df['temuan_kategori'] != 'Positive'].copy() # โœ… Filter non-Positive if 'kategori' not in df.columns or df.empty: return pd.DataFrame() start_month = df['created_at'].min().to_period('M') end_month = df['created_at'].max().to_period('M') all_months = pd.period_range(start=start_month, end=end_month, freq='M') n_months = len(all_months) results = [] for cat, group in df.groupby('kategori'): ts_data = ( group.groupby(group['created_at'].dt.to_period('M')) .size() .reindex(all_months, fill_value=0) ) total_reports = ts_data.sum() avg_per_month = total_reports / n_months if n_months > 0 else 0 active_months = (ts_data > 0).sum() coverage = active_months / n_months if n_months > 0 else 0 slope = 0.0 if len(ts_data) >= 2: try: slope = np.polyfit(np.arange(len(ts_data)), ts_data.values, 1)[0] except: pass results.append({ 'Category': cat, 'Avg/Month': round(avg_per_month, 2), 'Monthly Consistency (%)': round(coverage * 100, 1), 'Trend Slope': round(slope, 3), 'Trend': ascii_sparkline_pln(ts_data.values.tolist()) }) df_res = pd.DataFrame(results) # โœ… FILTER: Coverage = 100% AND Trend Slope > 0 if not df_res.empty: df_res = df_res # [ # (df_res['Coverage (%)'] == 100.0) & # (df_res['Trend Slope'] > 0) # ].copy() df_res['Status'] = df_res['Trend Slope'].apply( lambda s: "High-Risk Rising" if s > 0.2 else "Emerging Rising" ) df_res = df_res.sort_values('Trend Slope', ascending=False) return df_res.reset_index(drop=True).head(10) if not df_res.empty else df_res # โ€”โ€”โ€”โ€”โ€”โ€”โ€” RUN โ€”โ€”โ€”โ€”โ€”โ€”โ€” df_creator = predict_creators(df_filtered) df_location = predict_locations(df_filtered) df_division = predict_divisions(df_filtered) df_category = predict_categories(df_filtered) # ๐ŸŽฏ PANEL 1: Creators (FILTERED: Coverage < 90% & Slope < 0) st.markdown("
", unsafe_allow_html=True) st.markdown("
1. Which Reporters Are Predicted to Have Less Future Inspections? (Top 10 Most Declining)
", unsafe_allow_html=True) if not df_creator.empty: cols = ['Creator', 'Reports/Month', 'Monthly Consistency (%)', 'Trend Slope', 'Trend'] # ๐Ÿ”ฅ Rename hanya untuk DISPLAY, bukan data asli df_display = df_creator[cols].rename(columns={ "Reason": "Reason Forecast" }) html = df_display.to_html(escape=False, index=False, table_id="tbl-creators") st.markdown(f"
{html}
", unsafe_allow_html=True) # st.markdown( # "
" # "Criteria: Coverage < 90% AND negative slope. " # "High-risk: steep negative slope + low baseline activity." # "
", # unsafe_allow_html=True # ) # else: # st.markdown( # "
" # "

" # "No creators meet criteria: Coverage < 90% and negative trend." # "

" # "
" # "๐Ÿ’ก Note: Creators with Coverage โ‰ฅ 90% are excluded โ€” they are considered stable reporters." # "
" # "
", # unsafe_allow_html=True # ) st.markdown("
", unsafe_allow_html=True) # ๐ŸŽฏ PANEL 2: Locations (FILTERED: Coverage < 90% & Slope < 0) st.markdown("
", unsafe_allow_html=True) st.markdown("
2. Which Locations Are Predicted to Have Less Future Inspections? (Top 10 Most Declining)
", unsafe_allow_html=True) if not df_location.empty: cols = ['Location', 'Reports/Month', 'Monthly Consistency (%)', 'Trend Slope', 'Trend'] # # ๐Ÿ”ฅ Rename hanya untuk DISPLAY, bukan data asli df_display = df_location[cols].rename(columns={ "Reason": "Reason Forecast" }) html = df_display.to_html(escape=False, index=False, table_id="tbl-locations") st.markdown(f"
{html}
", unsafe_allow_html=True) # st.markdown( # "
" # "Criteria: Coverage < 90% AND negative slope. " # "High-risk: steep negative slope + low baseline activity." # "
", # unsafe_allow_html=True # ) # else: # st.markdown( # "
" # "

" # "No locations meet criteria: Coverage < 90% and negative trend." # "

" # "
" # "๐Ÿ’ก Note: Locations with Coverage โ‰ฅ 90% are excluded โ€” they are considered stable reporting zones." # "
" # "
", # unsafe_allow_html=True # ) st.markdown("
", unsafe_allow_html=True) # ๐ŸŽฏ PANEL 3: Divisions (FILTERED: Coverage < 90% & Slope < 0) st.markdown("
", unsafe_allow_html=True) st.markdown("
3. Which Divisions Are Predicted to Have Less Future Inspections? (Top 10 Most Declining)
", unsafe_allow_html=True) if not df_division.empty: cols = ['Division', 'Reports/Month', 'Monthly Consistency (%)', 'Trend Slope', 'Trend'] # # ๐Ÿ”ฅ Rename hanya untuk DISPLAY, bukan data asli df_display = df_division[cols].rename(columns={ "Reason": "Reason Forecast" }) html = df_display.to_html(escape=False, index=False, table_id="tbl-divisions") st.markdown(f"
{html}
", unsafe_allow_html=True) # st.markdown( # "
" # "Criteria: Coverage < 90% AND negative slope. " # "High-risk: steep negative slope + low baseline activity." # "
", # unsafe_allow_html=True # ) # else: # st.markdown( # "
" # "

" # "No divisions meet criteria: Coverage < 90% and negative trend." # "

" # "
" # "๐Ÿ’ก Note: Divisions with Coverage โ‰ฅ 90% are excluded โ€” they are considered stable reporting zones." # "
" # "
", # unsafe_allow_html=True # ) st.markdown("
", unsafe_allow_html=True) # ๐ŸŽฏ PANEL 4: Categories (FILTERED: Non-Positive + Coverage=100% & Rising) # st.markdown("
", unsafe_allow_html=True) st.markdown( "
" "4. Which Issue Categories Are Likely to Appear in the Next 3 Months (Increasing Trend Only)" "" "   (* Categorization uses NLP โ€” Natural Language Processing from random text)" "" "
", unsafe_allow_html=True ) # if not df_category.empty: # cols = ['Category', 'Avg/Month', 'Coverage (%)', 'Trend Slope', 'Status', 'Trend'] # # ๐Ÿ”ต Rename ONLY for display # df_display = df_category[cols].rename(columns={ # "Status": "Status Issue for Next Month" # }) # html = df_display.to_html(escape=False, index=False, table_id="tbl-categories") # st.markdown(f"
{html}
", unsafe_allow_html=True) # st.markdown( # "
" # "Filtered: Reported every month (100% coverage) with increasing trend. " # "Avg/Month = total รท months. " # "High-Risk Rising = slope > 0.2." # "
", # unsafe_allow_html=True # ) # else: # st.markdown( # "
" # "

" # "โš ๏ธ No rising categories with 100% monthly coverage." # "

" # "

" # "Consider relaxing coverage filter if data is sparse." # "

", # unsafe_allow_html=True # ) # st.markdown("
", unsafe_allow_html=True) # =================== WHITEBOARD STYLE CHART FOR PANEL 4 =================== # st.markdown("

Whiteboard Insight: Trend vs Frequency

", unsafe_allow_html=True) # Buat chart scatter dengan gaya whiteboard if not df_category.empty: # Ambil data untuk scatter df_plot = df_category.copy() df_plot['Size'] = df_plot['Avg/Month'] # Ukuran lingkaran = frekuensi (Avg/Month) df_plot['Y'] = df_plot['Trend Slope'] # Y = Trend Slope # Buat scatter plot fig = go.Figure() # Tambahkan scatter fig.add_trace(go.Scatter( x=df_plot['Category'], y=df_plot['Y'], mode='markers', marker=dict( size=df_plot['Size'] * 1.5, # Skala ukuran agar tidak terlalu besar color='#003DA5', line=dict(width=2, color='white'), opacity=0.8 ), text=df_plot['Category'], textposition="top center", textfont=dict(size=10, color="#003DA5", family="Arial"), hovertemplate="%{text}
Trend Slope: %{y:.3f}
Avg/Month: %{marker.size:.1f}", )) # Layout fig.update_layout( # title=dict(text="Issue Category Trend vs Frequency (Non-Positive)", x=0.5, y=0.95), xaxis=dict( title="Category", tickangle=45, showgrid=True, gridcolor="#e0e0e0", gridwidth=1, tickfont=dict(size=10, color="#003DA5") ), yaxis=dict( title="Trend Slope", showgrid=True, gridcolor="#e0e0e0", gridwidth=1, tickfont=dict(size=10, color="#003DA5") ), plot_bgcolor="white", paper_bgcolor="white", height=500, margin=dict(t=60, b=60, l=60, r=60), font=dict(family="Arial", size=12, color="#003DA5"), showlegend=False ) # Tambahkan teks manual seperti whiteboard fig.add_annotation( xref="paper", yref="paper", x=0.05, y=0.95, showarrow=False, font=dict(size=12, color="#003DA5"), textangle=-90, align="center", visible=False ) fig.add_annotation( xref="paper", yref="paper", x=0.5, y=0.05, # text="", showarrow=False, font=dict(size=12, color="#003DA5"), align="center", visible=False ) fig.add_annotation( xref="paper", yref="paper", x=0.95, y=0.95, text="Number of Findings
โ€ข 10
โ€ขโ€ข 20
โ€ขโ€ขโ€ข 30
โ€ขโ€ขโ€ขโ€ข 40", showarrow=False, font=dict(size=12, color="#003DA5"), align="left" ) fig.add_annotation( xref="paper", yref="paper", x=0.05, y=0.1, text="Semakin tinggi = semakin sering ditemukan deviasi
Semakin besar = semakin banyak ditemukan deviasi
", showarrow=False, font=dict(size=12, color="#003DA5"), align="left" ) st.plotly_chart(fig, use_container_width=True) # Insight # st.markdown("### ๐Ÿ’ก Insight") # insight_text = ( # f"
" # f"The scatter plot visualizes issue categories with 100% coverage and positive trend. " # f"Categories higher on the Y-axis indicate stronger upward trends, while larger circles indicate higher average monthly frequency. " # f"This helps identify which non-positive issues are both increasing in trend and frequently reported โ€” these should be prioritized for intervention." # f"
" # ) # st.markdown(insight_text, unsafe_allow_html=True) else: st.info("No data available for non-positive issue categories with 100% coverage and positive trend.") import streamlit as st import pandas as pd from huggingface_hub import InferenceClient # ========================== # LLM FUNCTION (HuggingFace) # ========================== def llm_generate_recommendation(insights_text): client = InferenceClient(model="meta-llama/Meta-Llama-3-8B-Instruct") prompt = f""" You are an expert Safety & Reliability Agentic AI. Based on the following structured INSIGHT SUMMARY, create: 1. Recommended Action (max 2 sentences) 2. Risk Mitigation Strategy (max 2 sentences) The insights: {insights_text} Now generate concise, high-impact: - "recommendation" - "mitigation" Return output in EXACT JSON format: {{ "recommendation": "...", "mitigation": "..." }} """ output = client.text_generation(prompt, max_new_tokens=500, temperature=0.4) return output # ============================================== # === INSIGHT COMPUTATION FUNCTION (your code) # ============================================== def extract_agentic_insights_v5(df: pd.DataFrame): dev = { "lowest_ratio_9_locs": [], "obj3a_lowest_div": None, "obj3b_slowest_executor": None, "obj3c_lowest_reporter": None, "obj3d_slowest_div": None, "obj4_unsafe_condition_pct": 0.0, "obj4_unsafe_action_pct": 0.0, "obj4_near_miss_pct": 0.0, "obj5_q1_divs": [], "obj5_q2_divs": [], "obj6_top2_categories": [], } # === 1. 9 locations with lowest finding-to-reporter ratio === if {'nama_lokasi_full', 'creator_nid', 'created_at', 'kode_temuan'}.issubset(df.columns): calc = df[['nama_lokasi_full', 'creator_nid', 'created_at', 'kode_temuan']].copy() calc['created_at'] = pd.to_datetime(calc['created_at'], errors='coerce') calc = calc.dropna(subset=['created_at', 'nama_lokasi_full', 'creator_nid']) calc['bulan'] = calc['created_at'].dt.to_period('M') monthly = calc.groupby(['nama_lokasi_full', 'bulan']).agg( findings=('kode_temuan', 'size'), reporters=('creator_nid', 'nunique') ).reset_index() monthly = monthly[monthly['reporters'] > 0] monthly['ratio'] = monthly['findings'] / monthly['reporters'] loc_avg = monthly.groupby('nama_lokasi_full')['ratio'].mean() lowest_9 = loc_avg.nsmallest(9) dev["lowest_ratio_9_locs"] = [(loc, round(ratio, 3)) for loc, ratio in lowest_9.items()] # === 2a Lowest-ratio division === if {'nama', 'creator_nid', 'created_at', 'kode_temuan'}.issubset(df.columns): calc = df[['nama', 'creator_nid', 'created_at', 'kode_temuan']].copy() calc['bulan'] = pd.to_datetime(calc['created_at']).dt.to_period('M') agg = calc.groupby(['nama', 'bulan']).agg( findings=('kode_temuan', 'size'), reporters=('creator_nid', 'nunique') ) agg = agg[agg['reporters'] > 0].reset_index() agg['ratio'] = agg['findings'] / agg['reporters'] div_ratio = agg.groupby('nama')['ratio'].mean() if not div_ratio.empty: dev["obj3a_lowest_div"] = (div_ratio.idxmin(), round(div_ratio.min(), 2)) # === 2b Slowest executor if 'days_to_close' in df.columns: valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)] exec_col = 'nama_pic' if 'nama_pic' in valid.columns else 'creator_name' if exec_col in valid.columns: lead = valid.groupby(exec_col)['days_to_close'].mean() if not lead.empty: dev["obj3b_slowest_executor"] = (lead.idxmax(), round(lead.max(), 1)) # === 2c Lowest reporter freq if {'creator_name', 'created_at'}.issubset(df.columns): calc = df[['creator_name', 'created_at']].copy() calc['bulan'] = pd.to_datetime(calc['created_at']).dt.to_period('M') monthly = calc.groupby(['creator_name', 'bulan']).size().reset_index(name='count') avg = monthly.groupby('creator_name')['count'].mean() avg = avg[avg > 0] if not avg.empty: dev["obj3c_lowest_reporter"] = (avg.idxmin(), round(avg.min(), 2)) # === 2d Slowest division resolution if 'days_to_close' in df.columns and 'nama' in df.columns: valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)] if not valid.empty: lead = valid.groupby('nama')['days_to_close'].mean() if not lead.empty: dev["obj3d_slowest_div"] = (lead.idxmax(), round(lead.max(), 1)) # === 3. Non-Positive findings composition if 'temuan_kategori' in df.columns: cnt = df['temuan_kategori'].value_counts(normalize=True) * 100 dev["obj4_unsafe_condition_pct"] = round(cnt.get("Unsafe Condition", 0), 1) dev["obj4_unsafe_action_pct"] = round(cnt.get("Unsafe Action", 0), 1) dev["obj4_near_miss_pct"] = round(cnt.get("Near Miss", 0), 1) # === 4. Quadrants X_LIMIT, Y_LIMIT = 20, 3 if {'nama', 'created_at', 'days_to_close', 'kode_temuan'}.issubset(df.columns): calc = df.copy() calc['created_at'] = pd.to_datetime(calc['created_at'], errors='coerce') calc = calc.assign(month=calc['created_at'].dt.to_period('M').astype(str)) monthly_counts = calc.groupby(['nama', 'month'])['kode_temuan'].nunique().reset_index() avg_count = monthly_counts.groupby('nama')['kode_temuan'].mean().reset_index(name='Finding Count') leadtime = calc.groupby('nama')['days_to_close'].mean().reset_index(name='Avg Lead Time') mat = avg_count.merge(leadtime, on='nama', how='left').fillna(0) for _, r in mat.iterrows(): if r['Finding Count'] >= X_LIMIT and r['Avg Lead Time'] >= Y_LIMIT: dev["obj5_q1_divs"].append(r['nama']) elif r['Finding Count'] < X_LIMIT and r['Avg Lead Time'] >= Y_LIMIT: dev["obj5_q2_divs"].append(r['nama']) # === 5. Top categories if {'kategori', 'temuan_kategori', 'created_at'}.issubset(df.columns): nonpos = df[df['temuan_kategori'] != 'Positive'] if not nonpos.empty: start = nonpos['created_at'].min().to_period('M') end = nonpos['created_at'].max().to_period('M') n_months = len(pd.period_range(start=start, end=end, freq='M')) cat_avg = (nonpos.groupby('kategori').size() / n_months).sort_values(ascending=False).head(2) dev["obj6_top2_categories"] = [(cat, round(val, 1)) for cat, val in cat_avg.items()] return dev # ========================== # ===== MAIN APP =========== # ========================== # app.py import streamlit as st import pandas as pd import json from typing import List, Dict # Transformers pipeline for lightweight local LLM (text2text) from transformers import pipeline import math # ===================================================================== # OBJECTIVE 7 โ€” INSIGHT & RECOMMENDATION (LLM FIRST, RULE-BASED IF FAIL) # ===================================================================== st.markdown("

OBJECTIVE 7 โ€” Risk Mitigation: What are the Insights and Recommendations?

", unsafe_allow_html=True) # ============================================================ # 1. LLM LOADER # ============================================================ import torch from transformers import AutoTokenizer, AutoModelForCausalLM @st.cache_resource def load_llm_model(): try: tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it") model = AutoModelForCausalLM.from_pretrained( "google/gemma-2-2b-it", torch_dtype=torch.float16, device_map="auto" ) return tokenizer, model except: return None, None def llm_generate_recommendation(insight_summary_text): """ Try to generate JSON recommendation using LLM. If LLM fails โ†’ return None, and rule-based will be used. """ tok, mdl = load_llm_model() if tok is None or mdl is None: return None try: prompt = f""" You are an expert Industrial Safety Analyst AI. Below is an INSIGHT SUMMARY from a Safety Reporting System: --- {insight_summary_text} --- Generate 5 Recommended Actions and Risk Mitigation in clean JSON: {{ "recommendations": [ {{"point":"1","rec":"...","mit":"..."}}, {{"point":"2","rec":"...","mit":"..."}}, {{"point":"3","rec":"...","mit":"..."}} ] }} """ inputs = tok(prompt, return_tensors="pt").to(mdl.device) out = mdl.generate( **inputs, max_new_tokens=380, temperature=0.25, do_sample=True ) text = tok.decode(out[0], skip_special_tokens=True) import re, json json_match = re.search(r"\{[\s\S]*\}", text) if not json_match: return None return json.loads(json_match.group(0)) except: return None # ============================================================ # 2. RULE-BASED ENGINE (YOUR ORIGINAL SCRIPT) # ============================================================ def extract_agentic_insights_v5(df: pd.DataFrame): dev = { "lowest_ratio_9_locs": [], "obj3a_lowest_div": None, "obj3b_slowest_executor": None, "obj3c_lowest_reporter": None, "obj3d_slowest_div": None, "obj4_unsafe_condition_pct": 0.0, "obj4_unsafe_action_pct": 0.0, "obj4_near_miss_pct": 0.0, "obj5_q1_divs": [], "obj5_q2_divs": [], "obj6_top2_categories": [], } # === 1. 9 lowest locations === if {'nama_lokasi_full', 'creator_nid', 'created_at', 'kode_temuan'}.issubset(df.columns): calc = df[['nama_lokasi_full', 'creator_nid', 'created_at', 'kode_temuan']].copy() calc['created_at'] = pd.to_datetime(calc['created_at'], errors='coerce') calc = calc.dropna(subset=['created_at', 'nama_lokasi_full', 'creator_nid']) calc['bulan'] = calc['created_at'].dt.to_period('M') monthly = calc.groupby(['nama_lokasi_full', 'bulan']).agg( findings=('kode_temuan', 'size'), reporters=('creator_nid', 'nunique') ).reset_index() monthly = monthly[monthly['reporters'] > 0] monthly['ratio'] = monthly['findings'] / monthly['reporters'] loc_avg = monthly.groupby('nama_lokasi_full')['ratio'].mean() lowest_9 = loc_avg.nsmallest(9) dev["lowest_ratio_9_locs"] = [(loc, round(ratio, 3)) for loc, ratio in lowest_9.items()] # === 2a: Lowest division ratio === if {'nama', 'creator_nid', 'created_at', 'kode_temuan'}.issubset(df.columns): calc = df[['nama', 'creator_nid', 'created_at', 'kode_temuan']].copy() calc['bulan'] = pd.to_datetime(calc['created_at']).dt.to_period('M') agg = calc.groupby(['nama', 'bulan']).agg( findings=('kode_temuan', 'size'), reporters=('creator_nid', 'nunique') ) agg = agg[agg['reporters'] > 0].reset_index() agg['ratio'] = agg['findings'] / agg['reporters'] div_ratio = agg.groupby('nama')['ratio'].mean() if not div_ratio.empty: dev["obj3a_lowest_div"] = (div_ratio.idxmin(), round(div_ratio.min(), 2)) # === 2b: Slowest executor === if 'days_to_close' in df.columns: valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)] exec_col = 'nama_pic' if 'nama_pic' in valid.columns else 'creator_name' if exec_col in valid.columns: lead = valid.groupby(exec_col)['days_to_close'].mean() if not lead.empty: dev["obj3b_slowest_executor"] = (lead.idxmax(), round(lead.max(), 1)) # === 2c: Lowest reporter === if {'creator_name', 'created_at'}.issubset(df.columns): calc = df[['creator_name', 'created_at']].copy() calc['bulan'] = pd.to_datetime(calc['created_at']).dt.to_period('M') monthly = calc.groupby(['creator_name', 'bulan']).size().reset_index(name='count') avg = monthly.groupby('creator_name')['count'].mean() avg = avg[avg > 0] if not avg.empty: dev["obj3c_lowest_reporter"] = (avg.idxmin(), round(avg.min(), 2)) # === 2d: Slowest division === if 'days_to_close' in df.columns and 'nama' in df.columns: valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)] if not valid.empty: lead = valid.groupby('nama')['days_to_close'].mean() if not lead.empty: dev["obj3d_slowest_div"] = (lead.idxmax(), round(lead.max(), 1)) # === 3. Non-positive === if 'temuan_kategori' in df.columns: cnt = df['temuan_kategori'].value_counts(normalize=True) * 100 dev["obj4_unsafe_condition_pct"] = round(cnt.get("Unsafe Condition", 0), 1) dev["obj4_unsafe_action_pct"] = round(cnt.get("Unsafe Action", 0), 1) dev["obj4_near_miss_pct"] = round(cnt.get("Near Miss", 0), 1) # === 4. Risk Quadrant === X_LIMIT, Y_LIMIT = 20, 3 if {'nama', 'created_at', 'days_to_close', 'kode_temuan'}.issubset(df.columns): calc = df.copy() calc['created_at'] = pd.to_datetime(calc['created_at'], errors='coerce') calc = calc.assign(month=calc['created_at'].dt.to_period('M').astype(str)) monthly_counts = calc.groupby(['nama', 'month'])['kode_temuan'].nunique().reset_index() avg_count = monthly_counts.groupby('nama')['kode_temuan'].mean().reset_index(name='Finding Count') leadtime = calc.groupby('nama')['days_to_close'].mean().reset_index(name='Avg Lead Time') mat = avg_count.merge(leadtime, on='nama', how='left').fillna(0) for _, r in mat.iterrows(): if r['Finding Count'] >= X_LIMIT and r['Avg Lead Time'] >= Y_LIMIT: dev["obj5_q1_divs"].append(r['nama']) elif r['Finding Count'] < X_LIMIT and r['Avg Lead Time'] >= Y_LIMIT: dev["obj5_q2_divs"].append(r['nama']) # === 5. Top categories === if {'kategori', 'temuan_kategori', 'created_at'}.issubset(df.columns): nonpos = df[df['temuan_kategori'] != 'Positive'] if not nonpos.empty: start = nonpos['created_at'].min().to_period('M') end = nonpos['created_at'].max().to_period('M') n_months = len(pd.period_range(start=start, end=end, freq='M')) cat_avg = (nonpos.groupby('kategori').size() / n_months).sort_values(ascending=False).head(2) dev["obj6_top2_categories"] = [(cat, round(v, 1)) for cat, v in cat_avg.items()] return dev # ============================================================ # 3. RUN INSIGHT ENGINE # ============================================================ dev = extract_agentic_insights_v5(df_filtered) # Build Insight Summary Text insight_lines = [] if dev["lowest_ratio_9_locs"]: loc_list = ", ".join([f"{loc} ({ratio})" for loc, ratio in dev["lowest_ratio_9_locs"]]) insight_lines.append(f"1. Nine locations with the lowest finding-to-reporter ratio: {loc_list}.") parts = [] if dev["obj3a_lowest_div"]: parts.append(f"division {dev['obj3a_lowest_div'][0]} (ratio: {dev['obj3a_lowest_div'][1]})") if dev["obj3c_lowest_reporter"]: parts.append(f"reporter {dev['obj3c_lowest_reporter'][0]} ({dev['obj3c_lowest_reporter'][1]} findings/month)") if dev["obj3d_slowest_div"]: parts.append(f"division {dev['obj3d_slowest_div'][0]} ({dev['obj3d_slowest_div'][1]} days)") if dev["obj3b_slowest_executor"]: parts.append(f"executor {dev['obj3b_slowest_executor'][0]} ({dev['obj3b_slowest_executor'][1]} days)") if parts: insight_lines.append("2. Uneven operational capacity: " + "; ".join(parts)) uc, ua, nm = dev["obj4_unsafe_condition_pct"], dev["obj4_unsafe_action_pct"], dev["obj4_near_miss_pct"] if uc + ua + nm > 0: insight_lines.append(f"3. Non-Positive composition: Unsafe Condition ({uc}%), Unsafe Action ({ua}%), Near Miss ({nm}%).") if dev["obj5_q1_divs"] or dev["obj5_q2_divs"]: q1 = ", ".join([f"{x}" for x in dev["obj5_q1_divs"][:5]]) q2 = ", ".join([f"{x}" for x in dev["obj5_q2_divs"][:5]]) insight_lines.append(f"4. High-risk divisions (QI): {q1 or 'โ€”'}. Hidden-risk (QII): {q2 or 'โ€”'}.") if dev["obj6_top2_categories"]: c1, c2 = dev["obj6_top2_categories"] insight_lines.append(f"5. Top recurring non-Positive categories: {c1[0]} ({c1[1]}/mo) & {c2[0]} ({c2[1]}/mo).") insight_text = "

".join(insight_lines) # ============================================================ # 4. TRY LLM FIRST โ†’ ELSE FALLBACK RULE-BASED # ============================================================ llm_json = llm_generate_recommendation(insight_text) if llm_json is not None and "recommendations" in llm_json: recs = llm_json["recommendations"] else: # FALLBACK RULE-BASED recs = [] if dev["lowest_ratio_9_locs"]: recs.append({"point":"1","rec":"Launch spot-inspection sprint at across the 9 lowest-ratio locations.","mit":"Enable 3-min QR checklist + automated WhatsApp reminders."}) if parts: recs.append({"point":"2","rec":"Real-time monitoring of finding/reporter ratios and resolution lead times per division/individual.","mit":"Trigger coaching alerts to Area PICs & Division"}) if uc + ua + nm > 0: recs.append({"point":"3","rec":"Enforce photo-based validation for Unsafe Condition/Action/Near Miss submissions to ensure accurate categorization.","mit":"System blocks submission if photo evidence or category justification is missing."}) if dev["obj5_q1_divs"] or dev["obj5_q2_divs"]: recs.append({"point":"4","rec":"Assign dedicated safety crews to Quadrant I divisions; enforce One Finding, One Day closure policy for Quadrant II.","mit":"Auto-generate executive escalation reports to VP Operations if any division remains in QI/QII for โ‰ฅ2 consecutive months."}) if dev["obj6_top2_categories"]: c1, c2 = dev["obj6_top2_categories"] recs.append({"point":"5","rec":f"Create RCA task force for {c1[0]} & {c2[0]}.","mit":"Update tender specs with required mitigations."}) # ============================================================ # 5. RENDERING (NO CHANGES) # ============================================================ # Insight Summary Card st.markdown( f"""

Summary

{insight_text}

""", unsafe_allow_html=True ) # Recommendation Table if recs: rows = "".join([ f"{r['point']}" f"{r['rec']}" f"{r['mit']}" for r in recs ]) st.markdown( f"""

Recommended Actions & Agentic Risk Mitigation

{rows}
# Recommended Action Risk Mitigation Strategy
""", unsafe_allow_html=True ) else: st.info("No actionable insights generated.")