import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from datetime import datetime, timedelta
from typing import List
import os
import sklearn
import kaleido
from transformers import pipeline
import transformers
print("Transformers version:", transformers.__version__)
from transformers import pipeline
print("โ pipeline imported successfully")
# =================== PAGE CONFIG ===================
st.set_page_config(
page_title="Proactive Safety Intelligence & Analytics Dashboard",
page_icon="",
layout="wide",
initial_sidebar_state="expanded"
)
# =================== CUSTOM CSS (Updated for PLN Colors) ===================
st.markdown("""""", unsafe_allow_html=True)
# =================== DATA LOADING (FROM data.xlsx) ===================
@st.cache_data(ttl=300) # refresh every 5 min
def load_data():
file_path = "data.xlsx"
if not os.path.exists(file_path):
st.error(f"โ File **`{file_path}`** not found. Please ensure it's in the same directory as this script.")
return pd.DataFrame()
try:
# Load Excel file
df = pd.read_excel(file_path, sheet_name='Sheet1', engine='openpyxl')
# Check for required columns
required_cols = ['created_at']
missing = [c for c in required_cols if c not in df.columns]
if missing:
st.error(f"โ Missing required columns: {missing}. Available: {list(df.columns)}")
return pd.DataFrame()
# Parse datetime
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
if df['created_at'].isna().all():
st.error("โ `created_at` column could not be parsed as datetime.")
return pd.DataFrame()
# Optional: close_at
if 'close_at' in df.columns:
df['close_at'] = pd.to_datetime(df['close_at'], errors='coerce')
df['days_to_close'] = (df['close_at'] - df['created_at']).dt.total_seconds() / (24 * 3600)
df['days_to_close'] = df['days_to_close'].apply(lambda x: x if x >= 0 else np.nan)
else:
df['days_to_close'] = np.nan
# Derived columns
df['created_month'] = df['created_at'].dt.to_period('M')
df['created_date'] = df['created_at'].dt.date
df['created_week'] = df['created_at'].dt.to_period('W')
# Keep only valid rows
df = df.dropna(subset=['created_at']).copy()
# Log shape
st.sidebar.success(f"Loaded {len(df):,} Audit Findings from `data.xlsx`")
return df
except Exception as e:
st.exception(f"Error loading data.xlsx: {e}")
return pd.DataFrame()
df = load_data()
if df.empty:
st.stop()
# =================== SIDEBAR FILTERS (Perbaikan) ===================
st.sidebar.markdown('
', unsafe_allow_html=True)
st.sidebar.markdown('
Filter Dashboard
', unsafe_allow_html=True)
# Inisialisasi df_filtered
df_filtered = df.copy()
# Flag to track if filters were applied
filters_applied = False
# 1. Date Range Filter
min_date = df['created_at'].min().date()
max_date = df['created_at'].max().date()
date_range = st.sidebar.date_input(
"Date Range",
value=(min_date, max_date),
min_value=min_date,
max_value=max_date
)
# 2. Filter by Vendor (nama_perusahaan) - Default to All
if 'nama_perusahaan' in df.columns:
unique_vendors = sorted(df['nama_perusahaan'].dropna().astype(str).unique())
all_vendors_option = "All Vendors"
vendor_options = [all_vendors_option] + list(unique_vendors)
selected_vendor = st.sidebar.selectbox("Vendor", vendor_options, index=0) # Default to "All"
if selected_vendor != all_vendors_option:
df_filtered = df_filtered[df_filtered['nama_perusahaan'].astype(str) == selected_vendor]
filters_applied = True
# 3. Filter by Area/Unit Type (temuan_nama_distrik or creator_nama_distrik) - Renamed
area_col = None
if 'temuan_nama_distrik' in df_filtered.columns:
area_col = 'temuan_nama_distrik'
elif 'creator_nama_distrik' in df_filtered.columns:
area_col = 'creator_nama_distrik'
if area_col:
# Define mapping for display names
area_mapping = {
'UMRO': 'Unit Maintenance',
'UP GRESIK': 'Unit Pembangkit'
}
unique_areas_raw = sorted(df_filtered[area_col].dropna().astype(str).unique())
# Map raw values to display names, keep unmapped values as is
unique_areas_display = [area_mapping.get(area, area) for area in unique_areas_raw]
# Prepend "All" option
all_areas_option = "All Units"
area_options = [all_areas_option] + unique_areas_display
selected_area_display = st.sidebar.selectbox("Unit Type", area_options, index=0) # Default to "All"
if selected_area_display != all_areas_option:
# Reverse map the selected display name back to the raw value for filtering
selected_area_raw = next((raw for raw, disp in area_mapping.items() if disp == selected_area_display), selected_area_display)
df_filtered = df_filtered[df_filtered[area_col].astype(str) == selected_area_raw]
filters_applied = True
# 4. Status filter - Changed to selectbox (dropdown)
status_filter_applied = False
if 'temuan_status' in df_filtered.columns:
all_status = sorted(df_filtered['temuan_status'].dropna().astype(str).unique())
# Prepend "All" option
all_status_option = "All Status"
status_options = [all_status_option] + list(all_status)
selected_status = st.sidebar.selectbox(
"Status",
status_options,
index=0 # Default to "All"
)
if selected_status != all_status_option:
df_filtered = df_filtered[df_filtered['temuan_status'].astype(str) == selected_status]
status_filter_applied = True
filters_applied = True
# Apply date filter *after* other filters
if len(date_range) == 2:
df_filtered = df_filtered[
(df_filtered['created_at'].dt.date >= date_range[0]) &
(df_filtered['created_at'].dt.date <= date_range[1])
]
if date_range[0] != min_date or date_range[1] != max_date:
filters_applied = True
# Submit Button
submit_clicked = st.sidebar.button("Apply Filters")
# Apply filters logic when button is clicked
if submit_clicked:
# The filtering based on selections already happened above
# Here we just update the summary based on the current state of df_filtered
active_filters = []
if 'selected_vendor' in locals() and selected_vendor != all_vendors_option:
active_filters.append(f"Vendor: {selected_vendor}")
if 'selected_area_display' in locals() and selected_area_display != all_areas_option:
active_filters.append(f"Unit: {selected_area_display}")
if 'selected_status' in locals() and selected_status != all_status_option:
active_filters.append(f"Status: {selected_status}")
if len(date_range) == 2 and (date_range[0] != min_date or date_range[1] != max_date):
active_filters.append(f"Date: {date_range[0]} to {date_range[1]}")
if active_filters:
st.sidebar.success("**Active Filters:**")
for f in active_filters:
st.sidebar.markdown(f"- {f}")
st.sidebar.info(f"Showing {len(df_filtered)} records based on filters.")
else:
st.sidebar.info("No specific filters applied (showing all records).")
else:
# Show default message when not submitted yet
st.sidebar.info("Set filters and click 'Apply Filters'.")
st.sidebar.markdown('
', unsafe_allow_html=True)
# =================== HEADER ===================
import streamlit as st
import os
import streamlit as st
import os
# ====================== CSS UNTUK HEADER ===========================
# CSS biar header terlihat seperti 1 kotak besar
st.markdown("""
""", unsafe_allow_html=True)
# ==== HEADER LAYOUT (Columns) ====
with st.container():
st.markdown('
', unsafe_allow_html=True)
col1, col2, col3 = st.columns([1, 5, 1]) # tengah lebih besar
# LEFT (kosong)
with col1:
st.write("")
# CENTER โ TITLE
with col2:
st.markdown("""
""", unsafe_allow_html=True)
# RIGHT โ LOGO
with col3:
try:
st.image("pln.png", width=120) # ๐ฅ Diperbesar dari 80px
except:
st.write("") # Jika logo tidak ditemukan, kosongkan
st.markdown('
', unsafe_allow_html=True)
# =================== OBJECTIVE 1 - Company Reporting Activity (Polar Bar Chart) ===================
st.markdown(
"""
OBJECTIVE 1 โ Company Reporting Activity: Who Reports the Most?
""",
unsafe_allow_html=True
)
df_local = df_filtered.copy()
df_local['created_month'] = df_local['created_at'].dt.to_period('M')
if 'temuan_kode_distrik' in df_local.columns:
df_local['Area_Type'] = df_local['temuan_kode_distrik'].apply(
lambda x: 'PG' if 'PG' in str(x).upper()
else 'UM' if 'UM' in str(x).upper()
else 'Other'
)
df_pg = df_local[df_local['Area_Type'] == 'PG'].copy()
df_um = df_local[df_local['Area_Type'] == 'UM'].copy()
# --- Hitung rasio per perusahaan ---
def calculate_avg_ratio_per_company(df_area):
if df_area.empty:
return pd.DataFrame()
findings = df_area.groupby(['created_month', 'nama_perusahaan']).size().reset_index(name='findings_count')
creators = df_area.groupby(['created_month', 'nama_perusahaan'])['creator_nid'].nunique().reset_index(name='unique_creators')
merged = findings.merge(creators, on=['created_month', 'nama_perusahaan'], how='outer')
merged = merged.fillna({'findings_count': 0, 'unique_creators': 0})
merged = merged[merged['unique_creators'] > 0]
merged['ratio'] = merged['findings_count'] / merged['unique_creators']
merged['ratio'] = merged['ratio'].replace([np.inf, -np.inf], np.nan)
avg_ratio = merged.groupby('nama_perusahaan')['ratio'].mean().reset_index(name='avg_monthly_ratio')
return avg_ratio.dropna(subset=['avg_monthly_ratio'])
avg_ratio_pg = calculate_avg_ratio_per_company(df_pg)
avg_ratio_um = calculate_avg_ratio_per_company(df_um)
# ๐ PALET & COLOR MAPPING GLOBAL โ KONSISTEN ANTAR CHART
PLN_COLOR = "#FFD700"
PASTEL_BLUES = [
"#A8DADC", "#E2ECE9", "#CCE4E7", "#B5D9D9",
"#98C8D1", "#7FB9C1", "#6BA9B3", "#5A9CB5", "#4A8FA7", "#3A8399"
] # diperpanjang sedikit untuk antisipasi banyak kontraktor
# ๐ฅ Dapatkan daftar *semua* perusahaan non-PLN unik (dari seluruh data), urut alfabetis โ assign warna deterministik
all_companies = pd.concat([avg_ratio_pg, avg_ratio_um])['nama_perusahaan'].dropna().unique()
non_pln_companies = sorted([c for c in all_companies if 'PLN' not in str(c).upper()])
# Mapping: company โ color (PLN khusus, lainnya dari palet biru berurut)
COMPANY_COLOR_MAP = {}
for i, company in enumerate(non_pln_companies):
COMPANY_COLOR_MAP[company] = PASTEL_BLUES[i % len(PASTEL_BLUES)]
# Pastikan PLN juga masuk mapping (jika muncul)
for company in all_companies:
if 'PLN' in str(company).upper():
COMPANY_COLOR_MAP[company] = PLN_COLOR
# ๐ฅ Fungsi helper: dapatkan warna berdasarkan nama perusahaan (konsisten!)
def get_color(company_name):
return COMPANY_COLOR_MAP.get(company_name, "#CCCCCC") # fallback abu-abu
# ๐ฅ Fungsi chart โ pakai mapping global
def create_polar_bar_chart(df, area_name):
if df.empty:
return None
# Urutkan untuk stabilitas visual (misal: ascending ratio)
df = df.sort_values('avg_monthly_ratio', ascending=True).reset_index(drop=True)
companies = df['nama_perusahaan'].tolist()
ratios = df['avg_monthly_ratio'].tolist()
colors = [get_color(comp) for comp in companies]
# Hitung proporsi temuan (untuk lebar bar)
total_findings = df_local[df_local['Area_Type'] == area_name].groupby('nama_perusahaan').size()
angles = [total_findings.get(comp, 0) / total_findings.sum() * 360 if total_findings.sum() > 0 else 0 for comp in companies]
# Hitung posisi tengah
mid_angles = []
current = 0
for a in angles:
mid_angles.append(current + a / 2)
current += a
fig = go.Figure()
# Satu trace saja โ lebih clean, legend bisa diatur manual jika perlu
fig.add_trace(go.Barpolar(
r=ratios,
theta=mid_angles,
width=angles,
marker_color=colors,
marker_line_color="white",
marker_line_width=1.2,
opacity=0.9,
hovertemplate="%{text} Avg Ratio: %{r:.2f}",
text=companies,
showlegend=False # Kita akan buat legend manual yang rapi
))
# ๐ฅ LEGEND MANUAL โ hanya tampilkan setiap perusahaan sekali, dengan warna konsisten
# Tambahkan satu scatter "dummy" per perusahaan untuk legend
for company in sorted(set(companies)):
color = get_color(company)
fig.add_trace(go.Scatterpolar(
r=[None], theta=[None], # invisible
mode='markers',
marker=dict(color=color, size=10),
name=company,
showlegend=True
))
fig.update_layout(
title=f'{area_name} Area',
polar=dict(
radialaxis=dict(
visible=True,
tickfont=dict(size=9, color="gray"),
gridcolor='lightgray',
title=dict(text='Avg Finding/Person', font=dict(size=10, color="gray"))
),
angularaxis=dict(
visible=True,
direction='clockwise',
tickfont=dict(size=9, color="white"),
showline=False,
gridcolor="lightgray"
),
),
showlegend=True,
legend=dict(
orientation="v",
yanchor="top",
y=1,
xanchor="right",
x=1.02,
font=dict(size=10)
),
height=450,
margin=dict(t=40, b=20, l=20, r=40)
)
return fig
# Plot
col1, col2 = st.columns(2)
with col1:
st.markdown("
Unit Pembangkit: Monthly Finding by Company
", unsafe_allow_html=True)
fig_pg = create_polar_bar_chart(avg_ratio_pg, 'PG')
if fig_pg:
st.plotly_chart(fig_pg, use_container_width=True)
if not avg_ratio_pg.empty:
top = avg_ratio_pg.loc[avg_ratio_pg['avg_monthly_ratio'].idxmax()]
low = avg_ratio_pg.loc[avg_ratio_pg['avg_monthly_ratio'].idxmin()]
st.markdown("### Insight")
st.markdown(
f"
"
f"In PG Area, {top['nama_perusahaan']} has the highest ratio ({top['avg_monthly_ratio']:.2f}), "
f"while {low['nama_perusahaan']} has the lowest ({low['avg_monthly_ratio']:.2f}). "
f"Consider cross-learning between them to standardize reporting culture."
f"
",
unsafe_allow_html=True
)
else:
st.warning("No data for PG area.")
with col2:
st.markdown("
Unit Maintenance: Monthly Finding by Company
", unsafe_allow_html=True)
fig_um = create_polar_bar_chart(avg_ratio_um, 'UM')
if fig_um:
st.plotly_chart(fig_um, use_container_width=True)
if not avg_ratio_um.empty:
top = avg_ratio_um.loc[avg_ratio_um['avg_monthly_ratio'].idxmax()]
low = avg_ratio_um.loc[avg_ratio_um['avg_monthly_ratio'].idxmin()]
st.markdown("### Insight")
st.markdown(
f"
"
f"Across all companies, the finding-per-person ratio is similar in the UM Area "
f"
",
unsafe_allow_html=True
)
else:
st.warning("No data for UM area.")
else:
st.error("Column 'temuan_kode_distrik' not found.")
st.stop()
# =================== OBJECTIVE 2 โ Active vs Inactive Locations (Treemap with Color Gradient) ===================
st.markdown(
"""
OBJECTIVE 2 โ Active vs Inactive Locations: Who Leads?
""",
unsafe_allow_html=True
)
df_local = df_filtered.copy()
if df_local.empty:
st.warning("No data available after filtering.")
st.stop()
df_local['created_month'] = df_local['created_at'].dt.to_period('M')
# Hitung temuan per bulan per lokasi
findings_by_location_month = df_local.groupby(['created_month', 'nama_lokasi_full']).size().reset_index(name='findings_count')
# Hitung jumlah orang unik per bulan per lokasi
creators_by_location_month = df_local.groupby(['created_month', 'nama_lokasi_full'])['creator_nid'].nunique().reset_index(name='unique_creators')
# Gabung
merged_loc = findings_by_location_month.merge(creators_by_location_month, on=['created_month', 'nama_lokasi_full'], how='outer')
# Isi NaN dengan 0
merged_loc = merged_loc.fillna({'findings_count': 0, 'unique_creators': 0})
# Hindari pembagian dengan nol
merged_loc = merged_loc[merged_loc['unique_creators'] > 0]
# Hitung rasio
merged_loc['ratio'] = merged_loc['findings_count'] / merged_loc['unique_creators']
merged_loc['ratio'] = merged_loc['ratio'].replace([np.inf, -np.inf], np.nan)
# Rata-rata bulanan per lokasi
avg_ratio_per_location = merged_loc.groupby('nama_lokasi_full')['ratio'].mean().reset_index(name='avg_monthly_ratio')
avg_ratio_per_location = avg_ratio_per_location.dropna(subset=['avg_monthly_ratio'])
if avg_ratio_per_location.empty:
st.write("Data kosong.")
else:
# Treemap โ hanya dibuat & ditampilkan sekali
fig_treemap = px.treemap(
avg_ratio_per_location,
path=['nama_lokasi_full'],
values='avg_monthly_ratio',
title='Avg Monthly Finding/Person Ratio by Location',
labels={
'avg_monthly_ratio': 'Avg Monthly Finding/Person Ratio',
'nama_lokasi_full': 'Location'
},
color='avg_monthly_ratio',
color_continuous_scale=[
[0.0, '#D32F2F'], # Red (low activity)
[0.5, '#FFB300'], # Amber/orange-yellow (medium)
[1.0, '#4CAF50'] # Green (high activity)
],
hover_data={'avg_monthly_ratio': ':.2f'} # pastikan format hover sudah 2 desimal
)
# Perbaiki tata letak & hover โ lakukan *sebelum* menampilkan
fig_treemap.update_traces(
hovertemplate="%{label} Avg Ratio: %{value:.2f}"
)
fig_treemap.update_layout(
title=dict(text='Avg Monthly Finding/Person Ratio by Location', x=0.5, xanchor='center'),
height=600,
margin=dict(t=50, l=25, r=25, b=25)
)
# Tampilkan hanya sekali!
st.plotly_chart(fig_treemap, use_container_width=True)
# === AI Insight ===
top_location = avg_ratio_per_location.loc[avg_ratio_per_location['avg_monthly_ratio'].idxmax()]
low_location = avg_ratio_per_location.loc[avg_ratio_per_location['avg_monthly_ratio'].idxmin()]
st.markdown("### Insight")
insight_text = (
f"
"
f"{top_location['nama_lokasi_full']} shows the highest activity level "
f"({top_location['avg_monthly_ratio']:.2f} findings per person/month). "
f"In contrast, {low_location['nama_lokasi_full']} has the lowest activity level "
f"({low_location['avg_monthly_ratio']:.2f}).
"
f"๐ข High-ratio (green) locations indicate either high reporting diligence or high exposure to risks โ both warrant deeper root-cause analysis. "
f"๐ก Medium-ratio (yellow) areas need monitoring to ensure trends do not deteriorate. "
f"๐ด Low-ratio (red) locations may reflect under-reporting, lack of engagement, or low hazard visibility โ recommend follow-up audits to verify data completeness."
f"
"
)
st.markdown(insight_text, unsafe_allow_html=True)
# =================== OBJECTIVE 3 - Frequency & Response Time ===================
st.markdown(
"""
OBJECTIVE 3 โ Frequency & Response Time: Who Reports Well? Who Executes Well?
""",
unsafe_allow_html=True
)
df_local = df_filtered.copy()
if df_local.empty:
st.warning("No data available after filtering.")
st.stop()
df_local['created_month'] = df_local['created_at'].dt.to_period('M')
# โโโ Helper: Hitung rasio per division (reporter) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def compute_reporter_ratio_by_nama(df):
if 'nama' not in df.columns:
return pd.DataFrame()
findings_by_nama_month = df.groupby(['created_month', 'nama']).size().reset_index(name='findings_count')
creators_by_nama_month = df.groupby(['created_month', 'nama'])['creator_nid'].nunique().reset_index(name='unique_creators')
merged_rep = findings_by_nama_month.merge(creators_by_nama_month, on=['created_month', 'nama'], how='outer')
merged_rep = merged_rep.fillna({'findings_count': 0, 'unique_creators': 0})
merged_rep = merged_rep[merged_rep['unique_creators'] > 0]
merged_rep['ratio'] = merged_rep['findings_count'] / merged_rep['unique_creators']
merged_rep['ratio'] = merged_rep['ratio'].replace([np.inf, -np.inf], np.nan)
avg_ratio_per_nama = merged_rep.groupby('nama')['ratio'].mean().reset_index(name='avg_monthly_ratio')
avg_ratio_per_nama = avg_ratio_per_nama.dropna(subset=['avg_monthly_ratio'])
return avg_ratio_per_nama
# โโโ Helper: Hitung rata-rata temuan per reporter (individu) โโโโโโโโโโโโโโโโโ
def compute_reporter_rate_by_creator(df):
if 'creator_name' not in df.columns:
return pd.DataFrame()
findings_by_creator_month = df.groupby(['created_month', 'creator_name']).size().reset_index(name='findings_count')
active_months_by_creator = findings_by_creator_month.groupby('creator_name')['created_month'].nunique().reset_index(name='active_months')
total_findings_by_creator = findings_by_creator_month.groupby('creator_name')['findings_count'].sum().reset_index()
merged_rep_creator = total_findings_by_creator.merge(active_months_by_creator, on='creator_name', how='outer')
merged_rep_creator = merged_rep_creator.fillna({'findings_count': 0, 'active_months': 0})
merged_rep_creator = merged_rep_creator[merged_rep_creator['active_months'] > 0]
merged_rep_creator['avg_monthly_rate'] = merged_rep_creator['findings_count'] / merged_rep_creator['active_months']
merged_rep_creator['avg_monthly_rate'] = merged_rep_creator['avg_monthly_rate'].replace([np.inf, -np.inf], np.nan)
avg_rate_per_creator = merged_rep_creator.dropna(subset=['avg_monthly_rate'])
return avg_rate_per_creator
# โโโ Helper: Hitung lead time per division (executor) โโโโโโโโโโโโโโโโโโโโโโโ
def compute_executor_leadtime_by_nama(df):
if 'nama' not in df.columns or 'days_to_close' not in df.columns:
return pd.DataFrame()
# Filter hanya data dengan lead time valid
df_valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)]
leadtime_by_nama_month = df_valid.groupby(['created_month', 'nama'])['days_to_close'].mean().reset_index(name='avg_leadtime')
avg_leadtime_nama = leadtime_by_nama_month.groupby('nama')['avg_leadtime'].mean().reset_index(name='avg_monthly_leadtime')
avg_leadtime_nama = avg_leadtime_nama.dropna(subset=['avg_monthly_leadtime'])
return avg_leadtime_nama
# โโโ Helper: Hitung lead time per individu executor (deteksi kolom otomatis) โ
def compute_executor_leadtime_by_individual(df, name_col='creator_name'):
if name_col not in df.columns or 'days_to_close' not in df.columns:
return pd.DataFrame()
df_valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)]
leadtime_by_indiv_month = df_valid.groupby(['created_month', name_col])['days_to_close'].mean().reset_index(name='avg_leadtime')
avg_leadtime_indiv = leadtime_by_indiv_month.groupby(name_col)['avg_leadtime'].mean().reset_index(name='avg_monthly_leadtime')
avg_leadtime_indiv = avg_leadtime_indiv.dropna(subset=['avg_monthly_leadtime'])
return avg_leadtime_indiv
# โโโ Deteksi kolom executor individu โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
EXECUTOR_INDIV_COL = None
candidate_executor_cols = ['pic', 'pic_name', 'responsible', 'responsible_name', 'assigned_to', 'closed_by', 'executor_name', 'executor']
for col in candidate_executor_cols:
if col in df_local.columns:
EXECUTOR_INDIV_COL = col
break
if EXECUTOR_INDIV_COL is None:
# Fallback โ gunakan creator_name (dengan warning transparan)
EXECUTOR_INDIV_COL = 'nama_pic'
# st.warning(
# "โ ๏ธ No dedicated executor column (e.g., 'pic', 'responsible') found. "
# "Using 'creator_name' as proxy for executor โ insights may conflate reporters & executors. "
# "Consider adding an executor identifier column for accuracy."
# )
# โโโ Hitung semua metrik โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
avg_ratio_per_nama = compute_reporter_ratio_by_nama(df_local) # 3a
avg_rate_per_creator = compute_reporter_rate_by_creator(df_local) # 3c
avg_leadtime_nama = compute_executor_leadtime_by_nama(df_local) # 3b
avg_leadtime_per_indiv = compute_executor_leadtime_by_individual(df_local, name_col=EXECUTOR_INDIV_COL) # 3d
# โโโ Helper: Warna berdasarkan ranking global โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def add_color_by_global_rank(df, value_col, top_n=5, worst_n=5, high_is_good=True):
df = df.copy()
df['color'] = '#1f77b4' # default biru
if len(df) == 0:
return df
if high_is_good:
# Nilai tinggi = baik โ top N โ hijau
top_names = df.nlargest(top_n, value_col)['nama' if 'nama' in df.columns else df.columns[0]]
df.loc[df[df.columns[0]].isin(top_names), 'color'] = '#4CAF50'
else:
# Nilai tinggi = buruk (e.g., lead time) โ worst N (tertinggi) โ merah
worst_names = df.nlargest(worst_n, value_col)['nama' if 'nama' in df.columns else df.columns[0]]
df.loc[df[df.columns[0]].isin(worst_names), 'color'] = '#D32F2F'
return df
# โโโ Layout: 2 baris ร 2 kolom โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
col_3a, col_3c = st.columns(2)
# โโโ 3a: Reporter by Division (Rasio Temuan/Orang) โโโโโโโโโโโโโโโโโโโโโโโโโโโ
with col_3a:
st.markdown("
3a. Finding/Person Ratio by Division (Reporter)
", unsafe_allow_html=True)
if avg_ratio_per_nama.empty:
st.warning("No data for division-level reporter analysis.")
else:
sort_opt = st.selectbox("Show:", ["Top 10", "Bottom 10"], key='sort_3a')
full_sorted = avg_ratio_per_nama.sort_values('avg_monthly_ratio', ascending=False)
subset = full_sorted.head(10) if sort_opt == "Top 10" else full_sorted.tail(10).sort_values('avg_monthly_ratio', ascending=True)
# Tambahkan warna: top 5 โ hijau
colored = add_color_by_global_rank(avg_ratio_per_nama, 'avg_monthly_ratio', top_n=5, high_is_good=True)
subset = subset.merge(colored[['nama', 'color']], on='nama', how='left').fillna({'color': '#1f77b4'})
# Reverse untuk visual (tertinggi di atas)
if sort_opt == "Top 10":
subset = subset.iloc[::-1]
fig = px.bar(
subset, x='avg_monthly_ratio', y='nama', orientation='h',
title=f'{sort_opt} Divisions',
labels={'avg_monthly_ratio': 'Monthly Ratio', 'nama': 'Division'},
color='color', color_discrete_map={c: c for c in subset['color'].unique()},
text=subset['avg_monthly_ratio'].apply(lambda x: f'{x:.2f}')
)
fig.update_layout(height=450, showlegend=False, yaxis={'categoryorder': 'array', 'categoryarray': subset['nama'].tolist()})
fig.update_traces(textposition='auto')
st.plotly_chart(fig, use_container_width=True)
# ๐ Insight (dari full data)
if len(full_sorted) >= 2:
min_r, max_r, mean_r = full_sorted['avg_monthly_ratio'].min(), full_sorted['avg_monthly_ratio'].max(), full_sorted['avg_monthly_ratio'].mean()
best, worst = full_sorted.iloc[0]['nama'], full_sorted.iloc[-1]['nama']
st.markdown(
f"
"
f"Insight: Division reporting ratio ranges from {min_r:.2f} to {max_r:.2f} (avg: {mean_r:.2f}). "
f"{best} leads."
# {worst} lags.
f"Recommendation: Benchmark processes from {best}; assess capacity/tooling gaps in {worst}."
f"
",
unsafe_allow_html=True
)
# โโโ 3c: Reporter by Individual โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
with col_3c:
st.markdown("
"
f"Insight: Individual reporting ranges from {min_r:.2f} to {max_r:.2f} findings/month (avg: {mean_r:.2f}). "
f"{top_reporter} is the most active contributor. "
f"Recommendation: Recognize top reporters; inspection causes of low activity (<0.5/month) via 1:1 review."
f"
",
unsafe_allow_html=True
)
# โโโ Baris 2: Executor โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
col_3b, col_3d = st.columns(2)
# โโโ 3b: Executor by Division (Lead Time) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
with col_3b:
st.markdown("
3b. Monthly Lead Time by Division (Executor)
", unsafe_allow_html=True)
if avg_leadtime_nama.empty:
st.warning("No data for division-level executor analysis.")
else:
sort_opt = st.selectbox(
"Show:",
["Top 10", "Bottom 10"],
key='sort_3b'
)
# Sort penuh sekali โ ascending: tercepat โ terlambat
full_sorted = avg_leadtime_nama.sort_values('avg_monthly_leadtime', ascending=True)
# Ambil subset sesuai pilihan
if sort_opt == "Top 10":
# 10 tercepat: ascending (kecil โ besar), tetap diurut ascending โ tercepat di atas
subset = full_sorted.head(10).sort_values('avg_monthly_leadtime', ascending=False)
else: # "Bottom 10 Slowest"
# 10 terlambat: descending (besar โ kecil), agar terlambat di atas
subset = full_sorted.tail(10).sort_values('avg_monthly_leadtime', ascending=False)
# Warna: 5 terlambat secara global (bukan di subset!) โ warna merah
colored = add_color_by_global_rank(
avg_leadtime_nama,
'avg_monthly_leadtime',
worst_n=5,
high_is_good=False # lebih tinggi = buruk
)
subset = subset.merge(colored[['nama', 'color']], on='nama', how='left').fillna({'color': '#1f77b4'})
fig = px.bar(
subset,
x='avg_monthly_leadtime',
y='nama',
orientation='h',
title=sort_opt,
labels={'avg_monthly_leadtime': 'Monthly Lead Time (Days)', 'nama': 'Division'},
color='color',
color_discrete_map={c: c for c in subset['color'].unique()},
text=subset['avg_monthly_leadtime'].apply(lambda x: f'{x:.1f}')
)
fig.update_layout(
height=450,
showlegend=False,
yaxis={'categoryorder': 'array', 'categoryarray': subset['nama'].tolist()}
)
fig.update_traces(textposition='auto')
st.plotly_chart(fig, use_container_width=True)
if len(full_sorted) >= 2:
min_lt = full_sorted['avg_monthly_leadtime'].min()
max_lt = full_sorted['avg_monthly_leadtime'].max()
mean_lt = full_sorted['avg_monthly_leadtime'].mean()
fastest = full_sorted.iloc[0]['nama']
slowest = full_sorted.iloc[-1]['nama']
st.markdown(
f"
"
f"Insight: Resolution time ranges from {min_lt:.1f} to {max_lt:.1f} days (avg: {mean_lt:.1f}). "
f"{slowest} has highest risk of SLA breach. "
f"Recommendation: Initiate RCA for {slowest}; replicate workflow from {fastest}. Set SLA threshold at 7 days."
f"
",
unsafe_allow_html=True
)
# โโโ 3d: Executor by Individual โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
with col_3d:
st.markdown(
f"
3d. Monthly Lead Time per Executor (Individual)
",
unsafe_allow_html=True
)
if avg_leadtime_per_indiv.empty:
st.warning(f"No data for individual executor analysis (column: '{EXECUTOR_INDIV_COL}').")
else:
sort_opt = st.selectbox(
"Show:",
["Top 10", "Bottom 10"],
key='sort_3d'
)
full_sorted = avg_leadtime_per_indiv.sort_values('avg_monthly_leadtime', ascending=True)
if sort_opt == "Top 10":
subset = full_sorted.head(10).subset = full_sorted.head(10).sort_values('avg_monthly_leadtime', ascending=False)
else: # "Bottom 10 Slowest"
subset = full_sorted.tail(10).sort_values('avg_monthly_leadtime', ascending=False)
# Warna berdasarkan ranking global
colored = add_color_by_global_rank(
avg_leadtime_per_indiv,
'avg_monthly_leadtime',
worst_n=5,
high_is_good=False
)
id_col = EXECUTOR_INDIV_COL
subset = subset.merge(colored[[id_col, 'color']], on=id_col, how='left').fillna({'color': '#1f77b4'})
fig = px.bar(
subset,
x='avg_monthly_leadtime',
y=id_col,
orientation='h',
title=sort_opt,
labels={'avg_monthly_leadtime': 'Monthly Lead Time (Days)', id_col: 'Executor'},
color='color',
color_discrete_map={c: c for c in subset['color'].unique()},
text=subset['avg_monthly_leadtime'].apply(lambda x: f'{x:.1f}')
)
fig.update_layout(
height=450,
showlegend=False,
yaxis={'categoryorder': 'array', 'categoryarray': subset[id_col].tolist()}
)
fig.update_traces(textposition='auto')
st.plotly_chart(fig, use_container_width=True)
if len(full_sorted) >= 2:
min_lt = full_sorted['avg_monthly_leadtime'].min()
max_lt = full_sorted['avg_monthly_leadtime'].max()
mean_lt = full_sorted['avg_monthly_leadtime'].mean()
slowest_exec = full_sorted.iloc[-1][id_col]
fastest_exec = full_sorted.iloc[0][id_col]
st.markdown(
f"
"
f"Insight: Executor performance ranges from {min_lt:.1f} to {max_lt:.1f} days (avg: {mean_lt:.1f}). "
f"{slowest_exec} requires support to meet SLA. "
f"Recommendation: Assign mentor to executors >7 days; document & share best practices from top performers (e.g., {fastest_exec})."
f"
",
unsafe_allow_html=True
)
#Objective 4
try:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.express as px
from collections import Counter
WORDCLOUD_AVAILABLE = True
except ImportError:
WORDCLOUD_AVAILABLE = False
st.markdown("
OBJECTIVE 4 - Unsafe Issues: Which One is the Most Often?
",
unsafe_allow_html=True)
# ๐น Fungsi untuk membuat judul seragam
def create_consistent_title(title_text):
return f"
{title_text}
"
if WORDCLOUD_AVAILABLE:
# ๐ฅ Data untuk pie chart: semua kategori
df_all_kategori = df_local.copy()
# ๐ฅ Data untuk wordcloud: hanya Non-Positive
df_filtered_kategori = df_local[df_local['temuan_kategori'] != 'Positive'] \
if 'temuan_kategori' in df_local.columns else df_local
# 2 Kolom
col1, col2 = st.columns(2)
# === PIE CHART: Semua temuan_kategori (Dengan Custom Text Position untuk Unsafe Action & Near Miss) ===
with col1:
st.markdown(create_consistent_title("Distribution of All Issue Categories"), unsafe_allow_html=True)
if 'temuan_kategori' in df_all_kategori.columns:
category_counts = df_all_kategori['temuan_kategori'].value_counts()
if not category_counts.empty:
# Mapping warna
color_map = {
'Positive': '#2E7D32', # Hijau
'Unsafe Condition': '#EF5350', # Merah Muda
'Unsafe Action': '#F48FB1', # Pink
'Near Miss': '#BDBDBD' # Abu-abu
}
colors = [color_map.get(cat, '#9E9E9E') for cat in category_counts.index]
# Buat pie chart tanpa legend
fig_pie = px.pie(
names=category_counts.index,
values=category_counts.values,
color_discrete_sequence=colors
)
# โ๏ธ Custom: tarik keluar slice untuk Unsafe Action & Near Miss
pull_values = []
for cat in category_counts.index:
if cat in ['Unsafe Action', 'Near Miss']:
pull_values.append(0.1)
else:
pull_values.append(0.0)
fig_pie.update_traces(
textposition='outside',
textinfo='percent+label',
pull=pull_values,
marker=dict(line=dict(color='#FFFFFF', width=1))
)
fig_pie.update_layout(
showlegend=False,
height=450,
margin=dict(t=20, b=20, l=40, r=40)
)
st.plotly_chart(fig_pie, use_container_width=True)
# โ CUSTOM LEGEND
st.markdown("
", unsafe_allow_html=True)
legend_items = []
for cat in category_counts.index:
color = color_map.get(cat, '#9E9E9E')
item = (
f"{cat}"
)
legend_items.append(item)
legend_html = " | ".join(legend_items)
st.markdown(f"
{legend_html}
", unsafe_allow_html=True)
st.markdown("
", unsafe_allow_html=True)
else:
st.warning("No data available for pie chart.")
else:
st.warning("Column 'temuan_kategori' not available.")
# === WORDCLOUD: Hanya Non-Positive (dari keyword_kategori) ===
with col2:
st.markdown(create_consistent_title("Unsafe Issues"), unsafe_allow_html=True)
st.markdown(
"""
Categorization uses NLP โ Natural Language Processing from random text
""",
unsafe_allow_html=True
)
if df_filtered_kategori.empty:
st.warning("No data available after filtering out 'Positive' category.")
else:
if 'keyword_kategori' in df_filtered_kategori.columns:
# Gabungkan & bersihkan teks
import re
text_series = df_filtered_kategori['keyword_kategori'].dropna().astype(str)
text = ' '.join(text_series)
text = re.sub(r'[^a-zA-Z\s]', ' ', text).strip()
if text:
# Generate wordcloud
wordcloud = WordCloud(
width=1600,
height=800,
background_color='white',
colormap='viridis',
max_words=1000,
random_state=42
).generate(text)
# Tampilkan
fig, ax = plt.subplots(figsize=(3, 2), dpi=200)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
plt.tight_layout()
st.pyplot(fig, use_container_width=True)
# ๐ ๐น Insight dinamis: top 3 keywords
words_clean = [w.lower() for w in text.split() if len(w) > 2]
top3 = [word.capitalize() for word, _ in Counter(words_clean).most_common(3)]
if top3:
insight = f"The most frequent unsafe issue themes are: {', '.join(top3)}."
else:
insight = "Recurring unsafe issue patterns are detectable in textual findings."
st.markdown(
f"""
Insight: {insight}
""",
unsafe_allow_html=True
)
else:
st.warning("No valid text remaining after cleaning.")
else:
st.warning("Column 'keyword_kategori' not available.")
else:
st.info("WordCloud library not installed. Install `wordcloud` and `matplotlib` to enable this feature.")
# =================== 5. Matrix (Tetap Dipertahankan) ===================
# =================== 5. Matrix (Tetap Dipertahankan) ===================
st.markdown("
OBJECTIVE 5 - Findings vs Lead Time: Which Companies Move Slow?
", unsafe_allow_html=True)
import math
import plotly.express as px
import pandas as pd
try:
df_local_matrix = df.copy()
# ============================
# 0. Filter: ONLY 1 COMPANY & 1 PROFILE (if applicable)
# ============================
# (Skipped for general dashboard view)
# ============================
# 1. Exclude Positive findings
# ============================
if 'temuan_kategori' in df_local_matrix.columns:
df_local_matrix = df_local_matrix[df_local_matrix["temuan_kategori"] != "Positive"]
# ============================
# 2. Ensure datetime columns
# ============================
df_local_matrix['created_at'] = pd.to_datetime(df_local_matrix['created_at'], errors='coerce')
df_local_matrix['close_at'] = pd.to_datetime(df_local_matrix['close_at'], errors='coerce')
# ============================
# 3. Compute LEAD TIME
# ============================
df_local_matrix['lead_time_days'] = (df_local_matrix['close_at'] - df_local_matrix['created_at']).dt.days
df_local_matrix['lead_time_days'] = df_local_matrix['lead_time_days'].fillna(0)
# ============================
# 4. Average Monthly Finding Count per Operator
# ============================
if 'nama' not in df_local_matrix.columns:
st.error("โ Kolom 'nama' (operator) tidak ditemukan.")
# st.stop() # Stop bisa dihilangkan agar script tetap jalan
else:
# Buat kolom bulan (YYYY-MM)
df_local_matrix = df_local_matrix.assign(month=df_local_matrix['created_at'].dt.to_period('M').astype(str))
# Hitung jumlah temuan per operator per bulan
monthly_counts = (
df_local_matrix
.groupby(['nama', 'month'])['kode_temuan']
.nunique()
.reset_index(name='monthly_count')
)
# Hitung rata-rata bulanan per operator
operator_avg = (
monthly_counts
.groupby('nama')['monthly_count']
.mean() # <-- RATA-RATA per bulan (bukan total!)
.reset_index(name='Finding Count')
)
# ============================
# 5. Average Lead Time per Operator
# ============================
operator_lead = (
df_local_matrix.groupby('nama')['lead_time_days']
.mean()
.reset_index(name='Average Lead Time')
)
# ============================
# 6. Merge Risk Matrix
# ============================
risk_matrix = operator_avg.merge(operator_lead, on='nama', how='left')
risk_matrix = risk_matrix.rename(columns={'nama': 'Operator Name'})
# Handle operator tanpa lead time (e.g., belum closed)
risk_matrix['Average Lead Time'] = risk_matrix['Average Lead Time'].fillna(0)
# ============================
# 7. Quadrant Logic (unchanged)
# ============================
X_LIMIT = 20
Y_LIMIT = 3
def assign_quadrant(row):
if row['Finding Count'] >= X_LIMIT and row['Average Lead Time'] >= Y_LIMIT:
return "Quadrant I โ High Leadtime & High Count"
elif row['Finding Count'] < X_LIMIT and row['Average Lead Time'] >= Y_LIMIT:
return "Quadrant II โ High Leadtime but Low Count"
elif row['Finding Count'] >= X_LIMIT and row['Average Lead Time'] < Y_LIMIT:
return "Quadrant III โ Low Leadtime but High Count"
else:
return "Quadrant IV โ Low Leadtime & Low Count"
risk_matrix['quadrant'] = risk_matrix.apply(assign_quadrant, axis=1)
quadrant_count = risk_matrix['quadrant'].value_counts()
# ============================
# 8. Scatter Plot (format visual tetap sam persis)
# ============================
max_x = risk_matrix['Finding Count'].max() + 1
max_y = risk_matrix['Average Lead Time'].max() + 5
fig = px.scatter(
risk_matrix,
x='Finding Count',
y='Average Lead Time',
hover_name="Operator Name",
size=[12] * len(risk_matrix),
size_max=15,
title="Audit Findings Risk Matrix: Avg Monthly Count vs Lead Time"
)
# Background quadrant (same as original)
fig.add_shape(type="rect", x0=X_LIMIT, x1=max_x, y0=Y_LIMIT, y1=max_y,
fillcolor="rgba(255,0,0,0.25)", line_width=0) # Q1
fig.add_shape(type="rect", x0=0, x1=X_LIMIT, y0=Y_LIMIT, y1=max_y,
fillcolor="rgba(255,150,50,0.25)", line_width=0) # Q2
fig.add_shape(type="rect", x0=X_LIMIT, x1=max_x, y0=0, y1=Y_LIMIT,
fillcolor="rgba(255,200,200,0.25)", line_width=0) # Q3
fig.add_shape(type="rect", x0=0, x1=X_LIMIT, y0=0, y1=Y_LIMIT,
fillcolor="rgba(0,120,255,0.15)", line_width=0) # Q4
fig.add_vline(x=X_LIMIT, line_dash="dash", line_color="black")
fig.add_hline(y=Y_LIMIT, line_dash="dash", line_color="black")
# Quadrant count annotations (same positions & style)
fig.add_annotation(x=X_LIMIT + (max_x - X_LIMIT)/2,
y=Y_LIMIT + (max_y - Y_LIMIT)/2,
text=f"{quadrant_count.get('Quadrant I โ High Leadtime & High Count',0)}",
showarrow=False, font=dict(size=22, color="darkred"))
fig.add_annotation(x=X_LIMIT/2,
y=Y_LIMIT + (max_y - Y_LIMIT)/2,
text=f"{quadrant_count.get('Quadrant II โ High Leadtime but Low Count',0)}",
showarrow=False, font=dict(size=22, color="orange"))
fig.add_annotation(x=X_LIMIT + (max_x - X_LIMIT)/2,
y=Y_LIMIT/2,
text=f"{quadrant_count.get('Quadrant III โ Low Leadtime but High Count',0)}",
showarrow=False, font=dict(size=22, color="red"))
fig.add_annotation(x=X_LIMIT/2,
y=Y_LIMIT/2,
text=f"{quadrant_count.get('Quadrant IV โ Low Leadtime & Low Count',0)}",
showarrow=False, font=dict(size=22, color="green"))
st.plotly_chart(fig, use_container_width=True)
# ============================
# 9. Summary Table
# ============================
st.subheader("Summary (Avg Monthly Count vs Avg Lead Time)")
st.dataframe(
risk_matrix.sort_values("Finding Count", ascending=False),
use_container_width=True
)
except Exception as e:
st.error(f"โ ๏ธ Error Risk Matrix: {e}")
# st.exception(e) # Uncomment for debugging
# st.exception(e) # Uncomment for debugging
import streamlit as st
import plotly.graph_objects as go
import numpy as np
import pandas as pd
# =================== OBJECTIVE 6 - Predictive Dashboard & Early Warning Signals ===================
st.markdown("
OBJECTIVE 6 โ Prediction for Early Warning Signals: Which Ones Have Less Future Inspections?
", unsafe_allow_html=True)
# โ Enhanced CSS + Minimal Sortable JS
st.markdown("""
""", unsafe_allow_html=True)
#bloob line
# ๐น Helper: Sparkline ala chart saham โ hijau kalau naik, merah kalau turun
def ascii_sparkline_pln(data):
if not data or len(data) == 0:
return "โ"
try:
data = [float(x) for x in data]
n = len(data)
if n == 1:
return "โข"
# Hitung slope (linear trend)
xs = list(range(n))
x_mean = sum(xs) / n
y_mean = sum(data) / n
numerator = sum((x - x_mean) * (y - y_mean) for x, y in zip(xs, data))
denominator = sum((x - x_mean) ** 2 for x in xs)
slope = numerator / (denominator + 1e-12)
# Tentukan warna & label berdasarkan slope
if slope > 0.001:
color = "#2E7D32" # hijau (naik)
label = "โ"
elif slope < -0.001:
color = "#C62828" # merah (turun)
label = "โ"
else:
color = "#546E7A" # abu-abu (datar)
label = "โ"
# Normalisasi koordinat Y untuk SVG
height = 16
margin = 2
usable_h = height - 2 * margin
min_y, max_y = min(data), max(data)
if max_y == min_y:
y_coords = [margin + usable_h / 2] * n
else:
y_coords = [
margin + usable_h * (1 - (v - min_y) / (max_y - min_y + 1e-9))
for v in data
]
# Buat polyline
points = " ".join(f"{i * 8},{y:.1f}" for i, y in enumerate(y_coords))
width = max(24, (n - 1) * 8 + 4)
# SVG inline
svg = (
f''
)
return svg
except Exception:
return "โ "
# โโโโโโโ 1. Creators: ONLY Coverage < 90% AND Slope < 0 โโโโโโโ
def predict_creators(df):
# โ Tidak ada filter Non-Positive
if 'creator_name' not in df.columns or df.empty:
return pd.DataFrame()
start_month = df['created_at'].min().to_period('M')
end_month = df['created_at'].max().to_period('M')
all_months = pd.period_range(start=start_month, end=end_month, freq='M')
df_monthly = (
df.groupby(['creator_name', df['created_at'].dt.to_period('M')])
.size()
.unstack(fill_value=0)
.reindex(columns=all_months, fill_value=0)
.stack()
.reset_index(name='count')
)
df_monthly.columns = ['Creator', 'Month', 'Count']
results = []
for creator, group in df_monthly.groupby('Creator'):
ts = group.set_index('Month')['Count']
total = len(all_months)
active = (ts > 0).sum()
coverage = active / total if total > 0 else 0
avg_rate = ts.mean()
if len(ts) >= 2:
try:
slope = np.polyfit(np.arange(len(ts)), ts.values, 1)[0]
# โ FILTER: Coverage < 90% AND Slope < 0
if slope < 0 and coverage < 0.9:
reason = f"Slope = {slope:.3f}, Coverage = {coverage*100:.1f}%. Avg: {avg_rate:.2f}/mo."
results.append({
'Creator': creator,
'Reports/Month': round(avg_rate, 2),
'Monthly Consistency (%)': round(coverage * 100, 1),
'Trend Slope': round(slope, 3),
'Trend': ascii_sparkline_pln(ts.values.tolist()),
'Reason': reason
})
except:
continue
df_res = pd.DataFrame(results)
# โ Ambil 10 creator dengan slope paling negatif (paling turun)
return df_res.sort_values('Trend Slope', ascending=True).head(10) if not df_res.empty else df_res
# โโโโโโโ 2. Locations: ONLY Coverage < 90% AND Slope < 0 โโโโโโโ
def predict_locations(df):
# โ Tidak ada filter Non-Positive
if 'nama_lokasi_full' not in df.columns or df.empty:
return pd.DataFrame()
start_month = df['created_at'].min().to_period('M')
end_month = df['created_at'].max().to_period('M')
all_months = pd.period_range(start=start_month, end=end_month, freq='M')
df_monthly = (
df.groupby(['nama_lokasi_full', df['created_at'].dt.to_period('M')])
.size()
.unstack(fill_value=0)
.reindex(columns=all_months, fill_value=0)
.stack()
.reset_index(name='count')
)
df_monthly.columns = ['Location', 'Month', 'Count']
results = []
for lokasi, group in df_monthly.groupby('Location'):
ts = group.set_index('Month')['Count']
total = len(all_months)
active = (ts > 0).sum()
coverage = active / total if total > 0 else 0
avg_rate = ts.mean()
if len(ts) >= 2:
try:
slope = np.polyfit(np.arange(len(ts)), ts.values, 1)[0]
# โ FILTER: Coverage < 90% AND Slope < 0
if slope < 0 and coverage < 0.9:
reason = f"Slope = {slope:.3f}, Coverage = {coverage*100:.1f}%. Avg: {avg_rate:.2f}/mo."
results.append({
'Location': lokasi,
'Reports/Month': round(avg_rate, 2),
'Monthly Consistency (%)': round(coverage * 100, 1),
'Trend Slope': round(slope, 3),
'Trend': ascii_sparkline_pln(ts.values.tolist()),
'Reason': reason
})
except:
continue
df_res = pd.DataFrame(results)
# โ Ambil 10 lokasi dengan slope paling negatif (paling turun)
return df_res.sort_values('Trend Slope', ascending=True).head(10) if not df_res.empty else df_res
# โโโโโโโ 3. Divisions: ONLY Coverage < 90% AND Slope < 0 โโโโโโโ
def predict_divisions(df):
# โ Tidak ada filter Non-Positive
if 'nama' not in df.columns or df.empty:
return pd.DataFrame()
start_month = df['created_at'].min().to_period('M')
end_month = df['created_at'].max().to_period('M')
all_months = pd.period_range(start=start_month, end=end_month, freq='M')
df_monthly = (
df.groupby(['nama', df['created_at'].dt.to_period('M')])
.size()
.unstack(fill_value=0)
.reindex(columns=all_months, fill_value=0)
.stack()
.reset_index(name='count')
)
df_monthly.columns = ['Division', 'Month', 'Count']
results = []
for div, group in df_monthly.groupby('Division'):
ts = group.set_index('Month')['Count']
total = len(all_months)
active = (ts > 0).sum()
coverage = active / total if total > 0 else 0
avg_rate = ts.mean()
if len(ts) >= 2:
try:
slope = np.polyfit(np.arange(len(ts)), ts.values, 1)[0]
# โ FILTER: Coverage < 90% AND Slope < 0
if slope < 0 and coverage < 0.9:
reason = f"Slope = {slope:.3f}, Coverage = {coverage*100:.1f}%. Avg: {avg_rate:.2f}/mo."
results.append({
'Division': div,
'Reports/Month': round(avg_rate, 2),
'Monthly Consistency (%)': round(coverage * 100, 1),
'Trend Slope': round(slope, 3),
'Trend': ascii_sparkline_pln(ts.values.tolist()),
'Reason': reason
})
except:
continue
df_res = pd.DataFrame(results)
# โ Ambil 10 divisi dengan slope paling negatif (paling turun)
return df_res.sort_values('Trend Slope', ascending=True).head(10) if not df_res.empty else df_res
# โโโโโโโ 4. Categories: ONLY Non-Positive + Coverage=100% & Trend Slope > 0 โโโโโโโ
def predict_categories(df):
# ๐ฅ Filter: Hanya yang bukan 'Positive'
df = df[df['temuan_kategori'] != 'Positive'].copy() # โ Filter non-Positive
if 'kategori' not in df.columns or df.empty:
return pd.DataFrame()
start_month = df['created_at'].min().to_period('M')
end_month = df['created_at'].max().to_period('M')
all_months = pd.period_range(start=start_month, end=end_month, freq='M')
n_months = len(all_months)
results = []
for cat, group in df.groupby('kategori'):
ts_data = (
group.groupby(group['created_at'].dt.to_period('M'))
.size()
.reindex(all_months, fill_value=0)
)
total_reports = ts_data.sum()
avg_per_month = total_reports / n_months if n_months > 0 else 0
active_months = (ts_data > 0).sum()
coverage = active_months / n_months if n_months > 0 else 0
slope = 0.0
if len(ts_data) >= 2:
try:
slope = np.polyfit(np.arange(len(ts_data)), ts_data.values, 1)[0]
except:
pass
results.append({
'Category': cat,
'Avg/Month': round(avg_per_month, 2),
'Monthly Consistency (%)': round(coverage * 100, 1),
'Trend Slope': round(slope, 3),
'Trend': ascii_sparkline_pln(ts_data.values.tolist())
})
df_res = pd.DataFrame(results)
# โ FILTER: Coverage = 100% AND Trend Slope > 0
if not df_res.empty:
df_res = df_res
# [
# (df_res['Coverage (%)'] == 100.0) &
# (df_res['Trend Slope'] > 0)
# ].copy()
df_res['Status'] = df_res['Trend Slope'].apply(
lambda s: "High-Risk Rising" if s > 0.2 else
"Emerging Rising"
)
df_res = df_res.sort_values('Trend Slope', ascending=False)
return df_res.reset_index(drop=True).head(10) if not df_res.empty else df_res
# โโโโโโโ RUN โโโโโโโ
df_creator = predict_creators(df_filtered)
df_location = predict_locations(df_filtered)
df_division = predict_divisions(df_filtered)
df_category = predict_categories(df_filtered)
# ๐ฏ PANEL 1: Creators (FILTERED: Coverage < 90% & Slope < 0)
st.markdown("
", unsafe_allow_html=True)
st.markdown("
1. Which Reporters Are Predicted to Have Less Future Inspections? (Top 10 Most Declining)
", unsafe_allow_html=True)
if not df_creator.empty:
cols = ['Creator', 'Reports/Month', 'Monthly Consistency (%)', 'Trend Slope', 'Trend']
# ๐ฅ Rename hanya untuk DISPLAY, bukan data asli
df_display = df_creator[cols].rename(columns={
"Reason": "Reason Forecast"
})
html = df_display.to_html(escape=False, index=False, table_id="tbl-creators")
st.markdown(f"
2. Which Locations Are Predicted to Have Less Future Inspections? (Top 10 Most Declining)
", unsafe_allow_html=True)
if not df_location.empty:
cols = ['Location', 'Reports/Month', 'Monthly Consistency (%)', 'Trend Slope', 'Trend']
# # ๐ฅ Rename hanya untuk DISPLAY, bukan data asli
df_display = df_location[cols].rename(columns={
"Reason": "Reason Forecast"
})
html = df_display.to_html(escape=False, index=False, table_id="tbl-locations")
st.markdown(f"
3. Which Divisions Are Predicted to Have Less Future Inspections? (Top 10 Most Declining)
", unsafe_allow_html=True)
if not df_division.empty:
cols = ['Division', 'Reports/Month', 'Monthly Consistency (%)', 'Trend Slope', 'Trend']
# # ๐ฅ Rename hanya untuk DISPLAY, bukan data asli
df_display = df_division[cols].rename(columns={
"Reason": "Reason Forecast"
})
html = df_display.to_html(escape=False, index=False, table_id="tbl-divisions")
st.markdown(f"
"
"4. Which Issue Categories Are Likely to Appear in the Next 3 Months (Increasing Trend Only)"
""
" (* Categorization uses NLP โ Natural Language Processing from random text)"
""
"
",
unsafe_allow_html=True
)
# if not df_category.empty:
# cols = ['Category', 'Avg/Month', 'Coverage (%)', 'Trend Slope', 'Status', 'Trend']
# # ๐ต Rename ONLY for display
# df_display = df_category[cols].rename(columns={
# "Status": "Status Issue for Next Month"
# })
# html = df_display.to_html(escape=False, index=False, table_id="tbl-categories")
# st.markdown(f"
{html}
", unsafe_allow_html=True)
# st.markdown(
# "
"
# "Filtered: Reported every month (100% coverage) with increasing trend. "
# "Avg/Month = total รท months. "
# "High-Risk Rising = slope > 0.2."
# "
", unsafe_allow_html=True)
# Buat chart scatter dengan gaya whiteboard
if not df_category.empty:
# Ambil data untuk scatter
df_plot = df_category.copy()
df_plot['Size'] = df_plot['Avg/Month'] # Ukuran lingkaran = frekuensi (Avg/Month)
df_plot['Y'] = df_plot['Trend Slope'] # Y = Trend Slope
# Buat scatter plot
fig = go.Figure()
# Tambahkan scatter
fig.add_trace(go.Scatter(
x=df_plot['Category'],
y=df_plot['Y'],
mode='markers',
marker=dict(
size=df_plot['Size'] * 1.5, # Skala ukuran agar tidak terlalu besar
color='#003DA5',
line=dict(width=2, color='white'),
opacity=0.8
),
text=df_plot['Category'],
textposition="top center",
textfont=dict(size=10, color="#003DA5", family="Arial"),
hovertemplate="%{text} Trend Slope: %{y:.3f} Avg/Month: %{marker.size:.1f}",
))
# Layout
fig.update_layout(
# title=dict(text="Issue Category Trend vs Frequency (Non-Positive)", x=0.5, y=0.95),
xaxis=dict(
title="Category",
tickangle=45,
showgrid=True,
gridcolor="#e0e0e0",
gridwidth=1,
tickfont=dict(size=10, color="#003DA5")
),
yaxis=dict(
title="Trend Slope",
showgrid=True,
gridcolor="#e0e0e0",
gridwidth=1,
tickfont=dict(size=10, color="#003DA5")
),
plot_bgcolor="white",
paper_bgcolor="white",
height=500,
margin=dict(t=60, b=60, l=60, r=60),
font=dict(family="Arial", size=12, color="#003DA5"),
showlegend=False
)
# Tambahkan teks manual seperti whiteboard
fig.add_annotation(
xref="paper", yref="paper",
x=0.05, y=0.95,
showarrow=False,
font=dict(size=12, color="#003DA5"),
textangle=-90,
align="center",
visible=False
)
fig.add_annotation(
xref="paper", yref="paper",
x=0.5, y=0.05,
# text="",
showarrow=False,
font=dict(size=12, color="#003DA5"),
align="center",
visible=False
)
fig.add_annotation(
xref="paper", yref="paper",
x=0.95, y=0.95,
text="Number of Findings โข 10 โขโข 20 โขโขโข 30 โขโขโขโข 40",
showarrow=False,
font=dict(size=12, color="#003DA5"),
align="left"
)
fig.add_annotation(
xref="paper", yref="paper",
x=0.05, y=0.1,
text="Semakin tinggi = semakin sering ditemukan deviasi Semakin besar = semakin banyak ditemukan deviasi ",
showarrow=False,
font=dict(size=12, color="#003DA5"),
align="left"
)
st.plotly_chart(fig, use_container_width=True)
# Insight
# st.markdown("### ๐ก Insight")
# insight_text = (
# f"
"
# f"The scatter plot visualizes issue categories with 100% coverage and positive trend. "
# f"Categories higher on the Y-axis indicate stronger upward trends, while larger circles indicate higher average monthly frequency. "
# f"This helps identify which non-positive issues are both increasing in trend and frequently reported โ these should be prioritized for intervention."
# f"
"
# )
# st.markdown(insight_text, unsafe_allow_html=True)
else:
st.info("No data available for non-positive issue categories with 100% coverage and positive trend.")
import streamlit as st
import pandas as pd
from huggingface_hub import InferenceClient
# ==========================
# LLM FUNCTION (HuggingFace)
# ==========================
def llm_generate_recommendation(insights_text):
client = InferenceClient(model="meta-llama/Meta-Llama-3-8B-Instruct")
prompt = f"""
You are an expert Safety & Reliability Agentic AI.
Based on the following structured INSIGHT SUMMARY, create:
1. Recommended Action (max 2 sentences)
2. Risk Mitigation Strategy (max 2 sentences)
The insights:
{insights_text}
Now generate concise, high-impact:
- "recommendation"
- "mitigation"
Return output in EXACT JSON format:
{{
"recommendation": "...",
"mitigation": "..."
}}
"""
output = client.text_generation(prompt, max_new_tokens=500, temperature=0.4)
return output
# ==============================================
# === INSIGHT COMPUTATION FUNCTION (your code)
# ==============================================
def extract_agentic_insights_v5(df: pd.DataFrame):
dev = {
"lowest_ratio_9_locs": [],
"obj3a_lowest_div": None,
"obj3b_slowest_executor": None,
"obj3c_lowest_reporter": None,
"obj3d_slowest_div": None,
"obj4_unsafe_condition_pct": 0.0,
"obj4_unsafe_action_pct": 0.0,
"obj4_near_miss_pct": 0.0,
"obj5_q1_divs": [],
"obj5_q2_divs": [],
"obj6_top2_categories": [],
}
# === 1. 9 locations with lowest finding-to-reporter ratio ===
if {'nama_lokasi_full', 'creator_nid', 'created_at', 'kode_temuan'}.issubset(df.columns):
calc = df[['nama_lokasi_full', 'creator_nid', 'created_at', 'kode_temuan']].copy()
calc['created_at'] = pd.to_datetime(calc['created_at'], errors='coerce')
calc = calc.dropna(subset=['created_at', 'nama_lokasi_full', 'creator_nid'])
calc['bulan'] = calc['created_at'].dt.to_period('M')
monthly = calc.groupby(['nama_lokasi_full', 'bulan']).agg(
findings=('kode_temuan', 'size'),
reporters=('creator_nid', 'nunique')
).reset_index()
monthly = monthly[monthly['reporters'] > 0]
monthly['ratio'] = monthly['findings'] / monthly['reporters']
loc_avg = monthly.groupby('nama_lokasi_full')['ratio'].mean()
lowest_9 = loc_avg.nsmallest(9)
dev["lowest_ratio_9_locs"] = [(loc, round(ratio, 3)) for loc, ratio in lowest_9.items()]
# === 2a Lowest-ratio division ===
if {'nama', 'creator_nid', 'created_at', 'kode_temuan'}.issubset(df.columns):
calc = df[['nama', 'creator_nid', 'created_at', 'kode_temuan']].copy()
calc['bulan'] = pd.to_datetime(calc['created_at']).dt.to_period('M')
agg = calc.groupby(['nama', 'bulan']).agg(
findings=('kode_temuan', 'size'),
reporters=('creator_nid', 'nunique')
)
agg = agg[agg['reporters'] > 0].reset_index()
agg['ratio'] = agg['findings'] / agg['reporters']
div_ratio = agg.groupby('nama')['ratio'].mean()
if not div_ratio.empty:
dev["obj3a_lowest_div"] = (div_ratio.idxmin(), round(div_ratio.min(), 2))
# === 2b Slowest executor
if 'days_to_close' in df.columns:
valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)]
exec_col = 'nama_pic' if 'nama_pic' in valid.columns else 'creator_name'
if exec_col in valid.columns:
lead = valid.groupby(exec_col)['days_to_close'].mean()
if not lead.empty:
dev["obj3b_slowest_executor"] = (lead.idxmax(), round(lead.max(), 1))
# === 2c Lowest reporter freq
if {'creator_name', 'created_at'}.issubset(df.columns):
calc = df[['creator_name', 'created_at']].copy()
calc['bulan'] = pd.to_datetime(calc['created_at']).dt.to_period('M')
monthly = calc.groupby(['creator_name', 'bulan']).size().reset_index(name='count')
avg = monthly.groupby('creator_name')['count'].mean()
avg = avg[avg > 0]
if not avg.empty:
dev["obj3c_lowest_reporter"] = (avg.idxmin(), round(avg.min(), 2))
# === 2d Slowest division resolution
if 'days_to_close' in df.columns and 'nama' in df.columns:
valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)]
if not valid.empty:
lead = valid.groupby('nama')['days_to_close'].mean()
if not lead.empty:
dev["obj3d_slowest_div"] = (lead.idxmax(), round(lead.max(), 1))
# === 3. Non-Positive findings composition
if 'temuan_kategori' in df.columns:
cnt = df['temuan_kategori'].value_counts(normalize=True) * 100
dev["obj4_unsafe_condition_pct"] = round(cnt.get("Unsafe Condition", 0), 1)
dev["obj4_unsafe_action_pct"] = round(cnt.get("Unsafe Action", 0), 1)
dev["obj4_near_miss_pct"] = round(cnt.get("Near Miss", 0), 1)
# === 4. Quadrants
X_LIMIT, Y_LIMIT = 20, 3
if {'nama', 'created_at', 'days_to_close', 'kode_temuan'}.issubset(df.columns):
calc = df.copy()
calc['created_at'] = pd.to_datetime(calc['created_at'], errors='coerce')
calc = calc.assign(month=calc['created_at'].dt.to_period('M').astype(str))
monthly_counts = calc.groupby(['nama', 'month'])['kode_temuan'].nunique().reset_index()
avg_count = monthly_counts.groupby('nama')['kode_temuan'].mean().reset_index(name='Finding Count')
leadtime = calc.groupby('nama')['days_to_close'].mean().reset_index(name='Avg Lead Time')
mat = avg_count.merge(leadtime, on='nama', how='left').fillna(0)
for _, r in mat.iterrows():
if r['Finding Count'] >= X_LIMIT and r['Avg Lead Time'] >= Y_LIMIT:
dev["obj5_q1_divs"].append(r['nama'])
elif r['Finding Count'] < X_LIMIT and r['Avg Lead Time'] >= Y_LIMIT:
dev["obj5_q2_divs"].append(r['nama'])
# === 5. Top categories
if {'kategori', 'temuan_kategori', 'created_at'}.issubset(df.columns):
nonpos = df[df['temuan_kategori'] != 'Positive']
if not nonpos.empty:
start = nonpos['created_at'].min().to_period('M')
end = nonpos['created_at'].max().to_period('M')
n_months = len(pd.period_range(start=start, end=end, freq='M'))
cat_avg = (nonpos.groupby('kategori').size() / n_months).sort_values(ascending=False).head(2)
dev["obj6_top2_categories"] = [(cat, round(val, 1)) for cat, val in cat_avg.items()]
return dev
# ==========================
# ===== MAIN APP ===========
# ==========================
# app.py
import streamlit as st
import pandas as pd
import json
from typing import List, Dict
# Transformers pipeline for lightweight local LLM (text2text)
from transformers import pipeline
import math
# =====================================================================
# OBJECTIVE 7 โ INSIGHT & RECOMMENDATION (LLM FIRST, RULE-BASED IF FAIL)
# =====================================================================
st.markdown("
OBJECTIVE 7 โ Risk Mitigation: What are the Insights and Recommendations?
", unsafe_allow_html=True)
# ============================================================
# 1. LLM LOADER
# ============================================================
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
@st.cache_resource
def load_llm_model():
try:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained(
"google/gemma-2-2b-it",
torch_dtype=torch.float16,
device_map="auto"
)
return tokenizer, model
except:
return None, None
def llm_generate_recommendation(insight_summary_text):
"""
Try to generate JSON recommendation using LLM.
If LLM fails โ return None, and rule-based will be used.
"""
tok, mdl = load_llm_model()
if tok is None or mdl is None:
return None
try:
prompt = f"""
You are an expert Industrial Safety Analyst AI.
Below is an INSIGHT SUMMARY from a Safety Reporting System:
---
{insight_summary_text}
---
Generate 5 Recommended Actions and Risk Mitigation in clean JSON:
{{
"recommendations": [
{{"point":"1","rec":"...","mit":"..."}},
{{"point":"2","rec":"...","mit":"..."}},
{{"point":"3","rec":"...","mit":"..."}}
]
}}
"""
inputs = tok(prompt, return_tensors="pt").to(mdl.device)
out = mdl.generate(
**inputs,
max_new_tokens=380,
temperature=0.25,
do_sample=True
)
text = tok.decode(out[0], skip_special_tokens=True)
import re, json
json_match = re.search(r"\{[\s\S]*\}", text)
if not json_match:
return None
return json.loads(json_match.group(0))
except:
return None
# ============================================================
# 2. RULE-BASED ENGINE (YOUR ORIGINAL SCRIPT)
# ============================================================
def extract_agentic_insights_v5(df: pd.DataFrame):
dev = {
"lowest_ratio_9_locs": [],
"obj3a_lowest_div": None,
"obj3b_slowest_executor": None,
"obj3c_lowest_reporter": None,
"obj3d_slowest_div": None,
"obj4_unsafe_condition_pct": 0.0,
"obj4_unsafe_action_pct": 0.0,
"obj4_near_miss_pct": 0.0,
"obj5_q1_divs": [],
"obj5_q2_divs": [],
"obj6_top2_categories": [],
}
# === 1. 9 lowest locations ===
if {'nama_lokasi_full', 'creator_nid', 'created_at', 'kode_temuan'}.issubset(df.columns):
calc = df[['nama_lokasi_full', 'creator_nid', 'created_at', 'kode_temuan']].copy()
calc['created_at'] = pd.to_datetime(calc['created_at'], errors='coerce')
calc = calc.dropna(subset=['created_at', 'nama_lokasi_full', 'creator_nid'])
calc['bulan'] = calc['created_at'].dt.to_period('M')
monthly = calc.groupby(['nama_lokasi_full', 'bulan']).agg(
findings=('kode_temuan', 'size'),
reporters=('creator_nid', 'nunique')
).reset_index()
monthly = monthly[monthly['reporters'] > 0]
monthly['ratio'] = monthly['findings'] / monthly['reporters']
loc_avg = monthly.groupby('nama_lokasi_full')['ratio'].mean()
lowest_9 = loc_avg.nsmallest(9)
dev["lowest_ratio_9_locs"] = [(loc, round(ratio, 3)) for loc, ratio in lowest_9.items()]
# === 2a: Lowest division ratio ===
if {'nama', 'creator_nid', 'created_at', 'kode_temuan'}.issubset(df.columns):
calc = df[['nama', 'creator_nid', 'created_at', 'kode_temuan']].copy()
calc['bulan'] = pd.to_datetime(calc['created_at']).dt.to_period('M')
agg = calc.groupby(['nama', 'bulan']).agg(
findings=('kode_temuan', 'size'),
reporters=('creator_nid', 'nunique')
)
agg = agg[agg['reporters'] > 0].reset_index()
agg['ratio'] = agg['findings'] / agg['reporters']
div_ratio = agg.groupby('nama')['ratio'].mean()
if not div_ratio.empty:
dev["obj3a_lowest_div"] = (div_ratio.idxmin(), round(div_ratio.min(), 2))
# === 2b: Slowest executor ===
if 'days_to_close' in df.columns:
valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)]
exec_col = 'nama_pic' if 'nama_pic' in valid.columns else 'creator_name'
if exec_col in valid.columns:
lead = valid.groupby(exec_col)['days_to_close'].mean()
if not lead.empty:
dev["obj3b_slowest_executor"] = (lead.idxmax(), round(lead.max(), 1))
# === 2c: Lowest reporter ===
if {'creator_name', 'created_at'}.issubset(df.columns):
calc = df[['creator_name', 'created_at']].copy()
calc['bulan'] = pd.to_datetime(calc['created_at']).dt.to_period('M')
monthly = calc.groupby(['creator_name', 'bulan']).size().reset_index(name='count')
avg = monthly.groupby('creator_name')['count'].mean()
avg = avg[avg > 0]
if not avg.empty:
dev["obj3c_lowest_reporter"] = (avg.idxmin(), round(avg.min(), 2))
# === 2d: Slowest division ===
if 'days_to_close' in df.columns and 'nama' in df.columns:
valid = df[df['days_to_close'].notna() & (df['days_to_close'] >= 0)]
if not valid.empty:
lead = valid.groupby('nama')['days_to_close'].mean()
if not lead.empty:
dev["obj3d_slowest_div"] = (lead.idxmax(), round(lead.max(), 1))
# === 3. Non-positive ===
if 'temuan_kategori' in df.columns:
cnt = df['temuan_kategori'].value_counts(normalize=True) * 100
dev["obj4_unsafe_condition_pct"] = round(cnt.get("Unsafe Condition", 0), 1)
dev["obj4_unsafe_action_pct"] = round(cnt.get("Unsafe Action", 0), 1)
dev["obj4_near_miss_pct"] = round(cnt.get("Near Miss", 0), 1)
# === 4. Risk Quadrant ===
X_LIMIT, Y_LIMIT = 20, 3
if {'nama', 'created_at', 'days_to_close', 'kode_temuan'}.issubset(df.columns):
calc = df.copy()
calc['created_at'] = pd.to_datetime(calc['created_at'], errors='coerce')
calc = calc.assign(month=calc['created_at'].dt.to_period('M').astype(str))
monthly_counts = calc.groupby(['nama', 'month'])['kode_temuan'].nunique().reset_index()
avg_count = monthly_counts.groupby('nama')['kode_temuan'].mean().reset_index(name='Finding Count')
leadtime = calc.groupby('nama')['days_to_close'].mean().reset_index(name='Avg Lead Time')
mat = avg_count.merge(leadtime, on='nama', how='left').fillna(0)
for _, r in mat.iterrows():
if r['Finding Count'] >= X_LIMIT and r['Avg Lead Time'] >= Y_LIMIT:
dev["obj5_q1_divs"].append(r['nama'])
elif r['Finding Count'] < X_LIMIT and r['Avg Lead Time'] >= Y_LIMIT:
dev["obj5_q2_divs"].append(r['nama'])
# === 5. Top categories ===
if {'kategori', 'temuan_kategori', 'created_at'}.issubset(df.columns):
nonpos = df[df['temuan_kategori'] != 'Positive']
if not nonpos.empty:
start = nonpos['created_at'].min().to_period('M')
end = nonpos['created_at'].max().to_period('M')
n_months = len(pd.period_range(start=start, end=end, freq='M'))
cat_avg = (nonpos.groupby('kategori').size() / n_months).sort_values(ascending=False).head(2)
dev["obj6_top2_categories"] = [(cat, round(v, 1)) for cat, v in cat_avg.items()]
return dev
# ============================================================
# 3. RUN INSIGHT ENGINE
# ============================================================
dev = extract_agentic_insights_v5(df_filtered)
# Build Insight Summary Text
insight_lines = []
if dev["lowest_ratio_9_locs"]:
loc_list = ", ".join([f"{loc} ({ratio})" for loc, ratio in dev["lowest_ratio_9_locs"]])
insight_lines.append(f"1. Nine locations with the lowest finding-to-reporter ratio: {loc_list}.")
parts = []
if dev["obj3a_lowest_div"]:
parts.append(f"division {dev['obj3a_lowest_div'][0]} (ratio: {dev['obj3a_lowest_div'][1]})")
if dev["obj3c_lowest_reporter"]:
parts.append(f"reporter {dev['obj3c_lowest_reporter'][0]} ({dev['obj3c_lowest_reporter'][1]} findings/month)")
if dev["obj3d_slowest_div"]:
parts.append(f"division {dev['obj3d_slowest_div'][0]} ({dev['obj3d_slowest_div'][1]} days)")
if dev["obj3b_slowest_executor"]:
parts.append(f"executor {dev['obj3b_slowest_executor'][0]} ({dev['obj3b_slowest_executor'][1]} days)")
if parts:
insight_lines.append("2. Uneven operational capacity: " + "; ".join(parts))
uc, ua, nm = dev["obj4_unsafe_condition_pct"], dev["obj4_unsafe_action_pct"], dev["obj4_near_miss_pct"]
if uc + ua + nm > 0:
insight_lines.append(f"3. Non-Positive composition: Unsafe Condition ({uc}%), Unsafe Action ({ua}%), Near Miss ({nm}%).")
if dev["obj5_q1_divs"] or dev["obj5_q2_divs"]:
q1 = ", ".join([f"{x}" for x in dev["obj5_q1_divs"][:5]])
q2 = ", ".join([f"{x}" for x in dev["obj5_q2_divs"][:5]])
insight_lines.append(f"4. High-risk divisions (QI): {q1 or 'โ'}. Hidden-risk (QII): {q2 or 'โ'}.")
if dev["obj6_top2_categories"]:
c1, c2 = dev["obj6_top2_categories"]
insight_lines.append(f"5. Top recurring non-Positive categories: {c1[0]} ({c1[1]}/mo) & {c2[0]} ({c2[1]}/mo).")
insight_text = "
".join(insight_lines)
# ============================================================
# 4. TRY LLM FIRST โ ELSE FALLBACK RULE-BASED
# ============================================================
llm_json = llm_generate_recommendation(insight_text)
if llm_json is not None and "recommendations" in llm_json:
recs = llm_json["recommendations"]
else:
# FALLBACK RULE-BASED
recs = []
if dev["lowest_ratio_9_locs"]:
recs.append({"point":"1","rec":"Launch spot-inspection sprint at across the 9 lowest-ratio locations.","mit":"Enable 3-min QR checklist + automated WhatsApp reminders."})
if parts:
recs.append({"point":"2","rec":"Real-time monitoring of finding/reporter ratios and resolution lead times per division/individual.","mit":"Trigger coaching alerts to Area PICs & Division"})
if uc + ua + nm > 0:
recs.append({"point":"3","rec":"Enforce photo-based validation for Unsafe Condition/Action/Near Miss submissions to ensure accurate categorization.","mit":"System blocks submission if photo evidence or category justification is missing."})
if dev["obj5_q1_divs"] or dev["obj5_q2_divs"]:
recs.append({"point":"4","rec":"Assign dedicated safety crews to Quadrant I divisions; enforce One Finding, One Day closure policy for Quadrant II.","mit":"Auto-generate executive escalation reports to VP Operations if any division remains in QI/QII for โฅ2 consecutive months."})
if dev["obj6_top2_categories"]:
c1, c2 = dev["obj6_top2_categories"]
recs.append({"point":"5","rec":f"Create RCA task force for {c1[0]} & {c2[0]}.","mit":"Update tender specs with required mitigations."})
# ============================================================
# 5. RENDERING (NO CHANGES)
# ============================================================
# Insight Summary Card
st.markdown(
f"""