Spaces:

bukittechnology
/

pln

Sleeping

App Files Files Community

SHELLAPANDIANGANHUNGING commited on Dec 10, 2025

Commit

ec6fb87

verified ·

1 Parent(s): cb328df

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -92

app.py CHANGED Viewed

@@ -1975,7 +1975,7 @@ else:
     st.info("No data available for non-positive issue categories with 100% coverage and positive trend.")
 st.markdown("<h3 class='section-title'>OBJECTIVE 7 - Insight and Recommendation</h3>", unsafe_allow_html=True)
 def compute_risk_mitigation_insights(df: pd.DataFrame) -> List[dict]:
     """
@@ -1985,52 +1985,70 @@ def compute_risk_mitigation_insights(df: pd.DataFrame) -> List[dict]:
     - Agentic Safety Behaviors (proactive vs reactive)
     - Wordcloud-based Emerging Risk Detection
     - Actionable coverage-balancing strategies
     """
     insights = []
     if df.empty:
         return insights
-    # --- Helper: Detect risk terms from wordcloud (simulate via keyword freq in 'uraian_temuan' or similar) ---
     def detect_emerging_risks(df):
-        # Assume 'uraian_temuan' or 'temuan_uraian' contains free text
         text_col = None
-        for col in ['uraian_temuan', 'temuan_uraian', 'keterangan', 'catatan']:
             if col in df.columns and df[col].notna().any():
                 text_col = col
                 break
         if text_col is None:
             return [], []
-        # Combine all text (non-null)
-        all_text = ' '.join(df[text_col].dropna().astype(str).str.lower())
-        # Define risk lexicon (adjust based on domain)
         risk_keywords = [
             'terbuka', 'tidak terkunci', 'tanpa izin', 'tanpa alat', 'tanpa pelindung',
-            'overload', 'short circuit', 'grounding', 'exposed', 'fall', 'slip',
-            'fire hazard', 'unauthorized', 'no ppe', 'unsecured', 'untrained'
         ]
         found_risks = [kw for kw in risk_keywords if kw in all_text]
         return risk_keywords, found_risks
-    # --- 1. Coverage Equity by Location (Spatial Risk Mapping) ---
     if 'nama_lokasi_full' in df.columns and 'creator_nid' in df.columns:
         loc_activity = df.groupby('nama_lokasi_full').agg(
-            findings_count=('temuan_id', 'count'),
             unique_reporters=('creator_nid', 'nunique')
         ).reset_index()
-        total_locations = loc_activity.shape[0]
-        low_coverage_locs = loc_activity[loc_activity['unique_reporters'] <= 1]
-        high_volume_locs = loc_activity[loc_activity['findings_count'] > loc_activity['findings_count'].quantile(0.75)]
-        # Risk: High findings + low reporters = under-coverage bias
         risky_high_low = loc_activity[
             (loc_activity['findings_count'] > loc_activity['findings_count'].median()) &
             (loc_activity['unique_reporters'] <= 2)
         ]
         if not risky_high_low.empty:
-            loc_list = risky_high_low['nama_lokasi_full'].tolist()[:3]
             loc_names = ', '.join(loc_list)
             insight = (
                 f"Locations {loc_names} show high finding volume but rely on ≤2 reporters, indicating potential blind spots "
@@ -2042,7 +2060,7 @@ def compute_risk_mitigation_insights(df: pd.DataFrame) -> List[dict]:
             )
             insights.append({"insight": insight, "recommendation": recommendation})
-        if not low_coverage_locs.empty and len(low_coverage_locs) > total_locations * 0.3:
             insight = (
                 f"Over 30% of locations ({len(low_coverage_locs)}/{total_locations}) are covered by only 1 reporter, "
                 f"increasing the risk of unreported hazards due to observer fatigue or familiarity bias."
@@ -2053,63 +2071,71 @@ def compute_risk_mitigation_insights(df: pd.DataFrame) -> List[dict]:
             )
             insights.append({"insight": insight, "recommendation": recommendation})
-    # --- 2. Divisional Load & Frequency Risk (Over/Under-Reporting) ---
-    if 'nama' in df.columns and 'created_at' in df.columns:  # `nama` = divisi (per permintaan user)
         div_summary = df.groupby('nama').agg(
-            total_findings=('temuan_id', 'count'),
             unique_people=('creator_nid', 'nunique'),
             first_report=('created_at', 'min'),
             last_report=('created_at', 'max')
         )
-        div_summary['reporting_span_days'] = (div_summary['last_report'] - div_summary['first_report']).dt.days + 1
-        div_summary['avg_freq_per_person'] = div_summary['total_findings'] / div_summary['unique_people']
-        div_summary['findings_per_day'] = div_summary['total_findings'] / div_summary['reporting_span_days']
-        # Define thresholds (adjust as needed)
-        HIGH_LOAD_THRESHOLD = 8   # avg > 8 findings/person
-        LOW_ACTIVITY_THRESHOLD = 0.2  # < 0.2 findings/day
-        high_load_div = div_summary[div_summary['avg_freq_per_person'] >= HIGH_LOAD_THRESHOLD]
-        low_activity_div = div_summary[div_summary['findings_per_day'] <= LOW_ACTIVITY_THRESHOLD]
-        if not high_load_div.empty:
-            top_div = high_load_div['avg_freq_per_person'].idxmax()
-            insight = (
-                f"Division '{top_div}' has an elevated reporting load ({high_load_div.loc[top_div, 'avg_freq_per_person']:.1f} findings/person), "
-                f"which may lead to fatigue, rushed inspections, or selective reporting."
-            )
-            recommendation = (
-                f"Augment the division’s safety team with cross-trained support staff. "
-                f"Introduce AI-assisted checklist validation to reduce cognitive load. "
-                f"Monitor for declining finding quality (e.g., vague descriptions)."
-            )
-            insights.append({"insight": insight, "recommendation": recommendation})
-        if not low_activity_div.empty:
-            low_divs = low_activity_div.index.tolist()[:3]
-            div_names = ', '.join(low_divs)
-            insight = (
-                f"Divisions {div_names} show persistently low reporting frequency (<0.2 findings/day), "
-                f"suggesting either excellent safety performance or significant under-reporting."
-            )
-            recommendation = (
-                f"Conduct a *silent audit* (observation-only, no prior notice) in these divisions to validate safety status. "
-                f"Review training records and psychological safety survey scores — fear of blame suppresses reporting."
-            )
-            insights.append({"insight": insight, "recommendation": recommendation})
-    # --- 3. Agentic Safety Mitigation (Proactive vs Reactive Behavior) ---
     if 'temuan_kategori' in df.columns:
-        # Assume: 'Positive' = proactive (e.g., good housekeeping, initiative)
-        # Others (e.g., 'Unsafe Condition', 'Unsafe Act') = reactive
         total = len(df)
         proactive = (df['temuan_kategori'] == 'Positive').sum()
-        reactive = total - proactive
         proactive_rate = proactive / total if total > 0 else 0
         insight = (
             f"Only {proactive_rate:.1%} of findings reflect *proactive* safety behaviors (e.g., positive interventions, improvements). "
-            f"The remaining {100 - proactive_rate*100:.1f}% are *reactive* (hazards already present)."
         )
         recommendation = (
             f"Shift incentives from 'finding count' to 'prevention impact'. "
@@ -2118,18 +2144,17 @@ def compute_risk_mitigation_insights(df: pd.DataFrame) -> List[dict]:
         )
         insights.append({"insight": insight, "recommendation": recommendation})
-    # --- 4. Emerging Risk Detection via Wordcloud (Cloud = Risk Signal) ---
     all_risk_terms, detected_terms = detect_emerging_risks(df)
     if detected_terms:
-        missing_terms = set(all_risk_terms) - set(detected_terms)
-        # If *some* high-severity terms appear, but not all → partial risk cloud
-        high_sev_terms = ['exposed', 'fire hazard', 'fall', 'short circuit', 'unauthorized']
         detected_high = [t for t in detected_terms if t in high_sev_terms]
         if detected_high:
-            terms_str = ', '.join(detected_high)
             insight = (
                 f"Wordcloud analysis indicates emerging high-severity risks: *{terms_str}*. "
-                f"These signal active hazards (e.g., exposed conductors, fall risks) not yet fully mitigated."
             )
             recommendation = (
                 f"Launch a 14-day *Targeted Risk Blitz* on locations reporting these terms. "
@@ -2138,43 +2163,51 @@ def compute_risk_mitigation_insights(df: pd.DataFrame) -> List[dict]:
             )
             insights.append({"insight": insight, "recommendation": recommendation})
-        # Cloud still exists → why?
-        if missing_terms and len(detected_terms) > 3:
             insight = (
                 f"Despite mitigation efforts, the risk 'cloud' persists — likely due to: "
-                f"(1) Recurring root causes (e.g., contractor turnover), "
                 f"(2) Incomplete closure verification, or "
-                f"(3) Findings reappearing in new locations after fixes in old ones."
             )
             recommendation = (
-                f"Adopt *closed-loop verification*: require geo-tagged before/after photos + supervisor sign-off. "
-                f"Map recurring findings to contractor IDs — hold vendors accountable via SLA penalties. "
-                f"Use AI to cluster similar findings across time/location to detect systemic failures."
             )
             insights.append({"insight": insight, "recommendation": recommendation})
-    # --- 5. Coverage Balancing Strategy (How to Achieve Equitable Coverage) ---
-    # Based on location & reporter distribution
     if 'nama_lokasi_full' in df.columns and 'creator_nid' in df.columns:
         reporters_per_location = df.groupby('nama_lokasi_full')['creator_nid'].nunique()
-        coverage_gini = (reporters_per_location.std() / reporters_per_location.mean()) if reporters_per_location.mean() > 0 else 0
-        if coverage_gini > 0.6:  # High inequality
-            insight = (
-                f"Coverage inequality (Gini ≈ {coverage_gini:.2f}) is high — a few locations dominate reporting effort. "
-                f"This creates surveillance deserts in low-coverage zones."
-            )
-            recommendation = (
-                f"1. Assign *minimum 2 unique reporters per high-risk location* monthly. "
-                f"2. Use route optimization (e.g., VRP algorithm) to balance travel + inspection load. "
-                f"3. Deploy mobile micro-checklists for non-auditors (e.g., operators) to increase eyes-on-ground."
-            )
-            insights.append({"insight": insight, "recommendation": recommendation})
     return insights
-# Generate and display
-risk_insights = compute_risk_mitigation_insights(df_filtered)
 if risk_insights:
     for i, ir in enumerate(risk_insights, 1):
@@ -2182,7 +2215,8 @@ if risk_insights:
         st.markdown(f"<div class='ai-recommendation'><strong>Action {i}:</strong> {ir['recommendation']}</div>", unsafe_allow_html=True)
 else:
     st.markdown(
-        "<div class='ai-insight'>No risk-mitigation insights generated. Ensure key columns are present: "
-        "<code>nama_lokasi_full</code>, <code>nama</code> (division), <code>creator_nid</code>, <code>temuan_kategori</code>, and free-text field (e.g., <code>uraian_temuan</code>).</div>",
         unsafe_allow_html=True
     )

     st.info("No data available for non-positive issue categories with 100% coverage and positive trend.")
 st.markdown("<h3 class='section-title'>OBJECTIVE 7 - Insight and Recommendation</h3>", unsafe_allow_html=True)
+st.markdown("<h3 class='section-title'>OBJECTIVE 7 – Risk Mitigation Insights & Actions</h3>", unsafe_allow_html=True)
 def compute_risk_mitigation_insights(df: pd.DataFrame) -> List[dict]:
     """
     - Agentic Safety Behaviors (proactive vs reactive)
     - Wordcloud-based Emerging Risk Detection
     - Actionable coverage-balancing strategies
+    ✅ Uses ONLY columns confirmed in your schema.
+    ✅ Replaces 'temuan_id' with 'kode_temuan' or row count logic.
+    ✅ Handles missing text fields gracefully.
     """
     insights = []
     if df.empty:
         return insights
+    # --- Helper: Detect risk terms from free-text fields ---
     def detect_emerging_risks(df):
+        # Prioritize richest free-text field in order
+        text_cols = ['hasil_keyword_dan_kondisi', 'judul_dan_kondisi', 'kondisi', 'judul', 'keterangan_lokasi', 'note']
         text_col = None
+        for col in text_cols:
             if col in df.columns and df[col].notna().any():
                 text_col = col
                 break
         if text_col is None:
             return [], []
+        # Combine non-null text
+        texts = df[text_col].dropna().astype(str)
+        if texts.empty:
+            return [], []
+        all_text = ' '.join(texts.str.lower())
+        # Domain-specific risk lexicon (Indonesian + English)
         risk_keywords = [
+            # Bahasa Indonesia
             'terbuka', 'tidak terkunci', 'tanpa izin', 'tanpa alat', 'tanpa pelindung',
+            'overload', 'korsleting', 'grounding buruk', 'kabel terkelupas', 'tanpa grounding',
+            'jatuh', 'terpeleset', 'tergelincir', 'kebakaran', 'panas berlebih',
+            'tidak kompeten', 'tanpa pelatihan', 'tidak tersertifikasi',
+            'tidak sesuai prosedur', 'prosedur tidak diikuti',
+            # English (for bilingual reports)
+            'exposed', 'unlocked', 'unauthorized', 'no ppe', 'no permit',
+            'overload', 'short circuit', 'poor grounding', 'stripped cable',
+            'fall hazard', 'slip', 'trip', 'fire hazard', 'overheating',
+            'untrained', 'incompetent', 'not certified',
+            'procedure violated', 'bypassed'
         ]
         found_risks = [kw for kw in risk_keywords if kw in all_text]
         return risk_keywords, found_risks
+    # --- 1. Coverage Equity by Location ---
     if 'nama_lokasi_full' in df.columns and 'creator_nid' in df.columns:
+        # Group by location
         loc_activity = df.groupby('nama_lokasi_full').agg(
+            findings_count=('kode_temuan', 'size'),          # ✅ SAFE: uses row count
             unique_reporters=('creator_nid', 'nunique')
         ).reset_index()
+        total_locations = len(loc_activity)
+        low_coverage_locs = loc_activity[loc_activity['unique_reporters'] <= 1]
         risky_high_low = loc_activity[
             (loc_activity['findings_count'] > loc_activity['findings_count'].median()) &
             (loc_activity['unique_reporters'] <= 2)
         ]
         if not risky_high_low.empty:
+            loc_list = risky_high_low['nama_lokasi_full'].head(3).tolist()
             loc_names = ', '.join(loc_list)
             insight = (
                 f"Locations {loc_names} show high finding volume but rely on ≤2 reporters, indicating potential blind spots "
             )
             insights.append({"insight": insight, "recommendation": recommendation})
+        if len(low_coverage_locs) > total_locations * 0.3 and total_locations > 3:
             insight = (
                 f"Over 30% of locations ({len(low_coverage_locs)}/{total_locations}) are covered by only 1 reporter, "
                 f"increasing the risk of unreported hazards due to observer fatigue or familiarity bias."
             )
             insights.append({"insight": insight, "recommendation": recommendation})
+    # --- 2. Divisional Load & Frequency Risk ---
+    if 'nama' in df.columns and 'created_at' in df.columns:
+        # Ensure 'created_at' is datetime
+        if not pd.api.types.is_datetime64_any_dtype(df['created_at']):
+            df = df.copy()
+            df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
         div_summary = df.groupby('nama').agg(
+            total_findings=('kode_temuan', 'size'),     # ✅
             unique_people=('creator_nid', 'nunique'),
             first_report=('created_at', 'min'),
             last_report=('created_at', 'max')
         )
+        # Drop groups with NaT (invalid dates)
+        div_summary = div_summary.dropna(subset=['first_report', 'last_report'])
+        if not div_summary.empty:
+            div_summary['reporting_span_days'] = (
+                (div_summary['last_report'] - div_summary['first_report']).dt.days + 1
+            ).clip(lower=1)  # avoid zero division
+            div_summary['avg_freq_per_person'] = div_summary['total_findings'] / div_summary['unique_people'].replace(0, 1)
+            div_summary['findings_per_day'] = div_summary['total_findings'] / div_summary['reporting_span_days']
+            HIGH_LOAD_THRESHOLD = 8.0
+            LOW_ACTIVITY_THRESHOLD = 0.2
+            high_load_div = div_summary[div_summary['avg_freq_per_person'] >= HIGH_LOAD_THRESHOLD]
+            low_activity_div = div_summary[div_summary['findings_per_day'] <= LOW_ACTIVITY_THRESHOLD]
+            if not high_load_div.empty:
+                top_div = high_load_div['avg_freq_per_person'].idxmax()
+                rate = high_load_div.loc[top_div, 'avg_freq_per_person']
+                insight = (
+                    f"Division '{top_div}' has an elevated reporting load ({rate:.1f} findings/person), "
+                    f"which may lead to fatigue, rushed inspections, or selective reporting."
+                )
+                recommendation = (
+                    f"Augment the division’s safety team with cross-trained support staff. "
+                    f"Introduce AI-assisted checklist validation to reduce cognitive load. "
+                    f"Monitor for declining finding quality (e.g., vague descriptions)."
+                )
+                insights.append({"insight": insight, "recommendation": recommendation})
+            if not low_activity_div.empty:
+                low_divs = low_activity_div.index.tolist()[:3]
+                div_names = ', '.join(low_divs)
+                insight = (
+                    f"Divisions {div_names} show persistently low reporting frequency (<0.2 findings/day), "
+                    f"suggesting either excellent safety performance or significant under-reporting."
+                )
+                recommendation = (
+                    f"Conduct a *silent audit* (observation-only, no prior notice) in these divisions to validate safety status. "
+                    f"Review training records and psychological safety survey scores — fear of blame suppresses reporting."
+                )
+                insights.append({"insight": insight, "recommendation": recommendation})
+    # --- 3. Agentic Safety Mitigation (Proactive vs Reactive) ---
     if 'temuan_kategori' in df.columns:
         total = len(df)
         proactive = (df['temuan_kategori'] == 'Positive').sum()
         proactive_rate = proactive / total if total > 0 else 0
         insight = (
             f"Only {proactive_rate:.1%} of findings reflect *proactive* safety behaviors (e.g., positive interventions, improvements). "
+            f"The remaining {100 - proactive_rate * 100:.1f}% are *reactive* (hazards already present)."
         )
         recommendation = (
             f"Shift incentives from 'finding count' to 'prevention impact'. "
         )
         insights.append({"insight": insight, "recommendation": recommendation})
+    # --- 4. Emerging Risk Detection via Wordcloud (Risk Cloud) ---
     all_risk_terms, detected_terms = detect_emerging_risks(df)
     if detected_terms:
+        high_sev_terms = ['exposed', 'fire hazard', 'fall', 'short circuit', 'unauthorized',
+                          'korsleting', 'kebakaran', 'jatuh', 'tanpa izin', 'kabel terkelupas']
         detected_high = [t for t in detected_terms if t in high_sev_terms]
         if detected_high:
+            terms_str = ', '.join(detected_high[:4])
             insight = (
                 f"Wordcloud analysis indicates emerging high-severity risks: *{terms_str}*. "
+                f"These signal active hazards not yet fully mitigated."
             )
             recommendation = (
                 f"Launch a 14-day *Targeted Risk Blitz* on locations reporting these terms. "
             )
             insights.append({"insight": insight, "recommendation": recommendation})
+        # Cloud persists? → Why?
+        if len(detected_terms) >= 4 and (len(all_risk_terms) - len(detected_terms)) > 5:
             insight = (
                 f"Despite mitigation efforts, the risk 'cloud' persists — likely due to: "
+                f"(1) Contractor turnover re-introducing old hazards, "
                 f"(2) Incomplete closure verification, or "
+                f"(3) Findings migrating across locations after local fixes."
             )
             recommendation = (
+                f"Adopt *closed-loop verification*: require geo-tagged before/after photos + PIC sign-off. "
+                f"Map recurring findings to contractor IDs — enforce SLA penalties for repeat failures. "
+                f"Use AI clustering to detect systemic patterns (e.g., 'grounding failure' across 3 locations in 2 weeks)."
             )
             insights.append({"insight": insight, "recommendation": recommendation})
+    # --- 5. Coverage Balancing Strategy ---
     if 'nama_lokasi_full' in df.columns and 'creator_nid' in df.columns:
         reporters_per_location = df.groupby('nama_lokasi_full')['creator_nid'].nunique()
+        if len(reporters_per_location) > 1:
+            mean_r = reporters_per_location.mean()
+            std_r = reporters_per_location.std()
+            coverage_gini = std_r / mean_r if mean_r > 0 else 0
+            if coverage_gini > 0.6:
+                insight = (
+                    f"Coverage inequality (Gini ≈ {coverage_gini:.2f}) is high — a few locations dominate reporting effort. "
+                    f"This creates surveillance deserts in low-coverage zones."
+                )
+                recommendation = (
+                    f"1. Assign *minimum 2 unique reporters per high-risk location* monthly. "
+                    f"2. Use route optimization (e.g., VRP algorithm) to balance travel + inspection load. "
+                    f"3. Deploy mobile micro-checklists for non-auditors (e.g., operators) to increase eyes-on-ground."
+                )
+                insights.append({"insight": insight, "recommendation": recommendation})
     return insights
+# ==============================
+# Execute & Display
+# ==============================
+try:
+    risk_insights = compute_risk_mitigation_insights(df_filtered)
+except Exception as e:
+    st.error(f"Error in risk insight generation: {str(e)}")
+    risk_insights = []
 if risk_insights:
     for i, ir in enumerate(risk_insights, 1):
         st.markdown(f"<div class='ai-recommendation'><strong>Action {i}:</strong> {ir['recommendation']}</div>", unsafe_allow_html=True)
 else:
     st.markdown(
+        "<div class='ai-insight'>No risk-mitigation insights generated. Ensure your data includes: "
+        "<code>nama_lokasi_full</code>, <code>nama</code>, <code>creator_nid</code>, "
+        "<code>temuan_kategori</code>, and at least one text field (e.g., <code>kondisi</code> or <code>judul</code>).</div>",
         unsafe_allow_html=True
     )