Spaces:

gapura-dev
/

gapura-ai-api

Build error

File size: 18,205 Bytes

13c3f2c

"""
Category Summarization Service
Provides aggregated summaries for Non-cargo and CGO categories
"""

import os
import logging
from typing import Dict, Any, List, Optional
from collections import Counter
from datetime import datetime, timedelta
import pandas as pd

logger = logging.getLogger(__name__)


class CategorySummarizationService:
    """Service for generating category-based summaries"""

    def __init__(self):
        self.severity_keywords = {
            "critical": [
                "emergency",
                "darurat",
                "critical",
                "kritis",
                "accident",
                "kecelakaan",
                "injury",
                "cedera",
            ],
            "high": [
                "damage",
                "rusak",
                "torn",
                "robek",
                "broken",
                "pecah",
                "urgent",
                "mendesak",
                "lost",
                "hilang",
                "stolen",
                "dicuri",
            ],
            "medium": [
                "delay",
                "terlambat",
                "wrong",
                "salah",
                "error",
                "kesalahan",
                "missing",
                "problem",
                "masalah",
            ],
            "low": ["minor", "kecil", "small", "sedikit", "normal", "biasa"],
        }
        self._data_cache = {}
        self._last_updated = None

    def summarize_category(
        self, data: List[Dict], category_type: str = "all"
    ) -> Dict[str, Any]:
        """
        Generate summary for a specific category

        Args:
            data: List of records with _sheet_name field
            category_type: "non_cargo", "cgo", or "all"
        """
        if category_type == "all":
            return {
                "non_cargo": self._summarize_single_category(data, "NON CARGO"),
                "cgo": self._summarize_single_category(data, "CGO"),
                "comparison": self._compare_categories(data),
            }
        elif category_type == "non_cargo":
            return self._summarize_single_category(data, "NON CARGO")
        elif category_type == "cgo":
            return self._summarize_single_category(data, "CGO")
        else:
            return {"error": f"Unknown category type: {category_type}"}

    def _summarize_single_category(
        self, data: List[Dict], sheet_name: str
    ) -> Dict[str, Any]:
        """Generate summary for a single category"""
        filtered_data = [r for r in data if r.get("_sheet_name") == sheet_name]

        if not filtered_data:
            return {
                "sheet_name": sheet_name,
                "total_records": 0,
                "message": "No data available for this category",
            }

        total_records = len(filtered_data)

        severity_dist = Counter()
        category_dist = Counter()
        airline_dist = Counter()
        hub_dist = Counter()
        branch_dist = Counter()
        area_dist = Counter()
        status_dist = Counter()
        issue_type_dist = Counter()
        root_cause_dist = Counter()
        monthly_trend = Counter()

        reports_text = []
        root_causes_text = []
        actions_text = []

        for record in filtered_data:
            report_text = record.get("Report", "") or ""
            root_cause = record.get("Root_Caused", "") or ""
            combined = f"{report_text} {root_cause}".lower()

            severity = self._classify_severity(combined)
            severity_dist[severity] += 1

            category = record.get("Irregularity_Complain_Category", "Unknown")
            category_dist[category] += 1

            airline = record.get("Airlines", "Unknown")
            airline_dist[airline] += 1

            hub = record.get("HUB", "Unknown")
            hub_dist[hub] += 1

            branch = record.get("Branch", "Unknown")
            branch_dist[branch] += 1

            area = record.get("Area", "Unknown")
            area_dist[area] += 1

            status = record.get("Status", "Unknown")
            status_dist[status] += 1

            if category and category != "Unknown":
                issue_type_dist[category] += 1

            if root_cause:
                root_cause_dist[self._categorize_root_cause(root_cause)] += 1

            date_str = record.get("Date_of_Event", "")
            if date_str:
                try:
                    date_obj = pd.to_datetime(date_str, errors="coerce")
                    if not pd.isna(date_obj):
                        month_key = date_obj.strftime("%Y-%m")
                        monthly_trend[month_key] += 1
                except:
                    pass

            if report_text:
                reports_text.append(report_text)
            if root_cause:
                root_causes_text.append(root_cause)
            action = record.get("Action_Taken", "")
            if action:
                actions_text.append(action)

        critical_high_count = severity_dist.get("Critical", 0) + severity_dist.get(
            "High", 0
        )
        critical_high_pct = (
            round((critical_high_count / total_records) * 100, 1)
            if total_records > 0
            else 0
        )

        open_count = status_dist.get("Open", 0) + status_dist.get("In Progress", 0)
        open_pct = (
            round((open_count / total_records) * 100, 1) if total_records > 0 else 0
        )

        key_insights = self._generate_key_insights(
            sheet_name,
            total_records,
            severity_dist,
            category_dist,
            airline_dist,
            critical_high_pct,
            open_pct,
        )

        common_issues = self._extract_common_issues(reports_text)

        return {
            "sheet_name": sheet_name,
            "total_records": total_records,
            "severity_distribution": dict(severity_dist),
            "critical_high_percentage": critical_high_pct,
            "open_issues_percentage": open_pct,
            "top_categories": dict(category_dist.most_common(5)),
            "top_airlines": dict(airline_dist.most_common(5)),
            "top_hubs": dict(hub_dist.most_common(5)),
            "top_branches": dict(branch_dist.most_common(5)),
            "area_distribution": dict(area_dist),
            "status_distribution": dict(status_dist),
            "root_cause_categories": dict(root_cause_dist.most_common(5)),
            "monthly_trend": dict(sorted(monthly_trend.items())[-6:]),
            "key_insights": key_insights,
            "common_issues": common_issues,
            "recommendations": self._generate_recommendations(
                severity_dist, category_dist, root_cause_dist
            ),
            "last_updated": datetime.now().isoformat(),
        }

    def _classify_severity(self, text: str) -> str:
        """Classify severity based on keywords"""
        text_lower = text.lower()

        for level, keywords in self.severity_keywords.items():
            for kw in keywords:
                if kw in text_lower:
                    return level.capitalize()

        return "Low"

    def _categorize_root_cause(self, root_cause: str) -> str:
        """Categorize root cause into categories"""
        rc_lower = root_cause.lower()

        categories = {
            "Equipment Failure": [
                "equipment",
                "mesin",
                "alat",
                "tool",
                "machine",
                "device",
                "broken",
                "rusak",
                "malfunction",
            ],
            "Staff Competency": [
                "staff",
                "staffing",
                "kompetensi",
                "skill",
                "training",
                "pelatihan",
                "human error",
                "kurang",
            ],
            "Process/Procedure": [
                "procedure",
                "prosedur",
                "process",
                "proses",
                "sop",
                "workflow",
                "system",
            ],
            "Communication": [
                "communication",
                "komunikasi",
                "informasi",
                "koordinasi",
                "coordination",
                "miscommunication",
            ],
            "External Factors": [
                "weather",
                "cuaca",
                "external",
                "flight delay",
                "airline",
                "airline delay",
                "faktor luar",
            ],
            "Resource/Manpower": [
                "manpower",
                "tenaga",
                "shortage",
                "kurang",
                "resource",
                "sumber daya",
                "lack of",
            ],
            "Documentation": [
                "document",
                "dokumen",
                "paperwork",
                "paper",
                "label",
                "tag",
                "manifest",
            ],
        }

        for category, keywords in categories.items():
            for kw in keywords:
                if kw in rc_lower:
                    return category

        return "Other"

    def _generate_key_insights(
        self,
        sheet_name: str,
        total_records: int,
        severity_dist: Counter,
        category_dist: Counter,
        airline_dist: Counter,
        critical_high_pct: float,
        open_pct: float,
    ) -> List[str]:
        """Generate key insights from the data"""
        insights = []

        category_label = "Non-Cargo" if sheet_name == "NON CARGO" else "Cargo"
        insights.append(f"Total {total_records} {category_label} reports analyzed")

        if critical_high_pct > 20:
            insights.append(
                f"High priority attention needed: {critical_high_pct}% Critical/High severity issues"
            )
        elif critical_high_pct > 10:
            insights.append(
                f"Moderate concern: {critical_high_pct}% Critical/High severity issues"
            )
        else:
            insights.append(
                f"Severity levels manageable: Only {critical_high_pct}% Critical/High severity"
            )

        if open_pct > 30:
            insights.append(f"Action required: {open_pct}% issues still open/pending")

        top_category = category_dist.most_common(1)
        if top_category:
            insights.append(
                f"Most common issue type: {top_category[0][0]} ({top_category[0][1]} occurrences)"
            )

        top_airline = airline_dist.most_common(1)
        if top_airline and top_airline[0][0] != "Unknown":
            insights.append(
                f"Highest reporting airline: {top_airline[0][0]} ({top_airline[0][1]} reports)"
            )

        critical_count = severity_dist.get("Critical", 0)
        if critical_count > 0:
            insights.append(
                f"ATTENTION: {critical_count} Critical severity issues require immediate action"
            )

        return insights

    def _extract_common_issues(self, reports: List[str]) -> List[Dict[str, Any]]:
        """Extract common issues from reports"""
        issue_keywords = {
            "Damage/Destruction": [
                "damage",
                "rusak",
                "broken",
                "pecah",
                "torn",
                "robek",
                "destroyed",
            ],
            "Delay/Late": ["delay", "terlambat", "late", "telat", "waiting", "tunggu"],
            "Missing/Lost Items": [
                "missing",
                "hilang",
                "lost",
                "not found",
                "tidak ada",
            ],
            "Documentation Error": [
                "wrong document",
                "salah dokumen",
                "incorrect",
                "label error",
                "tag salah",
            ],
            "Handling Issue": [
                "handling",
                "penanganan",
                "mishandled",
                "rough",
                "kasar",
            ],
            "Communication Issue": [
                "communication",
                "komunikasi",
                "information",
                "informasi",
                "coordinate",
            ],
            "Equipment Problem": [
                "equipment",
                "alat",
                "mesin",
                "machine",
                "device",
                "tool",
            ],
            "Security Concern": [
                "security",
                "keamanan",
                "unauthorized",
                "access",
                "theft",
                "pencurian",
            ],
        }

        issue_counts = Counter()

        for report in reports:
            report_lower = report.lower()
            for issue_type, keywords in issue_keywords.items():
                for kw in keywords:
                    if kw in report_lower:
                        issue_counts[issue_type] += 1
                        break

        return [
            {"issue": issue, "count": count}
            for issue, count in issue_counts.most_common(8)
        ]

    def _generate_recommendations(
        self, severity_dist: Counter, category_dist: Counter, root_cause_dist: Counter
    ) -> List[str]:
        """Generate actionable recommendations"""
        recommendations = []

        critical_count = severity_dist.get("Critical", 0) + severity_dist.get("High", 0)
        if critical_count > 10:
            recommendations.append(
                "Establish dedicated task force for critical/high severity issues"
            )

        top_root_cause = root_cause_dist.most_common(1)
        if top_root_cause:
            rc = top_root_cause[0][0]
            if rc == "Equipment Failure":
                recommendations.append("Schedule preventive maintenance for equipment")
            elif rc == "Staff Competency":
                recommendations.append(
                    "Implement refresher training programs for staff"
                )
            elif rc == "Process/Procedure":
                recommendations.append(
                    "Review and update standard operating procedures"
                )
            elif rc == "Communication":
                recommendations.append(
                    "Improve inter-department communication protocols"
                )
            elif rc == "Resource/Manpower":
                recommendations.append(
                    "Evaluate resource allocation and staffing levels"
                )

        top_category = category_dist.most_common(1)
        if top_category:
            cat = top_category[0][0]
            if "damage" in cat.lower() or "broken" in cat.lower():
                recommendations.append(
                    "Implement enhanced handling protocols to reduce damage"
                )
            elif "delay" in cat.lower():
                recommendations.append("Optimize workflow to minimize delays")

        if not recommendations:
            recommendations.append(
                "Continue monitoring trends and maintain current performance"
            )

        return recommendations

    def _compare_categories(self, data: List[Dict]) -> Dict[str, Any]:
        """Compare Non-cargo and CGO categories"""
        non_cargo_data = [r for r in data if r.get("_sheet_name") == "NON CARGO"]
        cgo_data = [r for r in data if r.get("_sheet_name") == "CGO"]

        non_cargo_summary = self._summarize_single_category(data, "NON CARGO")
        cgo_summary = self._summarize_single_category(data, "CGO")

        comparison = {
            "total_records": {
                "non_cargo": len(non_cargo_data),
                "cgo": len(cgo_data),
                "difference": len(non_cargo_data) - len(cgo_data),
            },
            "critical_high_percentage": {
                "non_cargo": non_cargo_summary.get("critical_high_percentage", 0),
                "cgo": cgo_summary.get("critical_high_percentage", 0),
            },
            "open_issues_percentage": {
                "non_cargo": non_cargo_summary.get("open_issues_percentage", 0),
                "cgo": cgo_summary.get("open_issues_percentage", 0),
            },
            "top_categories_comparison": {
                "non_cargo": list(non_cargo_summary.get("top_categories", {}).keys())[
                    :3
                ],
                "cgo": list(cgo_summary.get("top_categories", {}).keys())[:3],
            },
            "severity_comparison": {
                "non_cargo": non_cargo_summary.get("severity_distribution", {}),
                "cgo": cgo_summary.get("severity_distribution", {}),
            },
            "insights": [],
        }

        if len(non_cargo_data) > len(cgo_data):
            comparison["insights"].append(
                f"Non-Cargo has {len(non_cargo_data) - len(cgo_data)} more reports than Cargo"
            )
        elif len(cgo_data) > len(non_cargo_data):
            comparison["insights"].append(
                f"Cargo has {len(cgo_data) - len(non_cargo_data)} more reports than Non-Cargo"
            )

        nc_crit = non_cargo_summary.get("critical_high_percentage", 0)
        cgo_crit = cgo_summary.get("critical_high_percentage", 0)
        if nc_crit > cgo_crit + 5:
            comparison["insights"].append(
                "Non-Cargo has higher critical/high severity rate - needs attention"
            )
        elif cgo_crit > nc_crit + 5:
            comparison["insights"].append(
                "Cargo has higher critical/high severity rate - needs attention"
            )

        return comparison


_service_instance = None


def get_category_summarization_service() -> CategorySummarizationService:
    """Get or create singleton service instance"""
    global _service_instance
    if _service_instance is None:
        _service_instance = CategorySummarizationService()
    return _service_instance