gapura-ai-api / data /subcategory_service.py
Muhammad Ridzki Nugraha
Upload folder using huggingface_hub
13c3f2c verified
"""
Subcategory Classification Service for Gapura AI
Predicts Terminal_Area_Category and Apron_Area_Category from report text
"""
import os
import logging
import pickle
from typing import List, Dict, Any, Optional, Tuple
import re
logger = logging.getLogger(__name__)
class SubcategoryClassifier:
"""
Classifies reports into subcategories based on text content
Terminal Area Categories:
- Baggage/Special/Irregularities Handling
- Passenger, Baggage & Document Profilling
- Boarding Management
- Procedure Competencies
- Accuracy & Completeness of Service
- Lack communication skills
Apron Area Categories:
- Procedure Competencies
- Accurancy & Completeness of Service (Apron)
- The Availability of GSE
- Flight Document Handling
- Preparation Before ETA
- Safety Performance
- Officer Competencies
- Qualified Competencies (Apron)
- Cleanliness of GSE
- Prompt service and certainty
"""
TERMINAL_CATEGORIES = {
"Baggage/Special/Irregularities Handling": [
"bagasi",
"baggage",
"baggages",
"koper",
"lost baggage",
"missing baggage",
"baggage claim",
"baggage handling",
"special handling",
"irregularities",
"torn",
"dented",
"wet",
"damaged baggage",
"delayed baggage",
],
"Passenger, Baggage & Document Profilling": [
"passenger profiling",
"document",
"passport",
"visa",
"identification",
"profilling",
"screening",
"security check",
"dokumen",
"penumpang",
"wrong pax",
"passenger wrong",
"incorrect passenger",
],
"Boarding Management": [
"boarding",
"gate",
"embarkation",
"boarding pass",
"boarding card",
"gate closure",
"late boarding",
"boarding delay",
"wrong gate",
"naik pesawat",
"pintu",
"kapasitas",
],
"Procedure Competencies": [
"procedure",
"prosedur",
"sop",
"standard operating",
"competency",
"training",
"skill",
"knowledge",
"tidak sesuai prosedur",
"kelalaian",
"negligence",
"mistake",
"human error",
],
"Accuracy & Completeness of Service": [
"accuracy",
"completeness",
"kelengkapan",
"ketepatan",
"incomplete",
"missing information",
"wrong information",
"data entry",
"input error",
"administrasi",
"dokumentasi",
"documentation",
],
"Lack communication skills": [
"communication",
"komunikasi",
"language",
"bahasa",
"english",
"tidak bisa komunikasi",
"miscommunication",
"tidak mengerti",
"tidak paham",
"instruction",
"penjelasan",
],
}
APRON_CATEGORIES = {
"Procedure Competencies": [
"procedure",
"prosedur",
"sop",
"load sheet",
"instruction",
"loading instruction",
"unloading",
"bertentangan",
"tidak sesuai",
"mistake",
"error",
"human error",
"kelalaian",
],
"Accurancy & Completeness of Service (Apron)": [
"accuracy",
"completeness",
"wrong",
"salah",
"incorrect",
"mismatch",
"tidak sesuai",
"different",
"beda",
"not match",
"count",
"jumlah",
"pieces",
"pcs",
"weight",
"berat",
],
"The Availability of GSE": [
"gse",
"ground support",
"equipment",
"forklift",
"pallet",
"container",
"uld",
"belt loader",
"stairs",
"towing",
"peralatan",
"alat",
"broken",
"rusak",
"tidak tersedia",
],
"Flight Document Handling": [
"document",
"dokumen",
"manifest",
"load sheet",
"awb",
"air waybill",
"flight plan",
"notoc",
"dangerous goods",
"paperwork",
"dokumentasi",
" kelengkapan dokumen",
],
"Preparation Before ETA": [
"preparation",
"persiapan",
"before eta",
"standby",
"ready",
"belum siap",
"not ready",
"late preparation",
"tardiness",
"timing",
"schedule",
"jadwal",
],
"Safety Performance": [
"safety",
"keselamatan",
"danger",
"bahaya",
"hazard",
"incident",
"accident",
"injury",
"cedera",
"emergency",
"unsafe",
"risk",
"potensi bahaya",
],
"Officer Competencies": [
"officer",
"petugas",
"staff",
"staffing",
"competency",
"skill",
"kemampuan",
"pengalaman",
"experience",
"training",
"pelatihan",
"certification",
],
"Qualified Competencies (Apron)": [
"qualified",
"certified",
"bersertifikat",
"license",
"lisensi",
"authorized",
"terlatih",
"trained",
"qualification",
],
"Cleanliness of GSE": [
"cleanliness",
"kebersihan",
"dirty",
"kotor",
"clean",
"bersih",
"hygiene",
"sanitation",
"maintain",
],
"Prompt service and certainty": [
"prompt",
"cepat",
"quick",
"fast",
"slow",
"lambat",
"delay",
"terlambat",
"waiting time",
"tunggu",
"response time",
],
}
ISSUE_TYPE_MAPPING = {
"Pax Handling": [
"Passenger, Baggage & Document Profilling",
"Boarding Management",
],
"Baggage Handling": ["Baggage/Special/Irregularities Handling"],
"Cargo Problems": [
"Accurancy & Completeness of Service (Apron)",
"The Availability of GSE",
],
"GSE": ["The Availability of GSE", "Cleanliness of GSE"],
"Operation": ["Procedure Competencies", "Preparation Before ETA"],
"Flight Document Handling": ["Flight Document Handling"],
"Procedure Competencies": ["Procedure Competencies", "Officer Competencies"],
}
def __init__(self):
self.model = None
self.vectorizer = None
self.terminal_label_encoder = None
self.apron_label_encoder = None
self._load_model()
def _load_model(self):
"""Load trained model if available"""
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
model_path = os.path.join(base_dir, "models", "subcategory", "classifier.pkl")
if os.path.exists(model_path):
try:
with open(model_path, "rb") as f:
model_data = pickle.load(f)
self.model = model_data.get("model")
self.vectorizer = model_data.get("vectorizer")
self.terminal_label_encoder = model_data.get("terminal_encoder")
self.apron_label_encoder = model_data.get("apron_encoder")
logger.info("Subcategory classifier loaded")
except Exception as e:
logger.warning(f"Failed to load subcategory model: {e}")
def classify(
self,
report: str,
area: str = None,
issue_type: str = None,
root_cause: str = None,
) -> Dict[str, Any]:
"""
Classify report into subcategory
Args:
report: Report text
area: Area type (Terminal Area / Apron Area / General)
issue_type: Issue category
root_cause: Root cause text
Returns:
Dict with predicted subcategory and confidence
"""
combined_text = f"{report} {root_cause or ''}".lower()
if area == "Terminal Area":
return self._classify_terminal(combined_text, issue_type)
elif area == "Apron Area":
return self._classify_apron(combined_text, issue_type)
else:
# Auto-detect area and classify
terminal_result = self._classify_terminal(combined_text, issue_type)
apron_result = self._classify_apron(combined_text, issue_type)
if terminal_result["confidence"] > apron_result["confidence"]:
return {**terminal_result, "detected_area": "Terminal Area"}
else:
return {**apron_result, "detected_area": "Apron Area"}
def _classify_terminal(self, text: str, issue_type: str = None) -> Dict[str, Any]:
"""Classify into Terminal Area category"""
scores = {}
for category, keywords in self.TERMINAL_CATEGORIES.items():
score = sum(1 for kw in keywords if kw in text)
# Boost score if issue type matches
if issue_type and issue_type in self.ISSUE_TYPE_MAPPING:
if category in self.ISSUE_TYPE_MAPPING[issue_type]:
score += 2
scores[category] = score
total = sum(scores.values()) or 1
if max(scores.values()) == 0:
return {
"subcategory": "Baggage/Special/Irregularities Handling",
"confidence": 0.5,
"all_scores": {k: round(v / total, 2) for k, v in scores.items()},
}
best_category = max(scores, key=scores.get)
best_score = scores[best_category]
confidence = min(0.95, 0.5 + (best_score / total) * 0.45)
return {
"subcategory": best_category,
"confidence": round(confidence, 2),
"all_scores": {k: round(v / total, 2) for k, v in scores.items()},
}
def _classify_apron(self, text: str, issue_type: str = None) -> Dict[str, Any]:
"""Classify into Apron Area category"""
scores = {}
for category, keywords in self.APRON_CATEGORIES.items():
score = sum(1 for kw in keywords if kw in text)
if issue_type and issue_type in self.ISSUE_TYPE_MAPPING:
if category in self.ISSUE_TYPE_MAPPING[issue_type]:
score += 2
scores[category] = score
total = sum(scores.values()) or 1
if max(scores.values()) == 0:
return {
"subcategory": "Procedure Competencies",
"confidence": 0.5,
"all_scores": {k: round(v / total, 2) for k, v in scores.items()},
}
best_category = max(scores, key=scores.get)
best_score = scores[best_category]
confidence = min(0.95, 0.5 + (best_score / total) * 0.45)
return {
"subcategory": best_category,
"confidence": round(confidence, 2),
"all_scores": {k: round(v / total, 2) for k, v in scores.items()},
}
def classify_batch(self, records: List[Dict]) -> List[Dict[str, Any]]:
"""Classify multiple records"""
results = []
for record in records:
result = self.classify(
report=record.get("Report", ""),
area=record.get("Area"),
issue_type=record.get("Irregularity_Complain_Category"),
root_cause=record.get("Root_Caused"),
)
results.append(result)
return results
def get_available_categories(self, area: str = None) -> Dict[str, List[str]]:
"""Get list of available categories"""
if area == "Terminal Area":
return {"terminal": list(self.TERMINAL_CATEGORIES.keys())}
elif area == "Apron Area":
return {"apron": list(self.APRON_CATEGORIES.keys())}
else:
return {
"terminal": list(self.TERMINAL_CATEGORIES.keys()),
"apron": list(self.APRON_CATEGORIES.keys()),
}
_subcategory_classifier: Optional[SubcategoryClassifier] = None
def get_subcategory_classifier() -> SubcategoryClassifier:
"""Get singleton subcategory classifier instance"""
global _subcategory_classifier
if _subcategory_classifier is None:
_subcategory_classifier = SubcategoryClassifier()
return _subcategory_classifier