Spaces:

gapura-dev
/

gapura-ai-api

Build error

gapura-ai-api / data /subcategory_service.py

Muhammad Ridzki Nugraha

Upload folder using huggingface_hub

13c3f2c verified about 2 months ago

13.5 kB

	"""
	Subcategory Classification Service for Gapura AI
	Predicts Terminal_Area_Category and Apron_Area_Category from report text
	"""

	import os
	import logging
	import pickle
	from typing import List, Dict, Any, Optional, Tuple
	import re

	logger = logging.getLogger(__name__)


	class SubcategoryClassifier:
	"""
	Classifies reports into subcategories based on text content

	Terminal Area Categories:
	- Baggage/Special/Irregularities Handling
	- Passenger, Baggage & Document Profilling
	- Boarding Management
	- Procedure Competencies
	- Accuracy & Completeness of Service
	- Lack communication skills

	Apron Area Categories:
	- Procedure Competencies
	- Accurancy & Completeness of Service (Apron)
	- The Availability of GSE
	- Flight Document Handling
	- Preparation Before ETA
	- Safety Performance
	- Officer Competencies
	- Qualified Competencies (Apron)
	- Cleanliness of GSE
	- Prompt service and certainty
	"""

	TERMINAL_CATEGORIES = {
	"Baggage/Special/Irregularities Handling": [
	"bagasi",
	"baggage",
	"baggages",
	"koper",
	"lost baggage",
	"missing baggage",
	"baggage claim",
	"baggage handling",
	"special handling",
	"irregularities",
	"torn",
	"dented",
	"wet",
	"damaged baggage",
	"delayed baggage",
	],
	"Passenger, Baggage & Document Profilling": [
	"passenger profiling",
	"document",
	"passport",
	"visa",
	"identification",
	"profilling",
	"screening",
	"security check",
	"dokumen",
	"penumpang",
	"wrong pax",
	"passenger wrong",
	"incorrect passenger",
	],
	"Boarding Management": [
	"boarding",
	"gate",
	"embarkation",
	"boarding pass",
	"boarding card",
	"gate closure",
	"late boarding",
	"boarding delay",
	"wrong gate",
	"naik pesawat",
	"pintu",
	"kapasitas",
	],
	"Procedure Competencies": [
	"procedure",
	"prosedur",
	"sop",
	"standard operating",
	"competency",
	"training",
	"skill",
	"knowledge",
	"tidak sesuai prosedur",
	"kelalaian",
	"negligence",
	"mistake",
	"human error",
	],
	"Accuracy & Completeness of Service": [
	"accuracy",
	"completeness",
	"kelengkapan",
	"ketepatan",
	"incomplete",
	"missing information",
	"wrong information",
	"data entry",
	"input error",
	"administrasi",
	"dokumentasi",
	"documentation",
	],
	"Lack communication skills": [
	"communication",
	"komunikasi",
	"language",
	"bahasa",
	"english",
	"tidak bisa komunikasi",
	"miscommunication",
	"tidak mengerti",
	"tidak paham",
	"instruction",
	"penjelasan",
	],
	}

	APRON_CATEGORIES = {
	"Procedure Competencies": [
	"procedure",
	"prosedur",
	"sop",
	"load sheet",
	"instruction",
	"loading instruction",
	"unloading",
	"bertentangan",
	"tidak sesuai",
	"mistake",
	"error",
	"human error",
	"kelalaian",
	],
	"Accurancy & Completeness of Service (Apron)": [
	"accuracy",
	"completeness",
	"wrong",
	"salah",
	"incorrect",
	"mismatch",
	"tidak sesuai",
	"different",
	"beda",
	"not match",
	"count",
	"jumlah",
	"pieces",
	"pcs",
	"weight",
	"berat",
	],
	"The Availability of GSE": [
	"gse",
	"ground support",
	"equipment",
	"forklift",
	"pallet",
	"container",
	"uld",
	"belt loader",
	"stairs",
	"towing",
	"peralatan",
	"alat",
	"broken",
	"rusak",
	"tidak tersedia",
	],
	"Flight Document Handling": [
	"document",
	"dokumen",
	"manifest",
	"load sheet",
	"awb",
	"air waybill",
	"flight plan",
	"notoc",
	"dangerous goods",
	"paperwork",
	"dokumentasi",
	" kelengkapan dokumen",
	],
	"Preparation Before ETA": [
	"preparation",
	"persiapan",
	"before eta",
	"standby",
	"ready",
	"belum siap",
	"not ready",
	"late preparation",
	"tardiness",
	"timing",
	"schedule",
	"jadwal",
	],
	"Safety Performance": [
	"safety",
	"keselamatan",
	"danger",
	"bahaya",
	"hazard",
	"incident",
	"accident",
	"injury",
	"cedera",
	"emergency",
	"unsafe",
	"risk",
	"potensi bahaya",
	],
	"Officer Competencies": [
	"officer",
	"petugas",
	"staff",
	"staffing",
	"competency",
	"skill",
	"kemampuan",
	"pengalaman",
	"experience",
	"training",
	"pelatihan",
	"certification",
	],
	"Qualified Competencies (Apron)": [
	"qualified",
	"certified",
	"bersertifikat",
	"license",
	"lisensi",
	"authorized",
	"terlatih",
	"trained",
	"qualification",
	],
	"Cleanliness of GSE": [
	"cleanliness",
	"kebersihan",
	"dirty",
	"kotor",
	"clean",
	"bersih",
	"hygiene",
	"sanitation",
	"maintain",
	],
	"Prompt service and certainty": [
	"prompt",
	"cepat",
	"quick",
	"fast",
	"slow",
	"lambat",
	"delay",
	"terlambat",
	"waiting time",
	"tunggu",
	"response time",
	],
	}

	ISSUE_TYPE_MAPPING = {
	"Pax Handling": [
	"Passenger, Baggage & Document Profilling",
	"Boarding Management",
	],
	"Baggage Handling": ["Baggage/Special/Irregularities Handling"],
	"Cargo Problems": [
	"Accurancy & Completeness of Service (Apron)",
	"The Availability of GSE",
	],
	"GSE": ["The Availability of GSE", "Cleanliness of GSE"],
	"Operation": ["Procedure Competencies", "Preparation Before ETA"],
	"Flight Document Handling": ["Flight Document Handling"],
	"Procedure Competencies": ["Procedure Competencies", "Officer Competencies"],
	}

	def __init__(self):
	self.model = None
	self.vectorizer = None
	self.terminal_label_encoder = None
	self.apron_label_encoder = None
	self._load_model()

	def _load_model(self):
	"""Load trained model if available"""
	base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	model_path = os.path.join(base_dir, "models", "subcategory", "classifier.pkl")

	if os.path.exists(model_path):
	try:
	with open(model_path, "rb") as f:
	model_data = pickle.load(f)
	self.model = model_data.get("model")
	self.vectorizer = model_data.get("vectorizer")
	self.terminal_label_encoder = model_data.get("terminal_encoder")
	self.apron_label_encoder = model_data.get("apron_encoder")
	logger.info("Subcategory classifier loaded")
	except Exception as e:
	logger.warning(f"Failed to load subcategory model: {e}")

	def classify(
	self,
	report: str,
	area: str = None,
	issue_type: str = None,
	root_cause: str = None,
	) -> Dict[str, Any]:
	"""
	Classify report into subcategory

	Args:
	report: Report text
	area: Area type (Terminal Area / Apron Area / General)
	issue_type: Issue category
	root_cause: Root cause text

	Returns:
	Dict with predicted subcategory and confidence
	"""
	combined_text = f"{report} {root_cause or ''}".lower()

	if area == "Terminal Area":
	return self._classify_terminal(combined_text, issue_type)
	elif area == "Apron Area":
	return self._classify_apron(combined_text, issue_type)
	else:
	# Auto-detect area and classify
	terminal_result = self._classify_terminal(combined_text, issue_type)
	apron_result = self._classify_apron(combined_text, issue_type)

	if terminal_result["confidence"] > apron_result["confidence"]:
	return {**terminal_result, "detected_area": "Terminal Area"}
	else:
	return {**apron_result, "detected_area": "Apron Area"}

	def _classify_terminal(self, text: str, issue_type: str = None) -> Dict[str, Any]:
	"""Classify into Terminal Area category"""
	scores = {}

	for category, keywords in self.TERMINAL_CATEGORIES.items():
	score = sum(1 for kw in keywords if kw in text)

	# Boost score if issue type matches
	if issue_type and issue_type in self.ISSUE_TYPE_MAPPING:
	if category in self.ISSUE_TYPE_MAPPING[issue_type]:
	score += 2

	scores[category] = score

	total = sum(scores.values()) or 1

	if max(scores.values()) == 0:
	return {
	"subcategory": "Baggage/Special/Irregularities Handling",
	"confidence": 0.5,
	"all_scores": {k: round(v / total, 2) for k, v in scores.items()},
	}

	best_category = max(scores, key=scores.get)
	best_score = scores[best_category]
	confidence = min(0.95, 0.5 + (best_score / total) * 0.45)

	return {
	"subcategory": best_category,
	"confidence": round(confidence, 2),
	"all_scores": {k: round(v / total, 2) for k, v in scores.items()},
	}

	def _classify_apron(self, text: str, issue_type: str = None) -> Dict[str, Any]:
	"""Classify into Apron Area category"""
	scores = {}

	for category, keywords in self.APRON_CATEGORIES.items():
	score = sum(1 for kw in keywords if kw in text)

	if issue_type and issue_type in self.ISSUE_TYPE_MAPPING:
	if category in self.ISSUE_TYPE_MAPPING[issue_type]:
	score += 2

	scores[category] = score

	total = sum(scores.values()) or 1

	if max(scores.values()) == 0:
	return {
	"subcategory": "Procedure Competencies",
	"confidence": 0.5,
	"all_scores": {k: round(v / total, 2) for k, v in scores.items()},
	}

	best_category = max(scores, key=scores.get)
	best_score = scores[best_category]
	confidence = min(0.95, 0.5 + (best_score / total) * 0.45)

	return {
	"subcategory": best_category,
	"confidence": round(confidence, 2),
	"all_scores": {k: round(v / total, 2) for k, v in scores.items()},
	}

	def classify_batch(self, records: List[Dict]) -> List[Dict[str, Any]]:
	"""Classify multiple records"""
	results = []

	for record in records:
	result = self.classify(
	report=record.get("Report", ""),
	area=record.get("Area"),
	issue_type=record.get("Irregularity_Complain_Category"),
	root_cause=record.get("Root_Caused"),
	)
	results.append(result)

	return results

	def get_available_categories(self, area: str = None) -> Dict[str, List[str]]:
	"""Get list of available categories"""
	if area == "Terminal Area":
	return {"terminal": list(self.TERMINAL_CATEGORIES.keys())}
	elif area == "Apron Area":
	return {"apron": list(self.APRON_CATEGORIES.keys())}
	else:
	return {
	"terminal": list(self.TERMINAL_CATEGORIES.keys()),
	"apron": list(self.APRON_CATEGORIES.keys()),
	}


	_subcategory_classifier: Optional[SubcategoryClassifier] = None


	def get_subcategory_classifier() -> SubcategoryClassifier:
	"""Get singleton subcategory classifier instance"""
	global _subcategory_classifier
	if _subcategory_classifier is None:
	_subcategory_classifier = SubcategoryClassifier()
	return _subcategory_classifier