gapura-ai-api / data /advanced_ner_service.py
Muhammad Ridzki Nugraha
Upload folder using huggingface_hub
13c3f2c verified
"""
Advanced Named Entity Recognition Service for Gapura AI
Extracts specific entities from irregularity reports
"""
import os
import logging
import re
from typing import List, Dict, Any, Optional, Tuple
from collections import Counter
logger = logging.getLogger(__name__)
class AdvancedNER:
"""
Advanced Named Entity Recognition for airport irregularity reports
Entity Types:
- DAMAGE_TYPE: dented, torn, wet, broken, damaged
- QUANTITY: 5 pcs, 2 bags, 10 kg
- LOCATION: gate, warehouse, aircraft, apron
- TIME: STD, ETA, delay duration
- FLIGHT_INFO: flight number, route
- AIRLINE: airline names
- PERSONNEL: PIC, staff names
- EQUIPMENT: ULD, forklift, belt loader
- CARGO_TYPE: DG, perishable, fragile
"""
DAMAGE_PATTERNS = {
"DAMAGE_TYPE": [
(r"\b(dented|dent|dents)\b", "dented"),
(r"\b(torn|tear|tears|robek)\b", "torn"),
(r"\b(wet|basah|lembab|moisture)\b", "wet"),
(r"\b(broken|break|broke|pecah)\b", "broken"),
(r"\b(damaged|damage|kerusakan|rusak)\b", "damaged"),
(r"\b(crushed|crush|penyet)\b", "crushed"),
(r"\b(scratched|scratch|gores|tergores)\b", "scratched"),
(r"\b(missing|lost|hilang|kehabisan)\b", "missing"),
(r"\b(soiled|dirty|kotor|stain)\b", "soiled"),
(r"\b(pilfered|pencurian|stolen)\b", "pilfered"),
],
}
QUANTITY_PATTERNS = {
"QUANTITY": [
(r"(\d+)\s*(pcs?|pieces?)", "pieces"),
(r"(\d+)\s*(bags?|bagasi)", "bags"),
(r"(\d+)\s*(kg|kilogram)", "kg"),
(r"(\d+)\s*(uld|container)", "uld"),
(r"(\d+)\s*(pallet|pallets)", "pallets"),
(r"(\d+)\s*(passengers?|pax|penumpang)", "passengers"),
(r"(\d+)\s*(bags?|baggage)", "baggage"),
(r"(\d+)\s*(hours?|jam)", "hours"),
(r"(\d+)\s*(minutes?|menit)", "minutes"),
],
}
LOCATION_PATTERNS = {
"LOCATION": [
(r"\b(gate\s*[A-Z0-9]+|gate\s+\d+)", "gate"),
(r"\b(warehouse|gudang|WH)\b", "warehouse"),
(r"\b(aircraft|pesawat|plane)\b", "aircraft"),
(r"\b(apron|apron area)\b", "apron"),
(r"\b(terminal\s*[0-9]?|T[0-9])\b", "terminal"),
(r"\b(baggage\s*(area|claim|hall)?)\b", "baggage_area"),
(r"\b(check[- ]?in)\b", "check_in"),
(r"\b(boarding\s*(gate|area)?)\b", "boarding"),
(r"\b(cargo\s*(area|terminal|warehouse)?)\b", "cargo_area"),
(r"\b(ramp|lapangan)\b", "ramp"),
],
}
TIME_PATTERNS = {
"TIME": [
(r"\b(STD|ETD|ATD)[:\s]*(\d{1,2}[:\.]?\d{2})", "std"),
(r"\b(ETA|STA|ATA)[:\s]*(\d{1,2}[:\.]?\d{2})", "eta"),
(
r"\b(delay|terlambat)\s*(\d+)\s*(hours?|jam|minutes?|menit)",
"delay_duration",
),
(r"\b(\d{1,2}[:\.]\d{2})\s*(LT|UTC|WIB|WITA|WIT)", "time_with_zone"),
(r"\b(arrival|keberangkatan)\b", "arrival_departure"),
],
}
FLIGHT_PATTERNS = {
"FLIGHT_INFO": [
(r"\b([A-Z]{2})\s*(\d{2,4})\b", "flight_number"),
(r"\b(flight\s*[A-Z]{2}\s*\d{2,4})\b", "flight"),
(r"\b([A-Z]{3})[-/]([A-Z]{3})\b", "route"),
(r"\b(CGK|DPS|SUB|UPG|KNO|YIA|BKS|MDC|TKG|PKU)\b", "airport_code"),
],
}
EQUIPMENT_PATTERNS = {
"EQUIPMENT": [
(r"\b(ULD|Unit Load Device)\b", "uld"),
(r"\b(forklift|fork lift)\b", "forklift"),
(r"\b(belt\s*loader|beltloader)\b", "belt_loader"),
(r"\b(pallet\s*loader|palletloader)\b", "pallet_loader"),
(r"\b(tow\s*tractor|towtractor)\b", "tow_tractor"),
(r"\b(stairs|air stairs|airstairs)\b", "stairs"),
(r"\b(container|kontainer)\b", "container"),
(r"\b(pallet)\b", "pallet"),
(r"\b(chute)\b", "chute"),
(r"\b(conveyor)\b", "conveyor"),
],
}
CARGO_PATTERNS = {
"CARGO_TYPE": [
(
r"\b(DG|Dangerous Goods|dangerous\s*goods|barang\s*berbahaya)\b",
"dangerous_goods",
),
(r"\b(perishable|fresh|frozen|segar)\b", "perishable"),
(r"\b(fragile|breakable|mudah\s*pecah)\b", "fragile"),
(r"\b(live\s*animals?|AVI|hewan)\b", "live_animals"),
(r"\b(valuable|berharga)\b", "valuable"),
(r"\b(pharmaceutical|pharma|farmasi)\b", "pharmaceutical"),
(r"\b(human\s*remains|AHU)\b", "human_remains"),
(r"\b(mail|pos)\b", "mail"),
],
}
PERSONNEL_PATTERNS = {
"PERSONNEL": [
(r"\b(PIC|Person In Charge)\b", "pic"),
(r"\b(Load\s*Master|LM)\b", "load_master"),
(r"\b(Ground\s*Handler|GH)\b", "ground_handler"),
(r"\b(Supervisor|Spv)\b", "supervisor"),
(r"\b(Staff|Petugas|Officer)\b", "staff"),
(r"\b(Crew|Cabin\s*Crew)\b", "crew"),
(r"\b(Engineer|Teknisi)\b", "engineer"),
],
}
def __init__(self):
self.all_patterns = {
**self.DAMAGE_PATTERNS,
**self.QUANTITY_PATTERNS,
**self.LOCATION_PATTERNS,
**self.TIME_PATTERNS,
**self.FLIGHT_PATTERNS,
**self.EQUIPMENT_PATTERNS,
**self.CARGO_PATTERNS,
**self.PERSONNEL_PATTERNS,
}
def extract(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
"""
Extract all entities from text
Args:
text: Report text
Returns:
Dict with entity types as keys and list of entities as values
"""
if not text:
return {}
text_lower = text.lower()
entities = {}
for entity_type, patterns in self.all_patterns.items():
extracted = []
for pattern, label in patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
extracted.append(
{
"text": match.group(0),
"label": label,
"start": match.start(),
"end": match.end(),
"confidence": 0.85,
}
)
if extracted:
# Deduplicate
seen = set()
unique = []
for e in extracted:
key = (e["text"].lower(), e["label"])
if key not in seen:
seen.add(key)
unique.append(e)
entities[entity_type] = unique
return entities
def extract_summary(self, text: str) -> Dict[str, Any]:
"""
Extract entity summary from text
Args:
text: Report text
Returns:
Summary dict with extracted entities organized by type
"""
entities = self.extract(text)
summary = {
"damages": [],
"quantities": [],
"locations": [],
"times": [],
"flight_info": [],
"equipment": [],
"cargo_types": [],
"personnel": [],
"entity_count": 0,
}
for entity_type, entity_list in entities.items():
for entity in entity_list:
label = entity["label"]
text_val = entity["text"]
if entity_type == "DAMAGE_TYPE":
if label not in summary["damages"]:
summary["damages"].append(label)
elif entity_type == "QUANTITY":
summary["quantities"].append(f"{text_val} ({label})")
elif entity_type == "LOCATION":
if label not in summary["locations"]:
summary["locations"].append(label)
elif entity_type == "TIME":
summary["times"].append(text_val)
elif entity_type == "FLIGHT_INFO":
summary["flight_info"].append(f"{text_val} ({label})")
elif entity_type == "EQUIPMENT":
if label not in summary["equipment"]:
summary["equipment"].append(label)
elif entity_type == "CARGO_TYPE":
if label not in summary["cargo_types"]:
summary["cargo_types"].append(label)
elif entity_type == "PERSONNEL":
if label not in summary["personnel"]:
summary["personnel"].append(label)
summary["entity_count"] += 1
return summary
def extract_batch(self, records: List[Dict]) -> List[Dict[str, Any]]:
"""Extract entities from multiple records"""
results = []
for record in records:
text = f"{record.get('Report', '')} {record.get('Root_Caused', '')} {record.get('Action_Taken', '')}"
entities = self.extract(text)
summary = self.extract_summary(text)
results.append(
{
"entities": entities,
"summary": summary,
}
)
return results
def get_damage_statistics(self, records: List[Dict]) -> Dict[str, int]:
"""Get statistics on damage types"""
damage_counts = Counter()
for record in records:
text = record.get("Report", "")
entities = self.extract(text)
for entity in entities.get("DAMAGE_TYPE", []):
damage_counts[entity["label"]] += 1
return dict(damage_counts.most_common())
def get_equipment_statistics(self, records: List[Dict]) -> Dict[str, int]:
"""Get statistics on equipment mentioned"""
equipment_counts = Counter()
for record in records:
text = f"{record.get('Report', '')} {record.get('Root_Caused', '')}"
entities = self.extract(text)
for entity in entities.get("EQUIPMENT", []):
equipment_counts[entity["label"]] += 1
return dict(equipment_counts.most_common())
_advanced_ner: Optional[AdvancedNER] = None
def get_advanced_ner() -> AdvancedNER:
"""Get singleton NER instance"""
global _advanced_ner
if _advanced_ner is None:
_advanced_ner = AdvancedNER()
return _advanced_ner