Muhammad Ridzki Nugraha commited on
Commit
c5af9d3
·
verified ·
1 Parent(s): 1a8dcfc

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ api/__pycache__/main.cpython-314.pyc filter=lfs diff=lfs merge=lfs -text
api/__pycache__/main.cpython-311.pyc ADDED
Binary file (90.2 kB). View file
 
api/__pycache__/main.cpython-314.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53709ecd5752a6badcc55a0acd9d0e272484ff29709f03772a45619047fff8f0
3
+ size 126576
api/main.py ADDED
@@ -0,0 +1,2713 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gapura AI Analysis API
3
+ FastAPI server for regression and NLP analysis of irregularity reports
4
+ Uses real trained models from ai-model/models/
5
+ """
6
+
7
+ from fastapi import FastAPI, HTTPException, BackgroundTasks, Request, Body
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from fastapi.middleware.gzip import GZipMiddleware
10
+ from fastapi.responses import JSONResponse
11
+ from pydantic import BaseModel, Field, field_validator
12
+ from pydantic_core import ValidationError
13
+ from typing import List, Optional, Dict, Any, Tuple
14
+ from collections import Counter
15
+ import os
16
+ import json
17
+ import logging
18
+ from datetime import datetime
19
+ import numpy as np
20
+ import pickle
21
+ import pandas as pd
22
+ import sys
23
+
24
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
25
+ from data.cache_service import get_cache, CacheService
26
+ from data.nlp_service import NLPModelService
27
+ from data.shap_service import get_shap_explainer
28
+ from data.anomaly_service import get_anomaly_detector
29
+
30
+ # Setup logging
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger(__name__)
33
+
34
+ tags_metadata = [
35
+ {
36
+ "name": "Analysis",
37
+ "description": "Core AI analysis endpoints for irregularity reports.",
38
+ },
39
+ {
40
+ "name": "Health",
41
+ "description": "System health and model status checks.",
42
+ },
43
+ {
44
+ "name": "Jobs",
45
+ "description": "Asynchronous job management.",
46
+ },
47
+ {
48
+ "name": "Training",
49
+ "description": "Model retraining and lifecycle management.",
50
+ },
51
+ ]
52
+
53
+ app = FastAPI(
54
+ title="Gapura AI Analysis API",
55
+ description="""
56
+ Gapura AI Analysis API provides advanced machine learning capabilities for analyzing irregularity reports.
57
+
58
+ ## Features
59
+
60
+ * **Regression Analysis**: Predict resolution time (days) based on report details.
61
+ * **NLP Classification**: Determine severity (Critical, High, Medium, Low) and categorize issues.
62
+ * **Entity Extraction**: Extract key entities like Airlines, Flight Numbers, and Dates.
63
+ * **Summarization**: Generate executive summaries and key points from long reports.
64
+ * **Trend Analysis**: Analyze trends by Airline, Hub, and Category.
65
+ * **Anomaly Detection**: Identify unusual patterns in resolution times.
66
+
67
+ ## Models
68
+
69
+ * **Regression**: Random Forest Regressor (v1.0.0-trained)
70
+ * **NLP**: Hybrid Transformer + Rule-based System (v4.0.0-onnx)
71
+ """,
72
+ version="2.1.0",
73
+ openapi_tags=tags_metadata,
74
+ docs_url="/docs",
75
+ redoc_url="/redoc",
76
+ )
77
+
78
+ # CORS middleware
79
+ app.add_middleware(
80
+ CORSMiddleware,
81
+ allow_origins=["*"],
82
+ allow_credentials=True,
83
+ allow_methods=["*"],
84
+ allow_headers=["*"],
85
+ )
86
+ app.add_middleware(GZipMiddleware, minimum_size=500)
87
+
88
+
89
+ @app.exception_handler(ValidationError)
90
+ async def validation_exception_handler(request: Request, exc: ValidationError):
91
+ return JSONResponse(
92
+ status_code=422,
93
+ content={
94
+ "detail": "Validation error",
95
+ "errors": exc.errors(),
96
+ "body": exc.json(),
97
+ },
98
+ )
99
+
100
+
101
+ # ============== Pydantic Models ==============
102
+
103
+ from enum import Enum
104
+ from datetime import date as date_type
105
+
106
+
107
+ class ReportCategoryEnum(str, Enum):
108
+ IRREGULARITY = "Irregularity"
109
+ COMPLAINT = "Complaint"
110
+
111
+
112
+ class AreaEnum(str, Enum):
113
+ APRON = "Apron Area"
114
+ TERMINAL = "Terminal Area"
115
+ GENERAL = "General"
116
+
117
+
118
+ class StatusEnum(str, Enum):
119
+ OPEN = "Open"
120
+ CLOSED = "Closed"
121
+ IN_PROGRESS = "In Progress"
122
+
123
+
124
+ class IrregularityReport(BaseModel):
125
+ Date_of_Event: Optional[str] = Field(None, description="Date of the event")
126
+ Airlines: Optional[str] = Field(None, max_length=100)
127
+ Flight_Number: Optional[str] = Field(None, max_length=20)
128
+ Branch: Optional[str] = Field(None, max_length=10)
129
+ HUB: Optional[str] = Field(None, max_length=20)
130
+ Route: Optional[str] = Field(None, max_length=50)
131
+ Report_Category: Optional[str] = Field(None, max_length=50)
132
+ Irregularity_Complain_Category: Optional[str] = Field(None, max_length=100)
133
+ Report: Optional[str] = Field(None, max_length=2000)
134
+ Root_Caused: Optional[str] = Field(None, max_length=2000)
135
+ Action_Taken: Optional[str] = Field(None, max_length=2000)
136
+ Area: Optional[str] = Field(None, max_length=50)
137
+ Status: Optional[str] = Field(None, max_length=50)
138
+ Reported_By: Optional[str] = Field(None, max_length=100)
139
+ Upload_Irregularity_Photo: Optional[str] = Field(None)
140
+
141
+ model_config = {"extra": "allow"}
142
+
143
+
144
+ class AnalysisOptions(BaseModel):
145
+ predictResolutionTime: bool = Field(
146
+ default=True, description="Run regression model"
147
+ )
148
+ classifySeverity: bool = Field(
149
+ default=True, description="Classify severity using NLP"
150
+ )
151
+ extractEntities: bool = Field(
152
+ default=True, description="Extract entities using NER"
153
+ )
154
+ generateSummary: bool = Field(default=True, description="Generate text summaries")
155
+ analyzeTrends: bool = Field(default=True, description="Analyze trends")
156
+ bypassCache: bool = Field(
157
+ default=False, description="Bypass cache and fetch fresh data"
158
+ )
159
+ includeRisk: bool = Field(default=False, description="Include risk assessment in analysis")
160
+
161
+
162
+ class AnalysisRequest(BaseModel):
163
+ sheetId: Optional[str] = Field(None, description="Google Sheet ID")
164
+ sheetName: Optional[str] = Field(None, description="Sheet name (NON CARGO or CGO)")
165
+ rowRange: Optional[str] = Field(None, description="Row range (e.g., A2:Z100)")
166
+ data: Optional[List[IrregularityReport]] = Field(
167
+ None, description="Direct data upload"
168
+ )
169
+ options: AnalysisOptions = Field(default_factory=AnalysisOptions)
170
+
171
+ @field_validator("data")
172
+ @classmethod
173
+ def validate_data(cls, v):
174
+ if v is not None and len(v) == 0:
175
+ raise ValueError("data array cannot be empty")
176
+ return v
177
+
178
+
179
+ class ShapExplanation(BaseModel):
180
+ baseValue: float = Field(description="Base/expected value from model")
181
+ predictionExplained: bool = Field(
182
+ description="Whether SHAP explanation is available"
183
+ )
184
+ topFactors: List[Dict[str, Any]] = Field(
185
+ default_factory=list, description="Top contributing features"
186
+ )
187
+ explanation: str = Field(default="", description="Human-readable explanation")
188
+
189
+
190
+ class AnomalyResult(BaseModel):
191
+ isAnomaly: bool = Field(description="Whether prediction is anomalous")
192
+ anomalyScore: float = Field(description="Anomaly score (0-1)")
193
+ anomalies: List[Dict[str, Any]] = Field(
194
+ default_factory=list, description="List of detected anomalies"
195
+ )
196
+
197
+
198
+ class RegressionPrediction(BaseModel):
199
+ reportId: str
200
+ predictedDays: float
201
+ confidenceInterval: Tuple[float, float]
202
+ featureImportance: Dict[str, float]
203
+ hasUnknownCategories: bool = Field(
204
+ default=False, description="True if unknown categories were used in prediction"
205
+ )
206
+ shapExplanation: Optional[ShapExplanation] = Field(
207
+ default=None, description="SHAP-based explanation for prediction"
208
+ )
209
+ anomalyDetection: Optional[AnomalyResult] = Field(
210
+ default=None, description="Anomaly detection results"
211
+ )
212
+
213
+
214
+ class RegressionResult(BaseModel):
215
+ predictions: List[RegressionPrediction]
216
+ modelMetrics: Dict[str, Any]
217
+
218
+
219
+ class ClassificationResult(BaseModel):
220
+ reportId: str
221
+ severity: str
222
+ severityConfidence: float
223
+ areaType: str
224
+ issueType: str
225
+ issueTypeConfidence: float
226
+
227
+
228
+ class Entity(BaseModel):
229
+ text: str
230
+ label: str
231
+ start: int
232
+ end: int
233
+ confidence: float
234
+
235
+
236
+ class EntityResult(BaseModel):
237
+ reportId: str
238
+ entities: List[Entity]
239
+
240
+
241
+ class SummaryResult(BaseModel):
242
+ reportId: str
243
+ executiveSummary: str
244
+ keyPoints: List[str]
245
+
246
+
247
+ class SentimentResult(BaseModel):
248
+ reportId: str
249
+ urgencyScore: float
250
+ sentiment: str
251
+ keywords: List[str]
252
+
253
+
254
+ class NLPResult(BaseModel):
255
+ classifications: List[ClassificationResult]
256
+ entities: List[EntityResult]
257
+ summaries: List[SummaryResult]
258
+ sentiment: List[SentimentResult]
259
+
260
+
261
+ class TrendData(BaseModel):
262
+ count: int
263
+ avgResolutionDays: Optional[float]
264
+ topIssues: List[str]
265
+
266
+
267
+ class TrendResult(BaseModel):
268
+ byAirline: Dict[str, TrendData]
269
+ byHub: Dict[str, TrendData]
270
+ byCategory: Dict[str, Dict[str, Any]]
271
+ timeSeries: List[Dict[str, Any]]
272
+
273
+
274
+ class Metadata(BaseModel):
275
+ totalRecords: int
276
+ processingTime: float
277
+ modelVersions: Dict[str, str]
278
+
279
+
280
+ class AnalysisResponse(BaseModel):
281
+ regression: Optional[RegressionResult] = None
282
+ nlp: Optional[NLPResult] = None
283
+ trends: Optional[TrendResult] = None
284
+ risk: Optional[RiskAssessmentResponse] = None
285
+ metadata: Metadata
286
+
287
+
288
+ class RiskItem(BaseModel):
289
+ reportId: str
290
+ severity: str
291
+ severityConfidence: float
292
+ predictedDays: float
293
+ anomalyScore: float
294
+ category: str
295
+ hub: str
296
+ area: str
297
+ riskScore: float
298
+ priority: str
299
+ recommendedActions: List[Dict[str, Any]] = Field(default_factory=list)
300
+ preventiveSuggestions: List[str] = Field(default_factory=list)
301
+
302
+
303
+ class RiskAssessmentResponse(BaseModel):
304
+ items: List[RiskItem]
305
+ topPatterns: List[Dict[str, Any]]
306
+ metadata: Dict[str, Any]
307
+
308
+
309
+ def _severity_to_score(level: str) -> float:
310
+ m = {"Critical": 1.0, "High": 0.8, "Medium": 0.5, "Low": 0.2}
311
+ return m.get(level, 0.3)
312
+
313
+
314
+ def _normalize_days(d: float) -> float:
315
+ return max(0.0, min(1.0, float(d) / 7.0))
316
+
317
+
318
+ def _priority_from_score(s: float) -> str:
319
+ if s >= 0.75:
320
+ return "HIGH"
321
+ if s >= 0.45:
322
+ return "MEDIUM"
323
+ return "LOW"
324
+
325
+
326
+ def _extract_prevention(texts: List[str]) -> List[str]:
327
+ kws = ["review", "prosedur", "procedure", "training", "pelatihan", "prevent", "pencegahan", "maintenance", "inspection", "inspeksi", "briefing", "supervision", "checklist", "verify", "verifikasi"]
328
+ out = []
329
+ seen = set()
330
+ for t in texts:
331
+ lt = t.lower()
332
+ for k in kws:
333
+ if k in lt:
334
+ if t not in seen:
335
+ seen.add(t)
336
+ out.append(t)
337
+ return out[:5]
338
+
339
+
340
+ # ============== Real Model Service ==============
341
+
342
+
343
+ class ModelService:
344
+ """Service that loads and uses real trained models"""
345
+
346
+ def __init__(self):
347
+ self.regression_version = "1.0.0-trained"
348
+ self.nlp_version = "1.0.0-mock"
349
+ self.regression_model = None
350
+ self.regression_onnx_session = None
351
+ self.label_encoders = {}
352
+ self.scaler = None
353
+ self.feature_names = []
354
+ self.model_metrics = {}
355
+ self.model_loaded = False
356
+ self.nlp_service = None
357
+
358
+ self._load_regression_model()
359
+ self._load_nlp_service()
360
+
361
+ def _load_nlp_service(self):
362
+ """Load NLP service with trained models or fallback"""
363
+ try:
364
+ from data.nlp_service import get_nlp_service
365
+ self.nlp_service = get_nlp_service()
366
+ self.nlp_version = self.nlp_service.version
367
+ logger.info(f"NLP service loaded (version: {self.nlp_version})")
368
+ except Exception as e:
369
+ logger.warning(f"Failed to load NLP service: {e}")
370
+
371
+ def _load_regression_model(self):
372
+ """Load the trained regression model from file"""
373
+ try:
374
+ model_path = os.path.join(
375
+ os.path.dirname(__file__),
376
+ "..",
377
+ "models",
378
+ "regression",
379
+ "resolution_predictor_latest.pkl",
380
+ )
381
+
382
+ if not os.path.exists(model_path):
383
+ logger.warning(f"Model file not found at {model_path}")
384
+ return
385
+
386
+ logger.info(f"Loading regression model from {model_path}")
387
+
388
+ with open(model_path, "rb") as f:
389
+ model_data = pickle.load(f)
390
+
391
+ self.regression_model = model_data.get("model")
392
+ self.label_encoders = model_data.get("label_encoders", {})
393
+ self.scaler = model_data.get("scaler")
394
+ self.feature_names = model_data.get("feature_names", [])
395
+ self.model_metrics = model_data.get("metrics", {})
396
+ self.model_loaded = True
397
+
398
+ logger.info(f"✓ Regression model loaded successfully")
399
+ logger.info(f" - Features: {len(self.feature_names)}")
400
+ logger.info(f" - Metrics: MAE={self.model_metrics.get('test_mae', 'N/A')}")
401
+
402
+ # Try to load ONNX model for faster inference
403
+ onnx_path = os.path.join(
404
+ os.path.dirname(__file__),
405
+ "..",
406
+ "models",
407
+ "regression",
408
+ "resolution_predictor.onnx",
409
+ )
410
+ if os.path.exists(onnx_path):
411
+ try:
412
+ import onnxruntime as ort
413
+ sess_options = ort.SessionOptions()
414
+ sess_options.intra_op_num_threads = 1
415
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
416
+
417
+ self.regression_onnx_session = ort.InferenceSession(onnx_path, sess_options)
418
+ logger.info("✓ Regression ONNX model loaded successfully")
419
+ except Exception as e:
420
+ logger.warning(f"Failed to load Regression ONNX model: {e}")
421
+
422
+ except Exception as e:
423
+ logger.error(f"Failed to load regression model: {e}")
424
+ self.model_loaded = False
425
+
426
+ def _extract_features(self, report: Dict) -> Optional[np.ndarray]:
427
+ """Extract features from a single report matching training preprocessing"""
428
+ try:
429
+ # Parse date
430
+ date_str = report.get("Date_of_Event", "")
431
+ try:
432
+ date_obj = pd.to_datetime(date_str, errors="coerce")
433
+ if pd.isna(date_obj):
434
+ date_obj = datetime.now()
435
+ day_of_week = date_obj.dayofweek
436
+ month = date_obj.month
437
+ is_weekend = day_of_week in [5, 6]
438
+ week_of_year = date_obj.isocalendar().week
439
+ day_of_year = date_obj.dayofyear
440
+ except:
441
+ day_of_week = 0
442
+ month = 1
443
+ is_weekend = False
444
+ week_of_year = 1
445
+ day_of_year = 1
446
+
447
+ sin_day_of_week = np.sin(2 * np.pi * day_of_week / 7)
448
+ cos_day_of_week = np.cos(2 * np.pi * day_of_week / 7)
449
+ sin_month = np.sin(2 * np.pi * month / 12)
450
+ cos_month = np.cos(2 * np.pi * month / 12)
451
+ sin_day_of_year = np.sin(2 * np.pi * day_of_year / 365)
452
+ cos_day_of_year = np.cos(2 * np.pi * day_of_year / 365)
453
+
454
+ # Text features
455
+ report_text = report.get("Report", "")
456
+ root_cause = report.get("Root_Caused", "")
457
+ action_taken = report.get("Action_Taken", "")
458
+
459
+ # Categorical
460
+ airline = report.get("Airlines", "Unknown")
461
+ hub = report.get("HUB", "Unknown")
462
+ branch = report.get("Branch", "Unknown")
463
+ category = report.get("Irregularity_Complain_Category", "Unknown")
464
+ area = report.get("Area", "Unknown")
465
+
466
+ # Binary features
467
+ has_photos = bool(report.get("Upload_Irregularity_Photo", ""))
468
+ is_complaint = report.get("Report_Category", "") == "Complaint"
469
+
470
+ # Encode categorical features
471
+ categorical_values = {
472
+ "airline": airline,
473
+ "hub": hub,
474
+ "branch": branch,
475
+ "category": category,
476
+ "area": area,
477
+ }
478
+
479
+ encoded_values = {}
480
+ unknown_flags = {}
481
+ for col, value in categorical_values.items():
482
+ if col in self.label_encoders:
483
+ le = self.label_encoders[col]
484
+ value_str = str(value)
485
+ if value_str in le.classes_:
486
+ encoded_values[f"{col}_encoded"] = le.transform([value_str])[0]
487
+ unknown_flags[col] = False
488
+ else:
489
+ unknown_idx = (
490
+ le.transform(["Unknown"])[0]
491
+ if "Unknown" in le.classes_
492
+ else 0
493
+ )
494
+ encoded_values[f"{col}_encoded"] = unknown_idx
495
+ unknown_flags[col] = True
496
+ logger.warning(
497
+ f"Unknown {col} value: '{value_str}' - using Unknown category"
498
+ )
499
+ else:
500
+ encoded_values[f"{col}_encoded"] = 0
501
+ unknown_flags[col] = True
502
+
503
+ # Build feature vector in correct order
504
+ feature_dict = {
505
+ "day_of_week": day_of_week,
506
+ "month": month,
507
+ "is_weekend": int(is_weekend),
508
+ "week_of_year": week_of_year,
509
+ "sin_day_of_week": sin_day_of_week,
510
+ "cos_day_of_week": cos_day_of_week,
511
+ "sin_month": sin_month,
512
+ "cos_month": cos_month,
513
+ "sin_day_of_year": sin_day_of_year,
514
+ "cos_day_of_year": cos_day_of_year,
515
+ "report_length": len(report_text),
516
+ "report_word_count": len(report_text.split()) if report_text else 0,
517
+ "root_cause_length": len(root_cause),
518
+ "action_taken_length": len(action_taken),
519
+ "has_photos": int(has_photos),
520
+ "is_complaint": int(is_complaint),
521
+ "text_complexity": (len(report_text) * len(report_text.split()) / 100)
522
+ if report_text
523
+ else 0,
524
+ "has_root_cause": int(bool(root_cause)),
525
+ "has_action_taken": int(bool(action_taken)),
526
+ }
527
+ feature_dict.update(encoded_values)
528
+
529
+ has_unknown_categories = any(unknown_flags.values())
530
+
531
+ # Create feature array in correct order
532
+ features = []
533
+ for feature_name in self.feature_names:
534
+ features.append(feature_dict.get(feature_name, 0))
535
+
536
+ X = np.array([features])
537
+
538
+ # Scale features
539
+ if self.scaler:
540
+ X = self.scaler.transform(X)
541
+
542
+ return X, has_unknown_categories
543
+
544
+ except Exception as e:
545
+ logger.error(f"Feature extraction error: {e}")
546
+ return None, True
547
+
548
+ def _extract_features_batch(self, df: pd.DataFrame) -> Tuple[Optional[np.ndarray], np.ndarray]:
549
+ """Extract features from a dataframe matching training preprocessing (Batch optimized)"""
550
+ try:
551
+ # Ensure required columns exist
552
+ required_cols = [
553
+ "Date_of_Event", "Report", "Root_Caused", "Action_Taken",
554
+ "Upload_Irregularity_Photo", "Report_Category",
555
+ "Airlines", "HUB", "Branch", "Irregularity_Complain_Category", "Area"
556
+ ]
557
+ for col in required_cols:
558
+ if col not in df.columns:
559
+ df[col] = None
560
+
561
+ # Copy to avoid modifying original
562
+ df = df.copy()
563
+
564
+ # Parse date
565
+ df["Date_of_Event"] = pd.to_datetime(df["Date_of_Event"], errors="coerce")
566
+ now = datetime.now()
567
+ df["Date_of_Event"] = df["Date_of_Event"].fillna(now)
568
+
569
+ df["day_of_week"] = df["Date_of_Event"].dt.dayofweek
570
+ df["month"] = df["Date_of_Event"].dt.month
571
+ df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)
572
+ df["week_of_year"] = df["Date_of_Event"].dt.isocalendar().week.astype(int)
573
+ df["day_of_year"] = df["Date_of_Event"].dt.dayofyear
574
+
575
+ # Sin/Cos transforms
576
+ df["sin_day_of_week"] = np.sin(2 * np.pi * df["day_of_week"] / 7)
577
+ df["cos_day_of_week"] = np.cos(2 * np.pi * df["day_of_week"] / 7)
578
+ df["sin_month"] = np.sin(2 * np.pi * df["month"] / 12)
579
+ df["cos_month"] = np.cos(2 * np.pi * df["month"] / 12)
580
+ df["sin_day_of_year"] = np.sin(2 * np.pi * df["day_of_year"] / 365)
581
+ df["cos_day_of_year"] = np.cos(2 * np.pi * df["day_of_year"] / 365)
582
+
583
+ # Text features
584
+ df["Report"] = df["Report"].fillna("").astype(str)
585
+ df["Root_Caused"] = df["Root_Caused"].fillna("").astype(str)
586
+ df["Action_Taken"] = df["Action_Taken"].fillna("").astype(str)
587
+
588
+ df["report_length"] = df["Report"].str.len()
589
+ df["report_word_count"] = df["Report"].apply(lambda x: len(x.split()) if x else 0)
590
+ df["root_cause_length"] = df["Root_Caused"].str.len()
591
+ df["action_taken_length"] = df["Action_Taken"].str.len()
592
+
593
+ df["has_photos"] = df["Upload_Irregularity_Photo"].fillna("").astype(bool).astype(int)
594
+ df["is_complaint"] = (df["Report_Category"] == "Complaint").astype(int)
595
+
596
+ df["text_complexity"] = np.where(
597
+ df["Report"].str.len() > 0,
598
+ (df["report_length"] * df["report_word_count"] / 100),
599
+ 0
600
+ )
601
+ df["has_root_cause"] = (df["Root_Caused"].str.len() > 0).astype(int)
602
+ df["has_action_taken"] = (df["Action_Taken"].str.len() > 0).astype(int)
603
+
604
+ # Categorical encoding
605
+ categorical_cols = {
606
+ "airline": "Airlines",
607
+ "hub": "HUB",
608
+ "branch": "Branch",
609
+ "category": "Irregularity_Complain_Category",
610
+ "area": "Area"
611
+ }
612
+
613
+ unknown_flags = np.zeros(len(df), dtype=bool)
614
+
615
+ for feature_name, col_name in categorical_cols.items():
616
+ df[col_name] = df[col_name].fillna("Unknown").astype(str)
617
+
618
+ if feature_name in self.label_encoders:
619
+ le = self.label_encoders[feature_name]
620
+ # Create mapping for fast lookup
621
+ mapping = {label: idx for idx, label in enumerate(le.classes_)}
622
+ unknown_idx = mapping.get("Unknown", 0)
623
+ if "Unknown" in le.classes_:
624
+ unknown_idx = mapping["Unknown"]
625
+
626
+ # Map values
627
+ encoded_col = df[col_name].map(mapping)
628
+
629
+ # Track unknowns (NaN after map means unknown)
630
+ is_unknown = encoded_col.isna()
631
+ unknown_flags |= is_unknown.values
632
+
633
+ # Fill unknowns
634
+ df[f"{feature_name}_encoded"] = encoded_col.fillna(unknown_idx).astype(int)
635
+ else:
636
+ df[f"{feature_name}_encoded"] = 0
637
+ unknown_flags[:] = True
638
+
639
+ # Select features in order
640
+ for f in self.feature_names:
641
+ if f not in df.columns:
642
+ df[f] = 0
643
+
644
+ X = df[self.feature_names].values
645
+
646
+ # Scale
647
+ if self.scaler:
648
+ X = self.scaler.transform(X)
649
+
650
+ return X, unknown_flags
651
+
652
+ except Exception as e:
653
+ logger.error(f"Batch feature extraction error: {e}")
654
+ return None, np.ones(len(df), dtype=bool)
655
+
656
+ def predict_regression(self, data: List[Dict]) -> List[RegressionPrediction]:
657
+ """Predict resolution time using trained model"""
658
+ predictions = []
659
+ shap_explainer = get_shap_explainer()
660
+ anomaly_detector = get_anomaly_detector()
661
+
662
+ # Batch processing
663
+ try:
664
+ df = pd.DataFrame(data)
665
+ X_batch, unknown_flags_batch = self._extract_features_batch(df)
666
+
667
+ if X_batch is not None:
668
+ if self.regression_onnx_session:
669
+ # Use ONNX model
670
+ input_name = self.regression_onnx_session.get_inputs()[0].name
671
+ predicted_batch = self.regression_onnx_session.run(None, {input_name: X_batch.astype(np.float32)})[0]
672
+ predicted_batch = predicted_batch.ravel() # Flatten to 1D array
673
+ elif self.regression_model is not None:
674
+ # Use Pickle model
675
+ predicted_batch = self.regression_model.predict(X_batch)
676
+ else:
677
+ predicted_batch = None
678
+ unknown_flags_batch = [True] * len(data)
679
+ else:
680
+ predicted_batch = None
681
+ unknown_flags_batch = [True] * len(data)
682
+ except Exception as e:
683
+ logger.error(f"Batch prediction setup failed: {e}")
684
+ predicted_batch = None
685
+ unknown_flags_batch = [True] * len(data)
686
+
687
+ for i, item in enumerate(data):
688
+ # Use batch results
689
+ has_unknown = unknown_flags_batch[i]
690
+ features = X_batch[i:i+1] if X_batch is not None else None
691
+
692
+ category = item.get("Irregularity_Complain_Category", "Unknown")
693
+ hub = item.get("HUB", "Unknown")
694
+
695
+ if predicted_batch is not None:
696
+ predicted = predicted_batch[i]
697
+ mae = self.model_metrics.get("test_mae", 0.5)
698
+ lower = max(0.1, predicted - mae)
699
+ upper = predicted + mae
700
+
701
+ shap_exp = None
702
+ if shap_explainer.explainer is not None and features is not None:
703
+ try:
704
+ shap_result = shap_explainer.explain_prediction(features)
705
+ shap_exp = ShapExplanation(
706
+ baseValue=shap_result.get("base_value", 0),
707
+ predictionExplained=shap_result.get(
708
+ "prediction_explained", False
709
+ ),
710
+ topFactors=shap_result.get("top_factors", [])[:5],
711
+ explanation=shap_result.get("explanation", ""),
712
+ )
713
+ except Exception as e:
714
+ logger.debug(f"SHAP explanation failed: {e}")
715
+
716
+ anomaly_result = None
717
+ try:
718
+ anomaly_data = anomaly_detector.detect_prediction_anomaly(
719
+ predicted, category, hub
720
+ )
721
+ anomaly_result = AnomalyResult(
722
+ isAnomaly=anomaly_data.get("is_anomaly", False),
723
+ anomalyScore=anomaly_data.get("anomaly_score", 0),
724
+ anomalies=anomaly_data.get("anomalies", []),
725
+ )
726
+ except Exception as e:
727
+ logger.debug(f"Anomaly detection failed: {e}")
728
+ else:
729
+ base_days = {
730
+ "Cargo Problems": 2.5,
731
+ "Pax Handling": 1.8,
732
+ "GSE": 3.2,
733
+ "Operation": 2.1,
734
+ "Baggage Handling": 1.5,
735
+ }.get(category, 2.0)
736
+ predicted = base_days + np.random.normal(0, 0.3)
737
+ lower = max(0.1, predicted - 0.5)
738
+ upper = predicted + 0.5
739
+ has_unknown = True
740
+ shap_exp = None
741
+ anomaly_result = None
742
+
743
+ if self.model_metrics and "feature_importance" in self.model_metrics:
744
+ importance = self.model_metrics["feature_importance"]
745
+ else:
746
+ importance = {
747
+ "category": 0.35,
748
+ "airline": 0.28,
749
+ "hub": 0.15,
750
+ "reportLength": 0.12,
751
+ "hasPhotos": 0.10,
752
+ }
753
+
754
+ predictions.append(
755
+ RegressionPrediction(
756
+ reportId=f"row_{i}",
757
+ predictedDays=round(max(0.1, predicted), 2),
758
+ confidenceInterval=(round(lower, 2), round(upper, 2)),
759
+ featureImportance=importance,
760
+ hasUnknownCategories=has_unknown,
761
+ shapExplanation=shap_exp,
762
+ anomalyDetection=anomaly_result,
763
+ )
764
+ )
765
+
766
+ return predictions
767
+
768
+ def classify_text(self, data: List[Dict]) -> List[ClassificationResult]:
769
+ """Classify text using trained NLP models or rule-based fallback"""
770
+ results = []
771
+
772
+ texts = [
773
+ (item.get("Report") or "") + " " + (item.get("Root_Caused") or "")
774
+ for item in data
775
+ ]
776
+
777
+ # Get multi-task predictions if available
778
+ mt_results = None
779
+ if self.nlp_service:
780
+ mt_results = self.nlp_service.predict_multi_task(texts)
781
+ severity_results = self.nlp_service.classify_severity(texts)
782
+ else:
783
+ severity_results = self._classify_severity_fallback(texts)
784
+
785
+ for i, (item, sev_result) in enumerate(zip(data, severity_results)):
786
+ severity = sev_result.get("severity", "Low")
787
+ severity_conf = sev_result.get("confidence", 0.8)
788
+
789
+ # Use multi-task predictions for area and issue type if available
790
+ if mt_results and i < len(mt_results):
791
+ mt_res = mt_results[i]
792
+ area = mt_res.get("area", {}).get("label", item.get("Area", "Unknown")).replace(" Area", "")
793
+ area_conf = mt_res.get("area", {}).get("confidence", 0.85)
794
+ issue = mt_res.get("irregularity", {}).get("label", item.get("Irregularity_Complain_Category", "Unknown"))
795
+ issue_conf = mt_res.get("irregularity", {}).get("confidence", 0.85)
796
+ else:
797
+ area = item.get("Area", "Unknown").replace(" Area", "")
798
+ area_conf = 0.85
799
+ issue = item.get("Irregularity_Complain_Category", "Unknown")
800
+ issue_conf = 0.85
801
+
802
+ results.append(
803
+ ClassificationResult(
804
+ reportId=f"row_{i}",
805
+ severity=severity,
806
+ severityConfidence=severity_conf,
807
+ areaType=area,
808
+ issueType=issue,
809
+ issueTypeConfidence=issue_conf,
810
+ )
811
+ )
812
+ return results
813
+
814
+ def _classify_severity_fallback(self, texts: List[str]) -> List[Dict]:
815
+ """Fallback severity classification"""
816
+ results = []
817
+ for text in texts:
818
+ report = text.lower()
819
+
820
+ if any(
821
+ kw in report
822
+ for kw in ["damage", "torn", "broken", "critical", "emergency"]
823
+ ):
824
+ severity = "High"
825
+ severity_conf = 0.89
826
+ elif any(kw in report for kw in ["delay", "late", "wrong", "error"]):
827
+ severity = "Medium"
828
+ severity_conf = 0.75
829
+ else:
830
+ severity = "Low"
831
+ severity_conf = 0.82
832
+
833
+ results.append({"severity": severity, "confidence": severity_conf})
834
+ return results
835
+
836
+ def extract_entities(self, data: List[Dict]) -> List[EntityResult]:
837
+ """Extract entities from reports"""
838
+ results = []
839
+ for i, item in enumerate(data):
840
+ entities = []
841
+ report_text = item.get("Report", "") + " " + item.get("Root_Caused", "")
842
+
843
+ # Extract airline
844
+ airline = item.get("Airlines", "")
845
+ if airline and airline != "Unknown":
846
+ # Find position in text
847
+ idx = report_text.lower().find(airline.lower())
848
+ start = max(0, idx) if idx >= 0 else 0
849
+ entities.append(
850
+ Entity(
851
+ text=airline,
852
+ label="AIRLINE",
853
+ start=start,
854
+ end=start + len(airline),
855
+ confidence=0.95,
856
+ )
857
+ )
858
+
859
+ # Extract flight number
860
+ flight = item.get("Flight_Number", "")
861
+ if flight and flight != "#N/A":
862
+ entities.append(
863
+ Entity(
864
+ text=flight,
865
+ label="FLIGHT_NUMBER",
866
+ start=0,
867
+ end=len(flight),
868
+ confidence=0.92,
869
+ )
870
+ )
871
+
872
+ # Extract dates
873
+ date_str = item.get("Date_of_Event", "")
874
+ if date_str:
875
+ entities.append(
876
+ Entity(
877
+ text=date_str,
878
+ label="DATE",
879
+ start=0,
880
+ end=len(date_str),
881
+ confidence=0.90,
882
+ )
883
+ )
884
+
885
+ results.append(EntityResult(reportId=f"row_{i}", entities=entities))
886
+ return results
887
+
888
+ def generate_summary(self, data: List[Dict]) -> List[SummaryResult]:
889
+ """Generate summaries using NLP service or fallback"""
890
+ results = []
891
+ for i, item in enumerate(data):
892
+ combined_text = (
893
+ item.get("Report", "")
894
+ + " "
895
+ + item.get("Root_Caused", "")
896
+ + " "
897
+ + item.get("Action_Taken", "")
898
+ )
899
+
900
+ if self.nlp_service and len(combined_text) > 100:
901
+ summary_result = self.nlp_service.summarize(combined_text)
902
+ executive_summary = summary_result.get("executiveSummary", "")
903
+ key_points = summary_result.get("keyPoints", [])
904
+ else:
905
+ category = item.get("Irregularity_Complain_Category", "Issue")
906
+ report = item.get("Report", "")[:120]
907
+ root_cause = item.get("Root_Caused", "")[:80]
908
+ action = item.get("Action_Taken", "")[:80]
909
+
910
+ executive_summary = f"{category}: {report}"
911
+ if root_cause:
912
+ executive_summary += f" Root cause: {root_cause}."
913
+
914
+ key_points = [
915
+ f"Category: {category}",
916
+ f"Status: {item.get('Status', 'Unknown')}",
917
+ f"Area: {item.get('Area', 'Unknown')}",
918
+ ]
919
+
920
+ if action:
921
+ key_points.append(f"Action: {action[:50]}...")
922
+
923
+ results.append(
924
+ SummaryResult(
925
+ reportId=f"row_{i}",
926
+ executiveSummary=executive_summary,
927
+ keyPoints=key_points,
928
+ )
929
+ )
930
+ return results
931
+
932
+ def analyze_sentiment(self, data: List[Dict]) -> List[SentimentResult]:
933
+ """Analyze sentiment/urgency using NLP service or fallback"""
934
+ results = []
935
+
936
+ texts = [
937
+ item.get("Report", "") + " " + item.get("Root_Caused", "") for item in data
938
+ ]
939
+
940
+ if self.nlp_service:
941
+ urgency_results = self.nlp_service.analyze_urgency(texts)
942
+ else:
943
+ urgency_results = self._analyze_urgency_fallback(texts)
944
+
945
+ for i, (item, urg_result) in enumerate(zip(data, urgency_results)):
946
+ results.append(
947
+ SentimentResult(
948
+ reportId=f"row_{i}",
949
+ urgencyScore=urg_result.get("urgency_score", 0.0),
950
+ sentiment=urg_result.get("sentiment", "Neutral"),
951
+ keywords=urg_result.get("keywords", []),
952
+ )
953
+ )
954
+ return results
955
+
956
+ def _analyze_urgency_fallback(self, texts: List[str]) -> List[Dict]:
957
+ """Fallback urgency analysis"""
958
+ urgency_keywords = [
959
+ "damage",
960
+ "broken",
961
+ "emergency",
962
+ "critical",
963
+ "urgent",
964
+ "torn",
965
+ "severe",
966
+ ]
967
+
968
+ results = []
969
+ for text in texts:
970
+ report = text.lower()
971
+ keyword_matches = [kw for kw in urgency_keywords if kw in report]
972
+ urgency_count = len(keyword_matches)
973
+ urgency_score = min(1.0, urgency_count / 3.0)
974
+
975
+ results.append(
976
+ {
977
+ "urgency_score": round(urgency_score, 2),
978
+ "sentiment": "Negative" if urgency_score > 0.3 else "Neutral",
979
+ "keywords": keyword_matches,
980
+ }
981
+ )
982
+ return results
983
+
984
+
985
+ # Initialize model service
986
+ model_service = ModelService()
987
+
988
+ # ============== API Endpoints ==============
989
+
990
+
991
+ @app.get(
992
+ "/",
993
+ tags=["Health"],
994
+ summary="API Root & Status",
995
+ )
996
+ async def root():
997
+ """Returns basic API status, version, and model availability."""
998
+ return {
999
+ "status": "healthy",
1000
+ "service": "Gapura AI Analysis API",
1001
+ "version": "1.0.0",
1002
+ "models": {
1003
+ "regression": "loaded" if model_service.model_loaded else "unavailable",
1004
+ "nlp": model_service.nlp_service.version if model_service.nlp_service and model_service.nlp_service.models_loaded else "unavailable",
1005
+ },
1006
+ "timestamp": datetime.now().isoformat(),
1007
+ }
1008
+
1009
+
1010
+ @app.get(
1011
+ "/health",
1012
+ tags=["Health"],
1013
+ summary="Detailed Health Check",
1014
+ )
1015
+ async def health_check():
1016
+ """
1017
+ Returns detailed health status including:
1018
+ - **Models**: Version and load status of Regression and NLP models.
1019
+ - **Cache**: Redis/Local cache connectivity.
1020
+ - **Metrics**: Current model performance metrics (MAE, RMSE, R2).
1021
+ """
1022
+ cache = get_cache()
1023
+ cache_health = cache.health_check()
1024
+
1025
+ return {
1026
+ "status": "healthy",
1027
+ "models": {
1028
+ "regression": {
1029
+ "version": model_service.regression_version,
1030
+ "loaded": model_service.model_loaded,
1031
+ "metrics": model_service.model_metrics
1032
+ if model_service.model_loaded
1033
+ else None,
1034
+ },
1035
+ "nlp": {
1036
+ "version": model_service.nlp_version,
1037
+ "status": "rule_based",
1038
+ },
1039
+ },
1040
+ "cache": cache_health,
1041
+ "timestamp": datetime.now().isoformat(),
1042
+ }
1043
+
1044
+ @app.post("/api/ai/risk/assess", response_model=RiskAssessmentResponse, tags=["Analysis"])
1045
+ async def assess_risk(
1046
+ request: Optional[AnalysisRequest] = Body(None),
1047
+ sheetId: Optional[str] = None,
1048
+ sheetName: Optional[str] = None,
1049
+ rowRange: Optional[str] = None,
1050
+ bypass_cache: bool = False,
1051
+ top_k_patterns: int = 5,
1052
+ ):
1053
+ from data.sheets_service import GoogleSheetsService
1054
+ from data.action_service import get_action_service
1055
+
1056
+ items_data: List[Dict[str, Any]] = []
1057
+ if request and request.data:
1058
+ items_data = [r.model_dump(exclude_none=True) for r in request.data]
1059
+ elif sheetId and sheetName and rowRange:
1060
+ cache = get_cache() if not bypass_cache else None
1061
+ sheets_service = GoogleSheetsService(cache=cache)
1062
+ items_data = sheets_service.fetch_sheet_data(sheetId, sheetName, rowRange, bypass_cache=bypass_cache)
1063
+ else:
1064
+ raise HTTPException(status_code=400, detail="sheetId, sheetName, and rowRange are required, or provide data in body")
1065
+
1066
+ if len(items_data) == 0:
1067
+ return RiskAssessmentResponse(items=[], topPatterns=[], metadata={"count": 0})
1068
+
1069
+ preds = model_service.predict_regression(items_data)
1070
+ classes = model_service.classify_text(items_data)
1071
+ try:
1072
+ action_service = get_action_service()
1073
+ eff = action_service.action_effectiveness or {}
1074
+ except Exception:
1075
+ eff = {}
1076
+
1077
+ items: List[RiskItem] = []
1078
+ for i, item in enumerate(items_data):
1079
+ cat = item.get("Irregularity_Complain_Category", "Unknown") or "Unknown"
1080
+ hub = item.get("HUB", "Unknown") or "Unknown"
1081
+ area = (item.get("Area", "Unknown") or "Unknown").replace(" Area", "")
1082
+ pr = preds[i]
1083
+ cl = classes[i]
1084
+ sev = cl.severity
1085
+ sev_conf = cl.severityConfidence
1086
+ pdays = pr.predictedDays
1087
+ anom = 0.0
1088
+ if pr.anomalyDetection:
1089
+ anom = pr.anomalyDetection.anomalyScore
1090
+ sev_s = _severity_to_score(sev)
1091
+ d_s = _normalize_days(pdays)
1092
+ cat_w = 1.0 - float(eff.get(cat, 0.8))
1093
+ risk = min(1.0, 0.5 * sev_s + 0.25 * d_s + 0.15 * anom + 0.10 * cat_w)
1094
+ recs: List[Dict[str, Any]] = []
1095
+ try:
1096
+ recs_resp = action_service.recommend(
1097
+ report=item.get("Report", "") or "",
1098
+ issue_type=cat,
1099
+ severity=sev,
1100
+ area=area if area else None,
1101
+ airline=item.get("Airlines") or None,
1102
+ top_n=5,
1103
+ )
1104
+ recs = recs_resp.get("recommendations", [])
1105
+ except Exception:
1106
+ recs = []
1107
+ prev = _extract_prevention([r.get("action", "") for r in recs])
1108
+ items.append(
1109
+ RiskItem(
1110
+ reportId=f"row_{i}",
1111
+ severity=sev,
1112
+ severityConfidence=sev_conf,
1113
+ predictedDays=pdays,
1114
+ anomalyScore=anom,
1115
+ category=cat,
1116
+ hub=hub,
1117
+ area=area,
1118
+ riskScore=round(risk, 3),
1119
+ priority=_priority_from_score(risk),
1120
+ recommendedActions=recs[:5],
1121
+ preventiveSuggestions=prev,
1122
+ )
1123
+ )
1124
+
1125
+ groups: Dict[str, Dict[str, Any]] = {}
1126
+ for it, raw in zip(items, items_data):
1127
+ key = f"{it.category}|{it.hub}|{it.area}"
1128
+ g = groups.get(key) or {"key": key, "category": it.category, "hub": it.hub, "area": it.area, "count": 0, "avgRisk": 0.0, "avgDays": 0.0, "highSeverityShare": 0.0}
1129
+ g["count"] += 1
1130
+ g["avgRisk"] += it.riskScore
1131
+ g["avgDays"] += it.predictedDays
1132
+ g["highSeverityShare"] += 1.0 if it.severity in ("Critical", "High") else 0.0
1133
+ groups[key] = g
1134
+ patterns = []
1135
+ for g in groups.values():
1136
+ c = g["count"]
1137
+ g["avgRisk"] = round(g["avgRisk"] / max(1, c), 3)
1138
+ g["avgDays"] = round(g["avgDays"] / max(1, c), 2)
1139
+ g["highSeverityShare"] = round(g["highSeverityShare"] / max(1, c), 3)
1140
+ patterns.append(g)
1141
+ patterns.sort(key=lambda x: (-x["avgRisk"], -x["highSeverityShare"], -x["avgDays"], -x["count"]))
1142
+ return RiskAssessmentResponse(
1143
+ items=sorted(items, key=lambda x: -x.riskScore),
1144
+ topPatterns=patterns[:top_k_patterns],
1145
+ metadata={"count": len(items)},
1146
+ )
1147
+
1148
+ from data.job_service import JobService, JobStatus
1149
+
1150
+ # Initialize job service
1151
+ job_service = JobService()
1152
+
1153
+
1154
+ def perform_analysis(data: List[Dict], options: AnalysisOptions, compact: bool) -> AnalysisResponse:
1155
+ """Core analysis logic reused by sync and async endpoints"""
1156
+ start_time = datetime.now()
1157
+ total_records = len(data)
1158
+
1159
+ logger.info(f"Analyzing {total_records} records...")
1160
+
1161
+ # Initialize response
1162
+ response = AnalysisResponse(
1163
+ metadata=Metadata(
1164
+ totalRecords=total_records,
1165
+ processingTime=0.0,
1166
+ modelVersions={
1167
+ "regression": model_service.regression_version,
1168
+ "nlp": model_service.nlp_version,
1169
+ },
1170
+ )
1171
+ )
1172
+
1173
+ # Regression Analysis
1174
+ predictions: List[RegressionPrediction] = []
1175
+ if options.predictResolutionTime or options.includeRisk:
1176
+ logger.info(f"Running regression analysis...")
1177
+ predictions = model_service.predict_regression(data)
1178
+
1179
+ # Use real metrics if available
1180
+ if model_service.model_loaded and model_service.model_metrics:
1181
+ metrics = {
1182
+ "mae": round(model_service.model_metrics.get("test_mae", 1.2), 3),
1183
+ "rmse": round(model_service.model_metrics.get("test_rmse", 1.8), 3),
1184
+ "r2": round(model_service.model_metrics.get("test_r2", 0.78), 3),
1185
+ "model_loaded": True,
1186
+ "note": "Using trained model"
1187
+ if model_service.model_loaded
1188
+ else "Using fallback",
1189
+ }
1190
+ else:
1191
+ metrics = {
1192
+ "mae": None,
1193
+ "rmse": None,
1194
+ "r2": None,
1195
+ "model_loaded": False,
1196
+ "note": "Model not available - using fallback predictions",
1197
+ }
1198
+
1199
+ if options.predictResolutionTime:
1200
+ response.regression = RegressionResult(
1201
+ predictions=predictions,
1202
+ modelMetrics=metrics,
1203
+ )
1204
+
1205
+ # NLP Analysis
1206
+ classifications: List[ClassificationResult] = []
1207
+ if any(
1208
+ [
1209
+ options.classifySeverity,
1210
+ options.extractEntities,
1211
+ options.generateSummary,
1212
+ options.includeRisk,
1213
+ ]
1214
+ ):
1215
+ logger.info(f"Running NLP analysis...")
1216
+
1217
+ entities = []
1218
+ summaries = []
1219
+ sentiment = []
1220
+
1221
+ if options.classifySeverity or options.includeRisk:
1222
+ classifications = model_service.classify_text(data)
1223
+
1224
+ if options.extractEntities:
1225
+ entities = model_service.extract_entities(data)
1226
+
1227
+ if options.generateSummary:
1228
+ summaries = model_service.generate_summary(data)
1229
+
1230
+ sentiment = model_service.analyze_sentiment(data)
1231
+
1232
+ response.nlp = NLPResult(
1233
+ classifications=classifications,
1234
+ entities=entities,
1235
+ summaries=summaries,
1236
+ sentiment=sentiment,
1237
+ )
1238
+
1239
+ # Trend Analysis
1240
+ if options.analyzeTrends:
1241
+ logger.info(f"Running trend analysis...")
1242
+
1243
+ by_airline = {}
1244
+ by_hub = {}
1245
+ by_category = {}
1246
+
1247
+ for item in data:
1248
+ airline = item.get("Airlines", "Unknown")
1249
+ hub = item.get("HUB", "Unknown")
1250
+ category = item.get("Irregularity_Complain_Category", "Unknown")
1251
+
1252
+ # Airline aggregation
1253
+ if airline not in by_airline:
1254
+ by_airline[airline] = {"count": 0, "issues": []}
1255
+ by_airline[airline]["count"] += 1
1256
+ by_airline[airline]["issues"].append(category)
1257
+
1258
+ # Hub aggregation
1259
+ if hub not in by_hub:
1260
+ by_hub[hub] = {"count": 0, "issues": []}
1261
+ by_hub[hub]["count"] += 1
1262
+ by_hub[hub]["issues"].append(category)
1263
+
1264
+ # Category aggregation
1265
+ if category not in by_category:
1266
+ by_category[category] = {"count": 0}
1267
+ by_category[category]["count"] += 1
1268
+
1269
+ # Convert to TrendData format
1270
+ by_airline_trend = {
1271
+ k: TrendData(
1272
+ count=v["count"],
1273
+ avgResolutionDays=2.0 + np.random.random(),
1274
+ topIssues=list(set(v["issues"]))[:3],
1275
+ )
1276
+ for k, v in by_airline.items()
1277
+ }
1278
+
1279
+ by_hub_trend = {
1280
+ k: TrendData(
1281
+ count=v["count"],
1282
+ avgResolutionDays=2.0 + np.random.random(),
1283
+ topIssues=list(set(v["issues"]))[:3],
1284
+ )
1285
+ for k, v in by_hub.items()
1286
+ }
1287
+
1288
+ by_category_trend = {
1289
+ k: {"count": v["count"], "trend": "stable"}
1290
+ for k, v in by_category.items()
1291
+ }
1292
+
1293
+ response.trends = TrendResult(
1294
+ byAirline=by_airline_trend,
1295
+ byHub=by_hub_trend,
1296
+ byCategory=by_category_trend,
1297
+ timeSeries=[],
1298
+ )
1299
+
1300
+ # Risk Assessment
1301
+ if options.includeRisk:
1302
+ try:
1303
+ from data.action_service import get_action_service
1304
+ action_service = get_action_service()
1305
+ eff = action_service.action_effectiveness or {}
1306
+ except Exception:
1307
+ eff = {}
1308
+ action_service = None
1309
+
1310
+ items: List[RiskItem] = []
1311
+ for i, item in enumerate(data):
1312
+ cat = item.get("Irregularity_Complain_Category", "Unknown") or "Unknown"
1313
+ hub = item.get("HUB", "Unknown") or "Unknown"
1314
+ area = (item.get("Area", "Unknown") or "Unknown").replace(" Area", "")
1315
+ pr = predictions[i] if i < len(predictions) else None
1316
+ cl = classifications[i] if i < len(classifications) else None
1317
+ sev = cl.severity if cl else "Low"
1318
+ sev_conf = cl.severityConfidence if cl else 0.6
1319
+ pdays = pr.predictedDays if pr else 0.0
1320
+ anom = pr.anomalyDetection.anomalyScore if pr and pr.anomalyDetection else 0.0
1321
+ sev_s = _severity_to_score(sev)
1322
+ d_s = _normalize_days(pdays)
1323
+ cat_w = 1.0 - float(eff.get(cat, 0.8))
1324
+ risk = min(1.0, 0.5 * sev_s + 0.25 * d_s + 0.15 * anom + 0.10 * cat_w)
1325
+ recs: List[Dict[str, Any]] = []
1326
+ if action_service:
1327
+ try:
1328
+ recs_resp = action_service.recommend(
1329
+ report=item.get("Report", "") or "",
1330
+ issue_type=cat,
1331
+ severity=sev,
1332
+ area=area if area else None,
1333
+ airline=item.get("Airlines") or None,
1334
+ top_n=5,
1335
+ )
1336
+ recs = recs_resp.get("recommendations", [])
1337
+ except Exception:
1338
+ recs = []
1339
+ prev = _extract_prevention([r.get("action", "") for r in recs])
1340
+ items.append(
1341
+ RiskItem(
1342
+ reportId=f"row_{i}",
1343
+ severity=sev,
1344
+ severityConfidence=sev_conf,
1345
+ predictedDays=pdays,
1346
+ anomalyScore=anom,
1347
+ category=cat,
1348
+ hub=hub,
1349
+ area=area,
1350
+ riskScore=round(risk, 3),
1351
+ priority=_priority_from_score(risk),
1352
+ recommendedActions=recs[:5],
1353
+ preventiveSuggestions=prev,
1354
+ )
1355
+ )
1356
+
1357
+ groups: Dict[str, Dict[str, Any]] = {}
1358
+ for it, raw in zip(items, data):
1359
+ key = f"{it.category}|{it.hub}|{it.area}"
1360
+ g = groups.get(key) or {"key": key, "category": it.category, "hub": it.hub, "area": it.area, "count": 0, "avgRisk": 0.0, "avgDays": 0.0, "highSeverityShare": 0.0}
1361
+ g["count"] += 1
1362
+ g["avgRisk"] += it.riskScore
1363
+ g["avgDays"] += it.predictedDays
1364
+ g["highSeverityShare"] += 1.0 if it.severity in ("Critical", "High") else 0.0
1365
+ groups[key] = g
1366
+ patterns = []
1367
+ for g in groups.values():
1368
+ c = g["count"]
1369
+ g["avgRisk"] = round(g["avgRisk"] / max(1, c), 3)
1370
+ g["avgDays"] = round(g["avgDays"] / max(1, c), 2)
1371
+ g["highSeverityShare"] = round(g["highSeverityShare"] / max(1, c), 3)
1372
+ patterns.append(g)
1373
+ patterns.sort(key=lambda x: (-x["avgRisk"], -x["highSeverityShare"], -x["avgDays"], -x["count"]))
1374
+ response.risk = RiskAssessmentResponse(
1375
+ items=sorted(items, key=lambda x: -x.riskScore),
1376
+ topPatterns=patterns[:5],
1377
+ metadata={"count": len(items)},
1378
+ )
1379
+
1380
+ if compact:
1381
+ if response.regression and response.regression.predictions:
1382
+ for p in response.regression.predictions:
1383
+ p.shapExplanation = None
1384
+ p.anomalyDetection = None
1385
+ if response.nlp:
1386
+ response.nlp.entities = []
1387
+ response.nlp.summaries = []
1388
+
1389
+ # Calculate processing time
1390
+ processing_time = (datetime.now() - start_time).total_seconds() * 1000
1391
+ response.metadata.processingTime = round(processing_time, 2)
1392
+
1393
+ logger.info(f"Analysis completed in {processing_time:.2f}ms")
1394
+ return response
1395
+
1396
+
1397
+ @app.post(
1398
+ "/api/ai/analyze",
1399
+ response_model=AnalysisResponse,
1400
+ tags=["Analysis"],
1401
+ summary="Analyze Irregularity Reports",
1402
+ response_description="Analysis results including predictions, severity, and entities.",
1403
+ )
1404
+ async def analyze_reports(request: AnalysisRequest, compact: bool = False):
1405
+ """
1406
+ Perform comprehensive AI analysis on a batch of irregularity reports.
1407
+
1408
+ - **Regression**: Predicts days to resolve based on category and description.
1409
+ - **NLP**: Classifies severity, extracts entities (Flight No, Airline), and summarizes text.
1410
+ - **Trends**: Aggregates data by Airline, Hub, and Category.
1411
+
1412
+ The endpoint accepts a list of `IrregularityReport` objects.
1413
+ """
1414
+ try:
1415
+ # Use direct data
1416
+ if not request.data:
1417
+ raise HTTPException(
1418
+ status_code=400,
1419
+ detail="No data provided. Either sheetId or data must be specified.",
1420
+ )
1421
+
1422
+ # Convert IrregularityReport objects to dicts
1423
+ data = [report.model_dump(exclude_none=True) for report in request.data]
1424
+
1425
+ return perform_analysis(data, request.options, compact)
1426
+
1427
+ except HTTPException:
1428
+ raise
1429
+ except Exception as e:
1430
+ logger.error(f"Analysis error: {str(e)}", exc_info=True)
1431
+ raise HTTPException(status_code=500, detail=str(e))
1432
+
1433
+
1434
+ def background_analysis_task(job_id: str, data: List[Dict], options: AnalysisOptions, compact: bool):
1435
+ """Background task for analysis"""
1436
+ try:
1437
+ job_service.update_job(job_id, JobStatus.PROCESSING)
1438
+ response = perform_analysis(data, options, compact)
1439
+ job_service.update_job(job_id, JobStatus.COMPLETED, result=response.model_dump())
1440
+ except Exception as e:
1441
+ logger.error(f"Job {job_id} failed: {e}")
1442
+ job_service.update_job(job_id, JobStatus.FAILED, error=str(e))
1443
+
1444
+
1445
+ @app.post(
1446
+ "/api/ai/analyze-async",
1447
+ response_model=Dict[str, str],
1448
+ tags=["Analysis", "Jobs"],
1449
+ summary="Start Async Analysis Job",
1450
+ )
1451
+ async def analyze_async(
1452
+ request: AnalysisRequest, background_tasks: BackgroundTasks, compact: bool = False
1453
+ ):
1454
+ """
1455
+ Start a background analysis job for large datasets.
1456
+ Returns a `jobId` immediately, which can be used to poll status via `/api/ai/jobs/{jobId}`.
1457
+ """
1458
+ if not request.data:
1459
+ raise HTTPException(status_code=400, detail="No data provided")
1460
+
1461
+ data = [report.model_dump(exclude_none=True) for report in request.data]
1462
+ job_id = job_service.create_job()
1463
+
1464
+ background_tasks.add_task(background_analysis_task, job_id, data, request.options, compact)
1465
+
1466
+ return {"job_id": job_id, "status": "queued"}
1467
+
1468
+
1469
+ @app.get(
1470
+ "/api/ai/jobs/{job_id}",
1471
+ tags=["Jobs"],
1472
+ summary="Get Job Status",
1473
+ )
1474
+ async def get_job_status(job_id: str):
1475
+ """
1476
+ Retrieve the status and results of a background analysis job.
1477
+ Possible statuses: `queued`, `processing`, `completed`, `failed`.
1478
+ """
1479
+ job = job_service.get_job(job_id)
1480
+ if not job:
1481
+ raise HTTPException(status_code=404, detail="Job not found")
1482
+ return job
1483
+
1484
+
1485
+ @app.post(
1486
+ "/api/ai/predict-single",
1487
+ tags=["Analysis"],
1488
+ summary="Real-time Single Prediction",
1489
+ )
1490
+ async def predict_single(report: IrregularityReport):
1491
+ """
1492
+ Get immediate AI predictions for a single irregularity report.
1493
+ Useful for real-time validation or "what-if" analysis in the UI.
1494
+ """
1495
+ try:
1496
+ report_dict = report.model_dump(exclude_none=True)
1497
+ predictions = model_service.predict_regression([report_dict])
1498
+ classifications = model_service.classify_text([report_dict])
1499
+ entities = model_service.extract_entities([report_dict])
1500
+ summaries = model_service.generate_summary([report_dict])
1501
+ sentiment = model_service.analyze_sentiment([report_dict])
1502
+
1503
+ return {
1504
+ "prediction": predictions[0],
1505
+ "classification": classifications[0],
1506
+ "entities": entities[0],
1507
+ "summary": summaries[0],
1508
+ "sentiment": sentiment[0],
1509
+ "modelLoaded": model_service.model_loaded,
1510
+ }
1511
+
1512
+ except Exception as e:
1513
+ logger.error(f"Single prediction error: {str(e)}", exc_info=True)
1514
+ raise HTTPException(status_code=500, detail=str(e))
1515
+
1516
+
1517
+ @app.post(
1518
+ "/api/ai/train",
1519
+ tags=["Training"],
1520
+ summary="Trigger Model Retraining",
1521
+ )
1522
+ async def train_models(background_tasks: BackgroundTasks, force: bool = False):
1523
+ """
1524
+ Trigger a background task to retrain AI models.
1525
+ Checks if new data is available in Google Sheets before training, unless `force=True`.
1526
+ """
1527
+ from scripts.scheduled_training import TrainingScheduler
1528
+
1529
+ def run_training_task():
1530
+ scheduler = TrainingScheduler()
1531
+ result = scheduler.run_training(force=force)
1532
+ logger.info(f"Training completed: {result}")
1533
+
1534
+ background_tasks.add_task(run_training_task)
1535
+
1536
+ return {
1537
+ "status": "training_queued",
1538
+ "message": "Model retraining has been started in the background",
1539
+ "force": force,
1540
+ "timestamp": datetime.now().isoformat(),
1541
+ }
1542
+
1543
+
1544
+ @app.get(
1545
+ "/api/ai/train/status",
1546
+ tags=["Training"],
1547
+ summary="Get Training Status",
1548
+ )
1549
+ async def training_status():
1550
+ """
1551
+ Get the status of the latest training job and training history.
1552
+ """
1553
+ from scripts.scheduled_training import TrainingScheduler
1554
+
1555
+ scheduler = TrainingScheduler()
1556
+ status = scheduler.get_status()
1557
+
1558
+ return {
1559
+ "status": "success",
1560
+ "data": status,
1561
+ "timestamp": datetime.now().isoformat(),
1562
+ }
1563
+
1564
+
1565
+ @app.get("/api/ai/model-info")
1566
+ async def model_info():
1567
+ """Get current model information"""
1568
+ return {
1569
+ "regression": {
1570
+ "version": model_service.regression_version,
1571
+ "type": "GradientBoostingRegressor",
1572
+ "status": "loaded" if model_service.model_loaded else "unavailable",
1573
+ "last_trained": "2025-01-15",
1574
+ "metrics": model_service.model_metrics
1575
+ if model_service.model_loaded
1576
+ else None,
1577
+ },
1578
+ "nlp": {
1579
+ "version": model_service.nlp_version,
1580
+ "type": "Rule-based + Keyword extraction",
1581
+ "status": "active",
1582
+ "tasks": ["classification", "ner", "summarization", "sentiment"],
1583
+ "note": "Full ML NLP models coming soon",
1584
+ },
1585
+ }
1586
+
1587
+
1588
+ @app.post("/api/ai/cache/invalidate")
1589
+ async def invalidate_cache(sheet_name: Optional[str] = None):
1590
+ """Invalidate cache for sheets data"""
1591
+ cache = get_cache()
1592
+
1593
+ if sheet_name:
1594
+ pattern = f"sheets:*{sheet_name}*"
1595
+ deleted = cache.delete_pattern(pattern)
1596
+ return {
1597
+ "status": "success",
1598
+ "message": f"Invalidated cache for sheet: {sheet_name}",
1599
+ "keys_deleted": deleted,
1600
+ }
1601
+ else:
1602
+ deleted = cache.delete_pattern("sheets:*")
1603
+ return {
1604
+ "status": "success",
1605
+ "message": "Invalidated all sheets cache",
1606
+ "keys_deleted": deleted,
1607
+ }
1608
+
1609
+
1610
+ @app.get("/api/ai/cache/status")
1611
+ async def cache_status():
1612
+ """Get cache status and statistics"""
1613
+ cache = get_cache()
1614
+ return cache.health_check()
1615
+
1616
+
1617
+ class AnalyzeAllResponse(BaseModel):
1618
+ status: str
1619
+ metadata: Dict[str, Any]
1620
+ sheets: Dict[str, Any]
1621
+ results: List[Dict[str, Any]]
1622
+ summary: Dict[str, Any]
1623
+ timestamp: str
1624
+
1625
+
1626
+ @app.get("/api/ai/analyze-all", response_model=AnalyzeAllResponse)
1627
+ async def analyze_all_sheets(
1628
+ bypass_cache: bool = False,
1629
+ include_regression: bool = True,
1630
+ include_nlp: bool = True,
1631
+ include_trends: bool = True,
1632
+ max_rows_per_sheet: int = 10000,
1633
+ compact: bool = False,
1634
+ ):
1635
+ """
1636
+ Analyze ALL rows from all Google Sheets
1637
+
1638
+ Fetches data from both NON CARGO and CGO sheets, analyzes each row,
1639
+ and returns comprehensive results.
1640
+
1641
+ Args:
1642
+ bypass_cache: Skip cache and fetch fresh data
1643
+ include_regression: Include regression predictions
1644
+ include_nlp: Include NLP analysis (severity, entities, summary)
1645
+ include_trends: Include trend analysis
1646
+ max_rows_per_sheet: Maximum rows to process per sheet
1647
+ """
1648
+ start_time = datetime.now()
1649
+
1650
+ try:
1651
+ from data.sheets_service import GoogleSheetsService
1652
+
1653
+ cache = get_cache() if not bypass_cache else None
1654
+ sheets_service = GoogleSheetsService(cache=cache)
1655
+
1656
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
1657
+ if not spreadsheet_id:
1658
+ raise HTTPException(
1659
+ status_code=500, detail="GOOGLE_SHEET_ID not configured"
1660
+ )
1661
+
1662
+ all_data = []
1663
+ sheet_info = {}
1664
+
1665
+ sheets_to_fetch = [
1666
+ {"name": "NON CARGO", "range": f"A1:AA{max_rows_per_sheet + 1}"},
1667
+ {"name": "CGO", "range": f"A1:Z{max_rows_per_sheet + 1}"},
1668
+ ]
1669
+
1670
+ for sheet in sheets_to_fetch:
1671
+ try:
1672
+ sheet_name = sheet["name"]
1673
+ range_str = sheet["range"]
1674
+
1675
+ logger.info(f"Fetching {sheet_name}...")
1676
+ data = sheets_service.fetch_sheet_data(
1677
+ spreadsheet_id, sheet_name, range_str, bypass_cache=bypass_cache
1678
+ )
1679
+
1680
+ for row in data:
1681
+ row["_source_sheet"] = sheet_name
1682
+ all_data.append(row)
1683
+
1684
+ sheet_info[sheet_name] = {
1685
+ "rows_fetched": len(data),
1686
+ "status": "success",
1687
+ }
1688
+
1689
+ except Exception as e:
1690
+ logger.error(f"Failed to fetch {sheet['name']}: {e}")
1691
+ sheet_info[sheet["name"]] = {
1692
+ "rows_fetched": 0,
1693
+ "status": "error",
1694
+ "error": str(e),
1695
+ }
1696
+
1697
+ total_records = len(all_data)
1698
+
1699
+ if total_records == 0:
1700
+ raise HTTPException(status_code=404, detail="No data found in any sheet")
1701
+
1702
+ logger.info(f"Analyzing {total_records} total records...")
1703
+
1704
+ results = []
1705
+ batch_size = 100
1706
+
1707
+ for i in range(0, total_records, batch_size):
1708
+ batch = all_data[i : i + batch_size]
1709
+
1710
+ if include_regression:
1711
+ regression_preds = model_service.predict_regression(batch)
1712
+ else:
1713
+ regression_preds = [None] * len(batch)
1714
+
1715
+ if include_nlp:
1716
+ classifications = model_service.classify_text(batch)
1717
+ entities = model_service.extract_entities(batch)
1718
+ summaries = model_service.generate_summary(batch)
1719
+ sentiments = model_service.analyze_sentiment(batch)
1720
+ else:
1721
+ classifications = [None] * len(batch)
1722
+ entities = [None] * len(batch)
1723
+ summaries = [None] * len(batch)
1724
+ sentiments = [None] * len(batch)
1725
+
1726
+ for j, row in enumerate(batch):
1727
+ result = {
1728
+ "rowId": row.get("_row_id", f"row_{i + j}"),
1729
+ "sourceSheet": row.get("_source_sheet", "Unknown"),
1730
+ "originalData": {
1731
+ "date": row.get("Date_of_Event"),
1732
+ "airline": row.get("Airlines"),
1733
+ "flightNumber": row.get("Flight_Number"),
1734
+ "branch": row.get("Branch"),
1735
+ "hub": row.get("HUB"),
1736
+ "route": row.get("Route"),
1737
+ "category": row.get("Report_Category"),
1738
+ "issueType": row.get("Irregularity_Complain_Category"),
1739
+ "report": row.get("Report"),
1740
+ "status": row.get("Status"),
1741
+ },
1742
+ }
1743
+
1744
+ if regression_preds[j]:
1745
+ pred = {
1746
+ "predictedDays": regression_preds[j].predictedDays,
1747
+ "confidenceInterval": regression_preds[j].confidenceInterval,
1748
+ "hasUnknownCategories": regression_preds[j].hasUnknownCategories,
1749
+ }
1750
+ if not compact:
1751
+ pred["shapExplanation"] = (
1752
+ regression_preds[j].shapExplanation.model_dump()
1753
+ if regression_preds[j].shapExplanation
1754
+ else None
1755
+ )
1756
+ pred["anomalyDetection"] = (
1757
+ regression_preds[j].anomalyDetection.model_dump()
1758
+ if regression_preds[j].anomalyDetection
1759
+ else None
1760
+ )
1761
+ result["prediction"] = pred
1762
+
1763
+ if classifications[j]:
1764
+ result["classification"] = classifications[j].model_dump()
1765
+
1766
+ if entities[j] and not compact:
1767
+ result["entities"] = entities[j].model_dump()
1768
+
1769
+ if summaries[j] and not compact:
1770
+ result["summary"] = summaries[j].model_dump()
1771
+
1772
+ if sentiments[j] and not compact:
1773
+ result["sentiment"] = sentiments[j].model_dump()
1774
+
1775
+ results.append(result)
1776
+
1777
+ summary = {
1778
+ "totalRecords": total_records,
1779
+ "sheetsProcessed": len(
1780
+ [s for s in sheet_info.values() if s["status"] == "success"]
1781
+ ),
1782
+ "regressionEnabled": include_regression,
1783
+ "nlpEnabled": include_nlp,
1784
+ }
1785
+
1786
+ if include_nlp and results:
1787
+ severity_counts = {}
1788
+ for r in results:
1789
+ sev = r.get("classification", {}).get("severity", "Unknown")
1790
+ severity_counts[sev] = severity_counts.get(sev, 0) + 1
1791
+ summary["severityDistribution"] = severity_counts
1792
+
1793
+ if include_regression and results:
1794
+ predictions = [
1795
+ r["prediction"]["predictedDays"] for r in results if r.get("prediction")
1796
+ ]
1797
+ if predictions:
1798
+ summary["predictionStats"] = {
1799
+ "min": round(min(predictions), 2),
1800
+ "max": round(max(predictions), 2),
1801
+ "mean": round(sum(predictions) / len(predictions), 2),
1802
+ }
1803
+
1804
+ processing_time = (datetime.now() - start_time).total_seconds()
1805
+
1806
+ return AnalyzeAllResponse(
1807
+ status="success",
1808
+ metadata={
1809
+ "totalRecords": total_records,
1810
+ "processingTimeSeconds": round(processing_time, 2),
1811
+ "recordsPerSecond": round(total_records / processing_time, 2)
1812
+ if processing_time > 0
1813
+ else 0,
1814
+ "modelVersions": {
1815
+ "regression": model_service.regression_version,
1816
+ "nlp": model_service.nlp_version,
1817
+ },
1818
+ },
1819
+ sheets=sheet_info,
1820
+ results=results,
1821
+ summary=summary,
1822
+ timestamp=datetime.now().isoformat(),
1823
+ )
1824
+
1825
+ except HTTPException:
1826
+ raise
1827
+ except Exception as e:
1828
+ logger.error(f"Analyze all error: {str(e)}", exc_info=True)
1829
+ raise HTTPException(status_code=500, detail=str(e))
1830
+
1831
+
1832
+ # ============== Risk Scoring Endpoints ==============
1833
+
1834
+
1835
+ @app.get("/api/ai/risk/summary")
1836
+ async def risk_summary():
1837
+ """Get overall risk summary for all entities"""
1838
+ from data.risk_service import get_risk_service
1839
+
1840
+ risk_service = get_risk_service()
1841
+ return risk_service.get_risk_summary()
1842
+
1843
+
1844
+ @app.get("/api/ai/risk/airlines")
1845
+ async def airline_risks():
1846
+ """Get risk scores for all airlines"""
1847
+ from data.risk_service import get_risk_service
1848
+
1849
+ risk_service = get_risk_service()
1850
+ return risk_service.get_all_airline_risks()
1851
+
1852
+
1853
+ @app.get("/api/ai/risk/airlines/{airline_name}")
1854
+ async def airline_risk(airline_name: str):
1855
+ """Get risk score for a specific airline"""
1856
+ from data.risk_service import get_risk_service
1857
+
1858
+ risk_service = get_risk_service()
1859
+ risk_data = risk_service.get_airline_risk(airline_name)
1860
+
1861
+ if not risk_data:
1862
+ raise HTTPException(
1863
+ status_code=404, detail=f"Airline '{airline_name}' not found"
1864
+ )
1865
+
1866
+ recommendations = risk_service.get_risk_recommendations("airline", airline_name)
1867
+
1868
+ return {
1869
+ "airline": airline_name,
1870
+ "risk_data": risk_data,
1871
+ "recommendations": recommendations,
1872
+ }
1873
+
1874
+
1875
+ @app.get("/api/ai/risk/branches")
1876
+ async def branch_risks():
1877
+ """Get risk scores for all branches"""
1878
+ from data.risk_service import get_risk_service
1879
+
1880
+ risk_service = get_risk_service()
1881
+ return risk_service.get_all_branch_risks()
1882
+
1883
+
1884
+ @app.get("/api/ai/risk/hubs")
1885
+ async def hub_risks():
1886
+ """Get risk scores for all hubs"""
1887
+ from data.risk_service import get_risk_service
1888
+
1889
+ risk_service = get_risk_service()
1890
+ return risk_service.get_all_hub_risks()
1891
+
1892
+
1893
+ @app.post("/api/ai/risk/calculate")
1894
+ async def calculate_risk_scores(bypass_cache: bool = False):
1895
+ """Calculate risk scores from current Google Sheets data"""
1896
+ from data.risk_service import get_risk_service
1897
+ from data.sheets_service import GoogleSheetsService
1898
+
1899
+ cache = get_cache() if not bypass_cache else None
1900
+ sheets_service = GoogleSheetsService(cache=cache)
1901
+
1902
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
1903
+ if not spreadsheet_id:
1904
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
1905
+
1906
+ # Fetch all data
1907
+ non_cargo = sheets_service.fetch_sheet_data(
1908
+ spreadsheet_id, "NON CARGO", "A1:AA2000", bypass_cache=bypass_cache
1909
+ )
1910
+ cargo = sheets_service.fetch_sheet_data(
1911
+ spreadsheet_id, "CGO", "A1:Z2000", bypass_cache=bypass_cache
1912
+ )
1913
+ all_data = non_cargo + cargo
1914
+
1915
+ risk_service = get_risk_service()
1916
+ risk_data = risk_service.calculate_all_risk_scores(all_data)
1917
+
1918
+ return {
1919
+ "status": "success",
1920
+ "records_processed": len(all_data),
1921
+ "risk_summary": risk_service.get_risk_summary(),
1922
+ }
1923
+
1924
+
1925
+ # ============== Subcategory Classification Endpoints ==============
1926
+
1927
+
1928
+ @app.post("/api/ai/subcategory")
1929
+ async def classify_subcategory(
1930
+ report: str,
1931
+ area: Optional[str] = None,
1932
+ issue_type: Optional[str] = None,
1933
+ root_cause: Optional[str] = None,
1934
+ ):
1935
+ """Classify report into subcategory"""
1936
+ from data.subcategory_service import get_subcategory_classifier
1937
+
1938
+ classifier = get_subcategory_classifier()
1939
+ result = classifier.classify(report, area, issue_type, root_cause)
1940
+
1941
+ return result
1942
+
1943
+
1944
+ @app.get("/api/ai/subcategory/categories")
1945
+ async def get_subcategories(area: Optional[str] = None):
1946
+ """Get list of available subcategories"""
1947
+ from data.subcategory_service import get_subcategory_classifier
1948
+
1949
+ classifier = get_subcategory_classifier()
1950
+ return classifier.get_available_categories(area)
1951
+
1952
+
1953
+ # ============== Action Recommendation Endpoints ==============
1954
+
1955
+
1956
+ @app.post("/api/ai/action/recommend")
1957
+ async def recommend_actions(
1958
+ report: str,
1959
+ issue_type: str,
1960
+ severity: str = "Medium",
1961
+ area: Optional[str] = None,
1962
+ airline: Optional[str] = None,
1963
+ top_n: int = 5,
1964
+ ):
1965
+ """Get action recommendations for an issue"""
1966
+ from data.action_service import get_action_service
1967
+
1968
+ action_service = get_action_service()
1969
+ recommendations = action_service.recommend(
1970
+ report=report,
1971
+ issue_type=issue_type,
1972
+ severity=severity,
1973
+ area=area,
1974
+ airline=airline,
1975
+ top_n=top_n,
1976
+ )
1977
+
1978
+ return recommendations
1979
+
1980
+
1981
+ @app.post("/api/ai/action/train")
1982
+ async def train_action_recommender(
1983
+ bypass_cache: bool = False, background_tasks: BackgroundTasks = None
1984
+ ):
1985
+ """Train action recommender from historical data"""
1986
+ from data.action_service import get_action_service
1987
+ from data.sheets_service import GoogleSheetsService
1988
+ from data.similarity_service import get_similarity_service
1989
+
1990
+ cache = get_cache() if not bypass_cache else None
1991
+ sheets_service = GoogleSheetsService(cache=cache)
1992
+
1993
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
1994
+ if not spreadsheet_id:
1995
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
1996
+
1997
+ non_cargo = sheets_service.fetch_sheet_data(
1998
+ spreadsheet_id, "NON CARGO", "A1:AA2000", bypass_cache=bypass_cache
1999
+ )
2000
+ cargo = sheets_service.fetch_sheet_data(
2001
+ spreadsheet_id, "CGO", "A1:Z2000", bypass_cache=bypass_cache
2002
+ )
2003
+ all_data = non_cargo + cargo
2004
+
2005
+ similarity_service = get_similarity_service()
2006
+ similarity_service.build_index(all_data)
2007
+
2008
+ action_service = get_action_service()
2009
+ action_service.train_from_data(all_data)
2010
+
2011
+ return {
2012
+ "status": "success",
2013
+ "records_processed": len(all_data),
2014
+ }
2015
+
2016
+
2017
+ # ============== Advanced NER Endpoints ==============
2018
+
2019
+
2020
+ @app.post("/api/ai/ner/extract")
2021
+ async def extract_entities(text: str):
2022
+ """Extract entities from text"""
2023
+ from data.advanced_ner_service import get_advanced_ner
2024
+
2025
+ ner = get_advanced_ner()
2026
+ entities = ner.extract(text)
2027
+ summary = ner.extract_summary(text)
2028
+
2029
+ return {
2030
+ "entities": entities,
2031
+ "summary": summary,
2032
+ }
2033
+
2034
+
2035
+ # ============== Similarity Endpoints ==============
2036
+
2037
+
2038
+ @app.post("/api/ai/similar")
2039
+ async def find_similar_reports(
2040
+ text: str,
2041
+ top_k: int = 5,
2042
+ threshold: float = 0.3,
2043
+ ):
2044
+ """Find similar reports"""
2045
+ from data.similarity_service import get_similarity_service
2046
+
2047
+ similarity_service = get_similarity_service()
2048
+ similar = similarity_service.find_similar(text, top_k, threshold)
2049
+
2050
+ return {
2051
+ "query_preview": text[:100],
2052
+ "similar_reports": similar,
2053
+ }
2054
+
2055
+
2056
+ @app.post("/api/ai/similar/build-index")
2057
+ async def build_similarity_index(bypass_cache: bool = False):
2058
+ """Build similarity index from Google Sheets data"""
2059
+ from data.similarity_service import get_similarity_service
2060
+ from data.sheets_service import GoogleSheetsService
2061
+
2062
+ cache = get_cache() if not bypass_cache else None
2063
+ sheets_service = GoogleSheetsService(cache=cache)
2064
+
2065
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
2066
+ if not spreadsheet_id:
2067
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
2068
+
2069
+ non_cargo = sheets_service.fetch_sheet_data(
2070
+ spreadsheet_id, "NON CARGO", "A1:AA2000", bypass_cache=bypass_cache
2071
+ )
2072
+ cargo = sheets_service.fetch_sheet_data(
2073
+ spreadsheet_id, "CGO", "A1:Z2000", bypass_cache=bypass_cache
2074
+ )
2075
+ all_data = non_cargo + cargo
2076
+
2077
+ similarity_service = get_similarity_service()
2078
+ similarity_service.build_index(all_data)
2079
+
2080
+ return {
2081
+ "status": "success",
2082
+ "records_indexed": len(all_data),
2083
+ }
2084
+
2085
+
2086
+ # ============== Forecasting Endpoints ==============
2087
+
2088
+
2089
+ @app.get("/api/ai/forecast/issues")
2090
+ async def forecast_issues(periods: int = 4):
2091
+ """Forecast issue volume for next periods"""
2092
+ from data.forecast_service import get_forecast_service
2093
+
2094
+ forecast_service = get_forecast_service()
2095
+ forecast = forecast_service.forecast_issues(periods)
2096
+
2097
+ return forecast
2098
+
2099
+
2100
+ @app.get("/api/ai/forecast/trends")
2101
+ async def predict_trends():
2102
+ """Predict category trends"""
2103
+ from data.forecast_service import get_forecast_service
2104
+
2105
+ forecast_service = get_forecast_service()
2106
+ trends = forecast_service.predict_category_trends()
2107
+
2108
+ return trends
2109
+
2110
+
2111
+ @app.get("/api/ai/forecast/seasonal")
2112
+ async def get_seasonal_patterns():
2113
+ """Get seasonal patterns"""
2114
+ from data.forecast_service import get_forecast_service
2115
+
2116
+ forecast_service = get_forecast_service()
2117
+ patterns = forecast_service.get_seasonal_patterns()
2118
+
2119
+ return patterns
2120
+
2121
+
2122
+ @app.post("/api/ai/forecast/build")
2123
+ async def build_forecast_data(bypass_cache: bool = False):
2124
+ """Build forecast historical data from Google Sheets"""
2125
+ from data.forecast_service import get_forecast_service
2126
+ from data.sheets_service import GoogleSheetsService
2127
+
2128
+ cache = get_cache() if not bypass_cache else None
2129
+ sheets_service = GoogleSheetsService(cache=cache)
2130
+
2131
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
2132
+ if not spreadsheet_id:
2133
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
2134
+
2135
+ non_cargo = sheets_service.fetch_sheet_data(
2136
+ spreadsheet_id, "NON CARGO", "A1:AA2000", bypass_cache=bypass_cache
2137
+ )
2138
+ cargo = sheets_service.fetch_sheet_data(
2139
+ spreadsheet_id, "CGO", "A1:Z2000", bypass_cache=bypass_cache
2140
+ )
2141
+ all_data = non_cargo + cargo
2142
+
2143
+ forecast_service = get_forecast_service()
2144
+ forecast_service.build_historical_data(all_data)
2145
+
2146
+ return {
2147
+ "status": "success",
2148
+ "records_processed": len(all_data),
2149
+ "forecast_summary": forecast_service.get_forecast_summary(),
2150
+ }
2151
+
2152
+
2153
+ # ============== Report Generation Endpoints ==============
2154
+
2155
+
2156
+ @app.post("/api/ai/report/generate")
2157
+ async def generate_report(
2158
+ row_id: str,
2159
+ bypass_cache: bool = False,
2160
+ ):
2161
+ """Generate formal incident report"""
2162
+ from data.report_generator_service import get_report_generator
2163
+ from data.sheets_service import GoogleSheetsService
2164
+ from data.risk_service import get_risk_service
2165
+
2166
+ cache = get_cache() if not bypass_cache else None
2167
+ sheets_service = GoogleSheetsService(cache=cache)
2168
+
2169
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
2170
+ if not spreadsheet_id:
2171
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
2172
+
2173
+ # Fetch all data and find the record
2174
+ non_cargo = sheets_service.fetch_sheet_data(
2175
+ spreadsheet_id, "NON CARGO", "A1:AA2000", bypass_cache=bypass_cache
2176
+ )
2177
+ cargo = sheets_service.fetch_sheet_data(
2178
+ spreadsheet_id, "CGO", "A1:Z2000", bypass_cache=bypass_cache
2179
+ )
2180
+ all_data = non_cargo + cargo
2181
+
2182
+ record = None
2183
+ for r in all_data:
2184
+ if r.get("_row_id") == row_id:
2185
+ record = r
2186
+ break
2187
+
2188
+ if not record:
2189
+ raise HTTPException(status_code=404, detail=f"Record '{row_id}' not found")
2190
+
2191
+ # Generate analysis
2192
+ report_text = record.get("Report", "") + " " + record.get("Root_Caused", "")
2193
+ analysis = {
2194
+ "severity": model_service._classify_severity_fallback([report_text])[0].get(
2195
+ "severity", "Medium"
2196
+ ),
2197
+ "issueType": record.get("Irregularity_Complain_Category", ""),
2198
+ }
2199
+
2200
+ # Get risk data
2201
+ risk_service = get_risk_service()
2202
+ airline = record.get("Airlines", "")
2203
+ risk_data = risk_service.get_airline_risk(airline)
2204
+
2205
+ # Generate report
2206
+ report_gen = get_report_generator()
2207
+ formal_report = report_gen.generate_incident_report(record, analysis, risk_data)
2208
+ exec_summary = report_gen.generate_executive_summary(record, analysis)
2209
+ json_report = report_gen.generate_json_report(record, analysis, risk_data)
2210
+
2211
+ return {
2212
+ "row_id": row_id,
2213
+ "formal_report": formal_report,
2214
+ "executive_summary": exec_summary,
2215
+ "structured_report": json_report,
2216
+ }
2217
+
2218
+
2219
+ # ============== Dashboard Endpoints ==============
2220
+
2221
+
2222
+ @app.get("/api/ai/dashboard/summary")
2223
+ async def dashboard_summary(bypass_cache: bool = False):
2224
+ """Get comprehensive dashboard summary"""
2225
+ from data.risk_service import get_risk_service
2226
+ from data.forecast_service import get_forecast_service
2227
+ from data.sheets_service import GoogleSheetsService
2228
+
2229
+ cache = get_cache() if not bypass_cache else None
2230
+ sheets_service = GoogleSheetsService(cache=cache)
2231
+
2232
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
2233
+ if not spreadsheet_id:
2234
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
2235
+
2236
+ # Fetch data
2237
+ non_cargo = sheets_service.fetch_sheet_data(
2238
+ spreadsheet_id, "NON CARGO", "A1:AA2000", bypass_cache=bypass_cache
2239
+ )
2240
+ cargo = sheets_service.fetch_sheet_data(
2241
+ spreadsheet_id, "CGO", "A1:Z2000", bypass_cache=bypass_cache
2242
+ )
2243
+ all_data = non_cargo + cargo
2244
+
2245
+ # Get risk summary
2246
+ risk_service = get_risk_service()
2247
+ risk_summary = risk_service.get_risk_summary()
2248
+
2249
+ # Get forecast summary
2250
+ forecast_service = get_forecast_service()
2251
+ forecast_summary = forecast_service.get_forecast_summary()
2252
+
2253
+ # Calculate statistics
2254
+ severity_dist = Counter()
2255
+ category_dist = Counter()
2256
+ airline_dist = Counter()
2257
+
2258
+ for record in all_data:
2259
+ report_text = record.get("Report", "") + " " + record.get("Root_Caused", "")
2260
+ sev = model_service._classify_severity_fallback([report_text])[0].get(
2261
+ "severity", "Low"
2262
+ )
2263
+ severity_dist[sev] += 1
2264
+ category_dist[record.get("Irregularity_Complain_Category", "Unknown")] += 1
2265
+ airline_dist[record.get("Airlines", "Unknown")] += 1
2266
+
2267
+ return {
2268
+ "total_records": len(all_data),
2269
+ "sheets": {
2270
+ "non_cargo": len(non_cargo),
2271
+ "cargo": len(cargo),
2272
+ },
2273
+ "severity_distribution": dict(severity_dist),
2274
+ "category_distribution": dict(category_dist.most_common(10)),
2275
+ "top_airlines": dict(airline_dist.most_common(10)),
2276
+ "risk_summary": risk_summary,
2277
+ "forecast_summary": forecast_summary,
2278
+ "model_status": {
2279
+ "regression": model_service.model_loaded,
2280
+ "nlp": model_service.nlp_service is not None,
2281
+ },
2282
+ "last_updated": datetime.now().isoformat(),
2283
+ }
2284
+
2285
+
2286
+ # ============== Seasonality Endpoints ==============
2287
+
2288
+
2289
+ @app.get("/api/ai/seasonality/summary")
2290
+ async def seasonality_summary(category_type: Optional[str] = None):
2291
+ """
2292
+ Get seasonality summary and patterns
2293
+
2294
+ Args:
2295
+ category_type: "landside_airside", "cgo", or None for both
2296
+ """
2297
+ from data.seasonality_service import get_seasonality_service
2298
+
2299
+ service = get_seasonality_service()
2300
+ return service.get_seasonality_summary(category_type)
2301
+
2302
+
2303
+ @app.get("/api/ai/seasonality/forecast")
2304
+ async def seasonality_forecast(
2305
+ category_type: Optional[str] = None,
2306
+ periods: int = 4,
2307
+ granularity: str = "weekly",
2308
+ ):
2309
+ """
2310
+ Forecast issue volumes
2311
+
2312
+ Args:
2313
+ category_type: "landside_airside", "cgo", or None for both
2314
+ periods: Number of periods to forecast
2315
+ granularity: "daily", "weekly", or "monthly"
2316
+ """
2317
+ from data.seasonality_service import get_seasonality_service
2318
+
2319
+ service = get_seasonality_service()
2320
+ return service.forecast(category_type, periods, granularity)
2321
+
2322
+
2323
+ @app.get("/api/ai/seasonality/peaks")
2324
+ async def seasonality_peaks(
2325
+ category_type: Optional[str] = None, threshold: float = 1.2
2326
+ ):
2327
+ """
2328
+ Identify peak periods
2329
+
2330
+ Args:
2331
+ category_type: "landside_airside", "cgo", or None for both
2332
+ threshold: Multiplier above average (1.2 = 20% above)
2333
+ """
2334
+ from data.seasonality_service import get_seasonality_service
2335
+
2336
+ service = get_seasonality_service()
2337
+ return service.get_peak_periods(category_type, threshold)
2338
+
2339
+
2340
+ @app.post("/api/ai/seasonality/build")
2341
+ async def build_seasonality_patterns(bypass_cache: bool = False):
2342
+ """Build seasonality patterns from Google Sheets data"""
2343
+ from data.seasonality_service import get_seasonality_service
2344
+ from data.sheets_service import GoogleSheetsService
2345
+
2346
+ cache = get_cache() if not bypass_cache else None
2347
+ sheets_service = GoogleSheetsService(cache=cache)
2348
+
2349
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
2350
+ if not spreadsheet_id:
2351
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
2352
+
2353
+ non_cargo = sheets_service.fetch_sheet_data(
2354
+ spreadsheet_id, "NON CARGO", "A1:AA5000", bypass_cache=bypass_cache
2355
+ )
2356
+ cargo = sheets_service.fetch_sheet_data(
2357
+ spreadsheet_id, "CGO", "A1:Z5000", bypass_cache=bypass_cache
2358
+ )
2359
+
2360
+ for row in non_cargo:
2361
+ row["_sheet_name"] = "NON CARGO"
2362
+ for row in cargo:
2363
+ row["_sheet_name"] = "CGO"
2364
+
2365
+ all_data = non_cargo + cargo
2366
+
2367
+ service = get_seasonality_service()
2368
+ result = service.build_patterns(all_data)
2369
+
2370
+ return {
2371
+ "status": "success",
2372
+ "records_processed": len(all_data),
2373
+ "patterns": result,
2374
+ }
2375
+
2376
+
2377
+ # ============== Root Cause Endpoints ==============
2378
+
2379
+
2380
+ @app.post("/api/ai/root-cause/classify")
2381
+ async def classify_root_cause(
2382
+ root_cause: str,
2383
+ report: Optional[str] = None,
2384
+ area: Optional[str] = None,
2385
+ category: Optional[str] = None,
2386
+ ):
2387
+ """
2388
+ Classify a root cause text into categories
2389
+
2390
+ Categories: Equipment Failure, Staff Competency, Process/Procedure,
2391
+ Communication, External Factors, Documentation, Training Gap, Resource/Manpower
2392
+ """
2393
+ from data.root_cause_service import get_root_cause_service
2394
+
2395
+ service = get_root_cause_service()
2396
+ context = {"area": area, "category": category}
2397
+ result = service.classify(root_cause, report or "", context)
2398
+
2399
+ return result
2400
+
2401
+
2402
+ @app.post("/api/ai/root-cause/classify-batch")
2403
+ async def classify_root_cause_batch(bypass_cache: bool = False):
2404
+ """Classify root causes for all records"""
2405
+ from data.root_cause_service import get_root_cause_service
2406
+ from data.sheets_service import GoogleSheetsService
2407
+
2408
+ cache = get_cache() if not bypass_cache else None
2409
+ sheets_service = GoogleSheetsService(cache=cache)
2410
+
2411
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
2412
+ if not spreadsheet_id:
2413
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
2414
+
2415
+ non_cargo = sheets_service.fetch_sheet_data(
2416
+ spreadsheet_id, "NON CARGO", "A1:AA5000", bypass_cache=bypass_cache
2417
+ )
2418
+ cargo = sheets_service.fetch_sheet_data(
2419
+ spreadsheet_id, "CGO", "A1:Z5000", bypass_cache=bypass_cache
2420
+ )
2421
+ all_data = non_cargo + cargo
2422
+
2423
+ service = get_root_cause_service()
2424
+ results = service.classify_batch(all_data)
2425
+
2426
+ return {
2427
+ "status": "success",
2428
+ "records_processed": len(all_data),
2429
+ "classifications": results[:100],
2430
+ "total_classified": len(
2431
+ [r for r in results if r["primary_category"] != "Unknown"]
2432
+ ),
2433
+ }
2434
+
2435
+
2436
+ @app.get("/api/ai/root-cause/categories")
2437
+ async def get_root_cause_categories():
2438
+ """Get all available root cause categories"""
2439
+ from data.root_cause_service import get_root_cause_service
2440
+
2441
+ service = get_root_cause_service()
2442
+ return service.get_categories()
2443
+
2444
+
2445
+ @app.get("/api/ai/root-cause/stats")
2446
+ async def get_root_cause_stats(source: Optional[str] = None, bypass_cache: bool = False):
2447
+ """
2448
+ Get root cause statistics from data
2449
+
2450
+ Args:
2451
+ source: "NON CARGO", "CGO", or None for both
2452
+ bypass_cache: Skip cache and fetch fresh data
2453
+ """
2454
+ from data.root_cause_service import get_root_cause_service
2455
+ from data.sheets_service import GoogleSheetsService
2456
+
2457
+ cache = get_cache() if not bypass_cache else None
2458
+ sheets_service = GoogleSheetsService(cache=cache)
2459
+
2460
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
2461
+ if not spreadsheet_id:
2462
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
2463
+
2464
+ all_data = []
2465
+
2466
+ # Conditional fetching based on source to reduce I/O and processing
2467
+ if not source or source.upper() == "NON CARGO":
2468
+ non_cargo = sheets_service.fetch_sheet_data(
2469
+ spreadsheet_id, "NON CARGO", "A1:AA5000", bypass_cache=bypass_cache
2470
+ )
2471
+ all_data.extend(non_cargo)
2472
+
2473
+ if not source or source.upper() == "CGO":
2474
+ cargo = sheets_service.fetch_sheet_data(
2475
+ spreadsheet_id, "CGO", "A1:Z5000", bypass_cache=bypass_cache
2476
+ )
2477
+ all_data.extend(cargo)
2478
+
2479
+ service = get_root_cause_service()
2480
+ stats = service.get_statistics(all_data)
2481
+
2482
+ return stats
2483
+
2484
+
2485
+ @app.post("/api/ai/root-cause/train")
2486
+ async def train_root_cause_classifier(background_tasks: BackgroundTasks, bypass_cache: bool = False):
2487
+ """Train root cause classifier from historical data"""
2488
+ from data.root_cause_service import get_root_cause_service
2489
+ from data.sheets_service import GoogleSheetsService
2490
+
2491
+ cache = get_cache() if not bypass_cache else None
2492
+ sheets_service = GoogleSheetsService(cache=cache)
2493
+
2494
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
2495
+ if not spreadsheet_id:
2496
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
2497
+
2498
+ non_cargo = sheets_service.fetch_sheet_data(
2499
+ spreadsheet_id, "NON CARGO", "A1:AA5000", bypass_cache=bypass_cache
2500
+ )
2501
+ cargo = sheets_service.fetch_sheet_data(
2502
+ spreadsheet_id, "CGO", "A1:Z5000", bypass_cache=bypass_cache
2503
+ )
2504
+ all_data = non_cargo + cargo
2505
+
2506
+ service = get_root_cause_service()
2507
+
2508
+ # Offload the intensive training process to the background
2509
+ background_tasks.add_task(service.train_from_data, all_data)
2510
+
2511
+ return {
2512
+ "status": "training_started",
2513
+ "records_fetched": len(all_data),
2514
+ "message": "Classification training is now running in the background. The model will be automatically updated once complete."
2515
+ }
2516
+
2517
+
2518
+ # ============== Category Summarization Endpoints ==============
2519
+
2520
+
2521
+ class CategorySummaryResponse(BaseModel):
2522
+ status: str
2523
+ category_type: str
2524
+ summary: Dict[str, Any]
2525
+ timestamp: str
2526
+
2527
+
2528
+ @app.get("/api/ai/summarize", response_model=CategorySummaryResponse)
2529
+ async def summarize_by_category(category: str = "all", bypass_cache: bool = False):
2530
+ """
2531
+ Get summarized insights for Non-cargo and/or CGO categories
2532
+
2533
+ Query Parameters:
2534
+ category: "non_cargo", "cgo", or "all" (default: "all")
2535
+ bypass_cache: Skip cache and fetch fresh data (default: false)
2536
+
2537
+ Returns aggregated summary including:
2538
+ - Severity distribution
2539
+ - Top categories, airlines, hubs, branches
2540
+ - Key insights and recommendations
2541
+ - Common issues
2542
+ - Monthly trends
2543
+ """
2544
+ from data.category_summarization_service import get_category_summarization_service
2545
+ from data.sheets_service import GoogleSheetsService
2546
+
2547
+ valid_categories = ["non_cargo", "cgo", "all"]
2548
+ if category.lower() not in valid_categories:
2549
+ raise HTTPException(
2550
+ status_code=400,
2551
+ detail=f"Invalid category. Must be one of: {valid_categories}",
2552
+ )
2553
+
2554
+ cache = get_cache() if not bypass_cache else None
2555
+ sheets_service = GoogleSheetsService(cache=cache)
2556
+
2557
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
2558
+ if not spreadsheet_id:
2559
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
2560
+
2561
+ non_cargo = sheets_service.fetch_sheet_data(
2562
+ spreadsheet_id, "NON CARGO", "A1:AA5000", bypass_cache=bypass_cache
2563
+ )
2564
+ cargo = sheets_service.fetch_sheet_data(
2565
+ spreadsheet_id, "CGO", "A1:Z5000", bypass_cache=bypass_cache
2566
+ )
2567
+
2568
+ for row in non_cargo:
2569
+ row["_sheet_name"] = "NON CARGO"
2570
+ for row in cargo:
2571
+ row["_sheet_name"] = "CGO"
2572
+
2573
+ all_data = non_cargo + cargo
2574
+
2575
+ summarization_service = get_category_summarization_service()
2576
+ summary = summarization_service.summarize_category(all_data, category.lower())
2577
+
2578
+ return CategorySummaryResponse(
2579
+ status="success",
2580
+ category_type=category.lower(),
2581
+ summary=summary,
2582
+ timestamp=datetime.now().isoformat(),
2583
+ )
2584
+
2585
+
2586
+ @app.get("/api/ai/summarize/non-cargo")
2587
+ async def summarize_non_cargo(bypass_cache: bool = False):
2588
+ """Quick endpoint for Non-cargo summary"""
2589
+ return await summarize_by_category(category="non_cargo", bypass_cache=bypass_cache)
2590
+
2591
+
2592
+ @app.get("/api/ai/summarize/cgo")
2593
+ async def summarize_cgo(bypass_cache: bool = False):
2594
+ """Quick endpoint for CGO (Cargo) summary"""
2595
+ return await summarize_by_category(category="cgo", bypass_cache=bypass_cache)
2596
+
2597
+
2598
+ @app.get("/api/ai/summarize/compare")
2599
+ async def compare_categories(bypass_cache: bool = False):
2600
+ """Compare Non-cargo and CGO categories side by side"""
2601
+ return await summarize_by_category(category="all", bypass_cache=bypass_cache)
2602
+
2603
+
2604
+ # ============== Branch Analytics Endpoints ==============
2605
+
2606
+
2607
+ @app.get("/api/ai/branch/summary")
2608
+ async def branch_analytics_summary(category_type: Optional[str] = None):
2609
+ """
2610
+ Get branch analytics summary
2611
+
2612
+ Args:
2613
+ category_type: "landside_airside", "cgo", or None for both
2614
+ """
2615
+ from data.branch_analytics_service import get_branch_analytics_service
2616
+
2617
+ service = get_branch_analytics_service()
2618
+ return service.get_summary(category_type)
2619
+
2620
+
2621
+ @app.get("/api/ai/branch/{branch_name}")
2622
+ async def get_branch_metrics(branch_name: str, category_type: Optional[str] = None):
2623
+ """
2624
+ Get metrics for a specific branch
2625
+
2626
+ Args:
2627
+ branch_name: Branch name
2628
+ category_type: "landside_airside", "cgo", or None for combined
2629
+ """
2630
+ from data.branch_analytics_service import get_branch_analytics_service
2631
+
2632
+ service = get_branch_analytics_service()
2633
+ data = service.get_branch(branch_name, category_type)
2634
+
2635
+ if not data:
2636
+ raise HTTPException(status_code=404, detail=f"Branch '{branch_name}' not found")
2637
+
2638
+ return data
2639
+
2640
+
2641
+ @app.get("/api/ai/branch/ranking")
2642
+ async def branch_ranking(
2643
+ category_type: Optional[str] = None,
2644
+ sort_by: str = "risk_score",
2645
+ limit: int = 20,
2646
+ ):
2647
+ """
2648
+ Get branch ranking
2649
+
2650
+ Args:
2651
+ category_type: "landside_airside", "cgo", or None for both
2652
+ sort_by: Field to sort by (risk_score, total_issues, critical_high_count)
2653
+ limit: Maximum branches to return
2654
+ """
2655
+ from data.branch_analytics_service import get_branch_analytics_service
2656
+
2657
+ service = get_branch_analytics_service()
2658
+ return service.get_ranking(category_type, sort_by, limit)
2659
+
2660
+
2661
+ @app.get("/api/ai/branch/comparison")
2662
+ async def branch_comparison():
2663
+ """Compare all branches across category types"""
2664
+ from data.branch_analytics_service import get_branch_analytics_service
2665
+
2666
+ service = get_branch_analytics_service()
2667
+ return service.get_comparison()
2668
+
2669
+
2670
+ @app.post("/api/ai/branch/calculate")
2671
+ async def calculate_branch_metrics(bypass_cache: bool = False):
2672
+ """Calculate branch metrics from Google Sheets data"""
2673
+ from data.branch_analytics_service import get_branch_analytics_service
2674
+ from data.sheets_service import GoogleSheetsService
2675
+
2676
+ cache = get_cache() if not bypass_cache else None
2677
+ sheets_service = GoogleSheetsService(cache=cache)
2678
+
2679
+ spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
2680
+ if not spreadsheet_id:
2681
+ raise HTTPException(status_code=500, detail="GOOGLE_SHEET_ID not configured")
2682
+
2683
+ non_cargo = sheets_service.fetch_sheet_data(
2684
+ spreadsheet_id, "NON CARGO", "A1:AA5000", bypass_cache=bypass_cache
2685
+ )
2686
+ cargo = sheets_service.fetch_sheet_data(
2687
+ spreadsheet_id, "CGO", "A1:Z5000", bypass_cache=bypass_cache
2688
+ )
2689
+
2690
+ for row in non_cargo:
2691
+ row["_sheet_name"] = "NON CARGO"
2692
+ for row in cargo:
2693
+ row["_sheet_name"] = "CGO"
2694
+
2695
+ all_data = non_cargo + cargo
2696
+
2697
+ service = get_branch_analytics_service()
2698
+ result = service.calculate_branch_metrics(all_data)
2699
+
2700
+ return {
2701
+ "status": "success",
2702
+ "records_processed": len(all_data),
2703
+ "metrics": result,
2704
+ }
2705
+
2706
+
2707
+ # ============== Main ==============
2708
+
2709
+ if __name__ == "__main__":
2710
+ import uvicorn
2711
+
2712
+ port = int(os.getenv("API_PORT", 8000))
2713
+ uvicorn.run(app, host="0.0.0.0", port=port)