agentsay commited on
Commit
9a4a869
·
verified ·
1 Parent(s): b2d9b23

Update modelLoanAPI.py

Browse files
Files changed (1) hide show
  1. modelLoanAPI.py +236 -300
modelLoanAPI.py CHANGED
@@ -1,318 +1,254 @@
1
- # ```python
2
  from fastapi import FastAPI, HTTPException
3
- from pydantic import BaseModel
4
  import pandas as pd
5
  import numpy as np
6
- from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
7
- from sklearn.preprocessing import LabelEncoder, StandardScaler
8
  from sklearn.metrics import accuracy_score
 
9
  import matplotlib.pyplot as plt
10
  import json
11
  import base64
12
  from io import BytesIO
 
13
  import warnings
14
- import logging
15
-
16
- # Set up logging
17
- logging.basicConfig(level=logging.INFO)
18
- logger = logging.getLogger(__name__)
19
 
20
  warnings.filterwarnings("ignore")
21
 
22
  app = FastAPI()
23
 
24
- class WorkerIdRequest(BaseModel):
25
  worker_id: int
26
 
27
- @app.post("/predict_worker_earnings/")
28
- async def predict_worker_earnings(request: WorkerIdRequest):
 
 
 
 
 
 
 
 
 
 
 
29
  try:
30
- worker_id = request.worker_id
31
- logger.info(f"Processing request for worker_id: {worker_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # Initialize result dictionary
34
- results = {
35
- 'worker_id': worker_id,
36
- 'classification_metrics': {},
37
- 'worker_profile': {},
38
- 'plot': ''
39
- }
40
-
41
- # Load data
42
- try:
43
- df = pd.read_csv('/app/extended_worker_dataset.csv')
44
- logger.info(f"CSV loaded successfully. Columns: {list(df.columns)}")
45
- except FileNotFoundError:
46
- logger.error("CSV file not found at /app/extended_worker_dataset.csv")
47
- raise HTTPException(status_code=500, detail="CSV file not found at /app/data/extended_worker_dataset.csv")
48
- except Exception as e:
49
- logger.error(f"Error reading CSV file: {str(e)}")
50
- raise HTTPException(status_code=500, detail=f"Error reading CSV file: {str(e)}")
51
-
52
- # Verify expected columns
53
- expected_columns = ['worker_id', 'state', 'labour_category', 'contracted_wage', 'age', 'gender',
54
- 'migration_status', 'years_of_experience', 'feedback_score', 'job_type', 'timestamp']
55
- if not all(col in df.columns for col in expected_columns):
56
- missing_cols = [col for col in expected_columns if col not in df.columns]
57
- logger.error(f"Missing columns in CSV: {missing_cols}")
58
- raise HTTPException(status_code=500, detail=f"Missing columns in CSV: {missing_cols}")
59
-
60
- # Filter for one worker_id
61
- df = df[df['worker_id'] == worker_id].copy()
62
- if df.empty:
63
- logger.warning(f"No data found for worker_id {worker_id}")
64
- raise HTTPException(status_code=404, detail=f"No data found for worker_id {worker_id}")
65
-
66
- logger.info(f"Filtered data for worker_id {worker_id}: {len(df)} rows")
67
-
68
- # Data preprocessing
69
- try:
70
- df['timestamp'] = pd.to_datetime(df['timestamp'])
71
- except Exception as e:
72
- logger.error(f"Error converting timestamp: {str(e)}")
73
- raise HTTPException(status_code=500, detail=f"Error converting timestamp: {str(e)}")
74
-
75
- df['has_job'] = (df['job_type'] != "No Job").astype(int)
76
-
77
- try:
78
- wage_cap = df[df['contracted_wage'] > 0]['contracted_wage'].quantile(0.90)
79
- if np.isnan(wage_cap) or wage_cap <= 500:
80
- logger.error("Invalid wage cap calculated")
81
- raise ValueError("Invalid wage cap calculated")
82
- df['contracted_wage'] = df['contracted_wage'].clip(lower=500, upper=wage_cap)
83
- except Exception as e:
84
- logger.error(f"Error processing wage data: {str(e)}")
85
- raise HTTPException(status_code=500, detail=f"Error processing wage data: {str(e)}")
86
-
87
- # Encode job_type
88
- le = LabelEncoder()
89
- try:
90
- df['job_type_encoded'] = le.fit_transform(df['job_type'])
91
- except Exception as e:
92
- logger.error(f"Error encoding job_type: {str(e)}")
93
- raise HTTPException(status_code=500, detail=f"Error encoding job_type: {str(e)}")
94
-
95
- # Split data
96
- if len(df) < 2:
97
- logger.warning("Insufficient data points for training and testing")
98
- raise HTTPException(status_code=400, detail="Insufficient data points for training and testing")
99
- split_point = int(len(df) * 0.8)
100
- train_df = df.iloc[:split_point].copy()
101
- test_df = df.iloc[split_point:].copy()
102
-
103
- # Scale features
104
- scaler = StandardScaler()
105
- try:
106
- train_df[['job_type_scaled', 'years_exp_scaled']] = scaler.fit_transform(
107
- train_df[['job_type_encoded', 'years_of_experience']]
108
- )
109
- train_df['job_exp_interaction'] = train_df['job_type_scaled'] * train_df['years_exp_scaled']
110
- except Exception as e:
111
- logger.error(f"Error scaling features: {str(e)}")
112
- raise HTTPException(status_code=500, detail=f"Error scaling features: {str(e)}")
113
-
114
- for subset in [train_df, test_df]:
115
- subset['dayofweek'] = subset['timestamp'].dt.dayofweek
116
- subset['month'] = subset['timestamp'].dt.month
117
- subset['year'] = subset['timestamp'].dt.year
118
- subset['dayofyear'] = subset['timestamp'].dt.dayofyear
119
- subset['is_weekend'] = subset['dayofweek'].isin([5, 6]).astype(int)
120
-
121
- # Train classifier
122
- X_train_class = train_df[['dayofweek', 'month', 'year', 'dayofyear',
123
- 'is_weekend', 'job_type_encoded', 'feedback_score',
124
- 'years_of_experience']]
125
- y_train_class = train_df['has_job']
126
-
127
- try:
128
- classifier = RandomForestClassifier(
129
- n_estimators=500, max_depth=12, min_samples_split=5, random_state=42
130
- )
131
- classifier.fit(X_train_class, y_train_class)
132
- except Exception as e:
133
- logger.error(f"Error training classifier: {str(e)}")
134
- raise HTTPException(status_code=500, detail=f"Error training classifier: {str(e)}")
135
-
136
- # Train regressor
137
- train_df_reg = train_df[train_df['has_job'] == 1].copy()
138
- if train_df_reg.empty:
139
- logger.warning("No data available for regression (all has_job == 0)")
140
- raise HTTPException(status_code=404, detail="No data available for regression (all has_job == 0)")
141
-
142
- X_train_reg = train_df_reg[['dayofweek', 'month', 'year', 'dayofyear',
143
- 'is_weekend', 'job_type_scaled', 'feedback_score',
144
- 'years_exp_scaled', 'job_exp_interaction']]
145
- y_train_reg = train_df_reg['contracted_wage']
146
-
147
- try:
148
- regressor = RandomForestRegressor(
149
- n_estimators=300, max_depth=10, min_samples_split=4, random_state=42
150
- )
151
- regressor.fit(X_train_reg, y_train_reg)
152
- except Exception as e:
153
- logger.error(f"Error training regressor: {str(e)}")
154
- raise HTTPException(status_code=500, detail=f"Error training regressor: {str(e)}")
155
-
156
- # Prepare future dataframe
157
- future_df = test_df[['timestamp', 'job_type', 'job_type_encoded',
158
- 'feedback_score', 'years_of_experience']].rename(columns={'timestamp': 'ds'})
159
-
160
- future_df['dayofweek'] = future_df['ds'].dt.dayofweek
161
- future_df['month'] = future_df['ds'].dt.month
162
- future_df['year'] = future_df['ds'].dt.year
163
- future_df['dayofyear'] = future_df['ds'].dt.dayofyear
164
- future_df['is_weekend'] = future_df['dayofweek'].isin([5, 6]).astype(int)
165
-
166
- try:
167
- future_df[['job_type_scaled', 'years_exp_scaled']] = scaler.transform(
168
- future_df[['job_type_encoded', 'years_of_experience']]
169
- )
170
- future_df['job_exp_interaction'] = future_df['job_type_scaled'] * future_df['years_exp_scaled']
171
- except Exception as e:
172
- logger.error(f"Error transforming future dataframe: {str(e)}")
173
- raise HTTPException(status_code=500, detail=f"Error transforming future dataframe: {str(e)}")
174
-
175
- # Predict job/no-job
176
- try:
177
- future_df['has_job_predicted'] = classifier.predict(
178
- future_df[['dayofweek', 'month', 'year', 'dayofyear',
179
- 'is_weekend', 'job_type_encoded', 'feedback_score',
180
- 'years_of_experience']]
181
- )
182
- except Exception as e:
183
- logger.error(f"Error predicting has_job: {str(e)}")
184
- raise HTTPException(status_code=500, detail=f"Error predicting has_job: {str(e)}")
185
-
186
- # Evaluate classifier accuracy
187
- test_df['has_job'] = (test_df['job_type'] != "No Job").astype(int)
188
- try:
189
- acc = accuracy_score(test_df['has_job'], future_df['has_job_predicted'])
190
- results['classification_metrics']['accuracy'] = round(acc * 100, 2)
191
- except Exception as e:
192
- logger.error(f"Error calculating accuracy: {str(e)}")
193
- raise HTTPException(status_code=500, detail=f"Error calculating accuracy: {str(e)}")
194
-
195
- # Predict wages
196
- try:
197
- future_df['yhat'] = regressor.predict(
198
- future_df[['dayofweek', 'month', 'year', 'dayofyear',
199
- 'is_weekend', 'job_type_scaled', 'feedback_score',
200
- 'years_exp_scaled', 'job_exp_interaction']]
201
- )
202
- except Exception as e:
203
- logger.error(f"Error predicting wages: {str(e)}")
204
- raise HTTPException(status_code=500, detail=f"Error predicting wages: {str(e)}")
205
-
206
- # Apply job prediction mask
207
- final_forecast_df = future_df.copy()
208
- final_forecast_df['yhat'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, final_forecast_df['yhat'])
209
- final_forecast_df['yhat'] = np.minimum(final_forecast_df['yhat'], wage_cap)
210
-
211
- # Uncertainty intervals
212
- try:
213
- predictions = regressor.predict(X_train_reg)
214
- std_dev = np.std([tree.predict(X_train_reg) for tree in regressor.estimators_], axis=0)
215
- future_df['yhat_lower'] = np.maximum(final_forecast_df['yhat'] - 1.96 * std_dev.mean(), 0)
216
- future_df['yhat_upper'] = final_forecast_df['yhat'] + 1.96 * std_dev.mean()
217
- final_forecast_df['yhat_lower'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_lower'])
218
- final_forecast_df['yhat_upper'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_upper'])
219
- except Exception as e:
220
- logger.error(f"Error calculating uncertainty intervals: {str(e)}")
221
- raise HTTPException(status_code=500, detail=f"Error calculating uncertainty intervals: {str(e)}")
222
-
223
- # Evaluation
224
- try:
225
- comparison_df = pd.merge(
226
- test_df[['timestamp', 'contracted_wage']].rename(columns={'timestamp': 'ds', 'contracted_wage': 'y'}),
227
- final_forecast_df[['ds', 'yhat', 'yhat_lower', 'yhat_upper']], on='ds'
228
- )
229
-
230
- valid_comparison_df = comparison_df[comparison_df['y'] > 0]
231
- if not valid_comparison_df.empty:
232
- weights = valid_comparison_df['y'] / valid_comparison_df['y'].mean()
233
- mae = np.average([abs(a - p) for a, p in zip(valid_comparison_df['y'], valid_comparison_df['yhat'])], weights=weights)
234
- mape = np.average([abs((a - p) / a) * 100 for a, p in zip(valid_comparison_df['y'], valid_comparison_df['yhat'])], weights=weights)
235
- else:
236
- mae = np.nan
237
- mape = np.nan
238
-
239
- results['classification_metrics']['mae'] = round(mae, 2) if not np.isnan(mae) else None
240
- results['classification_metrics']['mape'] = round(mape, 2) if not np.isnan(mape) else None
241
- except Exception as e:
242
- logger.error(f"Error evaluating predictions: {str(e)}")
243
- raise HTTPException(status_code=500, detail=f"Error evaluating predictions: {str(e)}")
244
-
245
- # Plot results
246
- try:
247
- plt.figure(figsize=(12, 6))
248
- plt.plot(comparison_df['ds'], comparison_df['y'], 'o-', label='Actual Values', markersize=4)
249
- plt.plot(comparison_df['ds'], comparison_df['yhat'], '-', label='Forecasted Values')
250
- plt.fill_between(comparison_df['ds'], comparison_df['yhat_lower'], comparison_df['yhat_upper'],
251
- color='gray', alpha=0.2, label='Uncertainty Interval')
252
- plt.title('Actual vs. Forecasted Daily Earnings (Last 20% of Dataset)')
253
- plt.xlabel('Date')
254
- plt.ylabel('Contracted Wage')
255
- plt.legend()
256
- plt.grid(True)
257
- plt.xticks(rotation=45)
258
- plt.tight_layout()
259
-
260
- buffer = BytesIO()
261
- plt.savefig(buffer, format='png')
262
- buffer.seek(0)
263
- plot_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
264
- results['plot'] = f'data:image/png;base64,{plot_base64}'
265
- plt.close()
266
- except Exception as e:
267
- logger.error(f"Error generating plot: {str(e)}")
268
- raise HTTPException(status_code=500, detail=f"Error generating plot: {str(e)}")
269
-
270
- # Worker Profile for Microfinance
271
- try:
272
- worker_data = df.copy()
273
-
274
- avg_daily_earning = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].mean()
275
- avg_monthly_earning = avg_daily_earning * 30 if not np.isnan(avg_daily_earning) else 0
276
-
277
- job_distribution = worker_data['job_type'].value_counts(normalize=True) * 100
278
-
279
- avg_feedback = worker_data['feedback_score'].mean()
280
-
281
- workholic_index = job_distribution.drop(labels=['No Job'], errors='ignore').sum() / 100
282
-
283
- if avg_daily_earning > 0:
284
- earning_stability = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].std() / avg_daily_earning
285
- else:
286
- earning_stability = np.nan
287
-
288
- results['worker_profile'] = {
289
- 'average_daily_earning': round(avg_daily_earning, 2) if not np.isnan(avg_daily_earning) else None,
290
- 'estimated_monthly_earning': round(avg_monthly_earning, 2) if not np.isnan(avg_monthly_earning) else None,
291
- 'job_distribution': job_distribution.round(2).to_dict(),
292
- 'average_feedback_score': round(avg_feedback, 2) if not np.isnan(avg_feedback) else None,
293
- 'workholic_index': round(workholic_index, 2) if not np.isnan(workholic_index) else None,
294
- 'earning_stability': round(earning_stability, 2) if not np.isnan(earning_stability) else None
295
- }
296
- except Exception as e:
297
- logger.error(f"Error generating worker profile: {str(e)}")
298
- raise HTTPException(status_code=500, detail=f"Error generating worker profile: {str(e)}")
299
-
300
- def convert_to_serializable(obj):
301
- if isinstance(obj, np.floating):
302
- return float(obj)
303
- if isinstance(obj, np.integer):
304
- return int(obj)
305
- if isinstance(obj, np.ndarray):
306
- return obj.tolist()
307
- return obj
308
-
309
- logger.info("Request processed successfully")
310
- return json.loads(json.dumps(results, default=convert_to_serializable))
311
-
312
- except Exception as e:
313
- logger.error(f"Error processing request: {str(e)}")
314
- raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
315
-
316
- if __name__ == "__main__":
317
- import uvicorn
318
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
 
1
  from fastapi import FastAPI, HTTPException
2
+ from fastapi.responses import JSONResponse, FileResponse
3
  import pandas as pd
4
  import numpy as np
5
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
6
+ from sklearn.preprocessing import RobustScaler, LabelEncoder
7
  from sklearn.metrics import accuracy_score
8
+ from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
9
  import matplotlib.pyplot as plt
10
  import json
11
  import base64
12
  from io import BytesIO
13
+ from PIL import Image
14
  import warnings
15
+ import os
16
+ from pydantic import BaseModel
 
 
 
17
 
18
  warnings.filterwarnings("ignore")
19
 
20
  app = FastAPI()
21
 
22
+ class WorkerInput(BaseModel):
23
  worker_id: int
24
 
25
+ @app.post("/worker_forecast/")
26
+ async def worker_forecast(input_data: WorkerInput):
27
+ worker_id = input_data.worker_id
28
+
29
+ # Initialize result dictionary
30
+ results = {
31
+ 'worker_id': worker_id,
32
+ 'metrics': {},
33
+ 'worker_profile': {},
34
+ 'plot': ''
35
+ }
36
+
37
+ # Load dataset
38
  try:
39
+ df = pd.read_csv('extended_worker_dataset_random_reduced.csv')
40
+ except FileNotFoundError:
41
+ raise HTTPException(status_code=500, detail="Dataset file not found")
42
+
43
+ # Filter for one worker_id
44
+ df = df[df['worker_id'] == worker_id].copy()
45
+ if df.empty:
46
+ raise HTTPException(status_code=404, detail=f"No data found for worker_id {worker_id}")
47
+
48
+ # Data preprocessing
49
+ df['timestamp'] = pd.to_datetime(df['timestamp'], dayfirst=True, errors='coerce')
50
+ df['has_job'] = (df['job_type'] != "No Job").astype(int)
51
+
52
+ # Wage capping
53
+ wage_cap = df[df['contracted_wage'] > 0]['contracted_wage'].quantile(0.98)
54
+ df['contracted_wage'] = df['contracted_wage'].clip(lower=500, upper=wage_cap)
55
+
56
+ # Encode categorical
57
+ le_job = LabelEncoder()
58
+ df['job_type_encoded'] = le_job.fit_transform(df['job_type'])
59
+ le_labour = LabelEncoder()
60
+ df['labour_category_encoded'] = le_labour.fit_transform(df['labour_category'])
61
+
62
+ # Lagged and rolling features
63
+ df['prev_wage'] = df['contracted_wage'].shift(1).fillna(0)
64
+ df['prev_wage2'] = df['contracted_wage'].shift(2).fillna(0)
65
+ df['prev_wage3'] = df['contracted_wage'].shift(3).fillna(0)
66
+ df['rolling_mean_3'] = df['contracted_wage'].rolling(3, min_periods=1).mean().shift(1).fillna(0)
67
+ df['rolling_std_3'] = df['contracted_wage'].rolling(3, min_periods=1).std().shift(1).fillna(0)
68
+ df['rolling_mean_7'] = df['contracted_wage'].rolling(7, min_periods=1).mean().shift(1).fillna(0)
69
+
70
+ # Train/test split
71
+ split_point = int(len(df) * 0.8)
72
+ train_df, test_df = df.iloc[:split_point].copy(), df.iloc[split_point:].copy()
73
+
74
+ # Scaling
75
+ scaler = RobustScaler()
76
+ train_df[['job_type_scaled', 'years_exp_scaled', 'prev_wage_scaled', 'prev_wage2_scaled', 'prev_wage3_scaled',
77
+ 'rolling_mean_3_scaled', 'rolling_std_3_scaled', 'rolling_mean_7_scaled', 'labour_category_scaled']] = scaler.fit_transform(
78
+ train_df[['job_type_encoded', 'years_of_experience', 'prev_wage', 'prev_wage2', 'prev_wage3',
79
+ 'rolling_mean_3', 'rolling_std_3', 'rolling_mean_7', 'labour_category_encoded']]
80
+ )
81
+ train_df['job_exp_interaction'] = train_df['job_type_scaled'] * train_df['years_exp_scaled']
82
+
83
+ # Date features
84
+ for subset in [train_df, test_df]:
85
+ subset['dayofweek'] = subset['timestamp'].dt.dayofweek
86
+ subset['month'] = subset['timestamp'].dt.month
87
+ subset['year'] = subset['timestamp'].dt.year
88
+ subset['dayofyear'] = subset['timestamp'].dt.dayofyear
89
+ subset['is_weekend'] = subset['dayofweek'].isin([5, 6]).astype(int)
90
+
91
+ # Classification model
92
+ X_train_class = train_df[['dayofweek', 'month', 'year', 'dayofyear',
93
+ 'is_weekend', 'job_type_encoded', 'feedback_score',
94
+ 'years_of_experience']]
95
+ y_train_class = train_df['has_job']
96
+ classifier = RandomForestClassifier(n_estimators=500, max_depth=12, min_samples_split=5, random_state=42)
97
+ classifier.fit(X_train_class, y_train_class)
98
+
99
+ # Regression model (only when has_job=1)
100
+ train_df_reg = train_df[train_df['has_job'] == 1].copy()
101
+ X_train_reg = train_df_reg[['dayofweek', 'month', 'year', 'dayofyear',
102
+ 'is_weekend', 'job_type_scaled', 'feedback_score',
103
+ 'years_exp_scaled', 'job_exp_interaction', 'prev_wage_scaled',
104
+ 'prev_wage2_scaled', 'prev_wage3_scaled', 'rolling_mean_3_scaled',
105
+ 'rolling_std_3_scaled', 'rolling_mean_7_scaled', 'labour_category_scaled']]
106
+ y_train_reg = train_df_reg['contracted_wage']
107
+
108
+ # Hyperparameter tuning
109
+ tscv = TimeSeriesSplit(n_splits=5)
110
+ param_grid = {
111
+ 'n_estimators': [200, 300, 400],
112
+ 'learning_rate': [0.01, 0.05],
113
+ 'max_depth': [3, 4, 5],
114
+ 'min_samples_split': [3, 4],
115
+ 'min_samples_leaf': [2, 3]
116
+ }
117
+ grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42),
118
+ param_grid, cv=tscv, scoring='neg_mean_absolute_error', n_jobs=-1)
119
+ grid_search.fit(X_train_reg, y_train_reg)
120
+ best_reg = grid_search.best_estimator_
121
+ best_params = grid_search.best_params_
122
+
123
+ # Quantile regressors
124
+ reg_lower = GradientBoostingRegressor(loss='quantile', alpha=0.025, **best_params, random_state=42)
125
+ reg_upper = GradientBoostingRegressor(loss='quantile', alpha=0.975, **best_params, random_state=42)
126
+ reg_lower.fit(X_train_reg, y_train_reg)
127
+ reg_upper.fit(X_train_reg, y_train_reg)
128
+
129
+ # Future dataframe
130
+ future_df = test_df[['timestamp', 'job_type', 'job_type_encoded', 'feedback_score', 'years_of_experience',
131
+ 'prev_wage', 'prev_wage2', 'prev_wage3', 'rolling_mean_3', 'rolling_std_3', 'rolling_mean_7',
132
+ 'labour_category_encoded']].rename(columns={'timestamp': 'ds'})
133
+ future_df['dayofweek'] = future_df['ds'].dt.dayofweek
134
+ future_df['month'] = future_df['ds'].dt.month
135
+ future_df['year'] = future_df['ds'].dt.year
136
+ future_df['dayofyear'] = future_df['ds'].dt.dayofyear
137
+ future_df['is_weekend'] = future_df['dayofweek'].isin([5, 6]).astype(int)
138
+ future_df[['job_type_scaled', 'years_exp_scaled', 'prev_wage_scaled', 'prev_wage2_scaled', 'prev_wage3_scaled',
139
+ 'rolling_mean_3_scaled', 'rolling_std_3_scaled', 'rolling_mean_7_scaled', 'labour_category_scaled']] = scaler.transform(
140
+ future_df[['job_type_encoded', 'years_of_experience', 'prev_wage', 'prev_wage2', 'prev_wage3',
141
+ 'rolling_mean_3', 'rolling_std_3', 'rolling_mean_7', 'labour_category_encoded']]
142
+ )
143
+ future_df['job_exp_interaction'] = future_df['job_type_scaled'] * future_df['years_exp_scaled']
144
+
145
+ # Predictions
146
+ future_df['has_job_predicted'] = classifier.predict(
147
+ future_df[['dayofweek', 'month', 'year', 'dayofyear', 'is_weekend', 'job_type_encoded',
148
+ 'feedback_score', 'years_of_experience']]
149
+ )
150
+ future_df['yhat'] = best_reg.predict(
151
+ future_df[['dayofweek', 'month', 'year', 'dayofyear', 'is_weekend', 'job_type_scaled', 'feedback_score',
152
+ 'years_exp_scaled', 'job_exp_interaction', 'prev_wage_scaled', 'prev_wage2_scaled', 'prev_wage3_scaled',
153
+ 'rolling_mean_3_scaled', 'rolling_std_3_scaled', 'rolling_mean_7_scaled', 'labour_category_scaled']]
154
+ )
155
+ future_df['yhat_lower'] = reg_lower.predict(
156
+ future_df[['dayofweek', 'month', 'year', 'dayofyear', 'is_weekend', 'job_type_scaled', 'feedback_score',
157
+ 'years_exp_scaled', 'job_exp_interaction', 'prev_wage_scaled', 'prev_wage2_scaled', 'prev_wage3_scaled',
158
+ 'rolling_mean_3_scaled', 'rolling_std_3_scaled', 'rolling_mean_7_scaled', 'labour_category_scaled']]
159
+ )
160
+ future_df['yhat_upper'] = reg_upper.predict(
161
+ future_df[['dayofweek', 'month', 'year', 'dayofyear', 'is_weekend', 'job_type_scaled', 'feedback_score',
162
+ 'years_exp_scaled', 'job_exp_interaction', 'prev_wage_scaled', 'prev_wage2_scaled', 'prev_wage3_scaled',
163
+ 'rolling_mean_3_scaled', 'rolling_std_3_scaled', 'rolling_mean_7_scaled', 'labour_category_scaled']]
164
+ )
165
+
166
+ # Apply job mask
167
+ final_forecast_df = future_df.copy()
168
+ final_forecast_df['yhat'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, final_forecast_df['yhat'])
169
+ final_forecast_df['yhat'] = np.minimum(final_forecast_df['yhat'], wage_cap)
170
+ final_forecast_df['yhat_lower'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_lower'])
171
+ final_forecast_df['yhat_upper'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_upper'])
172
+ final_forecast_df['yhat_lower'] = np.maximum(final_forecast_df['yhat_lower'], 0)
173
+
174
+ # Evaluation
175
+ comparison_df = pd.merge(
176
+ test_df[['timestamp', 'contracted_wage']].rename(columns={'timestamp': 'ds', 'contracted_wage': 'y'}),
177
+ final_forecast_df[['ds', 'yhat', 'yhat_lower', 'yhat_upper']], on='ds', how='left'
178
+ )
179
+ comparison_df = comparison_df.set_index(final_forecast_df.index) # Align indices
180
+ valid_comparison_df = comparison_df[comparison_df['y'] > 0].copy()
181
+
182
+ if not valid_comparison_df.empty:
183
+ valid_y = valid_comparison_df['y'].values
184
+ valid_yhat = valid_comparison_df['yhat'].values
185
+ weights = valid_comparison_df['y'].values / valid_comparison_df['y'].mean()
186
 
187
+ mae = np.average(np.abs(valid_y - valid_yhat), weights=weights, axis=0)
188
+ mape = np.average(np.abs((valid_y - valid_yhat) / valid_y) * 100, weights=weights, axis=0)
189
+ else:
190
+ mae, mape = np.nan, np.nan
191
+
192
+ results['metrics']['mae'] = round(mae, 2) if not np.isnan(mae) else None
193
+ results['metrics']['mape'] = round(mape, 2) if not np.isnan(mape) else None
194
+
195
+ # Plot results
196
+ plt.figure(figsize=(12, 6))
197
+ plt.plot(final_forecast_df['ds'], final_forecast_df['yhat'], '-', label='Forecasted', color='blue')
198
+ plt.fill_between(final_forecast_df['ds'], final_forecast_df['yhat_lower'], final_forecast_df['yhat_upper'],
199
+ color='gray', alpha=0.2, label='Uncertainty')
200
+ plt.title('Forecasted Daily Earnings (Last 20%)')
201
+ plt.xlabel('Date'); plt.ylabel('Contracted Wage')
202
+ plt.legend(); plt.grid(True); plt.xticks(rotation=45); plt.tight_layout()
203
+
204
+ # Save plot as PNG JPG for compression
205
+ buf_png = BytesIO()
206
+ plt.savefig(buf_png, format="png", dpi=80, bbox_inches="tight")
207
+ plt.close()
208
+ buf_png.seek(0)
209
+ img = Image.open(buf_png).convert("RGB")
210
+ buf_jpg = BytesIO()
211
+ img.save(buf_jpg, format="JPEG", quality=70, optimize=True)
212
+ buf_jpg.seek(0)
213
+ plot_base64 = base64.b64encode(buf_jpg.getvalue()).decode("utf-8")
214
+ results['plot'] = f"data:image/jpeg;base64,{plot_base64}"
215
+
216
+ # Save plot to file
217
+ plot_filename = f"worker_{worker_id}_forecast.jpg"
218
+ with open(plot_filename, "wb") as f:
219
+ f.write(base64.b64decode(plot_base64))
220
+
221
+ # Worker profile
222
+ worker_data = df.copy()
223
+ avg_daily = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].mean()
224
+ avg_monthly = avg_daily * 30 if not np.isnan(avg_daily) else 0
225
+ job_dist = worker_data['job_type'].value_counts(normalize=True) * 100
226
+ avg_feedback = worker_data['feedback_score'].mean()
227
+ work_index = job_dist.drop(labels=['No Job'], errors='ignore').sum() / 100
228
+ earn_stability = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].std() / avg_daily if avg_daily > 0 else np.nan
229
+
230
+ results['worker_profile'] = {
231
+ 'average_daily_earning': round(avg_daily, 2) if not np.isnan(avg_daily) else None,
232
+ 'estimated_monthly_earning': round(avg_monthly, 2) if not np.isnan(avg_monthly) else None,
233
+ 'job_distribution': job_dist.round(2).to_dict(),
234
+ 'average_feedback_score': round(avg_feedback, 2) if not np.isnan(avg_feedback) else None,
235
+ 'work_index': round(work_index, 2) if not np.isnan(work_index) else None,
236
+ 'earning_stability': round(earn_stability, 2) if not np.isnan(earn_stability) else None
237
+ }
238
+
239
+ def convert_to_serializable(obj):
240
+ if isinstance(obj, (np.floating, np.float32, np.float64)): return float(obj)
241
+ if isinstance(obj, (np.integer, np.int32, np.int64)): return int(obj)
242
+ if isinstance(obj, np.ndarray): return obj.tolist()
243
+ return obj
244
+
245
+ # Return JSON response with results
246
+ return JSONResponse(content=json.loads(json.dumps(results, default=convert_to_serializable)))
247
+
248
+ @app.get("/worker_forecast/plot/{worker_id}")
249
+ async def get_forecast_plot(worker_id: int):
250
+ plot_filename = f"worker_{worker_id}_forecast.jpg"
251
+ if os.path.exists(plot_filename):
252
+ return FileResponse(plot_filename, media_type="image/jpeg", filename=f"worker_{worker_id}_forecast.jpg")
253
+ else:
254
+ raise HTTPException(status_code=404, detail=f"Plot for worker_id {worker_id} not found")