agentsay commited on
Commit
25488f2
·
verified ·
1 Parent(s): 307aeff

Update modelLoanAPI.py

Browse files
Files changed (1) hide show
  1. modelLoanAPI.py +158 -97
modelLoanAPI.py CHANGED
@@ -1,4 +1,6 @@
1
- from fastapi import FastAPI, HTTPException, Query
 
 
2
  import pandas as pd
3
  import numpy as np
4
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
@@ -14,9 +16,13 @@ warnings.filterwarnings("ignore")
14
 
15
  app = FastAPI()
16
 
 
 
 
17
  @app.post("/predict_worker_earnings/")
18
- async def predict_worker_earnings(worker_id: int = Query(...)):
19
  try:
 
20
  # Initialize result dictionary
21
  results = {
22
  'worker_id': worker_id,
@@ -26,7 +32,12 @@ async def predict_worker_earnings(worker_id: int = Query(...)):
26
  }
27
 
28
  # Load data
29
- df = pd.read_csv('/app/data/extended_worker_dataset.csv')
 
 
 
 
 
30
 
31
  # Filter for one worker_id
32
  df = df[df['worker_id'] == worker_id].copy()
@@ -34,31 +45,48 @@ async def predict_worker_earnings(worker_id: int = Query(...)):
34
  raise HTTPException(status_code=404, detail=f"No data found for worker_id {worker_id}")
35
 
36
  # Data preprocessing
37
- df['timestamp'] = pd.to_datetime(df['timestamp'])
 
 
 
 
38
  df['has_job'] = (df['job_type'] != "No Job").astype(int)
39
 
40
- wage_cap = df[df['contracted_wage'] > 0]['contracted_wage'].quantile(0.90)
41
- df['contracted_wage'] = df['contracted_wage'].clip(lower=500, upper=wage_cap)
 
 
 
 
 
42
 
43
  # Encode job_type
44
  le = LabelEncoder()
45
- df['job_type_encoded'] = le.fit_transform(df['job_type'])
46
-
 
 
 
 
 
 
47
  split_point = int(len(df) * 0.8)
48
  train_df = df.iloc[:split_point].copy()
49
  test_df = df.iloc[split_point:].copy()
50
 
51
  # Scale features
52
  scaler = StandardScaler()
53
- train_df[['job_type_scaled', 'years_exp_scaled']] = scaler.fit_transform(
54
- train_df[['job_type_encoded', 'years_of_experience']]
55
- )
56
- train_df['job_exp_interaction'] = train_df['job_type_scaled'] * train_df['years_exp_scaled']
 
 
 
57
 
58
  for subset in [train_df, test_df]:
59
  subset['dayofweek'] = subset['timestamp'].dt.dayofweek
60
  subset['month'] = subset['timestamp'].dt.month
61
- Pin to content
62
  subset['year'] = subset['timestamp'].dt.year
63
  subset['dayofyear'] = subset['timestamp'].dt.dayofyear
64
  subset['is_weekend'] = subset['dayofweek'].isin([5, 6]).astype(int)
@@ -69,22 +97,31 @@ async def predict_worker_earnings(worker_id: int = Query(...)):
69
  'years_of_experience']]
70
  y_train_class = train_df['has_job']
71
 
72
- classifier = RandomForestClassifier(
73
- n_estimators=500, max_depth=12, min_samples_split=5, random_state=42
74
- )
75
- classifier.fit(X_train_class, y_train_class)
 
 
 
76
 
77
  # Train regressor
78
  train_df_reg = train_df[train_df['has_job'] == 1].copy()
 
 
 
79
  X_train_reg = train_df_reg[['dayofweek', 'month', 'year', 'dayofyear',
80
  'is_weekend', 'job_type_scaled', 'feedback_score',
81
  'years_exp_scaled', 'job_exp_interaction']]
82
  y_train_reg = train_df_reg['contracted_wage']
83
 
84
- regressor = RandomForestRegressor(
85
- n_estimators=300, max_depth=10, min_samples_split=4, random_state=42
86
- )
87
- regressor.fit(X_train_reg, y_train_reg)
 
 
 
88
 
89
  # Prepare future dataframe
90
  future_df = test_df[['timestamp', 'job_type', 'job_type_encoded',
@@ -96,29 +133,41 @@ async def predict_worker_earnings(worker_id: int = Query(...)):
96
  future_df['dayofyear'] = future_df['ds'].dt.dayofyear
97
  future_df['is_weekend'] = future_df['dayofweek'].isin([5, 6]).astype(int)
98
 
99
- future_df[['job_type_scaled', 'years_exp_scaled']] = scaler.transform(
100
- future_df[['job_type_encoded', 'years_of_experience']]
101
- )
102
- future_df['job_exp_interaction'] = future_df['job_type_scaled'] * future_df['years_exp_scaled']
 
 
 
103
 
104
  # Predict job/no-job
105
- future_df['has_job_predicted'] = classifier.predict(
106
- future_df[['dayofweek', 'month', 'year', 'dayofyear',
107
- 'is_weekend', 'job_type_encoded', 'feedback_score',
108
- 'years_of_experience']]
109
- )
 
 
 
110
 
111
  # Evaluate classifier accuracy
112
  test_df['has_job'] = (test_df['job_type'] != "No Job").astype(int)
113
- acc = accuracy_score(test_df['has_job'], future_df['has_job_predicted'])
114
- results['classification_metrics']['accuracy'] = round(acc * 100, 2)
 
 
 
115
 
116
  # Predict wages
117
- future_df['yhat'] = regressor.predict(
118
- future_df[['dayofweek', 'month', 'year', 'dayofyear',
119
- 'is_weekend', 'job_type_scaled', 'feedback_score',
120
- 'years_exp_scaled', 'job_exp_interaction']]
121
- )
 
 
 
122
 
123
  # Apply job prediction mask
124
  final_forecast_df = future_df.copy()
@@ -126,77 +175,89 @@ async def predict_worker_earnings(worker_id: int = Query(...)):
126
  final_forecast_df['yhat'] = np.minimum(final_forecast_df['yhat'], wage_cap)
127
 
128
  # Uncertainty intervals
129
- predictions = regressor.predict(X_train_reg)
130
- std_dev = np.std([tree.predict(X_train_reg) for tree in regressor.estimators_], axis=0)
131
- future_df['yhat_lower'] = np.maximum(final_forecast_df['yhat'] - 1.96 * std_dev.mean(), 0)
132
- future_df['yhat_upper'] = final_forecast_df['yhat'] + 1.96 * std_dev.mean()
133
- final_forecast_df['yhat_lower'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_lower'])
134
- final_forecast_df['yhat_upper'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_upper'])
 
 
 
135
 
136
  # Evaluation
137
- comparison_df = pd.merge(
138
- test_df[['timestamp', 'contracted_wage']].rename(columns={'timestamp': 'ds', 'contracted_wage': 'y'}),
139
- final_forecast_df[['ds', 'yhat', 'yhat_lower', 'yhat_upper']], on='ds'
140
- )
141
-
142
- valid_comparison_df = comparison_df[comparison_df['y'] > 0]
143
- if not valid_comparison_df.empty:
144
- weights = valid_comparison_df['y'] / valid_comparison_df['y'].mean()
145
- mae = np.average([abs(a - p) for a, p in zip(valid_comparison_df['y'], valid_comparison_df['yhat'])], weights=weights)
146
- mape = np.average([abs((a - p) / a) * 100 for a, p in zip(valid_comparison_df['y'], valid_comparison_df['yhat'])], weights=weights)
147
- else:
148
- mae = np.nan
149
- mape = np.nan
150
-
151
- results['classification_metrics']['mae'] = round(mae, 2) if not np.isnan(mae) else None
152
- results['classification_metrics']['mape'] = round(mape, 2) if not np.isnan(mape) else None
 
 
 
153
 
154
  # Plot results
155
- plt.figure(figsize=(12, 6))
156
- plt.plot(comparison_df['ds'], comparison_df['y'], 'o-', label='Actual Values', markersize=4)
157
- plt.plot(comparison_df['ds'], comparison_df['yhat'], '-', label='Forecasted Values')
158
- plt.fill_between(comparison_df['ds'], comparison_df['yhat_lower'], comparison_df['yhat_upper'],
159
- color='gray', alpha=0.2, label='Uncertainty Interval')
160
- plt.title('Actual vs. Forecasted Daily Earnings (Last 20% of Dataset)')
161
- plt.xlabel('Date')
162
- plt.ylabel('Contracted Wage')
163
- plt.legend()
164
- plt.grid(True)
165
- plt.xticks(rotation=45)
166
- plt.tight_layout()
167
-
168
- buffer = BytesIO()
169
- plt.savefig(buffer, format='png')
170
- buffer.seek(0)
171
- plot_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
172
- results['plot'] = f'data:image/png;base64,{plot_base64}'
173
- plt.close()
 
 
 
174
 
175
  # Worker Profile for Microfinance
176
- worker_data = df.copy()
 
177
 
178
- avg_daily_earning = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].mean()
179
- avg_monthly_earning = avg_daily_earning * 30 if not np.isnan(avg_daily_earning) else 0
180
 
181
- job_distribution = worker_data['job_type'].value_counts(normalize=True) * 100
182
 
183
- avg_feedback = worker_data['feedback_score'].mean()
184
 
185
- workholic_index = job_distribution.drop(labels=['No Job'], errors='ignore').sum() / 100
186
 
187
- if avg_daily_earning > 0:
188
- earning_stability = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].std() / avg_daily_earning
189
- else:
190
- earning_stability = np.nan
191
 
192
- results['worker_profile'] = {
193
- 'average_daily_earning': round(avg_daily_earning, 2) if not np.isnan(avg_daily_earning) else None,
194
- 'estimated_monthly_earning': round(avg_monthly_earning, 2) if not np.isnan(avg_monthly_earning) else None,
195
- 'job_distribution': job_distribution.round(2).to_dict(),
196
- 'average_feedback_score': round(avg_feedback, 2) if not np.isnan(avg_feedback) else None,
197
- 'workholic_index': round(workholic_index, 2) if not np.isnan(workholic_index) else None,
198
- 'earning_stability': round(earning_stability, 2) if not np.isnan(earning_stability) else None
199
- }
 
 
200
 
201
  def convert_to_serializable(obj):
202
  if isinstance(obj, np.floating):
@@ -214,4 +275,4 @@ async def predict_worker_earnings(worker_id: int = Query(...)):
214
 
215
  if __name__ == "__main__":
216
  import uvicorn
217
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
+ # ```python
2
+ from fastapi import FastAPI, HTTPException
3
+ from pydantic import BaseModel
4
  import pandas as pd
5
  import numpy as np
6
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
16
 
17
  app = FastAPI()
18
 
19
+ class WorkerIdRequest(BaseModel):
20
+ worker_id: int
21
+
22
  @app.post("/predict_worker_earnings/")
23
+ async def predict_worker_earnings(request: WorkerIdRequest):
24
  try:
25
+ worker_id = request.worker_id
26
  # Initialize result dictionary
27
  results = {
28
  'worker_id': worker_id,
 
32
  }
33
 
34
  # Load data
35
+ try:
36
+ df = pd.read_csv('/app/data/extended_worker_dataset.csv')
37
+ except FileNotFoundError:
38
+ raise HTTPException(status_code=500, detail="CSV file not found at /app/data/extended_worker_dataset.csv")
39
+ except Exception as e:
40
+ raise HTTPException(status_code=500, detail=f"Error reading CSV file: {str(e)}")
41
 
42
  # Filter for one worker_id
43
  df = df[df['worker_id'] == worker_id].copy()
 
45
  raise HTTPException(status_code=404, detail=f"No data found for worker_id {worker_id}")
46
 
47
  # Data preprocessing
48
+ try:
49
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
50
+ except Exception as e:
51
+ raise HTTPException(status_code=500, detail=f"Error converting timestamp: {str(e)}")
52
+
53
  df['has_job'] = (df['job_type'] != "No Job").astype(int)
54
 
55
+ try:
56
+ wage_cap = df[df['contracted_wage'] > 0]['contracted_wage'].quantile(0.90)
57
+ if np.isnan(wage_cap) or wage_cap <= 500:
58
+ raise ValueError("Invalid wage cap calculated")
59
+ df['contracted_wage'] = df['contracted_wage'].clip(lower=500, upper=wage_cap)
60
+ except Exception as e:
61
+ raise HTTPException(status_code=500, detail=f"Error processing wage data: {str(e)}")
62
 
63
  # Encode job_type
64
  le = LabelEncoder()
65
+ try:
66
+ df['job_type_encoded'] = le.fit_transform(df['job_type'])
67
+ except Exception as e:
68
+ raise HTTPException(status_code=500, detail=f"Error encoding job_type: {str(e)}")
69
+
70
+ # Split data
71
+ if len(df) < 2:
72
+ raise HTTPException(status_code=400, detail="Insufficient data points for training and testing")
73
  split_point = int(len(df) * 0.8)
74
  train_df = df.iloc[:split_point].copy()
75
  test_df = df.iloc[split_point:].copy()
76
 
77
  # Scale features
78
  scaler = StandardScaler()
79
+ try:
80
+ train_df[['job_type_scaled', 'years_exp_scaled']] = scaler.fit_transform(
81
+ train_df[['job_type_encoded', 'years_of_experience']]
82
+ )
83
+ train_df['job_exp_interaction'] = train_df['job_type_scaled'] * train_df['years_exp_scaled']
84
+ except Exception as e:
85
+ raise HTTPException(status_code=500, detail=f"Error scaling features: {str(e)}")
86
 
87
  for subset in [train_df, test_df]:
88
  subset['dayofweek'] = subset['timestamp'].dt.dayofweek
89
  subset['month'] = subset['timestamp'].dt.month
 
90
  subset['year'] = subset['timestamp'].dt.year
91
  subset['dayofyear'] = subset['timestamp'].dt.dayofyear
92
  subset['is_weekend'] = subset['dayofweek'].isin([5, 6]).astype(int)
 
97
  'years_of_experience']]
98
  y_train_class = train_df['has_job']
99
 
100
+ try:
101
+ classifier = RandomForestClassifier(
102
+ n_estimators=500, max_depth=12, min_samples_split=5, random_state=42
103
+ )
104
+ classifier.fit(X_train_class, y_train_class)
105
+ except Exception as e:
106
+ raise HTTPException(status_code=500, detail=f"Error training classifier: {str(e)}")
107
 
108
  # Train regressor
109
  train_df_reg = train_df[train_df['has_job'] == 1].copy()
110
+ if train_df_reg.empty:
111
+ raise HTTPException(status_code=404, detail="No data available for regression (all has_job == 0)")
112
+
113
  X_train_reg = train_df_reg[['dayofweek', 'month', 'year', 'dayofyear',
114
  'is_weekend', 'job_type_scaled', 'feedback_score',
115
  'years_exp_scaled', 'job_exp_interaction']]
116
  y_train_reg = train_df_reg['contracted_wage']
117
 
118
+ try:
119
+ regressor = RandomForestRegressor(
120
+ n_estimators=300, max_depth=10, min_samples_split=4, random_state=42
121
+ )
122
+ regressor.fit(X_train_reg, y_train_reg)
123
+ except Exception as e:
124
+ raise HTTPException(status_code=500, detail=f"Error training regressor: {str(e)}")
125
 
126
  # Prepare future dataframe
127
  future_df = test_df[['timestamp', 'job_type', 'job_type_encoded',
 
133
  future_df['dayofyear'] = future_df['ds'].dt.dayofyear
134
  future_df['is_weekend'] = future_df['dayofweek'].isin([5, 6]).astype(int)
135
 
136
+ try:
137
+ future_df[['job_type_scaled', 'years_exp_scaled']] = scaler.transform(
138
+ future_df[['job_type_encoded', 'years_of_experience']]
139
+ )
140
+ future_df['job_exp_interaction'] = future_df['job_type_scaled'] * future_df['years_exp_scaled']
141
+ except Exception as e:
142
+ raise HTTPException(status_code=500, detail=f"Error transforming future dataframe: {str(e)}")
143
 
144
  # Predict job/no-job
145
+ try:
146
+ future_df['has_job_predicted'] = classifier.predict(
147
+ future_df[['dayofweek', 'month', 'year', 'dayofyear',
148
+ 'is_weekend', 'job_type_encoded', 'feedback_score',
149
+ 'years_of_experience']]
150
+ )
151
+ except Exception as e:
152
+ raise HTTPException(status_code=500, detail=f"Error predicting has_job: {str(e)}")
153
 
154
  # Evaluate classifier accuracy
155
  test_df['has_job'] = (test_df['job_type'] != "No Job").astype(int)
156
+ try:
157
+ acc = accuracy_score(test_df['has_job'], future_df['has_job_predicted'])
158
+ results['classification_metrics']['accuracy'] = round(acc * 100, 2)
159
+ except Exception as e:
160
+ raise HTTPException(status_code=500, detail=f"Error calculating accuracy: {str(e)}")
161
 
162
  # Predict wages
163
+ try:
164
+ future_df['yhat'] = regressor.predict(
165
+ future_df[['dayofweek', 'month', 'year', 'dayofyear',
166
+ 'is_weekend', 'job_type_scaled', 'feedback_score',
167
+ 'years_exp_scaled', 'job_exp_interaction']]
168
+ )
169
+ except Exception as e:
170
+ raise HTTPException(status_code=500, detail=f"Error predicting wages: {str(e)}")
171
 
172
  # Apply job prediction mask
173
  final_forecast_df = future_df.copy()
 
175
  final_forecast_df['yhat'] = np.minimum(final_forecast_df['yhat'], wage_cap)
176
 
177
  # Uncertainty intervals
178
+ try:
179
+ predictions = regressor.predict(X_train_reg)
180
+ std_dev = np.std([tree.predict(X_train_reg) for tree in regressor.estimators_], axis=0)
181
+ future_df['yhat_lower'] = np.maximum(final_forecast_df['yhat'] - 1.96 * std_dev.mean(), 0)
182
+ future_df['yhat_upper'] = final_forecast_df['yhat'] + 1.96 * std_dev.mean()
183
+ final_forecast_df['yhat_lower'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_lower'])
184
+ final_forecast_df['yhat_upper'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_upper'])
185
+ except Exception as e:
186
+ raise HTTPException(status_code=500, detail=f"Error calculating uncertainty intervals: {str(e)}")
187
 
188
  # Evaluation
189
+ try:
190
+ comparison_df = pd.merge(
191
+ test_df[['timestamp', 'contracted_wage']].rename(columns={'timestamp': 'ds', 'contracted_wage': 'y'}),
192
+ final_forecast_df[['ds', 'yhat', 'yhat_lower', 'yhat_upper']], on='ds'
193
+ )
194
+
195
+ valid_comparison_df = comparison_df[comparison_df['y'] > 0]
196
+ if not valid_comparison_df.empty:
197
+ weights = valid_comparison_df['y'] / valid_comparison_df['y'].mean()
198
+ mae = np.average([abs(a - p) for a, p in zip(valid_comparison_df['y'], valid_comparison_df['yhat'])], weights=weights)
199
+ mape = np.average([abs((a - p) / a) * 100 for a, p in zip(valid_comparison_df['y'], valid_comparison_df['yhat'])], weights=weights)
200
+ else:
201
+ mae = np.nan
202
+ mape = np.nan
203
+
204
+ results['classification_metrics']['mae'] = round(mae, 2) if not np.isnan(mae) else None
205
+ results['classification_metrics']['mape'] = round(mape, 2) if not np.isnan(mape) else None
206
+ except Exception as e:
207
+ raise HTTPException(status_code=500, detail=f"Error evaluating predictions: {str(e)}")
208
 
209
  # Plot results
210
+ try:
211
+ plt.figure(figsize=(12, 6))
212
+ plt.plot(comparison_df['ds'], comparison_df['y'], 'o-', label='Actual Values', markersize=4)
213
+ plt.plot(comparison_df['ds'], comparison_df['yhat'], '-', label='Forecasted Values')
214
+ plt.fill_between(comparison_df['ds'], comparison_df['yhat_lower'], comparison_df['yhat_upper'],
215
+ color='gray', alpha=0.2, label='Uncertainty Interval')
216
+ plt.title('Actual vs. Forecasted Daily Earnings (Last 20% of Dataset)')
217
+ plt.xlabel('Date')
218
+ plt.ylabel('Contracted Wage')
219
+ plt.legend()
220
+ plt.grid(True)
221
+ plt.xticks(rotation=45)
222
+ plt.tight_layout()
223
+
224
+ buffer = BytesIO()
225
+ plt.savefig(buffer, format='png')
226
+ buffer.seek(0)
227
+ plot_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
228
+ results['plot'] = f'data:image/png;base64,{plot_base64}'
229
+ plt.close()
230
+ except Exception as e:
231
+ raise HTTPException(status_code=500, detail=f"Error generating plot: {str(e)}")
232
 
233
  # Worker Profile for Microfinance
234
+ try:
235
+ worker_data = df.copy()
236
 
237
+ avg_daily_earning = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].mean()
238
+ avg_monthly_earning = avg_daily_earning * 30 if not np.isnan(avg_daily_earning) else 0
239
 
240
+ job_distribution = worker_data['job_type'].value_counts(normalize=True) * 100
241
 
242
+ avg_feedback = worker_data['feedback_score'].mean()
243
 
244
+ workholic_index = job_distribution.drop(labels=['No Job'], errors='ignore').sum() / 100
245
 
246
+ if avg_daily_earning > 0:
247
+ earning_stability = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].std() / avg_daily_earning
248
+ else:
249
+ earning_stability = np.nan
250
 
251
+ results['worker_profile'] = {
252
+ 'average_daily_earning': round(avg_daily_earning, 2) if not np.isnan(avg_daily_earning) else None,
253
+ 'estimated_monthly_earning': round(avg_monthly_earning, 2) if not np.isnan(avg_monthly_earning) else None,
254
+ 'job_distribution': job_distribution.round(2).to_dict(),
255
+ 'average_feedback_score': round(avg_feedback, 2) if not np.isnan(avg_feedback) else None,
256
+ 'workholic_index': round(workholic_index, 2) if not np.isnan(workholic_index) else None,
257
+ 'earning_stability': round(earning_stability, 2) if not np.isnan(earning_stability) else None
258
+ }
259
+ except Exception as e:
260
+ raise HTTPException(status_code=500, detail=f"Error generating worker profile: {str(e)}")
261
 
262
  def convert_to_serializable(obj):
263
  if isinstance(obj, np.floating):
 
275
 
276
  if __name__ == "__main__":
277
  import uvicorn
278
+ uvicorn.run(app, host="0.0.0.0", port=8000)