agentsay commited on
Commit
19dcae0
·
verified ·
1 Parent(s): 85393ff

Update modelLoanAPI.py

Browse files
Files changed (1) hide show
  1. modelLoanAPI.py +298 -30
modelLoanAPI.py CHANGED
@@ -1,50 +1,318 @@
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  import pandas as pd
4
- import traceback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  app = FastAPI()
7
 
8
- # Request model
9
  class WorkerIdRequest(BaseModel):
10
  worker_id: int
11
 
12
- # Load dataset once on startup
13
- CSV_PATH = "/app/extended_worker_dataset.csv" # adjust if you use /app/data
14
-
15
- try:
16
- df = pd.read_csv(CSV_PATH)
17
- except Exception as e:
18
- print("==== ERROR LOADING CSV ====")
19
- print(str(e))
20
- traceback.print_exc()
21
- raise RuntimeError(f"Failed to load dataset from {CSV_PATH}: {str(e)}")
22
-
23
  @app.post("/predict_worker_earnings/")
24
  async def predict_worker_earnings(request: WorkerIdRequest):
25
  try:
26
  worker_id = request.worker_id
 
 
 
 
 
 
 
 
 
27
 
28
- # Ensure worker exists
29
- if worker_id not in df['worker_id'].values:
30
- raise HTTPException(status_code=404, detail=f"Worker ID {worker_id} not found")
 
 
 
 
 
 
 
31
 
32
- # Dummy earnings calculation (replace with ML model later)
33
- worker_data = df[df['worker_id'] == worker_id].iloc[0]
34
- base_salary = worker_data.get("base_salary", 10000)
35
- experience_years = worker_data.get("experience_years", 1)
36
- predicted_earnings = base_salary + (experience_years * 500)
 
 
37
 
38
- return {
39
- "worker_id": worker_id,
40
- "predicted_earnings": predicted_earnings
41
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- except HTTPException:
44
- raise # let FastAPI handle cleanly
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  except Exception as e:
47
- print("==== SERVER ERROR ====")
48
- print(str(e))
49
- traceback.print_exc()
50
  raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
 
 
 
 
 
1
+ # ```python
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
  import pandas as pd
5
+ import numpy as np
6
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
7
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
8
+ from sklearn.metrics import accuracy_score
9
+ import matplotlib.pyplot as plt
10
+ import json
11
+ import base64
12
+ from io import BytesIO
13
+ import warnings
14
+ import logging
15
+
16
+ # Set up logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ warnings.filterwarnings("ignore")
21
 
22
  app = FastAPI()
23
 
 
24
  class WorkerIdRequest(BaseModel):
25
  worker_id: int
26
 
 
 
 
 
 
 
 
 
 
 
 
27
  @app.post("/predict_worker_earnings/")
28
  async def predict_worker_earnings(request: WorkerIdRequest):
29
  try:
30
  worker_id = request.worker_id
31
+ logger.info(f"Processing request for worker_id: {worker_id}")
32
+
33
+ # Initialize result dictionary
34
+ results = {
35
+ 'worker_id': worker_id,
36
+ 'classification_metrics': {},
37
+ 'worker_profile': {},
38
+ 'plot': ''
39
+ }
40
 
41
+ # Load data
42
+ try:
43
+ df = pd.read_csv('/app/extended_worker_dataset.csv')
44
+ logger.info(f"CSV loaded successfully. Columns: {list(df.columns)}")
45
+ except FileNotFoundError:
46
+ logger.error("CSV file not found at /app/extended_worker_dataset.csv")
47
+ raise HTTPException(status_code=500, detail="CSV file not found at /app/data/extended_worker_dataset.csv")
48
+ except Exception as e:
49
+ logger.error(f"Error reading CSV file: {str(e)}")
50
+ raise HTTPException(status_code=500, detail=f"Error reading CSV file: {str(e)}")
51
 
52
+ # Verify expected columns
53
+ expected_columns = ['worker_id', 'state', 'labour_category', 'contracted_wage', 'age', 'gender',
54
+ 'migration_status', 'years_of_experience', 'feedback_score', 'job_type', 'timestamp']
55
+ if not all(col in df.columns for col in expected_columns):
56
+ missing_cols = [col for col in expected_columns if col not in df.columns]
57
+ logger.error(f"Missing columns in CSV: {missing_cols}")
58
+ raise HTTPException(status_code=500, detail=f"Missing columns in CSV: {missing_cols}")
59
 
60
+ # Filter for one worker_id
61
+ df = df[df['worker_id'] == worker_id].copy()
62
+ if df.empty:
63
+ logger.warning(f"No data found for worker_id {worker_id}")
64
+ raise HTTPException(status_code=404, detail=f"No data found for worker_id {worker_id}")
65
+
66
+ logger.info(f"Filtered data for worker_id {worker_id}: {len(df)} rows")
67
+
68
+ # Data preprocessing
69
+ try:
70
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
71
+ except Exception as e:
72
+ logger.error(f"Error converting timestamp: {str(e)}")
73
+ raise HTTPException(status_code=500, detail=f"Error converting timestamp: {str(e)}")
74
+
75
+ df['has_job'] = (df['job_type'] != "No Job").astype(int)
76
+
77
+ try:
78
+ wage_cap = df[df['contracted_wage'] > 0]['contracted_wage'].quantile(0.90)
79
+ if np.isnan(wage_cap) or wage_cap <= 500:
80
+ logger.error("Invalid wage cap calculated")
81
+ raise ValueError("Invalid wage cap calculated")
82
+ df['contracted_wage'] = df['contracted_wage'].clip(lower=500, upper=wage_cap)
83
+ except Exception as e:
84
+ logger.error(f"Error processing wage data: {str(e)}")
85
+ raise HTTPException(status_code=500, detail=f"Error processing wage data: {str(e)}")
86
+
87
+ # Encode job_type
88
+ le = LabelEncoder()
89
+ try:
90
+ df['job_type_encoded'] = le.fit_transform(df['job_type'])
91
+ except Exception as e:
92
+ logger.error(f"Error encoding job_type: {str(e)}")
93
+ raise HTTPException(status_code=500, detail=f"Error encoding job_type: {str(e)}")
94
+
95
+ # Split data
96
+ if len(df) < 2:
97
+ logger.warning("Insufficient data points for training and testing")
98
+ raise HTTPException(status_code=400, detail="Insufficient data points for training and testing")
99
+ split_point = int(len(df) * 0.8)
100
+ train_df = df.iloc[:split_point].copy()
101
+ test_df = df.iloc[split_point:].copy()
102
+
103
+ # Scale features
104
+ scaler = StandardScaler()
105
+ try:
106
+ train_df[['job_type_scaled', 'years_exp_scaled']] = scaler.fit_transform(
107
+ train_df[['job_type_encoded', 'years_of_experience']]
108
+ )
109
+ train_df['job_exp_interaction'] = train_df['job_type_scaled'] * train_df['years_exp_scaled']
110
+ except Exception as e:
111
+ logger.error(f"Error scaling features: {str(e)}")
112
+ raise HTTPException(status_code=500, detail=f"Error scaling features: {str(e)}")
113
+
114
+ for subset in [train_df, test_df]:
115
+ subset['dayofweek'] = subset['timestamp'].dt.dayofweek
116
+ subset['month'] = subset['timestamp'].dt.month
117
+ subset['year'] = subset['timestamp'].dt.year
118
+ subset['dayofyear'] = subset['timestamp'].dt.dayofyear
119
+ subset['is_weekend'] = subset['dayofweek'].isin([5, 6]).astype(int)
120
+
121
+ # Train classifier
122
+ X_train_class = train_df[['dayofweek', 'month', 'year', 'dayofyear',
123
+ 'is_weekend', 'job_type_encoded', 'feedback_score',
124
+ 'years_of_experience']]
125
+ y_train_class = train_df['has_job']
126
+
127
+ try:
128
+ classifier = RandomForestClassifier(
129
+ n_estimators=500, max_depth=12, min_samples_split=5, random_state=42
130
+ )
131
+ classifier.fit(X_train_class, y_train_class)
132
+ except Exception as e:
133
+ logger.error(f"Error training classifier: {str(e)}")
134
+ raise HTTPException(status_code=500, detail=f"Error training classifier: {str(e)}")
135
+
136
+ # Train regressor
137
+ train_df_reg = train_df[train_df['has_job'] == 1].copy()
138
+ if train_df_reg.empty:
139
+ logger.warning("No data available for regression (all has_job == 0)")
140
+ raise HTTPException(status_code=404, detail="No data available for regression (all has_job == 0)")
141
+
142
+ X_train_reg = train_df_reg[['dayofweek', 'month', 'year', 'dayofyear',
143
+ 'is_weekend', 'job_type_scaled', 'feedback_score',
144
+ 'years_exp_scaled', 'job_exp_interaction']]
145
+ y_train_reg = train_df_reg['contracted_wage']
146
+
147
+ try:
148
+ regressor = RandomForestRegressor(
149
+ n_estimators=300, max_depth=10, min_samples_split=4, random_state=42
150
+ )
151
+ regressor.fit(X_train_reg, y_train_reg)
152
+ except Exception as e:
153
+ logger.error(f"Error training regressor: {str(e)}")
154
+ raise HTTPException(status_code=500, detail=f"Error training regressor: {str(e)}")
155
+
156
+ # Prepare future dataframe
157
+ future_df = test_df[['timestamp', 'job_type', 'job_type_encoded',
158
+ 'feedback_score', 'years_of_experience']].rename(columns={'timestamp': 'ds'})
159
+
160
+ future_df['dayofweek'] = future_df['ds'].dt.dayofweek
161
+ future_df['month'] = future_df['ds'].dt.month
162
+ future_df['year'] = future_df['ds'].dt.year
163
+ future_df['dayofyear'] = future_df['ds'].dt.dayofyear
164
+ future_df['is_weekend'] = future_df['dayofweek'].isin([5, 6]).astype(int)
165
 
166
+ try:
167
+ future_df[['job_type_scaled', 'years_exp_scaled']] = scaler.transform(
168
+ future_df[['job_type_encoded', 'years_of_experience']]
169
+ )
170
+ future_df['job_exp_interaction'] = future_df['job_type_scaled'] * future_df['years_exp_scaled']
171
+ except Exception as e:
172
+ logger.error(f"Error transforming future dataframe: {str(e)}")
173
+ raise HTTPException(status_code=500, detail=f"Error transforming future dataframe: {str(e)}")
174
+
175
+ # Predict job/no-job
176
+ try:
177
+ future_df['has_job_predicted'] = classifier.predict(
178
+ future_df[['dayofweek', 'month', 'year', 'dayofyear',
179
+ 'is_weekend', 'job_type_encoded', 'feedback_score',
180
+ 'years_of_experience']]
181
+ )
182
+ except Exception as e:
183
+ logger.error(f"Error predicting has_job: {str(e)}")
184
+ raise HTTPException(status_code=500, detail=f"Error predicting has_job: {str(e)}")
185
+
186
+ # Evaluate classifier accuracy
187
+ test_df['has_job'] = (test_df['job_type'] != "No Job").astype(int)
188
+ try:
189
+ acc = accuracy_score(test_df['has_job'], future_df['has_job_predicted'])
190
+ results['classification_metrics']['accuracy'] = round(acc * 100, 2)
191
+ except Exception as e:
192
+ logger.error(f"Error calculating accuracy: {str(e)}")
193
+ raise HTTPException(status_code=500, detail=f"Error calculating accuracy: {str(e)}")
194
+
195
+ # Predict wages
196
+ try:
197
+ future_df['yhat'] = regressor.predict(
198
+ future_df[['dayofweek', 'month', 'year', 'dayofyear',
199
+ 'is_weekend', 'job_type_scaled', 'feedback_score',
200
+ 'years_exp_scaled', 'job_exp_interaction']]
201
+ )
202
+ except Exception as e:
203
+ logger.error(f"Error predicting wages: {str(e)}")
204
+ raise HTTPException(status_code=500, detail=f"Error predicting wages: {str(e)}")
205
+
206
+ # Apply job prediction mask
207
+ final_forecast_df = future_df.copy()
208
+ final_forecast_df['yhat'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, final_forecast_df['yhat'])
209
+ final_forecast_df['yhat'] = np.minimum(final_forecast_df['yhat'], wage_cap)
210
+
211
+ # Uncertainty intervals
212
+ try:
213
+ predictions = regressor.predict(X_train_reg)
214
+ std_dev = np.std([tree.predict(X_train_reg) for tree in regressor.estimators_], axis=0)
215
+ future_df['yhat_lower'] = np.maximum(final_forecast_df['yhat'] - 1.96 * std_dev.mean(), 0)
216
+ future_df['yhat_upper'] = final_forecast_df['yhat'] + 1.96 * std_dev.mean()
217
+ final_forecast_df['yhat_lower'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_lower'])
218
+ final_forecast_df['yhat_upper'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_upper'])
219
+ except Exception as e:
220
+ logger.error(f"Error calculating uncertainty intervals: {str(e)}")
221
+ raise HTTPException(status_code=500, detail=f"Error calculating uncertainty intervals: {str(e)}")
222
+
223
+ # Evaluation
224
+ try:
225
+ comparison_df = pd.merge(
226
+ test_df[['timestamp', 'contracted_wage']].rename(columns={'timestamp': 'ds', 'contracted_wage': 'y'}),
227
+ final_forecast_df[['ds', 'yhat', 'yhat_lower', 'yhat_upper']], on='ds'
228
+ )
229
+
230
+ valid_comparison_df = comparison_df[comparison_df['y'] > 0]
231
+ if not valid_comparison_df.empty:
232
+ weights = valid_comparison_df['y'] / valid_comparison_df['y'].mean()
233
+ mae = np.average([abs(a - p) for a, p in zip(valid_comparison_df['y'], valid_comparison_df['yhat'])], weights=weights)
234
+ mape = np.average([abs((a - p) / a) * 100 for a, p in zip(valid_comparison_df['y'], valid_comparison_df['yhat'])], weights=weights)
235
+ else:
236
+ mae = np.nan
237
+ mape = np.nan
238
+
239
+ results['classification_metrics']['mae'] = round(mae, 2) if not np.isnan(mae) else None
240
+ results['classification_metrics']['mape'] = round(mape, 2) if not np.isnan(mape) else None
241
+ except Exception as e:
242
+ logger.error(f"Error evaluating predictions: {str(e)}")
243
+ raise HTTPException(status_code=500, detail=f"Error evaluating predictions: {str(e)}")
244
+
245
+ # Plot results
246
+ try:
247
+ plt.figure(figsize=(12, 6))
248
+ plt.plot(comparison_df['ds'], comparison_df['y'], 'o-', label='Actual Values', markersize=4)
249
+ plt.plot(comparison_df['ds'], comparison_df['yhat'], '-', label='Forecasted Values')
250
+ plt.fill_between(comparison_df['ds'], comparison_df['yhat_lower'], comparison_df['yhat_upper'],
251
+ color='gray', alpha=0.2, label='Uncertainty Interval')
252
+ plt.title('Actual vs. Forecasted Daily Earnings (Last 20% of Dataset)')
253
+ plt.xlabel('Date')
254
+ plt.ylabel('Contracted Wage')
255
+ plt.legend()
256
+ plt.grid(True)
257
+ plt.xticks(rotation=45)
258
+ plt.tight_layout()
259
+
260
+ buffer = BytesIO()
261
+ plt.savefig(buffer, format='png')
262
+ buffer.seek(0)
263
+ plot_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
264
+ results['plot'] = f'data:image/png;base64,{plot_base64}'
265
+ plt.close()
266
+ except Exception as e:
267
+ logger.error(f"Error generating plot: {str(e)}")
268
+ raise HTTPException(status_code=500, detail=f"Error generating plot: {str(e)}")
269
+
270
+ # Worker Profile for Microfinance
271
+ try:
272
+ worker_data = df.copy()
273
+
274
+ avg_daily_earning = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].mean()
275
+ avg_monthly_earning = avg_daily_earning * 30 if not np.isnan(avg_daily_earning) else 0
276
+
277
+ job_distribution = worker_data['job_type'].value_counts(normalize=True) * 100
278
+
279
+ avg_feedback = worker_data['feedback_score'].mean()
280
+
281
+ workholic_index = job_distribution.drop(labels=['No Job'], errors='ignore').sum() / 100
282
+
283
+ if avg_daily_earning > 0:
284
+ earning_stability = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].std() / avg_daily_earning
285
+ else:
286
+ earning_stability = np.nan
287
+
288
+ results['worker_profile'] = {
289
+ 'average_daily_earning': round(avg_daily_earning, 2) if not np.isnan(avg_daily_earning) else None,
290
+ 'estimated_monthly_earning': round(avg_monthly_earning, 2) if not np.isnan(avg_monthly_earning) else None,
291
+ 'job_distribution': job_distribution.round(2).to_dict(),
292
+ 'average_feedback_score': round(avg_feedback, 2) if not np.isnan(avg_feedback) else None,
293
+ 'workholic_index': round(workholic_index, 2) if not np.isnan(workholic_index) else None,
294
+ 'earning_stability': round(earning_stability, 2) if not np.isnan(earning_stability) else None
295
+ }
296
+ except Exception as e:
297
+ logger.error(f"Error generating worker profile: {str(e)}")
298
+ raise HTTPException(status_code=500, detail=f"Error generating worker profile: {str(e)}")
299
+
300
+ def convert_to_serializable(obj):
301
+ if isinstance(obj, np.floating):
302
+ return float(obj)
303
+ if isinstance(obj, np.integer):
304
+ return int(obj)
305
+ if isinstance(obj, np.ndarray):
306
+ return obj.tolist()
307
+ return obj
308
+
309
+ logger.info("Request processed successfully")
310
+ return json.loads(json.dumps(results, default=convert_to_serializable))
311
 
312
  except Exception as e:
313
+ logger.error(f"Error processing request: {str(e)}")
 
 
314
  raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
315
+
316
+ if __name__ == "__main__":
317
+ import uvicorn
318
+ uvicorn.run(app, host="0.0.0.0", port=8000)