agentsay commited on
Commit
f89686b
·
verified ·
1 Parent(s): f3be0c5

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +29 -0
  2. extended_worker_dataset.csv +0 -0
  3. modelLoanAPI.py +220 -0
  4. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.10-slim
3
+
4
+ # Set working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the FastAPI application file
8
+ COPY modelLoanAPI.py /app/modelLoanAPI.py
9
+
10
+ # Copy requirements file (created below)
11
+ COPY requirements.txt /app/requirements.txt
12
+
13
+ COPY requirements.txt /app/extended_worker_dataset.csv
14
+
15
+ # Install system dependencies required for matplotlib and other libraries
16
+ RUN apt-get update && apt-get install -y \
17
+ gcc \
18
+ python3-dev \
19
+ libpq-dev \
20
+ && rm -rf /var/lib/apt/lists/*
21
+
22
+ # Install Python dependencies
23
+ RUN pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Expose the port the app runs on
26
+ EXPOSE 7860
27
+
28
+ # Command to run the FastAPI application
29
+ CMD ["uvicorn", "modelLoanAPI:app", "--host", "0.0.0.0", "--port", "7860"]
extended_worker_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
modelLoanAPI.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from fastapi import FastAPI, HTTPException
3
+ from fastapi.responses import JSONResponse
4
+ import pandas as pd
5
+ import numpy as np
6
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
7
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
8
+ from sklearn.metrics import accuracy_score
9
+ import matplotlib.pyplot as plt
10
+ import json
11
+ import base64
12
+ from io import BytesIO
13
+ import warnings
14
+
15
+ warnings.filterwarnings("ignore")
16
+
17
+ app = FastAPI()
18
+
19
+ @app.post("/predict_worker_earnings/")
20
+ async def predict_worker_earnings(worker_id: int):
21
+ try:
22
+ # Initialize result dictionary
23
+ results = {
24
+ 'worker_id': worker_id,
25
+ 'classification_metrics': {},
26
+ 'worker_profile': {},
27
+ 'plot': ''
28
+ }
29
+
30
+ # Load data
31
+ df = pd.read_csv('/content/drive/MyDrive/30_year_crop_data/extended_worker_dataset.csv')
32
+
33
+ # Filter for one worker_id
34
+ df = df[df['worker_id'] == worker_id].copy()
35
+
36
+ if df.empty:
37
+ raise HTTPException(status_code=404, detail=f"No data found for worker_id {worker_id}")
38
+
39
+ # Data preprocessing
40
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
41
+ df['has_job'] = (df['job_type'] != "No Job").astype(int)
42
+
43
+ wage_cap = df[df['contracted_wage'] > 0]['contracted_wage'].quantile(0.90)
44
+ df['contracted_wage'] = df['contracted_wage'].clip(lower=500, upper=wage_cap)
45
+
46
+ # Encode job_type
47
+ le = LabelEncoder()
48
+ df['job_type_encoded'] = le.fit_transform(df['job_type'])
49
+
50
+ # Split data
51
+ split_point = int(len(df) * 0.8)
52
+ train_df = df.iloc[:split_point].copy()
53
+ test_df = df.iloc[split_point:].copy()
54
+
55
+ # Scale features
56
+ scaler = StandardScaler()
57
+ train_df[['job_type_scaled', 'years_exp_scaled']] = scaler.fit_transform(
58
+ train_df[['job_type_encoded', 'years_of_experience']]
59
+ )
60
+ train_df['job_exp_interaction'] = train_df['job_type_scaled'] * train_df['years_exp_scaled']
61
+
62
+ for subset in [train_df, test_df]:
63
+ subset['dayofweek'] = subset['timestamp'].dt.dayofweek
64
+ subset['month'] = subset['timestamp'].dt.month
65
+ subset['year'] = subset['timestamp'].dt.year
66
+ subset['dayofyear'] = subset['timestamp'].dt.dayofyear
67
+ subset['is_weekend'] = subset['dayofweek'].isin([5, 6]).astype(int)
68
+
69
+ # Train classifier
70
+ X_train_class = train_df[['dayofweek', 'month', 'year', 'dayofyear',
71
+ 'is_weekend', 'job_type_encoded', 'feedback_score',
72
+ 'years_of_experience']]
73
+ y_train_class = train_df['has_job']
74
+
75
+ classifier = RandomForestClassifier(
76
+ n_estimators=500, max_depth=12, min_samples_split=5, random_state=42
77
+ )
78
+ classifier.fit(X_train_class, y_train_class)
79
+
80
+ # Train regressor
81
+ train_df_reg = train_df[train_df['has_job'] == 1].copy()
82
+ X_train_reg = train_df_reg[['dayofweek', 'month', 'year', 'dayofyear',
83
+ 'is_weekend', 'job_type_scaled', 'feedback_score',
84
+ 'years_exp_scaled', 'job_exp_interaction']]
85
+ y_train_reg = train_df_reg['contracted_wage']
86
+
87
+ regressor = RandomForestRegressor(
88
+ n_estimators=300, max_depth=10, min_samples_split=4, random_state=42
89
+ )
90
+ regressor.fit(X_train_reg, y_train_reg)
91
+
92
+ # Prepare future dataframe
93
+ future_df = test_df[['timestamp', 'job_type', 'job_type_encoded',
94
+ 'feedback_score', 'years_of_experience']].rename(columns={'timestamp': 'ds'})
95
+
96
+ future_df['dayofweek'] = future_df['ds'].dt.dayofweek
97
+ future_df['month'] = future_df['ds'].dt.month
98
+ future_df['year'] = future_df['ds'].dt.year
99
+ future_df['dayofyear'] = future_df['ds'].dt.dayofyear
100
+ future_df['is_weekend'] = future_df['dayofweek'].isin([5, 6]).astype(int)
101
+
102
+ future_df[['job_type_scaled', 'years_exp_scaled']] = scaler.transform(
103
+ future_df[['job_type_encoded', 'years_of_experience']]
104
+ )
105
+ future_df['job_exp_interaction'] = future_df['job_type_scaled'] * future_df['years_exp_scaled']
106
+
107
+ # Predict job/no-job
108
+ future_df['has_job_predicted'] = classifier.predict(
109
+ future_df[['dayofweek', 'month', 'year', 'dayofyear',
110
+ 'is_weekend', 'job_type_encoded', 'feedback_score',
111
+ 'years_of_experience']]
112
+ )
113
+
114
+ # Evaluate classifier accuracy
115
+ test_df['has_job'] = (test_df['job_type'] != "No Job").astype(int)
116
+ acc = accuracy_score(test_df['has_job'], future_df['has_job_predicted'])
117
+ results['classification_metrics']['accuracy'] = round(acc * 100, 2)
118
+
119
+ # Predict wages
120
+ future_df['yhat'] = regressor.predict(
121
+ future_df[['dayofweek', 'month', 'year', 'dayofyear',
122
+ 'is_weekend', 'job_type_scaled', 'feedback_score',
123
+ 'years_exp_scaled', 'job_exp_interaction']]
124
+ )
125
+
126
+ # Apply job prediction mask
127
+ final_forecast_df = future_df.copy()
128
+ final_forecast_df['yhat'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, final_forecast_df['yhat'])
129
+ final_forecast_df['yhat'] = np.minimum(final_forecast_df['yhat'], wage_cap)
130
+
131
+ # Uncertainty intervals
132
+ predictions = regressor.predict(X_train_reg)
133
+ std_dev = np.std([tree.predict(X_train_reg) for tree in regressor.estimators_], axis=0)
134
+ future_df['yhat_lower'] = np.maximum(final_forecast_df['yhat'] - 1.96 * std_dev.mean(), 0)
135
+ future_df['yhat_upper'] = final_forecast_df['yhat'] + 1.96 * std_dev.mean()
136
+ final_forecast_df['yhat_lower'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_lower'])
137
+ final_forecast_df['yhat_upper'] = np.where(final_forecast_df['has_job_predicted'] == 0, 0, future_df['yhat_upper'])
138
+
139
+ # Evaluation
140
+ comparison_df = pd.merge(
141
+ test_df[['timestamp', 'contracted_wage']].rename(columns={'timestamp': 'ds', 'contracted_wage': 'y'}),
142
+ final_forecast_df[['ds', 'yhat', 'yhat_lower', 'yhat_upper']], on='ds'
143
+ )
144
+
145
+ valid_comparison_df = comparison_df[comparison_df['y'] > 0]
146
+ if not valid_comparison_df.empty:
147
+ weights = valid_comparison_df['y'] / valid_comparison_df['y'].mean()
148
+ mae = np.average([abs(a - p) for a, p in zip(valid_comparison_df['y'], valid_comparison_df['yhat'])], weights=weights)
149
+ mape = np.average([abs((a - p) / a) * 100 for a, p in zip(valid_comparison_df['y'], valid_comparison_df['yhat'])], weights=weights)
150
+ else:
151
+ mae = np.nan
152
+ mape = np.nan
153
+
154
+ results['classification_metrics']['mae'] = round(mae, 2) if not np.isnan(mae) else None
155
+ results['classification_metrics']['mape'] = round(mape, 2) if not np.isnan(mape) else None
156
+
157
+ # Plot results
158
+ plt.figure(figsize=(12, 6))
159
+ plt.plot(comparison_df['ds'], comparison_df['y'], 'o-', label='Actual Values', markersize=4)
160
+ plt.plot(comparison_df['ds'], comparison_df['yhat'], '-', label='Forecasted Values')
161
+ plt.fill_between(comparison_df['ds'], comparison_df['yhat_lower'], comparison_df['yhat_upper'],
162
+ color='gray', alpha=0.2, label='Uncertainty Interval')
163
+ plt.title('Actual vs. Forecasted Daily Earnings (Last 20% of Dataset)')
164
+ plt.xlabel('Date')
165
+ plt.ylabel('Contracted Wage')
166
+ plt.legend()
167
+ plt.grid(True)
168
+ plt.xticks(rotation=45)
169
+ plt.tight_layout()
170
+
171
+ buffer = BytesIO()
172
+ plt.savefig(buffer, format='png')
173
+ buffer.seek(0)
174
+ plot_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
175
+ results['plot'] = f'data:image/png;base64,{plot_base64}'
176
+ plt.close()
177
+
178
+ # Worker Profile for Microfinance
179
+ worker_data = df.copy()
180
+
181
+ avg_daily_earning = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].mean()
182
+ avg_monthly_earning = avg_daily_earning * 30 if not np.isnan(avg_daily_earning) else 0
183
+
184
+ job_distribution = worker_data['job_type'].value_counts(normalize=True) * 100
185
+
186
+ avg_feedback = worker_data['feedback_score'].mean()
187
+
188
+ workholic_index = job_distribution.drop(labels=['No Job'], errors='ignore').sum() / 100
189
+
190
+ if avg_daily_earning > 0:
191
+ earning_stability = worker_data[worker_data['contracted_wage'] > 0]['contracted_wage'].std() / avg_daily_earning
192
+ else:
193
+ earning_stability = np.nan
194
+
195
+ results['worker_profile'] = {
196
+ 'average_daily_earning': round(avg_daily_earning, 2) if not np.isnan(avg_daily_earning) else None,
197
+ 'estimated_monthly_earning': round(avg_monthly_earning, 2) if not np.isnan(avg_monthly_earning) else None,
198
+ 'job_distribution': job_distribution.round(2).to_dict(),
199
+ 'average_feedback_score': round(avg_feedback, 2) if not np.isnan(avg_feedback) else None,
200
+ 'workholic_index': round(workholic_index, 2) if not np.isnan(workholic_index) else None,
201
+ 'earning_stability': round(earning_stability, 2) if not np.isnan(earning_stability) else None
202
+ }
203
+
204
+ def convert_to_serializable(obj):
205
+ if isinstance(obj, np.floating):
206
+ return float(obj)
207
+ if isinstance(obj, np.integer):
208
+ return int(obj)
209
+ if isinstance(obj, np.ndarray):
210
+ return obj.tolist()
211
+ return obj
212
+
213
+ return JSONResponse(content=json.loads(json.dumps(results, default=convert_to_serializable)))
214
+
215
+ except Exception as e:
216
+ raise HTTPException(status_code=500, detail=str(e))
217
+
218
+ if __name__ == "__main__":
219
+ import uvicorn
220
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn==0.30.6
3
+ pandas==2.2.2
4
+ numpy==1.26.4
5
+ scikit-learn==1.5.1
6
+ matplotlib==3.9.2