Spaces:
Runtime error
Runtime error
File size: 9,150 Bytes
4063787 abe00bf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import os
class GDPPredictor:
def __init__(self):
self.df = None
self.cleaned_df = None
self.features_to_use = []
self.target = 'Real GDP (USD bn)'
self.scaler = StandardScaler()
self.all_results = {}
self.ensemble_weights = {}
self.feature_ranges = {}
self.is_trained = False
def load_data(self, file_path='Consolidated.csv'):
"""Load and prepare the dataset"""
self.df = pd.read_csv(file_path)
self.cleaned_df = self.df.copy()
# Define all features (excluding target and Year)
self.features_to_use = [col for col in self.cleaned_df.columns if col != self.target and col != 'Year']
# Calculate feature ranges for reference
for feature in self.features_to_use:
feature_min = self.cleaned_df[feature].min()
feature_max = self.cleaned_df[feature].max()
feature_mean = self.cleaned_df[feature].mean()
feature_current = self.cleaned_df[self.cleaned_df['Year'] == self.cleaned_df['Year'].max()][feature].values[0]
self.feature_ranges[feature] = (feature_min, feature_max, feature_mean, feature_current)
return len(self.features_to_use)
def train_models(self):
"""Train all regression models"""
if self.cleaned_df is None:
raise ValueError("Data not loaded. Call load_data() first.")
# Prepare data for modeling
X = self.cleaned_df[self.features_to_use]
y = self.cleaned_df[self.target]
# Scale features
X_scaled = self.scaler.fit_transform(X)
# Define parameter ranges
ridge_alphas = [0.001, 0.01, 0.1, 1.0]
lasso_alphas = [0.001, 0.01, 0.1, 1.0]
# Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_scaled, y)
y_pred = lr_model.predict(X_scaled)
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)
self.all_results['Linear Regression'] = {
'model': lr_model,
'rmse': rmse,
'r2': r2
}
# Train Ridge models
for alpha in ridge_alphas:
model_name = f"Ridge (alpha={alpha})"
ridge_model = Ridge(alpha=alpha)
ridge_model.fit(X_scaled, y)
y_pred = ridge_model.predict(X_scaled)
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)
self.all_results[model_name] = {
'model': ridge_model,
'rmse': rmse,
'r2': r2
}
# Train Lasso models
for alpha in lasso_alphas:
model_name = f"Lasso (alpha={alpha})"
lasso_model = Lasso(alpha=alpha, max_iter=10000)
try:
lasso_model.fit(X_scaled, y)
y_pred = lasso_model.predict(X_scaled)
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)
self.all_results[model_name] = {
'model': lasso_model,
'rmse': rmse,
'r2': r2
}
except Exception as e:
print(f"Error training {model_name}: {e}")
# Create ensemble weights based on R2 scores
total_r2 = sum(max(0.01, self.all_results[m]['r2']) for m in self.all_results)
for name in self.all_results:
# Use max to ensure no negative weights (in case R2 is negative)
weight = max(0.01, self.all_results[name]['r2']) / total_r2
self.ensemble_weights[name] = weight
self.is_trained = True
return self.all_results
def get_feature_info(self):
"""Return feature categories with current values and ranges"""
if not self.features_to_use:
raise ValueError("Data not loaded. Call load_data() first.")
# Organize features into categories for better user experience
feature_categories = {
"Foreign Investment & Aid": ['Foreign Aid (USD bn)', 'FDI (bn USD)'],
"Economic Indicators": ['BOP (UDS bn)', 'Forex Reserves (in US $ Billion)',
'Money Supply (M3) in billion rupees', 'Inflation (in % of GDP)',
'CPI', 'Debt to GDP ratio'],
"Financial Markets": ['Sensex', 'IIP Index (Base year 1980)'],
"Resources & Savings": ['Savings (billion dollars)', 'Oil Prices', 'Bank Rate'],
"Agriculture": ['Agriculture Production(Food Grains) in Lakhs Tonnes'],
"Trade": ['With US(in USD Bn)', 'With China(in USD bn)']
}
# Add uncategorized features to "Other Indicators"
categorized = []
for category, feat_list in feature_categories.items():
categorized.extend(feat_list)
uncategorized = [f for f in self.features_to_use if f not in categorized]
if uncategorized:
feature_categories["Other Indicators"] = uncategorized
# Prepare results with only features that exist in our dataset
result = {}
for category, category_features in feature_categories.items():
valid_features = [f for f in category_features if f in self.features_to_use]
if valid_features:
result[category] = {
feature: self.feature_ranges[feature] for feature in valid_features
}
return result
def predict_gdp(self, input_values):
"""Predict GDP using all models and ensemble approach"""
if not self.is_trained:
raise ValueError("Models not trained. Call train_models() first.")
try:
# Check for missing features
missing_features = [f for f in self.features_to_use if f not in input_values]
if missing_features:
raise ValueError(f"Missing values for features: {missing_features}")
# Create input array
input_data = np.array([input_values[feature] for feature in self.features_to_use]).reshape(1, -1)
# Scale input
input_scaled = self.scaler.transform(input_data)
# Make predictions with each model
predictions = {}
for name, model_info in self.all_results.items():
model = model_info['model']
pred = model.predict(input_scaled)[0]
predictions[name] = pred
# Calculate ensemble prediction (weighted average)
ensemble_pred = 0
for name, pred in predictions.items():
weight = self.ensemble_weights[name]
ensemble_pred += pred * weight
predictions['Ensemble'] = ensemble_pred
return predictions
except Exception as e:
raise ValueError(f"Error predicting GDP: {e}")
def get_latest_gdp(self):
"""Return the most recent GDP value and year"""
if self.cleaned_df is None:
raise ValueError("Data not loaded. Call load_data() first.")
latest_year = self.cleaned_df['Year'].max()
latest_gdp = self.cleaned_df[self.cleaned_df['Year'] == latest_year][self.target].values[0]
return latest_year, latest_gdp
def save_models(self, filepath='gdp_models.pkl'):
"""Save the trained models to disk"""
if not self.is_trained:
raise ValueError("Models not trained. Call train_models() first.")
# Prepare data to save
save_data = {
'all_results': self.all_results,
'ensemble_weights': self.ensemble_weights,
'features_to_use': self.features_to_use,
'feature_ranges': self.feature_ranges,
'scaler': self.scaler
}
with open(filepath, 'wb') as f:
pickle.dump(save_data, f)
def load_models(self, filepath='gdp_models.pkl'):
"""Load trained models from disk"""
if not os.path.exists(filepath):
raise FileNotFoundError(f"Model file {filepath} not found.")
with open(filepath, 'rb') as f:
save_data = pickle.load(f)
self.all_results = save_data['all_results']
self.ensemble_weights = save_data['ensemble_weights']
self.features_to_use = save_data['features_to_use']
self.feature_ranges = save_data['feature_ranges']
self.scaler = save_data['scaler']
self.is_trained = True |