hardin009 commited on
Commit
572f22e
·
verified ·
1 Parent(s): eb9d2da

Upload market_ai.py

Browse files
Files changed (1) hide show
  1. market_ai.py +313 -0
market_ai.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Market AI.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1g_-stp3TgQo9X3UgKIAki9NSdkp_OiV1
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ from sklearn.model_selection import train_test_split, RandomizedSearchCV
14
+ from sklearn.preprocessing import StandardScaler
15
+ from sklearn.feature_selection import SelectKBest, f_regression
16
+ from sklearn.linear_model import Ridge
17
+ from sklearn.metrics import mean_squared_error, r2_score
18
+ from sklearn.pipeline import Pipeline
19
+ from transformers import pipeline
20
+ from datetime import timedelta
21
+ import traceback
22
+
23
+ def load_and_preprocess_data(file_path):
24
+ # Read the CSV file without headers
25
+ df = pd.read_csv(file_path, encoding='utf-8', header=None)
26
+
27
+ # Split the single column into multiple columns
28
+ columns = ['Date', 'Commodity', 'Price', 'Growing Months', 'Harvesting Months',
29
+ 'Cold Storage Availability', 'Cold Storage Capacity', 'ArrivalQuantity',
30
+ 'Temperature', 'Humidity', 'Wind direction', 'Festivals', 'Events', 'Impacts']
31
+
32
+ df = pd.DataFrame([row[0].split() for row in df.values], columns=columns)
33
+
34
+ # Convert Date to datetime
35
+ df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
36
+
37
+ # Convert Price and numerical columns to appropriate types
38
+ df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
39
+ numerical_columns = ['ArrivalQuantity', 'Temperature', 'Humidity']
40
+ for col in numerical_columns:
41
+ df[col] = pd.to_numeric(df[col], errors='coerce')
42
+
43
+ # Sort by Date
44
+ df = df.sort_values('Date')
45
+
46
+ return df
47
+
48
+ # Use the updated function to load your data
49
+ try:
50
+ train_df = load_and_preprocess_data('Potato Dataset - Train Data.csv')
51
+ test_df = load_and_preprocess_data('Potato Dataset - Test Data.csv')
52
+
53
+ # Print column names and data types
54
+ print("Train data columns:", train_df.columns)
55
+ print("Train data types:\n", train_df.dtypes)
56
+ print("\nTest data columns:", test_df.columns)
57
+ print("Test data types:\n", test_df.dtypes)
58
+
59
+ # Print shapes
60
+ print("\nTrain data shape:", train_df.shape)
61
+ print("Test data shape:", test_df.shape)
62
+
63
+ # Print first few rows
64
+ print("\nFirst few rows of train_df:")
65
+ print(train_df.head())
66
+ print("\nFirst few rows of test_df:")
67
+ print(test_df.head())
68
+
69
+ except FileNotFoundError as e:
70
+ print(f"Error: {e}. Please make sure the CSV files are in the correct location.")
71
+ exit(1)
72
+ except Exception as e:
73
+ print(f"An error occurred: {e}")
74
+ import traceback
75
+ traceback.print_exc()
76
+ exit(1)
77
+
78
+ # Sentiment analysis
79
+ sentiment_analyzer = pipeline("sentiment-analysis")
80
+
81
+ def analyze_sentiment(df):
82
+ df['Events_Sentiment'] = df['Events'].apply(lambda x: sentiment_analyzer(x)[0]['score'] if x else 0)
83
+ df['Impacts_Sentiment'] = df['Impacts'].apply(lambda x: sentiment_analyzer(x)[0]['score'] if x else 0)
84
+ return df
85
+
86
+ train_df = analyze_sentiment(train_df)
87
+ test_df = analyze_sentiment(test_df)
88
+
89
+ # Feature engineering
90
+ def engineer_features(df):
91
+ df['DayOfWeek'] = df['Date'].dt.dayofweek
92
+ df['Month'] = df['Date'].dt.month
93
+ df['Quarter'] = df['Date'].dt.quarter
94
+ df['Year'] = df['Date'].dt.year
95
+ df['PriceLag1'] = df['Price'].shift(1)
96
+ df['PriceLag7'] = df['Price'].shift(7)
97
+ df['PriceRollingMean7'] = df['Price'].rolling(window=7).mean()
98
+ df['PriceRollingStd7'] = df['Price'].rolling(window=7).std()
99
+ df['PrevWeekAvgPrice'] = df['Price'].rolling(window=7).mean().shift(1)
100
+ return df
101
+
102
+ train_df = engineer_features(train_df)
103
+ test_df = engineer_features(test_df)
104
+
105
+ # Prepare features and target
106
+ features = ['ArrivalQuantity', 'Temperature', 'Humidity', 'Wind direction',
107
+ 'Events_Sentiment', 'Impacts_Sentiment', 'DayOfWeek', 'Month', 'Quarter', 'Year',
108
+ 'PriceLag1', 'PriceLag7', 'PriceRollingMean7', 'PriceRollingStd7', 'PrevWeekAvgPrice']
109
+
110
+ # Print the first few rows of the dataframes to check the data
111
+ print("\nFirst few rows of train_df:")
112
+ print(train_df[features + ['Price']].head())
113
+
114
+ print("\nFirst few rows of test_df:")
115
+ print(test_df[features + ['Price']].head())
116
+
117
+ X = train_df[features].dropna()
118
+ y = train_df['Price'].loc[X.index]
119
+
120
+ X_test = test_df[features].dropna()
121
+ y_test = test_df['Price'].loc[X_test.index]
122
+
123
+ # Print shapes after preparing features and target
124
+ print("X shape:", X.shape)
125
+ print("y shape:", y.shape)
126
+ print("X_test shape:", X_test.shape)
127
+ print("y_test shape:", y_test.shape)
128
+
129
+ # Split the data
130
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
131
+
132
+ # Create pipeline for Ridge model
133
+ ridge_pipeline = Pipeline([
134
+ ('scaler', StandardScaler()),
135
+ ('feature_selection', SelectKBest(f_regression)),
136
+ ('model', Ridge())
137
+ ])
138
+
139
+ # Hyperparameter grid for RandomizedSearchCV
140
+ param_grid = {
141
+ 'feature_selection__k': [5, 10, 15],
142
+ 'model__alpha': np.logspace(-4, 1, 50)
143
+ }
144
+
145
+ # Function to train and evaluate the model
146
+ def train_and_evaluate_model(pipeline, param_grid, X_train, y_train, X_val, y_val):
147
+ random_search = RandomizedSearchCV(pipeline, param_grid, n_iter=50, cv=5, n_jobs=-1, random_state=42)
148
+ random_search.fit(X_train, y_train)
149
+
150
+ best_model = random_search.best_estimator_
151
+ y_pred = best_model.predict(X_val)
152
+ mse = mean_squared_error(y_val, y_pred)
153
+ r2 = r2_score(y_val, y_pred)
154
+
155
+ return best_model, mse, r2, random_search.best_params_
156
+
157
+ # Train and evaluate the Ridge model
158
+ print("Training Ridge model...")
159
+ best_model, mse, r2, best_params = train_and_evaluate_model(
160
+ ridge_pipeline, param_grid, X_train, y_train, X_val, y_val
161
+ )
162
+ print(f"Ridge - MSE: {mse:.4f}, R2: {r2:.4f}")
163
+ print(f"Best parameters: {best_params}\n")
164
+
165
+ # Evaluate the model on the test set
166
+ y_pred_test = best_model.predict(X_test)
167
+ test_mse = mean_squared_error(y_test, y_pred_test)
168
+ test_r2 = r2_score(y_test, y_pred_test)
169
+
170
+ print(f"\nTest MSE: {test_mse:.4f}")
171
+ print(f"Test R2: {test_r2:.4f}")
172
+
173
+ # Plot actual vs predicted prices
174
+ plt.figure(figsize=(12, 6))
175
+ plt.scatter(y_test, y_pred_test, alpha=0.5)
176
+ plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
177
+ plt.xlabel('Actual Price')
178
+ plt.ylabel('Predicted Price')
179
+ plt.title('Actual vs Predicted Potato Prices')
180
+ plt.tight_layout()
181
+ plt.show()
182
+
183
+ # Plot residuals
184
+ residuals = y_test - y_pred_test
185
+ plt.figure(figsize=(12, 6))
186
+ plt.scatter(y_pred_test, residuals, alpha=0.5)
187
+ plt.hlines(y=0, xmin=y_pred_test.min(), xmax=y_pred_test.max(), colors='r', linestyles='--')
188
+ plt.xlabel('Predicted Price')
189
+ plt.ylabel('Residuals')
190
+ plt.title('Residual Plot')
191
+ plt.tight_layout()
192
+ plt.show()
193
+
194
+ # Save the model
195
+ import joblib
196
+ joblib.dump(best_model, 'best_potato_price_model_Ridge.joblib')
197
+ print("Best model saved as 'best_potato_price_model_Ridge.joblib'")
198
+
199
+ # Function to generate future features
200
+ def generate_future_features(last_date, num_days, last_known_price, arrival_quantities):
201
+ future_dates = [last_date + timedelta(days=i) for i in range(1, num_days + 1)]
202
+ future_df = pd.DataFrame({'Date': future_dates})
203
+
204
+ # Fill in the features that we can generate
205
+ future_df['DayOfWeek'] = future_df['Date'].dt.dayofweek
206
+ future_df['Month'] = future_df['Date'].dt.month
207
+ future_df['Quarter'] = future_df['Date'].dt.quarter
208
+ future_df['Year'] = future_df['Date'].dt.year
209
+
210
+ # Use provided arrival quantities
211
+ future_df['ArrivalQuantity'] = arrival_quantities
212
+
213
+ # For other features, we'll use the last known values or estimates
214
+ future_df['Temperature'] = X_test['Temperature'].mean()
215
+ future_df['Humidity'] = X_test['Humidity'].mean()
216
+ future_df['Wind direction'] = X_test['Wind direction'].mean()
217
+ future_df['Events_Sentiment'] = 0 # Neutral sentiment
218
+ future_df['Impacts_Sentiment'] = 0 # Neutral sentiment
219
+
220
+ # Initialize price-related features with the last known price
221
+ future_df['PriceLag1'] = last_known_price
222
+ future_df['PriceLag7'] = last_known_price
223
+ future_df['PriceRollingMean7'] = last_known_price
224
+ future_df['PriceRollingStd7'] = 0
225
+ future_df['PrevWeekAvgPrice'] = last_known_price
226
+
227
+ return future_df[features]
228
+
229
+ # Function to predict future prices
230
+ def predict_future_prices(model, last_date, num_days, last_known_price, arrival_quantities):
231
+ print("Debug: last_date =", last_date)
232
+ print("Debug: last_known_price =", last_known_price)
233
+
234
+ future_features = generate_future_features(last_date, num_days, last_known_price, arrival_quantities)
235
+ print("Debug: future_features.columns =", future_features.columns)
236
+ print("Debug: future_features shape =", future_features.shape)
237
+ print("Debug: future_features head =", future_features.head())
238
+
239
+ # Check if the model expects the same number of features
240
+ n_features_model = model.named_steps['feature_selection'].n_features_in_
241
+ print("Number of features expected by the model:", n_features_model)
242
+
243
+ if future_features.shape[1] != n_features_model:
244
+ print("Warning: Number of features doesn't match. Adjusting feature selection.")
245
+ future_features = model.named_steps['feature_selection'].transform(future_features)
246
+
247
+ future_prices = []
248
+
249
+ for i in range(num_days):
250
+ price = model.predict(future_features.iloc[[i]])[0]
251
+ future_prices.append(price)
252
+
253
+ # Update price-related features for the next prediction
254
+ if i < num_days - 1:
255
+ future_features.iloc[i+1, future_features.columns.get_loc('PriceLag1')] = price
256
+ if i >= 6:
257
+ future_features.iloc[i+1, future_features.columns.get_loc('PriceLag7')] = future_prices[i-6]
258
+ future_features.iloc[i+1, future_features.columns.get_loc('PriceRollingMean7')] = np.mean(future_prices[max(0, i-6):i+1])
259
+ future_features.iloc[i+1, future_features.columns.get_loc('PriceRollingStd7')] = np.std(future_prices[max(0, i-6):i+1])
260
+ future_features.iloc[i+1, future_features.columns.get_loc('PrevWeekAvgPrice')] = np.mean(future_prices[max(0, i-6):i+1])
261
+
262
+ return np.array(future_prices)
263
+
264
+ # Predict future prices
265
+ try:
266
+ last_date = test_df['Date'].max()
267
+ print("Debug: last_date retrieved successfully")
268
+
269
+ last_known_price = test_df['Price'].iloc[-1]
270
+ print("Debug: last_known_price retrieved successfully")
271
+
272
+ num_days_to_predict = 30
273
+
274
+ print("Last date:", last_date)
275
+ print("Last known price:", last_known_price)
276
+
277
+ # Print information about the best model
278
+ print("Best model steps:", best_model.named_steps.keys())
279
+ print("Feature selection k:", best_model.named_steps['feature_selection'].k)
280
+ print("Selected features:", [features[i] for i in best_model.named_steps['feature_selection'].get_support(indices=True)])
281
+
282
+ # Generate future arrival quantities (you can modify this based on your requirements)
283
+ future_arrival_quantities = np.random.randint(
284
+ low=X_test['ArrivalQuantity'].min(),
285
+ high=X_test['ArrivalQuantity'].max(),
286
+ size=num_days_to_predict
287
+ )
288
+
289
+ future_prices = predict_future_prices(best_model, last_date, num_days_to_predict, last_known_price, future_arrival_quantities)
290
+ print("Debug: future_prices calculated successfully")
291
+
292
+ # Plot the predictions
293
+ future_dates = [last_date + timedelta(days=i) for i in range(1, num_days_to_predict + 1)]
294
+ plt.figure(figsize=(12, 6))
295
+ plt.plot(test_df['Date'], test_df['Price'], label='Historical Prices')
296
+ plt.plot(future_dates, future_prices, label='Predicted Prices', color='red')
297
+ plt.xlabel('Date')
298
+ plt.ylabel('Price')
299
+ plt.title('Historical and Predicted Potato Prices')
300
+ plt.legend()
301
+ plt.tight_layout()
302
+ plt.show()
303
+
304
+ print("Future price predictions:")
305
+ for date, price, quantity in zip(future_dates, future_prices, future_arrival_quantities):
306
+ print(f"{date.date()}: Price: {price:.2f}, Arrival Quantity: {quantity}")
307
+
308
+ except KeyError as e:
309
+ print(f"Error: {e}. Please check if the 'Price' column exists in your CSV file.")
310
+ print("Columns in test_df:", test_df.columns)
311
+ except Exception as e:
312
+ print(f"An error occurred: {e}")
313
+ print("Error location:", traceback.format_exc())