Upload market_ai.py
Browse files- market_ai.py +313 -0
market_ai.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Market AI.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1g_-stp3TgQo9X3UgKIAki9NSdkp_OiV1
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import numpy as np
|
| 12 |
+
import matplotlib.pyplot as plt
|
| 13 |
+
from sklearn.model_selection import train_test_split, RandomizedSearchCV
|
| 14 |
+
from sklearn.preprocessing import StandardScaler
|
| 15 |
+
from sklearn.feature_selection import SelectKBest, f_regression
|
| 16 |
+
from sklearn.linear_model import Ridge
|
| 17 |
+
from sklearn.metrics import mean_squared_error, r2_score
|
| 18 |
+
from sklearn.pipeline import Pipeline
|
| 19 |
+
from transformers import pipeline
|
| 20 |
+
from datetime import timedelta
|
| 21 |
+
import traceback
|
| 22 |
+
|
| 23 |
+
def load_and_preprocess_data(file_path):
|
| 24 |
+
# Read the CSV file without headers
|
| 25 |
+
df = pd.read_csv(file_path, encoding='utf-8', header=None)
|
| 26 |
+
|
| 27 |
+
# Split the single column into multiple columns
|
| 28 |
+
columns = ['Date', 'Commodity', 'Price', 'Growing Months', 'Harvesting Months',
|
| 29 |
+
'Cold Storage Availability', 'Cold Storage Capacity', 'ArrivalQuantity',
|
| 30 |
+
'Temperature', 'Humidity', 'Wind direction', 'Festivals', 'Events', 'Impacts']
|
| 31 |
+
|
| 32 |
+
df = pd.DataFrame([row[0].split() for row in df.values], columns=columns)
|
| 33 |
+
|
| 34 |
+
# Convert Date to datetime
|
| 35 |
+
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
|
| 36 |
+
|
| 37 |
+
# Convert Price and numerical columns to appropriate types
|
| 38 |
+
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
|
| 39 |
+
numerical_columns = ['ArrivalQuantity', 'Temperature', 'Humidity']
|
| 40 |
+
for col in numerical_columns:
|
| 41 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 42 |
+
|
| 43 |
+
# Sort by Date
|
| 44 |
+
df = df.sort_values('Date')
|
| 45 |
+
|
| 46 |
+
return df
|
| 47 |
+
|
| 48 |
+
# Use the updated function to load your data
|
| 49 |
+
try:
|
| 50 |
+
train_df = load_and_preprocess_data('Potato Dataset - Train Data.csv')
|
| 51 |
+
test_df = load_and_preprocess_data('Potato Dataset - Test Data.csv')
|
| 52 |
+
|
| 53 |
+
# Print column names and data types
|
| 54 |
+
print("Train data columns:", train_df.columns)
|
| 55 |
+
print("Train data types:\n", train_df.dtypes)
|
| 56 |
+
print("\nTest data columns:", test_df.columns)
|
| 57 |
+
print("Test data types:\n", test_df.dtypes)
|
| 58 |
+
|
| 59 |
+
# Print shapes
|
| 60 |
+
print("\nTrain data shape:", train_df.shape)
|
| 61 |
+
print("Test data shape:", test_df.shape)
|
| 62 |
+
|
| 63 |
+
# Print first few rows
|
| 64 |
+
print("\nFirst few rows of train_df:")
|
| 65 |
+
print(train_df.head())
|
| 66 |
+
print("\nFirst few rows of test_df:")
|
| 67 |
+
print(test_df.head())
|
| 68 |
+
|
| 69 |
+
except FileNotFoundError as e:
|
| 70 |
+
print(f"Error: {e}. Please make sure the CSV files are in the correct location.")
|
| 71 |
+
exit(1)
|
| 72 |
+
except Exception as e:
|
| 73 |
+
print(f"An error occurred: {e}")
|
| 74 |
+
import traceback
|
| 75 |
+
traceback.print_exc()
|
| 76 |
+
exit(1)
|
| 77 |
+
|
| 78 |
+
# Sentiment analysis
|
| 79 |
+
sentiment_analyzer = pipeline("sentiment-analysis")
|
| 80 |
+
|
| 81 |
+
def analyze_sentiment(df):
|
| 82 |
+
df['Events_Sentiment'] = df['Events'].apply(lambda x: sentiment_analyzer(x)[0]['score'] if x else 0)
|
| 83 |
+
df['Impacts_Sentiment'] = df['Impacts'].apply(lambda x: sentiment_analyzer(x)[0]['score'] if x else 0)
|
| 84 |
+
return df
|
| 85 |
+
|
| 86 |
+
train_df = analyze_sentiment(train_df)
|
| 87 |
+
test_df = analyze_sentiment(test_df)
|
| 88 |
+
|
| 89 |
+
# Feature engineering
|
| 90 |
+
def engineer_features(df):
|
| 91 |
+
df['DayOfWeek'] = df['Date'].dt.dayofweek
|
| 92 |
+
df['Month'] = df['Date'].dt.month
|
| 93 |
+
df['Quarter'] = df['Date'].dt.quarter
|
| 94 |
+
df['Year'] = df['Date'].dt.year
|
| 95 |
+
df['PriceLag1'] = df['Price'].shift(1)
|
| 96 |
+
df['PriceLag7'] = df['Price'].shift(7)
|
| 97 |
+
df['PriceRollingMean7'] = df['Price'].rolling(window=7).mean()
|
| 98 |
+
df['PriceRollingStd7'] = df['Price'].rolling(window=7).std()
|
| 99 |
+
df['PrevWeekAvgPrice'] = df['Price'].rolling(window=7).mean().shift(1)
|
| 100 |
+
return df
|
| 101 |
+
|
| 102 |
+
train_df = engineer_features(train_df)
|
| 103 |
+
test_df = engineer_features(test_df)
|
| 104 |
+
|
| 105 |
+
# Prepare features and target
|
| 106 |
+
features = ['ArrivalQuantity', 'Temperature', 'Humidity', 'Wind direction',
|
| 107 |
+
'Events_Sentiment', 'Impacts_Sentiment', 'DayOfWeek', 'Month', 'Quarter', 'Year',
|
| 108 |
+
'PriceLag1', 'PriceLag7', 'PriceRollingMean7', 'PriceRollingStd7', 'PrevWeekAvgPrice']
|
| 109 |
+
|
| 110 |
+
# Print the first few rows of the dataframes to check the data
|
| 111 |
+
print("\nFirst few rows of train_df:")
|
| 112 |
+
print(train_df[features + ['Price']].head())
|
| 113 |
+
|
| 114 |
+
print("\nFirst few rows of test_df:")
|
| 115 |
+
print(test_df[features + ['Price']].head())
|
| 116 |
+
|
| 117 |
+
X = train_df[features].dropna()
|
| 118 |
+
y = train_df['Price'].loc[X.index]
|
| 119 |
+
|
| 120 |
+
X_test = test_df[features].dropna()
|
| 121 |
+
y_test = test_df['Price'].loc[X_test.index]
|
| 122 |
+
|
| 123 |
+
# Print shapes after preparing features and target
|
| 124 |
+
print("X shape:", X.shape)
|
| 125 |
+
print("y shape:", y.shape)
|
| 126 |
+
print("X_test shape:", X_test.shape)
|
| 127 |
+
print("y_test shape:", y_test.shape)
|
| 128 |
+
|
| 129 |
+
# Split the data
|
| 130 |
+
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 131 |
+
|
| 132 |
+
# Create pipeline for Ridge model
|
| 133 |
+
ridge_pipeline = Pipeline([
|
| 134 |
+
('scaler', StandardScaler()),
|
| 135 |
+
('feature_selection', SelectKBest(f_regression)),
|
| 136 |
+
('model', Ridge())
|
| 137 |
+
])
|
| 138 |
+
|
| 139 |
+
# Hyperparameter grid for RandomizedSearchCV
|
| 140 |
+
param_grid = {
|
| 141 |
+
'feature_selection__k': [5, 10, 15],
|
| 142 |
+
'model__alpha': np.logspace(-4, 1, 50)
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
# Function to train and evaluate the model
|
| 146 |
+
def train_and_evaluate_model(pipeline, param_grid, X_train, y_train, X_val, y_val):
|
| 147 |
+
random_search = RandomizedSearchCV(pipeline, param_grid, n_iter=50, cv=5, n_jobs=-1, random_state=42)
|
| 148 |
+
random_search.fit(X_train, y_train)
|
| 149 |
+
|
| 150 |
+
best_model = random_search.best_estimator_
|
| 151 |
+
y_pred = best_model.predict(X_val)
|
| 152 |
+
mse = mean_squared_error(y_val, y_pred)
|
| 153 |
+
r2 = r2_score(y_val, y_pred)
|
| 154 |
+
|
| 155 |
+
return best_model, mse, r2, random_search.best_params_
|
| 156 |
+
|
| 157 |
+
# Train and evaluate the Ridge model
|
| 158 |
+
print("Training Ridge model...")
|
| 159 |
+
best_model, mse, r2, best_params = train_and_evaluate_model(
|
| 160 |
+
ridge_pipeline, param_grid, X_train, y_train, X_val, y_val
|
| 161 |
+
)
|
| 162 |
+
print(f"Ridge - MSE: {mse:.4f}, R2: {r2:.4f}")
|
| 163 |
+
print(f"Best parameters: {best_params}\n")
|
| 164 |
+
|
| 165 |
+
# Evaluate the model on the test set
|
| 166 |
+
y_pred_test = best_model.predict(X_test)
|
| 167 |
+
test_mse = mean_squared_error(y_test, y_pred_test)
|
| 168 |
+
test_r2 = r2_score(y_test, y_pred_test)
|
| 169 |
+
|
| 170 |
+
print(f"\nTest MSE: {test_mse:.4f}")
|
| 171 |
+
print(f"Test R2: {test_r2:.4f}")
|
| 172 |
+
|
| 173 |
+
# Plot actual vs predicted prices
|
| 174 |
+
plt.figure(figsize=(12, 6))
|
| 175 |
+
plt.scatter(y_test, y_pred_test, alpha=0.5)
|
| 176 |
+
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
|
| 177 |
+
plt.xlabel('Actual Price')
|
| 178 |
+
plt.ylabel('Predicted Price')
|
| 179 |
+
plt.title('Actual vs Predicted Potato Prices')
|
| 180 |
+
plt.tight_layout()
|
| 181 |
+
plt.show()
|
| 182 |
+
|
| 183 |
+
# Plot residuals
|
| 184 |
+
residuals = y_test - y_pred_test
|
| 185 |
+
plt.figure(figsize=(12, 6))
|
| 186 |
+
plt.scatter(y_pred_test, residuals, alpha=0.5)
|
| 187 |
+
plt.hlines(y=0, xmin=y_pred_test.min(), xmax=y_pred_test.max(), colors='r', linestyles='--')
|
| 188 |
+
plt.xlabel('Predicted Price')
|
| 189 |
+
plt.ylabel('Residuals')
|
| 190 |
+
plt.title('Residual Plot')
|
| 191 |
+
plt.tight_layout()
|
| 192 |
+
plt.show()
|
| 193 |
+
|
| 194 |
+
# Save the model
|
| 195 |
+
import joblib
|
| 196 |
+
joblib.dump(best_model, 'best_potato_price_model_Ridge.joblib')
|
| 197 |
+
print("Best model saved as 'best_potato_price_model_Ridge.joblib'")
|
| 198 |
+
|
| 199 |
+
# Function to generate future features
|
| 200 |
+
def generate_future_features(last_date, num_days, last_known_price, arrival_quantities):
|
| 201 |
+
future_dates = [last_date + timedelta(days=i) for i in range(1, num_days + 1)]
|
| 202 |
+
future_df = pd.DataFrame({'Date': future_dates})
|
| 203 |
+
|
| 204 |
+
# Fill in the features that we can generate
|
| 205 |
+
future_df['DayOfWeek'] = future_df['Date'].dt.dayofweek
|
| 206 |
+
future_df['Month'] = future_df['Date'].dt.month
|
| 207 |
+
future_df['Quarter'] = future_df['Date'].dt.quarter
|
| 208 |
+
future_df['Year'] = future_df['Date'].dt.year
|
| 209 |
+
|
| 210 |
+
# Use provided arrival quantities
|
| 211 |
+
future_df['ArrivalQuantity'] = arrival_quantities
|
| 212 |
+
|
| 213 |
+
# For other features, we'll use the last known values or estimates
|
| 214 |
+
future_df['Temperature'] = X_test['Temperature'].mean()
|
| 215 |
+
future_df['Humidity'] = X_test['Humidity'].mean()
|
| 216 |
+
future_df['Wind direction'] = X_test['Wind direction'].mean()
|
| 217 |
+
future_df['Events_Sentiment'] = 0 # Neutral sentiment
|
| 218 |
+
future_df['Impacts_Sentiment'] = 0 # Neutral sentiment
|
| 219 |
+
|
| 220 |
+
# Initialize price-related features with the last known price
|
| 221 |
+
future_df['PriceLag1'] = last_known_price
|
| 222 |
+
future_df['PriceLag7'] = last_known_price
|
| 223 |
+
future_df['PriceRollingMean7'] = last_known_price
|
| 224 |
+
future_df['PriceRollingStd7'] = 0
|
| 225 |
+
future_df['PrevWeekAvgPrice'] = last_known_price
|
| 226 |
+
|
| 227 |
+
return future_df[features]
|
| 228 |
+
|
| 229 |
+
# Function to predict future prices
|
| 230 |
+
def predict_future_prices(model, last_date, num_days, last_known_price, arrival_quantities):
|
| 231 |
+
print("Debug: last_date =", last_date)
|
| 232 |
+
print("Debug: last_known_price =", last_known_price)
|
| 233 |
+
|
| 234 |
+
future_features = generate_future_features(last_date, num_days, last_known_price, arrival_quantities)
|
| 235 |
+
print("Debug: future_features.columns =", future_features.columns)
|
| 236 |
+
print("Debug: future_features shape =", future_features.shape)
|
| 237 |
+
print("Debug: future_features head =", future_features.head())
|
| 238 |
+
|
| 239 |
+
# Check if the model expects the same number of features
|
| 240 |
+
n_features_model = model.named_steps['feature_selection'].n_features_in_
|
| 241 |
+
print("Number of features expected by the model:", n_features_model)
|
| 242 |
+
|
| 243 |
+
if future_features.shape[1] != n_features_model:
|
| 244 |
+
print("Warning: Number of features doesn't match. Adjusting feature selection.")
|
| 245 |
+
future_features = model.named_steps['feature_selection'].transform(future_features)
|
| 246 |
+
|
| 247 |
+
future_prices = []
|
| 248 |
+
|
| 249 |
+
for i in range(num_days):
|
| 250 |
+
price = model.predict(future_features.iloc[[i]])[0]
|
| 251 |
+
future_prices.append(price)
|
| 252 |
+
|
| 253 |
+
# Update price-related features for the next prediction
|
| 254 |
+
if i < num_days - 1:
|
| 255 |
+
future_features.iloc[i+1, future_features.columns.get_loc('PriceLag1')] = price
|
| 256 |
+
if i >= 6:
|
| 257 |
+
future_features.iloc[i+1, future_features.columns.get_loc('PriceLag7')] = future_prices[i-6]
|
| 258 |
+
future_features.iloc[i+1, future_features.columns.get_loc('PriceRollingMean7')] = np.mean(future_prices[max(0, i-6):i+1])
|
| 259 |
+
future_features.iloc[i+1, future_features.columns.get_loc('PriceRollingStd7')] = np.std(future_prices[max(0, i-6):i+1])
|
| 260 |
+
future_features.iloc[i+1, future_features.columns.get_loc('PrevWeekAvgPrice')] = np.mean(future_prices[max(0, i-6):i+1])
|
| 261 |
+
|
| 262 |
+
return np.array(future_prices)
|
| 263 |
+
|
| 264 |
+
# Predict future prices
|
| 265 |
+
try:
|
| 266 |
+
last_date = test_df['Date'].max()
|
| 267 |
+
print("Debug: last_date retrieved successfully")
|
| 268 |
+
|
| 269 |
+
last_known_price = test_df['Price'].iloc[-1]
|
| 270 |
+
print("Debug: last_known_price retrieved successfully")
|
| 271 |
+
|
| 272 |
+
num_days_to_predict = 30
|
| 273 |
+
|
| 274 |
+
print("Last date:", last_date)
|
| 275 |
+
print("Last known price:", last_known_price)
|
| 276 |
+
|
| 277 |
+
# Print information about the best model
|
| 278 |
+
print("Best model steps:", best_model.named_steps.keys())
|
| 279 |
+
print("Feature selection k:", best_model.named_steps['feature_selection'].k)
|
| 280 |
+
print("Selected features:", [features[i] for i in best_model.named_steps['feature_selection'].get_support(indices=True)])
|
| 281 |
+
|
| 282 |
+
# Generate future arrival quantities (you can modify this based on your requirements)
|
| 283 |
+
future_arrival_quantities = np.random.randint(
|
| 284 |
+
low=X_test['ArrivalQuantity'].min(),
|
| 285 |
+
high=X_test['ArrivalQuantity'].max(),
|
| 286 |
+
size=num_days_to_predict
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
future_prices = predict_future_prices(best_model, last_date, num_days_to_predict, last_known_price, future_arrival_quantities)
|
| 290 |
+
print("Debug: future_prices calculated successfully")
|
| 291 |
+
|
| 292 |
+
# Plot the predictions
|
| 293 |
+
future_dates = [last_date + timedelta(days=i) for i in range(1, num_days_to_predict + 1)]
|
| 294 |
+
plt.figure(figsize=(12, 6))
|
| 295 |
+
plt.plot(test_df['Date'], test_df['Price'], label='Historical Prices')
|
| 296 |
+
plt.plot(future_dates, future_prices, label='Predicted Prices', color='red')
|
| 297 |
+
plt.xlabel('Date')
|
| 298 |
+
plt.ylabel('Price')
|
| 299 |
+
plt.title('Historical and Predicted Potato Prices')
|
| 300 |
+
plt.legend()
|
| 301 |
+
plt.tight_layout()
|
| 302 |
+
plt.show()
|
| 303 |
+
|
| 304 |
+
print("Future price predictions:")
|
| 305 |
+
for date, price, quantity in zip(future_dates, future_prices, future_arrival_quantities):
|
| 306 |
+
print(f"{date.date()}: Price: {price:.2f}, Arrival Quantity: {quantity}")
|
| 307 |
+
|
| 308 |
+
except KeyError as e:
|
| 309 |
+
print(f"Error: {e}. Please check if the 'Price' column exists in your CSV file.")
|
| 310 |
+
print("Columns in test_df:", test_df.columns)
|
| 311 |
+
except Exception as e:
|
| 312 |
+
print(f"An error occurred: {e}")
|
| 313 |
+
print("Error location:", traceback.format_exc())
|