Spaces:

hardin009
/

CropSync

Build error

App Files Files Community

hardin009 commited on Sep 13, 2024

Commit

572f22e

verified ·

1 Parent(s): eb9d2da

Upload market_ai.py

Browse files

Files changed (1) hide show

market_ai.py +313 -0

market_ai.py ADDED Viewed

	@@ -0,0 +1,313 @@

+# -*- coding: utf-8 -*-
+"""Market AI.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1g_-stp3TgQo9X3UgKIAki9NSdkp_OiV1
+"""
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split, RandomizedSearchCV
+from sklearn.preprocessing import StandardScaler
+from sklearn.feature_selection import SelectKBest, f_regression
+from sklearn.linear_model import Ridge
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.pipeline import Pipeline
+from transformers import pipeline
+from datetime import timedelta
+import traceback
+def load_and_preprocess_data(file_path):
+    # Read the CSV file without headers
+    df = pd.read_csv(file_path, encoding='utf-8', header=None)
+    # Split the single column into multiple columns
+    columns = ['Date', 'Commodity', 'Price', 'Growing Months', 'Harvesting Months',
+               'Cold Storage Availability', 'Cold Storage Capacity', 'ArrivalQuantity',
+               'Temperature', 'Humidity', 'Wind direction', 'Festivals', 'Events', 'Impacts']
+    df = pd.DataFrame([row[0].split() for row in df.values], columns=columns)
+    # Convert Date to datetime
+    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
+    # Convert Price and numerical columns to appropriate types
+    df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
+    numerical_columns = ['ArrivalQuantity', 'Temperature', 'Humidity']
+    for col in numerical_columns:
+        df[col] = pd.to_numeric(df[col], errors='coerce')
+    # Sort by Date
+    df = df.sort_values('Date')
+    return df
+# Use the updated function to load your data
+try:
+    train_df = load_and_preprocess_data('Potato Dataset - Train Data.csv')
+    test_df = load_and_preprocess_data('Potato Dataset - Test Data.csv')
+    # Print column names and data types
+    print("Train data columns:", train_df.columns)
+    print("Train data types:\n", train_df.dtypes)
+    print("\nTest data columns:", test_df.columns)
+    print("Test data types:\n", test_df.dtypes)
+    # Print shapes
+    print("\nTrain data shape:", train_df.shape)
+    print("Test data shape:", test_df.shape)
+    # Print first few rows
+    print("\nFirst few rows of train_df:")
+    print(train_df.head())
+    print("\nFirst few rows of test_df:")
+    print(test_df.head())
+except FileNotFoundError as e:
+    print(f"Error: {e}. Please make sure the CSV files are in the correct location.")
+    exit(1)
+except Exception as e:
+    print(f"An error occurred: {e}")
+    import traceback
+    traceback.print_exc()
+    exit(1)
+# Sentiment analysis
+sentiment_analyzer = pipeline("sentiment-analysis")
+def analyze_sentiment(df):
+    df['Events_Sentiment'] = df['Events'].apply(lambda x: sentiment_analyzer(x)[0]['score'] if x else 0)
+    df['Impacts_Sentiment'] = df['Impacts'].apply(lambda x: sentiment_analyzer(x)[0]['score'] if x else 0)
+    return df
+train_df = analyze_sentiment(train_df)
+test_df = analyze_sentiment(test_df)
+# Feature engineering
+def engineer_features(df):
+    df['DayOfWeek'] = df['Date'].dt.dayofweek
+    df['Month'] = df['Date'].dt.month
+    df['Quarter'] = df['Date'].dt.quarter
+    df['Year'] = df['Date'].dt.year
+    df['PriceLag1'] = df['Price'].shift(1)
+    df['PriceLag7'] = df['Price'].shift(7)
+    df['PriceRollingMean7'] = df['Price'].rolling(window=7).mean()
+    df['PriceRollingStd7'] = df['Price'].rolling(window=7).std()
+    df['PrevWeekAvgPrice'] = df['Price'].rolling(window=7).mean().shift(1)
+    return df
+train_df = engineer_features(train_df)
+test_df = engineer_features(test_df)
+# Prepare features and target
+features = ['ArrivalQuantity', 'Temperature', 'Humidity', 'Wind direction',
+            'Events_Sentiment', 'Impacts_Sentiment', 'DayOfWeek', 'Month', 'Quarter', 'Year',
+            'PriceLag1', 'PriceLag7', 'PriceRollingMean7', 'PriceRollingStd7', 'PrevWeekAvgPrice']
+# Print the first few rows of the dataframes to check the data
+print("\nFirst few rows of train_df:")
+print(train_df[features + ['Price']].head())
+print("\nFirst few rows of test_df:")
+print(test_df[features + ['Price']].head())
+X = train_df[features].dropna()
+y = train_df['Price'].loc[X.index]
+X_test = test_df[features].dropna()
+y_test = test_df['Price'].loc[X_test.index]
+# Print shapes after preparing features and target
+print("X shape:", X.shape)
+print("y shape:", y.shape)
+print("X_test shape:", X_test.shape)
+print("y_test shape:", y_test.shape)
+# Split the data
+X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
+# Create pipeline for Ridge model
+ridge_pipeline = Pipeline([
+    ('scaler', StandardScaler()),
+    ('feature_selection', SelectKBest(f_regression)),
+    ('model', Ridge())
+])
+# Hyperparameter grid for RandomizedSearchCV
+param_grid = {
+    'feature_selection__k': [5, 10, 15],
+    'model__alpha': np.logspace(-4, 1, 50)
+}
+# Function to train and evaluate the model
+def train_and_evaluate_model(pipeline, param_grid, X_train, y_train, X_val, y_val):
+    random_search = RandomizedSearchCV(pipeline, param_grid, n_iter=50, cv=5, n_jobs=-1, random_state=42)
+    random_search.fit(X_train, y_train)
+    best_model = random_search.best_estimator_
+    y_pred = best_model.predict(X_val)
+    mse = mean_squared_error(y_val, y_pred)
+    r2 = r2_score(y_val, y_pred)
+    return best_model, mse, r2, random_search.best_params_
+# Train and evaluate the Ridge model
+print("Training Ridge model...")
+best_model, mse, r2, best_params = train_and_evaluate_model(
+    ridge_pipeline, param_grid, X_train, y_train, X_val, y_val
+)
+print(f"Ridge - MSE: {mse:.4f}, R2: {r2:.4f}")
+print(f"Best parameters: {best_params}\n")
+# Evaluate the model on the test set
+y_pred_test = best_model.predict(X_test)
+test_mse = mean_squared_error(y_test, y_pred_test)
+test_r2 = r2_score(y_test, y_pred_test)
+print(f"\nTest MSE: {test_mse:.4f}")
+print(f"Test R2: {test_r2:.4f}")
+# Plot actual vs predicted prices
+plt.figure(figsize=(12, 6))
+plt.scatter(y_test, y_pred_test, alpha=0.5)
+plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
+plt.xlabel('Actual Price')
+plt.ylabel('Predicted Price')
+plt.title('Actual vs Predicted Potato Prices')
+plt.tight_layout()
+plt.show()
+# Plot residuals
+residuals = y_test - y_pred_test
+plt.figure(figsize=(12, 6))
+plt.scatter(y_pred_test, residuals, alpha=0.5)
+plt.hlines(y=0, xmin=y_pred_test.min(), xmax=y_pred_test.max(), colors='r', linestyles='--')
+plt.xlabel('Predicted Price')
+plt.ylabel('Residuals')
+plt.title('Residual Plot')
+plt.tight_layout()
+plt.show()
+# Save the model
+import joblib
+joblib.dump(best_model, 'best_potato_price_model_Ridge.joblib')
+print("Best model saved as 'best_potato_price_model_Ridge.joblib'")
+# Function to generate future features
+def generate_future_features(last_date, num_days, last_known_price, arrival_quantities):
+    future_dates = [last_date + timedelta(days=i) for i in range(1, num_days + 1)]
+    future_df = pd.DataFrame({'Date': future_dates})
+    # Fill in the features that we can generate
+    future_df['DayOfWeek'] = future_df['Date'].dt.dayofweek
+    future_df['Month'] = future_df['Date'].dt.month
+    future_df['Quarter'] = future_df['Date'].dt.quarter
+    future_df['Year'] = future_df['Date'].dt.year
+    # Use provided arrival quantities
+    future_df['ArrivalQuantity'] = arrival_quantities
+    # For other features, we'll use the last known values or estimates
+    future_df['Temperature'] = X_test['Temperature'].mean()
+    future_df['Humidity'] = X_test['Humidity'].mean()
+    future_df['Wind direction'] = X_test['Wind direction'].mean()
+    future_df['Events_Sentiment'] = 0  # Neutral sentiment
+    future_df['Impacts_Sentiment'] = 0  # Neutral sentiment
+    # Initialize price-related features with the last known price
+    future_df['PriceLag1'] = last_known_price
+    future_df['PriceLag7'] = last_known_price
+    future_df['PriceRollingMean7'] = last_known_price
+    future_df['PriceRollingStd7'] = 0
+    future_df['PrevWeekAvgPrice'] = last_known_price
+    return future_df[features]
+# Function to predict future prices
+def predict_future_prices(model, last_date, num_days, last_known_price, arrival_quantities):
+    print("Debug: last_date =", last_date)
+    print("Debug: last_known_price =", last_known_price)
+    future_features = generate_future_features(last_date, num_days, last_known_price, arrival_quantities)
+    print("Debug: future_features.columns =", future_features.columns)
+    print("Debug: future_features shape =", future_features.shape)
+    print("Debug: future_features head =", future_features.head())
+    # Check if the model expects the same number of features
+    n_features_model = model.named_steps['feature_selection'].n_features_in_
+    print("Number of features expected by the model:", n_features_model)
+    if future_features.shape[1] != n_features_model:
+        print("Warning: Number of features doesn't match. Adjusting feature selection.")
+        future_features = model.named_steps['feature_selection'].transform(future_features)
+    future_prices = []
+    for i in range(num_days):
+        price = model.predict(future_features.iloc[[i]])[0]
+        future_prices.append(price)
+        # Update price-related features for the next prediction
+        if i < num_days - 1:
+            future_features.iloc[i+1, future_features.columns.get_loc('PriceLag1')] = price
+            if i >= 6:
+                future_features.iloc[i+1, future_features.columns.get_loc('PriceLag7')] = future_prices[i-6]
+                future_features.iloc[i+1, future_features.columns.get_loc('PriceRollingMean7')] = np.mean(future_prices[max(0, i-6):i+1])
+                future_features.iloc[i+1, future_features.columns.get_loc('PriceRollingStd7')] = np.std(future_prices[max(0, i-6):i+1])
+                future_features.iloc[i+1, future_features.columns.get_loc('PrevWeekAvgPrice')] = np.mean(future_prices[max(0, i-6):i+1])
+    return np.array(future_prices)
+# Predict future prices
+try:
+    last_date = test_df['Date'].max()
+    print("Debug: last_date retrieved successfully")
+    last_known_price = test_df['Price'].iloc[-1]
+    print("Debug: last_known_price retrieved successfully")
+    num_days_to_predict = 30
+    print("Last date:", last_date)
+    print("Last known price:", last_known_price)
+    # Print information about the best model
+    print("Best model steps:", best_model.named_steps.keys())
+    print("Feature selection k:", best_model.named_steps['feature_selection'].k)
+    print("Selected features:", [features[i] for i in best_model.named_steps['feature_selection'].get_support(indices=True)])
+    # Generate future arrival quantities (you can modify this based on your requirements)
+    future_arrival_quantities = np.random.randint(
+        low=X_test['ArrivalQuantity'].min(),
+        high=X_test['ArrivalQuantity'].max(),
+        size=num_days_to_predict
+    )
+    future_prices = predict_future_prices(best_model, last_date, num_days_to_predict, last_known_price, future_arrival_quantities)
+    print("Debug: future_prices calculated successfully")
+    # Plot the predictions
+    future_dates = [last_date + timedelta(days=i) for i in range(1, num_days_to_predict + 1)]
+    plt.figure(figsize=(12, 6))
+    plt.plot(test_df['Date'], test_df['Price'], label='Historical Prices')
+    plt.plot(future_dates, future_prices, label='Predicted Prices', color='red')
+    plt.xlabel('Date')
+    plt.ylabel('Price')
+    plt.title('Historical and Predicted Potato Prices')
+    plt.legend()
+    plt.tight_layout()
+    plt.show()
+    print("Future price predictions:")
+    for date, price, quantity in zip(future_dates, future_prices, future_arrival_quantities):
+        print(f"{date.date()}: Price: {price:.2f}, Arrival Quantity: {quantity}")
+except KeyError as e:
+    print(f"Error: {e}. Please check if the 'Price' column exists in your CSV file.")
+    print("Columns in test_df:", test_df.columns)
+except Exception as e:
+    print(f"An error occurred: {e}")
+    print("Error location:", traceback.format_exc())