Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.metrics import mean_squared_error, r2_score | |
| import tensorflow as tf | |
| from tensorflow.keras.models import Sequential | |
| from tensorflow.keras.layers import LSTM, Dense, Dropout | |
| import gradio as gr | |
| import plotly.graph_objects as go | |
| from datetime import datetime, timedelta | |
| import warnings | |
| import logging | |
| import traceback | |
| import yfinance as yf | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class PredictiveSystem: | |
| def __init__(self): | |
| self.scaler = StandardScaler() | |
| self.rf_model = None | |
| self.lstm_model = None | |
| self.feature_importance = None | |
| def convert_dates(self, df): | |
| """Convert date columns to datetime""" | |
| try: | |
| df = df.copy() | |
| # Try to convert 'date' column to datetime | |
| if 'date' in df.columns: | |
| df['date'] = pd.to_datetime(df['date'], errors='coerce') | |
| # Extract datetime features | |
| df['month'] = df['date'].dt.month | |
| df['day'] = df['date'].dt.day | |
| df['day_of_week'] = df['date'].dt.dayofweek | |
| df['is_weekend'] = df['date'].dt.dayofweek.isin([5, 6]).astype(int) | |
| # Drop original date column | |
| df = df.drop('date', axis=1) | |
| return df | |
| except Exception as e: | |
| logger.error(f"Error converting dates: {str(e)}") | |
| raise | |
| def validate_data(self, df): | |
| """Validate input data structure and contents""" | |
| try: | |
| # Check if dataframe is empty | |
| if df.empty: | |
| raise ValueError("The uploaded file contains no data") | |
| # Check minimum number of rows | |
| if len(df) < 30: | |
| raise ValueError("Dataset must contain at least 30 rows of data") | |
| # Check for minimum number of columns | |
| if len(df.columns) < 2: | |
| raise ValueError("Dataset must contain at least 2 columns (features and target)") | |
| # First convert date columns | |
| df = self.convert_dates(df) | |
| # Now check for remaining non-numeric columns | |
| non_numeric_cols = df.select_dtypes(exclude=['number']).columns | |
| if len(non_numeric_cols) > 0: | |
| raise ValueError(f"Non-numeric columns found after date processing: {', '.join(non_numeric_cols)}. Please ensure all features are numeric.") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Data validation error: {str(e)}") | |
| raise | |
| def preprocess_data(self, df): | |
| """Clean and preprocess the data with error handling""" | |
| try: | |
| logger.info("Starting data preprocessing...") | |
| # Convert dates first | |
| df_processed = self.convert_dates(df) | |
| # Handle missing values | |
| missing_count = df_processed.isnull().sum().sum() | |
| if missing_count > 0: | |
| logger.info(f"Handling {missing_count} missing values") | |
| df_processed = df_processed.fillna(method='ffill').fillna(method='bfill') | |
| # Remove any remaining non-numeric columns | |
| numeric_cols = df_processed.select_dtypes(include=[np.number]).columns | |
| df_processed = df_processed[numeric_cols] | |
| logger.info("Data preprocessing completed successfully") | |
| return df_processed | |
| except Exception as e: | |
| logger.error(f"Error in preprocessing data: {str(e)}") | |
| raise | |
| def feature_selection(self, X, y): | |
| """Select important features using Random Forest with error handling""" | |
| try: | |
| logger.info("Starting feature selection...") | |
| rf = RandomForestRegressor(n_estimators=100, random_state=42) | |
| rf.fit(X, y) | |
| self.feature_importance = pd.DataFrame({ | |
| 'feature': X.columns, | |
| 'importance': rf.feature_importances_ | |
| }).sort_values('importance', ascending=False) | |
| selected_features = self.feature_importance['feature'].head( | |
| min(10, len(X.columns)) | |
| ) | |
| logger.info(f"Selected {len(selected_features)} features") | |
| return X[selected_features] | |
| except Exception as e: | |
| logger.error(f"Error in feature selection: {str(e)}") | |
| raise | |
| def train_models(self, X, y): | |
| """Train both Random Forest and LSTM models with error handling""" | |
| try: | |
| logger.info("Starting model training...") | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Scale data | |
| X_train_scaled = self.scaler.fit_transform(X_train) | |
| X_test_scaled = self.scaler.transform(X_test) | |
| # Train Random Forest | |
| logger.info("Training Random Forest model...") | |
| self.rf_model = RandomForestRegressor(n_estimators=100, random_state=42) | |
| self.rf_model.fit(X_train_scaled, y_train) | |
| # Train LSTM | |
| logger.info("Training LSTM model...") | |
| X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1])) | |
| self.lstm_model = Sequential([ | |
| LSTM(50, activation='relu', input_shape=(1, X_train_scaled.shape[1]), return_sequences=True), | |
| Dropout(0.2), | |
| LSTM(50, activation='relu'), | |
| Dense(1) | |
| ]) | |
| self.lstm_model.compile(optimizer='adam', loss='mse') | |
| # Use early stopping | |
| early_stopping = tf.keras.callbacks.EarlyStopping( | |
| monitor='loss', | |
| patience=5, | |
| restore_best_weights=True | |
| ) | |
| self.lstm_model.fit( | |
| X_train_lstm, | |
| y_train, | |
| epochs=50, | |
| batch_size=32, | |
| verbose=0, | |
| callbacks=[early_stopping] | |
| ) | |
| # Calculate metrics | |
| rf_pred = self.rf_model.predict(X_test_scaled) | |
| lstm_pred = self.lstm_model.predict( | |
| X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1])) | |
| ) | |
| metrics = { | |
| 'rf_rmse': np.sqrt(mean_squared_error(y_test, rf_pred)), | |
| 'rf_r2': r2_score(y_test, rf_pred), | |
| 'lstm_rmse': np.sqrt(mean_squared_error(y_test, lstm_pred)), | |
| 'lstm_r2': r2_score(y_test, lstm_pred) | |
| } | |
| logger.info("Model training completed successfully") | |
| return metrics | |
| except Exception as e: | |
| logger.error(f"Error in model training: {str(e)}") | |
| raise | |
| def generate_predictions(self, X): | |
| """Generate predictions using both models""" | |
| try: | |
| X_scaled = self.scaler.transform(X) | |
| rf_pred = self.rf_model.predict(X_scaled) | |
| lstm_pred = self.lstm_model.predict( | |
| X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1])) | |
| ) | |
| # Combine predictions (ensemble) | |
| final_pred = (rf_pred + lstm_pred.flatten()) / 2 | |
| return final_pred | |
| except Exception as e: | |
| logger.error(f"Error generating predictions: {str(e)}") | |
| raise | |
| def fetch_real_time_data(ticker): | |
| """Fetch real-time stock data using yfinance""" | |
| try: | |
| stock = yf.Ticker(ticker) | |
| data = stock.history(period="1d") | |
| return data | |
| except Exception as e: | |
| logger.error(f"Error fetching real-time data for {ticker}: {str(e)}") | |
| raise | |
| def create_gradio_interface(predictor): | |
| def process_and_predict(file, ticker): | |
| try: | |
| # Read data | |
| logger.info("Reading uploaded file...") | |
| df = pd.read_csv(file.name) | |
| # Show initial data info | |
| logger.info(f"Columns in uploaded file: {', '.join(df.columns)}") | |
| logger.info(f"Data types: {df.dtypes}") | |
| # Validate and process data | |
| predictor.validate_data(df) | |
| df_processed = predictor.preprocess_data(df) | |
| # Separate features and target | |
| y = df_processed.iloc[:, -1] # Assume last column is target | |
| X = df_processed.iloc[:, :-1] | |
| # Feature selection and model training | |
| X_selected = predictor.feature_selection(X, y) | |
| metrics = predictor.train_models(X_selected, y) | |
| # Generate predictions | |
| predictions = predictor.generate_predictions(X_selected) | |
| # Fetch real-time stock data | |
| real_time_data = fetch_real_time_data(ticker) | |
| # Create visualization | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(y=y, name='Actual', line=dict(color='blue'))) | |
| fig.add_trace(go.Scatter(y=predictions, name='Predicted', line=dict(color='red'))) | |
| fig.add_trace(go.Scatter(y=real_time_data['Close'], name='Real-Time Data', line=dict(color='green'))) | |
| fig.update_layout( | |
| title='Actual vs Predicted vs Real-Time Values', | |
| xaxis_title='Time', | |
| yaxis_title='Value', | |
| template='plotly_white' | |
| ) | |
| # Format output | |
| output = f""" | |
| Model Performance Metrics: | |
| Random Forest RMSE: {metrics['rf_rmse']:.4f} | |
| Random Forest R²: {metrics['rf_r2']:.4f} | |
| LSTM RMSE: {metrics['lstm_rmse']:.4f} | |
| LSTM R²: {metrics['lstm_r2']:.4f} | |
| Data Processing Summary: | |
| - Total records processed: {len(df)} | |
| - Features selected: {len(X_selected.columns)} | |
| - Date features created: month, day, day_of_week, is_weekend | |
| - Training completed successfully | |
| Real-Time Data Summary: | |
| - Ticker: {ticker} | |
| - Last Close Price: {real_time_data['Close'].iloc[-1]:.2f} | |
| """ | |
| logger.info("Analysis completed successfully") | |
| return fig, output | |
| except Exception as e: | |
| error_msg = f""" | |
| Error occurred during processing: | |
| {str(e)} | |
| Please ensure your data: | |
| 1. Is in CSV format | |
| 2. Contains a 'date' column (will be automatically processed) | |
| 3. Contains numeric feature columns | |
| 4. Has at least 30 rows of data | |
| 5. Has both feature columns and a target column | |
| 6. Has no corrupted values | |
| Technical details for debugging: | |
| {traceback.format_exc()} | |
| """ | |
| logger.error(f"Process failed: {str(e)}") | |
| return None, error_msg | |
| interface = gr.Interface( | |
| fn=process_and_predict, | |
| inputs=[ | |
| gr.File(label="Upload CSV file"), | |
| gr.Textbox(label="Stock Ticker (e.g., AAPL)") | |
| ], | |
| outputs=[ | |
| gr.Plot(label="Predictions Visualization"), | |
| gr.Textbox(label="Analysis Results", lines=10) | |
| ], | |
| title="Predictive & Prescriptive Analytics System", | |
| description=""" | |
| Upload your CSV file containing historical data and enter a stock ticker to fetch real-time data. | |
| Required format: Furtur Any contact Anupam Joshi 91-9878255748 @ joshianupam32@gmail.com | |
| - A 'date' column in any standard date format | |
| - Numeric feature columns | |
| - A target column (last column) | |
| - At least 30 rows of data | |
| The system will automatically: | |
| - Process the date column into useful features | |
| - Handle any missing values | |
| - Select the most important features | |
| - Train and evaluate the models | |
| - Fetch and display real-time stock data | |
| """, | |
| examples=[["sample_sales_data.csv", "AAPL"]] | |
| ) | |
| return interface | |
| # Initialize and launch | |
| if __name__ == "__main__": | |
| try: | |
| predictor = PredictiveSystem() | |
| interface = create_gradio_interface(predictor) | |
| interface.launch(share=True) | |
| except Exception as e: | |
| logger.error(f"Failed to launch interface: {str(e)}") | |
| raise |