Anupam202224's picture
Create app.py
08a171c verified
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import gradio as gr
import plotly.graph_objects as go
from datetime import datetime, timedelta
import warnings
import logging
import traceback
import yfinance as yf
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class PredictiveSystem:
def __init__(self):
self.scaler = StandardScaler()
self.rf_model = None
self.lstm_model = None
self.feature_importance = None
def convert_dates(self, df):
"""Convert date columns to datetime"""
try:
df = df.copy()
# Try to convert 'date' column to datetime
if 'date' in df.columns:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
# Extract datetime features
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['date'].dt.dayofweek.isin([5, 6]).astype(int)
# Drop original date column
df = df.drop('date', axis=1)
return df
except Exception as e:
logger.error(f"Error converting dates: {str(e)}")
raise
def validate_data(self, df):
"""Validate input data structure and contents"""
try:
# Check if dataframe is empty
if df.empty:
raise ValueError("The uploaded file contains no data")
# Check minimum number of rows
if len(df) < 30:
raise ValueError("Dataset must contain at least 30 rows of data")
# Check for minimum number of columns
if len(df.columns) < 2:
raise ValueError("Dataset must contain at least 2 columns (features and target)")
# First convert date columns
df = self.convert_dates(df)
# Now check for remaining non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=['number']).columns
if len(non_numeric_cols) > 0:
raise ValueError(f"Non-numeric columns found after date processing: {', '.join(non_numeric_cols)}. Please ensure all features are numeric.")
return True
except Exception as e:
logger.error(f"Data validation error: {str(e)}")
raise
def preprocess_data(self, df):
"""Clean and preprocess the data with error handling"""
try:
logger.info("Starting data preprocessing...")
# Convert dates first
df_processed = self.convert_dates(df)
# Handle missing values
missing_count = df_processed.isnull().sum().sum()
if missing_count > 0:
logger.info(f"Handling {missing_count} missing values")
df_processed = df_processed.fillna(method='ffill').fillna(method='bfill')
# Remove any remaining non-numeric columns
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
df_processed = df_processed[numeric_cols]
logger.info("Data preprocessing completed successfully")
return df_processed
except Exception as e:
logger.error(f"Error in preprocessing data: {str(e)}")
raise
def feature_selection(self, X, y):
"""Select important features using Random Forest with error handling"""
try:
logger.info("Starting feature selection...")
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)
self.feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
selected_features = self.feature_importance['feature'].head(
min(10, len(X.columns))
)
logger.info(f"Selected {len(selected_features)} features")
return X[selected_features]
except Exception as e:
logger.error(f"Error in feature selection: {str(e)}")
raise
def train_models(self, X, y):
"""Train both Random Forest and LSTM models with error handling"""
try:
logger.info("Starting model training...")
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale data
X_train_scaled = self.scaler.fit_transform(X_train)
X_test_scaled = self.scaler.transform(X_test)
# Train Random Forest
logger.info("Training Random Forest model...")
self.rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
self.rf_model.fit(X_train_scaled, y_train)
# Train LSTM
logger.info("Training LSTM model...")
X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
self.lstm_model = Sequential([
LSTM(50, activation='relu', input_shape=(1, X_train_scaled.shape[1]), return_sequences=True),
Dropout(0.2),
LSTM(50, activation='relu'),
Dense(1)
])
self.lstm_model.compile(optimizer='adam', loss='mse')
# Use early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='loss',
patience=5,
restore_best_weights=True
)
self.lstm_model.fit(
X_train_lstm,
y_train,
epochs=50,
batch_size=32,
verbose=0,
callbacks=[early_stopping]
)
# Calculate metrics
rf_pred = self.rf_model.predict(X_test_scaled)
lstm_pred = self.lstm_model.predict(
X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))
)
metrics = {
'rf_rmse': np.sqrt(mean_squared_error(y_test, rf_pred)),
'rf_r2': r2_score(y_test, rf_pred),
'lstm_rmse': np.sqrt(mean_squared_error(y_test, lstm_pred)),
'lstm_r2': r2_score(y_test, lstm_pred)
}
logger.info("Model training completed successfully")
return metrics
except Exception as e:
logger.error(f"Error in model training: {str(e)}")
raise
def generate_predictions(self, X):
"""Generate predictions using both models"""
try:
X_scaled = self.scaler.transform(X)
rf_pred = self.rf_model.predict(X_scaled)
lstm_pred = self.lstm_model.predict(
X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))
)
# Combine predictions (ensemble)
final_pred = (rf_pred + lstm_pred.flatten()) / 2
return final_pred
except Exception as e:
logger.error(f"Error generating predictions: {str(e)}")
raise
def fetch_real_time_data(ticker):
"""Fetch real-time stock data using yfinance"""
try:
stock = yf.Ticker(ticker)
data = stock.history(period="1d")
return data
except Exception as e:
logger.error(f"Error fetching real-time data for {ticker}: {str(e)}")
raise
def create_gradio_interface(predictor):
def process_and_predict(file, ticker):
try:
# Read data
logger.info("Reading uploaded file...")
df = pd.read_csv(file.name)
# Show initial data info
logger.info(f"Columns in uploaded file: {', '.join(df.columns)}")
logger.info(f"Data types: {df.dtypes}")
# Validate and process data
predictor.validate_data(df)
df_processed = predictor.preprocess_data(df)
# Separate features and target
y = df_processed.iloc[:, -1] # Assume last column is target
X = df_processed.iloc[:, :-1]
# Feature selection and model training
X_selected = predictor.feature_selection(X, y)
metrics = predictor.train_models(X_selected, y)
# Generate predictions
predictions = predictor.generate_predictions(X_selected)
# Fetch real-time stock data
real_time_data = fetch_real_time_data(ticker)
# Create visualization
fig = go.Figure()
fig.add_trace(go.Scatter(y=y, name='Actual', line=dict(color='blue')))
fig.add_trace(go.Scatter(y=predictions, name='Predicted', line=dict(color='red')))
fig.add_trace(go.Scatter(y=real_time_data['Close'], name='Real-Time Data', line=dict(color='green')))
fig.update_layout(
title='Actual vs Predicted vs Real-Time Values',
xaxis_title='Time',
yaxis_title='Value',
template='plotly_white'
)
# Format output
output = f"""
Model Performance Metrics:
Random Forest RMSE: {metrics['rf_rmse']:.4f}
Random Forest R²: {metrics['rf_r2']:.4f}
LSTM RMSE: {metrics['lstm_rmse']:.4f}
LSTM R²: {metrics['lstm_r2']:.4f}
Data Processing Summary:
- Total records processed: {len(df)}
- Features selected: {len(X_selected.columns)}
- Date features created: month, day, day_of_week, is_weekend
- Training completed successfully
Real-Time Data Summary:
- Ticker: {ticker}
- Last Close Price: {real_time_data['Close'].iloc[-1]:.2f}
"""
logger.info("Analysis completed successfully")
return fig, output
except Exception as e:
error_msg = f"""
Error occurred during processing:
{str(e)}
Please ensure your data:
1. Is in CSV format
2. Contains a 'date' column (will be automatically processed)
3. Contains numeric feature columns
4. Has at least 30 rows of data
5. Has both feature columns and a target column
6. Has no corrupted values
Technical details for debugging:
{traceback.format_exc()}
"""
logger.error(f"Process failed: {str(e)}")
return None, error_msg
interface = gr.Interface(
fn=process_and_predict,
inputs=[
gr.File(label="Upload CSV file"),
gr.Textbox(label="Stock Ticker (e.g., AAPL)")
],
outputs=[
gr.Plot(label="Predictions Visualization"),
gr.Textbox(label="Analysis Results", lines=10)
],
title="Predictive & Prescriptive Analytics System",
description="""
Upload your CSV file containing historical data and enter a stock ticker to fetch real-time data.
Required format: Furtur Any contact Anupam Joshi 91-9878255748 @ joshianupam32@gmail.com
- A 'date' column in any standard date format
- Numeric feature columns
- A target column (last column)
- At least 30 rows of data
The system will automatically:
- Process the date column into useful features
- Handle any missing values
- Select the most important features
- Train and evaluate the models
- Fetch and display real-time stock data
""",
examples=[["sample_sales_data.csv", "AAPL"]]
)
return interface
# Initialize and launch
if __name__ == "__main__":
try:
predictor = PredictiveSystem()
interface = create_gradio_interface(predictor)
interface.launch(share=True)
except Exception as e:
logger.error(f"Failed to launch interface: {str(e)}")
raise