project3 / app.py
V8055's picture
Update app.py
26507b3 verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# Function Definitions
def analyze_data(data):
st.write("### Data Analysis")
st.write("**Missing Values:**")
st.write(data.isnull().sum())
st.write("**Statistical Summary:**")
st.write(data.describe())
# Correlation matrix
numeric_data = data.select_dtypes(include=['number'])
if not numeric_data.empty:
st.write("**Correlation Matrix:**")
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
st.pyplot(plt)
def prepare_data(data):
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
X = data[numeric_columns[:-1]]
y = data[numeric_columns[-1]]
return X, y
def preprocess_data(X_train, X_test):
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled, scaler
def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, feature_names):
models = {
'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}
results = {}
for name, model in models.items():
model.fit(X_train_scaled, y_train)
train_pred = model.predict(X_train_scaled)
test_pred = model.predict(X_test_scaled)
results[name] = {
'model': model,
'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)),
'train_r2': r2_score(y_train, train_pred),
'test_r2': r2_score(y_test, test_pred)
}
st.write(f"### {name} Results:")
st.write(f"**Training RMSE:** {results[name]['train_rmse']:.2f}")
st.write(f"**Test RMSE:** {results[name]['test_rmse']:.2f}")
st.write(f"**Training R²:** {results[name]['train_r2']:.3f}")
st.write(f"**Test R²:** {results[name]['test_r2']:.3f}")
if name == 'Random Forest':
feature_importance = pd.DataFrame({
'Feature': feature_names,
'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
st.write("**Feature Importance:**")
st.write(feature_importance)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance')
st.pyplot(plt)
return results
def main():
st.title("Housing Price Prediction")
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file:
data = pd.read_csv(uploaded_file)
st.write("## Dataset Overview")
st.write(data.head())
# Analyze the data
analyze_data(data)
# Prepare the data
X, y = prepare_data(data)
# Train-test split
test_size = st.slider("Test data size:", 0.1, 0.5, 0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
# Preprocess the data
X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)
# Train and evaluate models
st.write("## Model Training and Evaluation")
train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train.columns)
# Run the app
if __name__ == "__main__":
main()