File size: 3,873 Bytes
3e13577 26507b3 3e13577 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 d99f5b5 ea5ef0b 26507b3 ea5ef0b 26507b3 3e13577 ea5ef0b 26507b3 3e13577 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b 26507b3 ea5ef0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# Function Definitions
def analyze_data(data):
st.write("### Data Analysis")
st.write("**Missing Values:**")
st.write(data.isnull().sum())
st.write("**Statistical Summary:**")
st.write(data.describe())
# Correlation matrix
numeric_data = data.select_dtypes(include=['number'])
if not numeric_data.empty:
st.write("**Correlation Matrix:**")
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
st.pyplot(plt)
def prepare_data(data):
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
X = data[numeric_columns[:-1]]
y = data[numeric_columns[-1]]
return X, y
def preprocess_data(X_train, X_test):
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled, scaler
def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, feature_names):
models = {
'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}
results = {}
for name, model in models.items():
model.fit(X_train_scaled, y_train)
train_pred = model.predict(X_train_scaled)
test_pred = model.predict(X_test_scaled)
results[name] = {
'model': model,
'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)),
'train_r2': r2_score(y_train, train_pred),
'test_r2': r2_score(y_test, test_pred)
}
st.write(f"### {name} Results:")
st.write(f"**Training RMSE:** {results[name]['train_rmse']:.2f}")
st.write(f"**Test RMSE:** {results[name]['test_rmse']:.2f}")
st.write(f"**Training R²:** {results[name]['train_r2']:.3f}")
st.write(f"**Test R²:** {results[name]['test_r2']:.3f}")
if name == 'Random Forest':
feature_importance = pd.DataFrame({
'Feature': feature_names,
'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
st.write("**Feature Importance:**")
st.write(feature_importance)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance')
st.pyplot(plt)
return results
def main():
st.title("Housing Price Prediction")
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file:
data = pd.read_csv(uploaded_file)
st.write("## Dataset Overview")
st.write(data.head())
# Analyze the data
analyze_data(data)
# Prepare the data
X, y = prepare_data(data)
# Train-test split
test_size = st.slider("Test data size:", 0.1, 0.5, 0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
# Preprocess the data
X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)
# Train and evaluate models
st.write("## Model Training and Evaluation")
train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train.columns)
# Run the app
if __name__ == "__main__":
main()
|