Spaces:

V8055
/

project3

Sleeping

App Files Files Community

V8055 commited on Jan 15, 2025

Commit

3e13577

verified ·

1 Parent(s): b1a1739

Create app.py

Browse files

Files changed (1) hide show

app.py +152 -0

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_squared_error, r2_score
+import matplotlib.pyplot as plt
+import seaborn as sns
+def analyze_data(data):
+    """
+    Perform initial data analysis
+    """
+    # Check for missing values
+    print("\nMissing values:")
+    print(data.isnull().sum())
+    # Display statistical summary
+    print("\nStatistical summary:")
+    print(data.describe())
+    # Visualize distribution of target variable
+    numeric_data = data.select_dtypes(include=['number'])
+    # Create correlation matrix
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
+    plt.title('Correlation Matrix')
+    plt.tight_layout()
+    plt.show()
+def prepare_data(data):
+    """
+    Prepare the data for modeling
+    """
+    # Identify numeric columns
+    numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
+    # Separate features and target
+    # Assuming the last column is the price/target variable
+    X = data[numeric_columns[:-1]]
+    y = data[numeric_columns[-1]]
+    return X, y
+def preprocess_data(X_train, X_test):
+    """
+    Scale the features using StandardScaler
+    """
+    scaler = StandardScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+    return X_train_scaled, X_test_scaled, scaler
+def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train):
+    """
+    Train and evaluate multiple models
+    """
+    models = {
+        'Linear Regression': LinearRegression(),
+        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
+    }
+    results = {}
+    for name, model in models.items():
+        # Train model
+        model.fit(X_train_scaled, y_train)
+        # Make predictions
+        train_pred = model.predict(X_train_scaled)
+        test_pred = model.predict(X_test_scaled)
+        # Calculate metrics
+        results[name] = {
+            'model': model,
+            'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
+            'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)),
+            'train_r2': r2_score(y_train, train_pred),
+            'test_r2': r2_score(y_test, test_pred)
+        }
+        # Feature importance for Random Forest
+        if name == 'Random Forest':
+            feature_importance = pd.DataFrame({
+                'feature': X_train.columns,
+                'importance': model.feature_importances_
+            }).sort_values('importance', ascending=False)
+            print(f"\nFeature Importance:")
+            print(feature_importance)
+            # Plot feature importance
+            plt.figure(figsize=(10, 6))
+            sns.barplot(x='importance', y='feature', data=feature_importance)
+            plt.title('Feature Importance (Random Forest)')
+            plt.tight_layout()
+            plt.show()
+    return results
+def plot_predictions(model, X_test_scaled, y_test, title):
+    """
+    Plot actual vs predicted values
+    """
+    predictions = model.predict(X_test_scaled)
+    plt.figure(figsize=(10, 6))
+    plt.scatter(y_test, predictions, alpha=0.5)
+    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
+    plt.xlabel('Actual Prices')
+    plt.ylabel('Predicted Prices')
+    plt.title(title)
+    plt.tight_layout()
+    plt.show()
+def main(data):
+    # Analyze the data
+    analyze_data(data)
+    # Prepare the data
+    X, y = prepare_data(data)
+    # Split the data
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    # Preprocess the data
+    X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)
+    # Train and evaluate models
+    results = train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train)
+    # Print results
+    for name, metrics in results.items():
+        print(f"\n{name} Results:")
+        print(f"Training RMSE: ${metrics['train_rmse']:.2f}")
+        print(f"Test RMSE: ${metrics['test_rmse']:.2f}")
+        print(f"Training R²: {metrics['train_r2']:.3f}")
+        print(f"Test R²: {metrics['test_r2']:.3f}")
+        # Plot predictions
+        plot_predictions(metrics['model'], X_test_scaled, y_test, f"{name} Predictions vs Actual Values")
+    return results
+# Run the analysis and modeling
+results = main(data)