Spaces:

V8055
/

project3

Sleeping

App Files Files Community

V8055 commited on Jan 15, 2025

Commit

d99f5b5

verified ·

1 Parent(s): cc93377

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -126

app.py CHANGED Viewed

@@ -9,144 +9,89 @@ from sklearn.metrics import mean_squared_error, r2_score
 import matplotlib.pyplot as plt
 import seaborn as sns
-def analyze_data(data):
-    """
-    Perform initial data analysis
-    """
-    # Check for missing values
-    print("\nMissing values:")
-    print(data.isnull().sum())
-    # Display statistical summary
-    print("\nStatistical summary:")
-    print(data.describe())
-    # Visualize distribution of target variable
-    numeric_data = data.select_dtypes(include=['number'])
-    # Create correlation matrix
-    plt.figure(figsize=(10, 8))
-    sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
-    plt.title('Correlation Matrix')
-    plt.tight_layout()
-    plt.show()
-def prepare_data(data):
-    """
-    Prepare the data for modeling
-    """
-    # Identify numeric columns
-    numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
-    # Separate features and target
-    # Assuming the last column is the price/target variable
-    X = data[numeric_columns[:-1]]
-    y = data[numeric_columns[-1]]
-    return X, y
-def preprocess_data(X_train, X_test):
-    """
-    Scale the features using StandardScaler
-    """
     scaler = StandardScaler()
     X_train_scaled = scaler.fit_transform(X_train)
     X_test_scaled = scaler.transform(X_test)
-    return X_train_scaled, X_test_scaled, scaler
-def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train):
-    """
-    Train and evaluate multiple models
-    """
-    models = {
-        'Linear Regression': LinearRegression(),
-        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
-    }
-    results = {}
-    for name, model in models.items():
-        # Train model
-        model.fit(X_train_scaled, y_train)
-        # Make predictions
-        train_pred = model.predict(X_train_scaled)
-        test_pred = model.predict(X_test_scaled)
-        # Calculate metrics
-        results[name] = {
-            'model': model,
-            'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
-            'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)),
-            'train_r2': r2_score(y_train, train_pred),
-            'test_r2': r2_score(y_test, test_pred)
-        }
-        # Feature importance for Random Forest
-        if name == 'Random Forest':
-            feature_importance = pd.DataFrame({
-                'feature': X_train.columns,
-                'importance': model.feature_importances_
-            }).sort_values('importance', ascending=False)
-            print(f"\nFeature Importance:")
-            print(feature_importance)
-            # Plot feature importance
-            plt.figure(figsize=(10, 6))
-            sns.barplot(x='importance', y='feature', data=feature_importance)
-            plt.title('Feature Importance (Random Forest)')
-            plt.tight_layout()
-            plt.show()
-    return results
-def plot_predictions(model, X_test_scaled, y_test, title):
-    """
-    Plot actual vs predicted values
-    """
-    predictions = model.predict(X_test_scaled)
-    plt.figure(figsize=(10, 6))
-    plt.scatter(y_test, predictions, alpha=0.5)
-    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
-    plt.xlabel('Actual Prices')
-    plt.ylabel('Predicted Prices')
-    plt.title(title)
-    plt.tight_layout()
-    plt.show()
-def main(data):
-    # Analyze the data
-    analyze_data(data)
-    # Prepare the data
-    X, y = prepare_data(data)
-    # Split the data
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    # Preprocess the data
-    X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)
-    # Train and evaluate models
-    results = train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train)
-    # Print results
-    for name, metrics in results.items():
-        print(f"\n{name} Results:")
-        print(f"Training RMSE: ${metrics['train_rmse']:.2f}")
-        print(f"Test RMSE: ${metrics['test_rmse']:.2f}")
-        print(f"Training R²: {metrics['train_r2']:.3f}")
-        print(f"Test R²: {metrics['test_r2']:.3f}")
-        # Plot predictions
-        plot_predictions(metrics['model'], X_test_scaled, y_test, f"{name} Predictions vs Actual Values")
-    return results
-# Run the analysis and modeling
-results = main(data)

 import matplotlib.pyplot as plt
 import seaborn as sns
+# Streamlit setup
+st.title("ML Model Training and Evaluation App")
+st.write("This app allows you to upload data, analyze it, train ML models, and visualize results.")
+# Upload dataset
+uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
+# Sidebar settings
+test_size = st.sidebar.slider("Test Size (Train/Test Split)", 0.1, 0.5, 0.2)
+random_state = st.sidebar.number_input("Random State", min_value=0, max_value=100, value=42)
+models_to_train = st.sidebar.multiselect(
+    "Select Models to Train",
+    ["Linear Regression", "Random Forest"],
+    ["Linear Regression", "Random Forest"]
+)
+if uploaded_file:
+    # Load the dataset
+    data = pd.read_csv(uploaded_file)
+    st.write("Dataset Preview:")
+    st.dataframe(data.head())
+    # Analyze the data
+    if st.checkbox("Show Data Analysis"):
+        st.write("Missing Values:")
+        st.write(data.isnull().sum())
+        st.write("Statistical Summary:")
+        st.write(data.describe())
+        st.write("Correlation Matrix:")
+        numeric_data = data.select_dtypes(include=['number'])
+        plt.figure(figsize=(10, 8))
+        sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
+        st.pyplot()
+    # Prepare the data
+    X, y = data.iloc[:, :-1], data.iloc[:, -1]
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
+    # Scale the data
     scaler = StandardScaler()
     X_train_scaled = scaler.fit_transform(X_train)
     X_test_scaled = scaler.transform(X_test)
+    # Train and evaluate models
+    if st.button("Train Models"):
+        results = {}
+        if "Linear Regression" in models_to_train:
+            lr = LinearRegression()
+            lr.fit(X_train_scaled, y_train)
+            y_pred_train = lr.predict(X_train_scaled)
+            y_pred_test = lr.predict(X_test_scaled)
+            results["Linear Regression"] = {
+                "Train RMSE": np.sqrt(mean_squared_error(y_train, y_pred_train)),
+                "Test RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
+                "Train R²": r2_score(y_train, y_pred_train),
+                "Test R²": r2_score(y_test, y_pred_test)
+            }
+        if "Random Forest" in models_to_train:
+            rf = RandomForestRegressor(random_state=random_state, n_estimators=100)
+            rf.fit(X_train_scaled, y_train)
+            y_pred_train = rf.predict(X_train_scaled)
+            y_pred_test = rf.predict(X_test_scaled)
+            results["Random Forest"] = {
+                "Train RMSE": np.sqrt(mean_squared_error(y_train, y_pred_train)),
+                "Test RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
+                "Train R²": r2_score(y_train, y_pred_train),
+                "Test R²": r2_score(y_test, y_pred_test)
+            }
+        st.write("Model Results:")
+        st.json(results)
+        # Optional: Plot actual vs predicted for Random Forest
+        if "Random Forest" in results:
+            plt.figure(figsize=(8, 6))
+            plt.scatter(y_test, rf.predict(X_test_scaled), alpha=0.5)
+            plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
+            plt.xlabel("Actual")
+            plt.ylabel("Predicted")
+            plt.title("Random Forest: Actual vs Predicted")
+            st.pyplot()