Spaces:
Build error
Build error
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "id": 1, | |
| "source": [ | |
| "# Data Science Analysis Notebook\n", | |
| "\n", | |
| "This notebook contains some example Python code for data analysis." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "id": 2, | |
| "source": [ | |
| "# Import libraries\n", | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "import seaborn as sns\n", | |
| "\n", | |
| "# Set visualization style\n", | |
| "sns.set(style='whitegrid')\n", | |
| "%matplotlib inline" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "id": 3, | |
| "source": [ | |
| "# Load the dataset\n", | |
| "df = pd.read_csv('housing_data.csv')\n", | |
| "\n", | |
| "# Display basic information\n", | |
| "print(f\"Dataset shape: {df.shape}\")\n", | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "id": 4, | |
| "source": [ | |
| "# Perform data cleaning\n", | |
| "# Fill missing values with median\n", | |
| "for column in df.columns:\n", | |
| " if df[column].dtype in ['float64', 'int64']:\n", | |
| " df[column].fillna(df[column].median(), inplace=True)\n", | |
| " else:\n", | |
| " df[column].fillna(df[column].mode()[0], inplace=True)\n", | |
| "\n", | |
| "# Check for remaining missing values\n", | |
| "print(\"Missing values after cleaning:\")\n", | |
| "print(df.isnull().sum())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "id": 5, | |
| "source": [ | |
| "# Exploratory data analysis\n", | |
| "# Create correlation matrix\n", | |
| "numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n", | |
| "correlation_matrix = df[numeric_columns].corr()\n", | |
| "\n", | |
| "# Plot heatmap\n", | |
| "plt.figure(figsize=(12, 10))\n", | |
| "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)\n", | |
| "plt.title('Correlation Matrix of Numeric Features', fontsize=18)\n", | |
| "plt.xticks(rotation=45, ha='right')\n", | |
| "plt.tight_layout()\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "id": 6, | |
| "source": [ | |
| "# Feature engineering\n", | |
| "# Create new features\n", | |
| "if 'bedrooms' in df.columns and 'total_rooms' in df.columns:\n", | |
| " df['bedrooms_ratio'] = df['bedrooms'] / df['total_rooms']\n", | |
| "\n", | |
| "if 'total_rooms' in df.columns and 'households' in df.columns:\n", | |
| " df['rooms_per_household'] = df['total_rooms'] / df['households']\n", | |
| "\n", | |
| "# Scale numeric features\n", | |
| "from sklearn.preprocessing import StandardScaler\n", | |
| "scaler = StandardScaler()\n", | |
| "df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n", | |
| "\n", | |
| "# Display transformed data\n", | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "id": 7, | |
| "source": [ | |
| "# Build a simple prediction model\n", | |
| "from sklearn.model_selection import train_test_split\n", | |
| "from sklearn.linear_model import LinearRegression\n", | |
| "from sklearn.metrics import mean_squared_error, r2_score\n", | |
| "\n", | |
| "# Assume we're predicting median_house_value\n", | |
| "if 'median_house_value' in df.columns:\n", | |
| " # Prepare features and target\n", | |
| " X = df.drop('median_house_value', axis=1)\n", | |
| " y = df['median_house_value']\n", | |
| " \n", | |
| " # Split the data\n", | |
| " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", | |
| " \n", | |
| " # Train the model\n", | |
| " model = LinearRegression()\n", | |
| " model.fit(X_train, y_train)\n", | |
| " \n", | |
| " # Make predictions\n", | |
| " y_pred = model.predict(X_test)\n", | |
| " \n", | |
| " # Evaluate the model\n", | |
| " mse = mean_squared_error(y_test, y_pred)\n", | |
| " r2 = r2_score(y_test, y_pred)\n", | |
| " \n", | |
| " print(f\"Mean Squared Error: {mse:.2f}\")\n", | |
| " print(f\"R² Score: {r2:.2f}\")\n", | |
| " \n", | |
| " # Plot actual vs predicted values\n", | |
| " plt.figure(figsize=(10, 6))\n", | |
| " plt.scatter(y_test, y_pred, alpha=0.5)\n", | |
| " plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')\n", | |
| " plt.xlabel('Actual Values')\n", | |
| " plt.ylabel('Predicted Values')\n", | |
| " plt.title('Actual vs Predicted Values')\n", | |
| " plt.tight_layout()\n", | |
| " plt.show()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.10" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } | |