Spaces:

asoria
/

auto-notebook-creator

Sleeping

App Files Files Community

asoria commited on Aug 30, 2024

Commit

4dc6cd8

1 Parent(s): e62a0e5

EDA template partially finished (need to filter numerical operations)

Browse files

Files changed (2) hide show

app.py +4 -6
utils/notebook_utils.py +72 -18

app.py CHANGED Viewed

@@ -15,8 +15,8 @@ from dotenv import load_dotenv
 import os
 # TODOS:
 # 2. Add template for RAG and embeddings
-# 3. Improve templates
 load_dotenv()
@@ -112,9 +112,6 @@ def _push_to_hub(
             repo_id=NOTEBOOKS_REPOSITORY,
             repo_type="dataset",
         )
-        link = f"https://huggingface.co/datasets/{NOTEBOOKS_REPOSITORY}/blob/main/{notebook_name}"
-        logging.info(f"Notebook pushed to hub: {link}")
-        return link
     except Exception as e:
         logging.info("Failed to push notebook", e)
         raise
@@ -165,7 +162,8 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
             break
     notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
     create_notebook_file(cells, notebook_name=notebook_name)
-    notebook_link = _push_to_hub(dataset_id, notebook_name)
     yield generated_text, f"## Here you have the [generated notebook]({notebook_link})"
@@ -185,7 +183,7 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
             dataset_samples = gr.Examples(
                 examples=[
                     [
-                        "infinite-dataset-hub/WorldPopCounts",
                         "Try this dataset for Exploratory Data Analysis",
                     ],
                     [

 import os
 # TODOS:
+# 1. Add cells by data types in EDA notebook
 # 2. Add template for RAG and embeddings
 load_dotenv()
             repo_id=NOTEBOOKS_REPOSITORY,
             repo_type="dataset",
         )
     except Exception as e:
         logging.info("Failed to push notebook", e)
         raise
             break
     notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
     create_notebook_file(cells, notebook_name=notebook_name)
+    _push_to_hub(dataset_id, notebook_name)
+    notebook_link = f"https://colab.research.google.com/#fileId=https%3A//huggingface.co/datasets/asoria/dataset-notebook-creator-content/blob/main/{notebook_name}"
     yield generated_text, f"## Here you have the [generated notebook]({notebook_link})"
             dataset_samples = gr.Examples(
                 examples=[
                     [
+                        "scikit-learn/iris",
                         "Try this dataset for Exploratory Data Analysis",
                     ],
                     [

utils/notebook_utils.py CHANGED Viewed

@@ -33,15 +33,16 @@ embeggins_cells = [
 eda_cells = [
     {
         "cell_type": "markdown",
-        "source": "# Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset",
-    },
-    {
-        "cell_type": "code",
         "source": """
-from IPython.display import HTML
-display(HTML("{html_code}"))
 """,
     },
     {
         "cell_type": "code",
         "source": """
@@ -60,14 +61,18 @@ import seaborn as sns
     {
         "cell_type": "code",
         "source": """
-# 2. Load the dataset as a DataFrame using the provided code
 {first_code}
 """,
     },
     {
         "cell_type": "code",
         "source": """
-# 3. Understand the dataset structure
 print(df.head())
 print(df.info())
 print(df.describe())
@@ -76,40 +81,89 @@ print(df.describe())
     {
         "cell_type": "code",
         "source": """
-# 4. Check for missing values
 print(df.isnull().sum())
 """,
     },
     {
         "cell_type": "code",
         "source": """
-# 5. Identify data types of each column
 print(df.dtypes)
 """,
     },
     {
         "cell_type": "code",
         "source": """
-# 6. Detect duplicated rows
 print(df.duplicated().sum())
 """,
     },
     {
         "cell_type": "code",
         "source": """
-# 7. Generate descriptive statistics
 print(df.describe())
 """,
     },
     {
         "cell_type": "code",
         "source": """
-# 8. Visualize the distribution of each column.
-# TODO: Add code to visualize the distribution of each column.
-# 9. Explore relationships between columns.
-# TODO: Add code to explore relationships between columns.
-# 10. Perform correlation analysis.
-# TODO: Add code to perform correlation analysis.
 """,
     },
 ]

 eda_cells = [
     {
         "cell_type": "markdown",
         "source": """
+---
+# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**
+---
 """,
     },
+    {
+        "cell_type": "markdown",
+        "source": "## 1. Setup necessary libraries and load the dataset",
+    },
     {
         "cell_type": "code",
         "source": """
     {
         "cell_type": "code",
         "source": """
+# 2. Load the dataset as a DataFrame
 {first_code}
 """,
     },
+    {
+        "cell_type": "markdown",
+        "source": "## 2. Understanding the Dataset",
+    },
     {
         "cell_type": "code",
         "source": """
+# First rows of the dataset and info
 print(df.head())
 print(df.info())
 print(df.describe())
     {
         "cell_type": "code",
         "source": """
+# Check for missing values
 print(df.isnull().sum())
 """,
     },
     {
         "cell_type": "code",
         "source": """
+# Identify data types of each column
 print(df.dtypes)
 """,
     },
     {
         "cell_type": "code",
         "source": """
+# Detect duplicated rows
 print(df.duplicated().sum())
 """,
     },
     {
         "cell_type": "code",
         "source": """
+# Generate descriptive statistics
 print(df.describe())
 """,
     },
     {
         "cell_type": "code",
         "source": """
+# Unique values in categorical columns
+df.select_dtypes(include=['object']).nunique()
+""",
+    },
+    {
+        "cell_type": "markdown",
+        "source": "## 3. Data Visualization",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Correlation matrix for numerical columns
+corr_matrix = df.corr(numeric_only=True)
+plt.figure(figsize=(10, 8))
+sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
+plt.title('Correlation Matrix')
+plt.show()
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Distribution plots for numerical columns
+for column in df.select_dtypes(include=['int64', 'float64']).columns:
+    plt.figure(figsize=(8, 4))
+    sns.histplot(df[column], kde=True)
+    plt.title(f'Distribution of {column}')
+    plt.xlabel(column)
+    plt.ylabel('Frequency')
+    plt.show()
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Count plots for categorical columns
+for column in df.select_dtypes(include=['object']).columns:
+    plt.figure(figsize=(8, 4))
+    sns.countplot(x=column, data=df)
+    plt.title(f'Count Plot of {column}')
+    plt.xlabel(column)
+    plt.ylabel('Count')
+    plt.show()
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Box plots for detecting outliers in numerical columns
+for column in df.select_dtypes(include=['int64', 'float64']).columns:
+    plt.figure(figsize=(8, 4))
+    sns.boxplot(df[column])
+    plt.title(f'Box Plot of {column}')
+    plt.xlabel(column)
+    plt.show()
 """,
     },
 ]