GivingTuesday
/

localities_LNI

English

Model card Files Files and versions

xet

Community

hassaanulhaq01 commited on Dec 3, 2025

Commit

f6f26a4

verified ·

1 Parent(s): e6dcca9

Add interactive schedule_o notebook from Databricks

Browse files

Files changed (1) hide show

notebooks/NP03_schedule_I.ipynb +540 -266

notebooks/NP03_schedule_I.ipynb CHANGED Viewed

@@ -1,27 +1,90 @@
 {
  "cells": [
   {
-   "cell_type": "markdown",
    "metadata": {
     "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
      "inputWidgets": {},
-     "nuid": "939230d3-02ed-43f2-a10a-e983c2c23964",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
     }
    },
    "source": [
-    "#Funding organizations - 990 Filers\n",
     "\n",
-    "Schedule I is completed by organizations who answer \"Yes\" on Form 990, Part IV, line 21 or 22.\n",
     "\n",
-    "Question 21:  \n",
-    "\"Did the organization report more than $5,000 of grants or other assistance to any domestic organization or domestic government on Part IX, column (A), line 1? If “Yes,” complete Schedule I, Parts I and II\"\n",
     "\n",
-    "Question 22:  \n",
-    "\"Did the organization report more than $5,000 of grants or other assistance to or for domestic individuals on Part IX, column (A), line 2? If “Yes,” complete Schedule I, Parts I and III\""
    ]
   },
   {
@@ -34,7 +97,7 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "9dc61c58-ae98-4c6a-b8a2-cfd67b848b61",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
@@ -43,7 +106,12 @@
    "outputs": [],
    "source": [
     "from pyspark.sql import functions as F\n",
-    "from pyspark.sql.window import Window"
    ]
   },
   {
@@ -56,7 +124,7 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "d052b039-2f34-48ce-851e-416a15697695",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
@@ -64,24 +132,62 @@
    },
    "outputs": [],
    "source": [
-    "scheduleigrantsp2 = spark.table(\"prod_curated.irs.scheduleipart2grants\")\n",
-    "form990cn120fields = spark.table(\"prod_curated.irs.990standardfields\")"
    ]
   },
   {
-   "cell_type": "markdown",
    "metadata": {
     "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
      "inputWidgets": {},
-     "nuid": "71943bff-f143-4481-82b7-8dfc0c3aa965",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
     }
    },
    "source": [
-    "##Data checks"
    ]
   },
   {
@@ -89,28 +195,20 @@
    "execution_count": null,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
      "inputWidgets": {},
-     "nuid": "29735c68-9c07-49b7-9a37-816d82de393b",
-     "showTitle": true,
      "tableResultSettingsMap": {},
-     "title": "Check for non-unique filer state records within a tax year"
     }
    },
    "outputs": [],
    "source": [
-    "# filer_states = (\n",
-    "#     form990cn120fields\n",
-    "#     .select(\n",
-    "#         'FILEREIN',\n",
-    "#         'TAXYEAR',\n",
-    "#         'FILERUSSTATE',\n",
-    "#     )\n",
-    "#     .distinct()\n",
-    "#     .groupBy('FILEREIN', 'TAXYEAR')\n",
-    "#     .agg(F.countDistinct('FILERUSSTATE').alias('state_count'))\n",
-    "#     .filter(F.col('state_count') > 1)\n",
-    "# )"
    ]
   },
   {
@@ -123,28 +221,25 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "e8dea51f-9016-4a6d-b199-52d5509c6f32",
      "showTitle": true,
      "tableResultSettingsMap": {},
-     "title": "Check for non-unique filer city records within a tax year"
     }
    },
    "outputs": [],
    "source": [
-    "# filer_cities = (\n",
-    "#     form990cn120fields\n",
-    "#     .select(\n",
-    "#         'FILEREIN',\n",
-    "#         'TAXYEAR',\n",
-    "#         'FILERUSCITY',\n",
-    "#     )\n",
-    "#     .distinct()\n",
-    "#     .groupBy('FILEREIN', 'TAXYEAR')\n",
-    "#     .agg(F.countDistinct('FILERUSCITY').alias('state_count'))\n",
-    "#     .filter(F.col('state_count') > 1)\n",
-    "# )\n",
     "\n",
-    "# display(filer_cities)"
    ]
   },
   {
@@ -157,29 +252,15 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "d928d099-3b12-4c27-af21-bb85f83245d4",
      "showTitle": true,
      "tableResultSettingsMap": {},
-     "title": "Check recipient state counts"
     }
    },
    "outputs": [],
    "source": [
-    "display(\n",
-    "    scheduleigrantsp2_cl\n",
-    "    .groupBy(\n",
-    "        'RECTABADDSTA'\n",
-    "    ).agg(\n",
-    "        F.count('*')\n",
-    "    )\n",
-    ")\n",
-    "\n",
-    "# display(\n",
-    "#     scheduleigrantsp2_cl\n",
-    "#     .filter(\n",
-    "#         F.col('RECTABADDSTA') == 'AA'\n",
-    "#     )\n",
-    "# )"
    ]
   },
   {
@@ -192,7 +273,7 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "29713546-0dcb-4b5f-8717-0232b4f617d0",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
@@ -200,7 +281,23 @@
    },
    "outputs": [],
    "source": [
-    "display(scheduleigrantsp2.filter(F.col('RECTABADDSTA').isNull()))"
    ]
   },
   {
@@ -208,46 +305,67 @@
    "execution_count": null,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
      "inputWidgets": {},
-     "nuid": "9c2e202d-cb25-4753-8b44-0adfa0a97132",
      "showTitle": true,
      "tableResultSettingsMap": {},
-     "title": "Check overlap of null grant amounts versus null state codes"
     }
    },
    "outputs": [],
    "source": [
-    "# cross_section_counts = (\n",
-    "#     scheduleigrantsp2_cl\n",
-    "#     .select(\n",
-    "#         F.when(F.col('RECTABADDSTA').isNull(), 'NULL').otherwise('NON_NULL').alias('RECTABADDSTA_status'),\n",
-    "#         F.when(F.col('RETAAMOFCAGR').isNull(), 'NULL').otherwise('NON_NULL').alias('RETAAMOFCAGR_status')\n",
-    "#     )\n",
-    "#     .groupBy(\n",
-    "#         'RECTABADDSTA_status', 'RETAAMOFCAGR_status'\n",
-    "#     ).agg(\n",
-    "#         F.count('*').alias('count')\n",
-    "#     )\n",
     "# )\n",
     "\n",
-    "# display(cross_section_counts)"
    ]
   },
   {
-   "cell_type": "markdown",
    "metadata": {
     "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
      "inputWidgets": {},
-     "nuid": "da3ea6fa-5d11-437e-8705-824ec6ea5d5a",
-     "showTitle": false,
      "tableResultSettingsMap": {},
-     "title": ""
     }
    },
    "source": [
-    "##Schedule I Filers"
    ]
   },
   {
@@ -260,7 +378,7 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "253b21d0-e9c6-4eea-b74a-942a96192e7f",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
@@ -268,14 +386,7 @@
    },
    "outputs": [],
    "source": [
-    "filer_states = (\n",
-    "    form990cn120fields\n",
-    "    .select(\n",
-    "        'FILEREIN',\n",
-    "        'TAXYEAR',\n",
-    "        'FILERUSSTATE',\n",
-    "    ).distinct()\n",
-    ")"
    ]
   },
   {
@@ -288,7 +399,7 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "ae49acb7-fc82-4b25-8644-2f1798ac70aa",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
@@ -296,14 +407,24 @@
    },
    "outputs": [],
    "source": [
-    "filer_cities = (\n",
-    "    form990cn120fields\n",
-    "    .select(\n",
-    "        'FILEREIN',\n",
-    "        'TAXYEAR',\n",
-    "        'FILERUSCITY',\n",
-    "    ).distinct()\n",
-    ")"
    ]
   },
   {
@@ -316,7 +437,7 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "952f0e50-716e-42cc-abb7-b88619c7fb86",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
@@ -324,18 +445,25 @@
    },
    "outputs": [],
    "source": [
-    "scheduleigrantsp2_cl = (\n",
-    "    scheduleigrantsp2\n",
-    "    .select(\n",
-    "        'FILEREIN',\n",
-    "        'TAXYEAR',\n",
-    "        'RTEINORECIPI',\n",
-    "        'RTRNBBNLINE11',\n",
-    "        'RECTABADDCIT',\n",
-    "        'RECTABADDSTA',\n",
-    "        'RETAAMOFCAGR',\n",
-    "    )\n",
-    ")"
    ]
   },
   {
@@ -348,72 +476,36 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "7f6df2c9-25a9-4eb0-9057-043e77d9b11d",
-     "showTitle": false,
      "tableResultSettingsMap": {},
-     "title": ""
     }
    },
    "outputs": [],
    "source": [
-    "filer_window = Window.partitionBy('FILEREIN', 'TAXYEAR')\n",
-    "state_rank_window = Window.partitionBy('FILEREIN', 'TAXYEAR').orderBy(F.desc('total_grant_value'))\n",
-    "\n",
-    "grants_per_state = (\n",
-    "    scheduleigrantsp2_cl\n",
-    "    .filter(\n",
-    "        F.col('RECTABADDSTA').isNotNull()\n",
-    "    )\n",
-    "    .groupBy(\n",
-    "        'FILEREIN', 'TAXYEAR', 'RECTABADDSTA'\n",
-    "    ).agg(\n",
-    "        F.sum('RETAAMOFCAGR').alias('total_grant_value'),\n",
-    "        F.count('*').alias('total_grant_count'),\n",
-    "    ).withColumn(\n",
-    "        'total',\n",
-    "        F.sum('total_grant_value').over(filer_window)\n",
-    "    ).filter(\n",
-    "        F.col('total') > 0\n",
-    "    ).withColumn(\n",
-    "        'proportion',\n",
-    "        F.col('total_grant_value') / F.col('total')\n",
-    "    ).withColumn(\n",
-    "        'total_states',\n",
-    "        F.count('*').over(filer_window)\n",
-    "    ).withColumn(\n",
-    "        'rank',\n",
-    "        F.rank().over(state_rank_window)  # Rank states within each filer-year based on grant value\n",
-    "    ).groupBy(\n",
-    "        'FILEREIN', 'TAXYEAR'\n",
-    "    ).agg(\n",
-    "        F.sum('total_grant_value').alias('total_grant_value'),\n",
-    "        F.sum('total_grant_count').alias('total_grant_count'),\n",
-    "        F.first('total_states').alias('total_recipient_states'),\n",
-    "        F.max('proportion').alias('max_recipient_state_percentage'),\n",
-    "        F.collect_set('RECTABADDSTA').alias('distinct_recipient_states'),  # Collect unique states into a list\n",
-    "        F.first(F.when(F.col('rank') == 1, F.col('RECTABADDSTA')), ignorenulls=True).alias('top_recipient_state'),  # Get state with highest grant value\n",
-    "    ).join(\n",
-    "        filer_states,\n",
-    "        on=['FILEREIN', 'TAXYEAR'],\n",
-    "        how='left'\n",
-    "    ).withColumn(\n",
-    "        'foreign_percentage',   # Schedule I is for domestic grants so assumed 0 foreign\n",
-    "        F.lit(0)                # Added column for combining datasets later on\n",
-    "    ).select(\n",
-    "        'FILEREIN',\n",
-    "        'TAXYEAR',\n",
-    "        'FILERUSSTATE',\n",
-    "        'total_grant_value',\n",
-    "        'total_grant_count',\n",
-    "        'total_recipient_states',\n",
-    "        'foreign_percentage',\n",
-    "        'max_recipient_state_percentage',\n",
-    "        'distinct_recipient_states',\n",
-    "        'top_recipient_state',\n",
     "    )\n",
     ")\n",
     "\n",
-    "display(grants_per_state)\n"
    ]
   },
   {
@@ -426,68 +518,24 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "9772faeb-640c-4bdb-9a99-8b0e41f3bbda",
      "showTitle": true,
      "tableResultSettingsMap": {},
-     "title": "Aggregation at city level could be used to identify funders with local activity"
     }
    },
    "outputs": [],
    "source": [
-    "filer_window = Window.partitionBy('FILEREIN', 'TAXYEAR')\n",
-    "city_rank_window = Window.partitionBy('FILEREIN', 'TAXYEAR').orderBy(F.desc('total_grant_value'))\n",
     "\n",
-    "grants_per_city = (\n",
-    "    scheduleigrantsp2_cl\n",
-    "    .filter(\n",
-    "        F.col('RECTABADDCIT').isNotNull()\n",
-    "    )\n",
-    "    .groupBy(\n",
-    "        'FILEREIN', 'TAXYEAR', 'RECTABADDCIT'\n",
-    "    ).agg(\n",
-    "        F.sum('RETAAMOFCAGR').alias('total_grant_value'),\n",
-    "        F.count('*').alias('total_grant_count'),\n",
-    "    ).withColumn(\n",
-    "        'total',\n",
-    "        F.sum('total_grant_value').over(filer_window)\n",
-    "    ).filter(\n",
-    "        F.col('total') > 0\n",
-    "    ).withColumn(\n",
-    "        'proportion',\n",
-    "        F.col('total_grant_value') / F.col('total')\n",
-    "    ).withColumn(\n",
-    "        'total_cities',\n",
-    "        F.count('*').over(filer_window)\n",
-    "    ).withColumn(\n",
-    "        'rank',\n",
-    "        F.rank().over(city_rank_window)  # Rank cities within each filer-year based on grant value\n",
-    "    ).groupBy(\n",
-    "        'FILEREIN', 'TAXYEAR'\n",
-    "    ).agg(\n",
-    "        F.sum('total_grant_value').alias('total_grant_value'),\n",
-    "        F.sum('total_grant_count').alias('total_grant_count'),\n",
-    "        F.first('total_cities').alias('total_recipient_cities'),\n",
-    "        F.max('proportion').alias('max_recipient_city_percentage'),\n",
-    "        F.collect_set('RECTABADDCIT').alias('distinct_recipient_cities'),  # Collect unique cities into a list\n",
-    "        F.first(F.when(F.col('rank') == 1, F.col('RECTABADDCIT')), ignorenulls=True).alias('top_recipient_city'),  # Get city with highest grant value\n",
-    "    ).join(\n",
-    "        filer_cities,\n",
-    "        on=['FILEREIN', 'TAXYEAR'],\n",
-    "        how='left'\n",
-    "    ).select(\n",
-    "        'FILEREIN',\n",
-    "        'TAXYEAR',\n",
-    "        'FILERUSCITY',\n",
-    "        'total_grant_value',\n",
-    "        'total_grant_count',\n",
-    "        'total_recipient_cities',\n",
-    "        'max_recipient_city_percentage',\n",
-    "        'distinct_recipient_cities',\n",
-    "        'top_recipient_city',\n",
-    "    )\n",
     ")\n",
-    "\n",
-    "display(grants_per_city)\n"
    ]
   },
   {
@@ -500,18 +548,50 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "74a921cc-2325-4a27-88a6-5cfa28f52ac0",
-     "showTitle": false,
      "tableResultSettingsMap": {},
-     "title": ""
     }
    },
    "outputs": [],
    "source": [
-    "grants_per_state = (\n",
-    "    grants_per_state.withColumn('source', F.lit('990 (domestic grants, schedule I)'))\n",
     ")\n",
-    "grants_per_state.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.grants_per_state_990_filers')"
    ]
   },
   {
@@ -520,26 +600,56 @@
     "application/vnd.databricks.v1+cell": {
      "cellMetadata": {},
      "inputWidgets": {},
-     "nuid": "78af447a-fd18-4f6d-8283-a431e2d82435",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
     }
    },
    "source": [
-    "##International activity\n",
-    "\n",
-    "Would ideally use data in Schedule F but this isn't in Databricks yet.  \n",
-    "\n",
-    "Form 990 questions 14a, 14b, 15, and 16 could be used to identify foregin activity but this information seems to be missing from prod_curated.irs.990cn120fields:  \n",
-    "F9_04_PC_FOREIGOFFICE (14a)  \n",
-    "F9_04_PC_FOREIGACTIVI (14b)  \n",
-    "F9_04_PC_MOTHKTTOORIN (15)  \n",
-    "F9_04_PC_MOTHKTTOORIN (15)  \n",
-    "F9_04_PC_MOTHKTTOININ (16)  \n",
-    "F9_04_PC_MOTHKTTOINND (16)  \n",
-    "\n",
-    "prod_curated.irs.990cn120fields does contain the field F9_09_PC_FOREGRANTOTA from part 9 of the form which totals the amounts given in foreign grants"
    ]
   },
   {
@@ -552,7 +662,7 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "30d7ddc0-df13-4683-98f5-7b24b918e01c",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
@@ -560,16 +670,28 @@
    },
    "outputs": [],
    "source": [
-    "foreign_activity =(\n",
-    "    form990cn120fields\n",
     "    .select(\n",
-    "        'FILEREIN',\n",
-    "        'TAXYEAR',\n",
-    "        'FOREGRANTOTA',\n",
-    "    ).filter(\n",
-    "        F.col('FOREGRANTOTA') > 0\n",
     "    )\n",
-    ")"
    ]
   },
   {
@@ -582,15 +704,161 @@
       "rowLimit": 10000
      },
      "inputWidgets": {},
-     "nuid": "582caba3-4d4d-4e64-b222-eb7af12ce6bd",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
     }
    },
    "outputs": [],
    "source": [
-    "display(foreign_activity)"
    ]
   },
   {
@@ -600,7 +868,7 @@
     "application/vnd.databricks.v1+cell": {
      "cellMetadata": {},
      "inputWidgets": {},
-     "nuid": "00451a40-f571-4bbc-820b-d6fd31b3537a",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
@@ -612,7 +880,13 @@
  ],
  "metadata": {
   "application/vnd.databricks.v1+notebook": {
-   "computePreferences": null,
    "dashboards": [],
    "environmentMetadata": {
     "base_environment": "",
@@ -623,7 +897,7 @@
    "notebookMetadata": {
     "pythonIndentUnit": 4
    },
-   "notebookName": "(Clone) NP03_schedule_I",
    "widgets": {}
   },
   "language_info": {

 {
  "cells": [
   {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
      "inputWidgets": {},
+     "nuid": "62d4799f-4935-4a2d-8f0a-5f6383b22cf7",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
     }
    },
+   "outputs": [],
+   "source": [
+    "df1 = spark.read.table(\"prod_curated.irs.990cn120fields\")\n",
+    "df2 = spark.read.table(\"prod_curated.irs.990standardfields\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "d5410f3d-7463-43f5-8bfb-528d36e80b42",
+     "showTitle": false,
+     "tableResultSettingsMap": {
+      "0": {
+       "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{\"column\":116},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1758734440525}",
+       "filterBlob": null,
+       "queryPlanFiltersBlob": null,
+       "tableResultIndex": 0
+      }
+     },
+     "title": ""
+    }
+   },
+   "outputs": [],
    "source": [
+    "from pyspark.sql import SparkSession\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Extract (col, dtype) as dicts\n",
+    "df1_schema = {f.name: f.dataType.simpleString() for f in df1.schema.fields}\n",
+    "df2_schema = {f.name: f.dataType.simpleString() for f in df2.schema.fields}\n",
+    "\n",
+    "# Union of all column names\n",
+    "all_cols = set(df1_schema.keys()).union(df2_schema.keys())\n",
     "\n",
+    "# Build comparison rows\n",
+    "rows = []\n",
+    "for col in sorted(all_cols):\n",
+    "    in_df1 = col in df1_schema\n",
+    "    in_df2 = col in df2_schema\n",
+    "    \n",
+    "    if in_df1 and in_df2:\n",
+    "        flag = \"both\"\n",
+    "    elif in_df1:\n",
+    "        flag = \"old\"\n",
+    "    else:\n",
+    "        flag = \"new\"\n",
+    "    \n",
+    "    rows.append({\n",
+    "        \"column\": col,\n",
+    "        \"in_df\": flag,\n",
+    "        \"dtype_old\": df1_schema.get(col),\n",
+    "        \"dtype_new\": df2_schema.get(col)\n",
+    "    })\n",
     "\n",
+    "# Convert to pandas for inspection\n",
+    "comparison_df = pd.DataFrame(rows)\n",
     "\n",
+    "# If you prefer it as a Spark DataFrame:\n",
+    "spark = SparkSession.builder.getOrCreate()\n",
+    "spark_comparison_df = spark.createDataFrame(comparison_df)\n",
+    "\n",
+    "display(comparison_df)\n"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "2634e810-1046-456f-a00e-34db0ca198a2",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
    "outputs": [],
    "source": [
     "from pyspark.sql import functions as F\n",
+    "from pyspark.sql.window import Window\n",
+    "\n",
+    "from pyspark.ml.feature import VectorAssembler, StandardScaler\n",
+    "from pyspark.ml.clustering import KMeans\n",
+    "\n",
+    "import plotly.express as px"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "cc50ff8a-e01c-417d-b926-fecac95265d0",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
    },
    "outputs": [],
    "source": [
+    "grants_per_state_990 = spark.read.table('sandbox_edward.nonprofit_mapping.grants_per_state_990_filers')\n",
+    "grants_per_state_990pf = spark.read.table('sandbox_edward.nonprofit_mapping.grants_per_state_990pf_filers')"
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "79330be0-c72e-4670-b6bd-b95665af55c8",
+     "showTitle": true,
+     "tableResultSettingsMap": {},
+     "title": "check for EINs in both 990 and 990pf"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dual_filers = (\n",
+    "    grants_per_state_990.select(\n",
+    "        'FILEREIN', \n",
+    "        'TAXYEAR'\n",
+    "    )\n",
+    "    .join(\n",
+    "        grants_per_state_990pf.select('FILEREIN', 'TAXYEAR'), \n",
+    "        on=['FILEREIN', 'TAXYEAR'],\n",
+    "        how='inner'\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "display(dual_filers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
      "inputWidgets": {},
+     "nuid": "3cd453d0-0bac-42d7-b9d3-51a30be32e6b",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
     }
    },
+   "outputs": [],
    "source": [
+    "display(grants_per_state_990.filter(F.col('FILEREIN')=='85-0462315'))"
    ]
   },
   {
    "execution_count": null,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
      "inputWidgets": {},
+     "nuid": "7232db59-693a-43d9-826b-9e6c2a271626",
+     "showTitle": false,
      "tableResultSettingsMap": {},
+     "title": ""
     }
    },
    "outputs": [],
    "source": [
+    "display(grants_per_state_990pf.filter(F.col('FILEREIN')=='85-0462315'))"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "64686496-b51f-4aad-ad34-f88e2b69cf61",
      "showTitle": true,
      "tableResultSettingsMap": {},
+     "title": "drop dual filers"
     }
    },
    "outputs": [],
    "source": [
+    "grants_per_state_990 = grants_per_state_990.join(\n",
+    "    dual_filers.select(F.col('FILEREIN'), F.col('TAXYEAR')),\n",
+    "    on=['FILEREIN', 'TAXYEAR'],\n",
+    "    how='left_anti'\n",
+    ")\n",
     "\n",
+    "grants_per_state_990pf = grants_per_state_990pf.join(\n",
+    "    dual_filers.select(F.col('FILEREIN'), F.col('TAXYEAR')),\n",
+    "    on=['FILEREIN', 'TAXYEAR'],\n",
+    "    how='left_anti'\n",
+    ")"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "f0f23c24-a091-49d6-bc7b-64596c89ed0a",
      "showTitle": true,
      "tableResultSettingsMap": {},
+     "title": "combine 990 & 990pf orgs into one df"
     }
    },
    "outputs": [],
    "source": [
+    "grants_per_state = grants_per_state_990.union(grants_per_state_990pf).orderBy('FILEREIN', 'TAXYEAR')"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "ac11e27e-18a2-41e0-b8a2-2f6889990a02",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
    },
    "outputs": [],
    "source": [
+    "display(grants_per_state)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "6b8aee01-6623-4556-a717-9f58d8af4b6e",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "source": [
+    "##KMeans Clustering"
    ]
   },
   {
    "execution_count": null,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
      "inputWidgets": {},
+     "nuid": "013375d3-74b9-48a2-9db0-fe70b09c47f5",
      "showTitle": true,
      "tableResultSettingsMap": {},
+     "title": "feature engineering"
     }
    },
    "outputs": [],
    "source": [
+    "# Normalize/scale features\n",
+    "feature_cols = [\"foreign_percentage\", \"max_recipient_state_percentage\", \"total_recipient_states\"]\n",
+    "assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"features_unscaled\")\n",
+    "df_features = assembler.transform(grants_per_state)\n",
+    "\n",
+    "scaler = StandardScaler(inputCol=\"features_unscaled\", outputCol=\"features\", withStd=True, withMean=True)\n",
+    "df_scaled = scaler.fit(df_features).transform(df_features)\n",
+    "\n",
+    "# Create a composite score - optional, may not add value\n",
+    "# max_states = grants_per_state.select(F.max('total_recipient_states')).collect()[0][0]\n",
+    "# grants_per_state = grants_per_state.withColumn(\n",
+    "#     \"composite_score\",\n",
+    "#     0.5 * (1 - F.col(\"max_recipient_state_percentage\")/100) + \n",
+    "#     0.3 * (F.col(\"total_recipient_states\")/max_states) +   \n",
+    "#     0.2 * (F.col(\"foreign_percentage\")/100)\n",
     "# )\n",
+    "# feature_cols = [\"foreign_percentage\", \"max_recipient_state_percentage\", \"total_recipient_states\", \"composite_score\"]\n",
+    "# assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"features_unscaled\")\n",
+    "# df_features = assembler.transform(grants_per_state)\n",
     "\n",
+    "# scaler = StandardScaler(inputCol=\"features_unscaled\", outputCol=\"features\", withStd=True, withMean=True)\n",
+    "# df_scaled = scaler.fit(df_features).transform(df_features)"
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
      "inputWidgets": {},
+     "nuid": "9bedd689-bb2c-4beb-9f7b-1756ba0c99c5",
+     "showTitle": true,
      "tableResultSettingsMap": {},
+     "title": "clustering"
     }
    },
+   "outputs": [],
    "source": [
+    "# Clustering on all the scaled features\n",
+    "kmeans = KMeans(featuresCol=\"features\", predictionCol=\"cluster\", k=3, seed=42)\n",
+    "model = kmeans.fit(df_scaled)\n",
+    "\n",
+    "# Assign clusters\n",
+    "df_clustered = model.transform(df_scaled)"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "4f47c4f6-2143-41a7-b921-55cc3405be3a",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
    },
    "outputs": [],
    "source": [
+    "display(df_clustered)"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "5cce9fe9-0c60-4c5d-971e-1460e813a0fc",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
    },
    "outputs": [],
    "source": [
+    "# df_clustered.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_with_composite_feature')\n",
+    "df_clustered.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_without_composite_feature')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "86298a0c-3526-4749-84d8-33c4119da0d8",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "source": [
+    "##Cluster Summary - With Composite Feature"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "702ac066-0d69-47e9-9411-f080d3a541ea",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
    },
    "outputs": [],
    "source": [
+    "df_clustered = spark.read.table('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_with_composite_feature')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "3926601d-c18f-4fed-a87e-06b762d61c6e",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "source": [
+    "Cluster 0 = local/regional<br>\n",
+    "Cluster 1 = international<br>\n",
+    "Cluster 2 = national"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "5ae9099f-f596-4565-954a-40035f6eb880",
+     "showTitle": true,
      "tableResultSettingsMap": {},
+     "title": "summarize clusters by original features"
     }
    },
    "outputs": [],
    "source": [
+    "summary = (\n",
+    "    df_clustered\n",
+    "    .groupBy(\"cluster\")\n",
+    "    .agg(\n",
+    "        F.count(\"*\").alias(\"count\"),\n",
+    "        F.avg(\"foreign_percentage\").alias(\"avg_foreign_percentage\"),\n",
+    "        F.median(\"foreign_percentage\").alias(\"median_foreign_percentage\"),\n",
+    "        F.min(\"foreign_percentage\").alias(\"min_foreign_percentage\"),\n",
+    "        F.max(\"foreign_percentage\").alias(\"max_foreign_percentage\"),\n",
+    "        F.avg(\"max_recipient_state_percentage\").alias(\"avg_max_state_pct\"),\n",
+    "        F.median(\"max_recipient_state_percentage\").alias(\"median_max_state_pct\"),\n",
+    "        F.min(\"max_recipient_state_percentage\").alias(\"min_max_state_pct\"),\n",
+    "        F.max(\"max_recipient_state_percentage\").alias(\"max_max_state_pct\"),\n",
+    "        F.avg(\"total_recipient_states\").alias(\"avg_distinct_states\"),\n",
+    "        F.median(\"total_recipient_states\").alias(\"median_distinct_states\"),\n",
+    "        F.min(\"total_recipient_states\").alias(\"min_distinct_states\"),\n",
+    "        F.max(\"total_recipient_states\").alias(\"max_distinct_states\"),\n",
     "    )\n",
+    "    .orderBy(\"cluster\")\n",
     ")\n",
     "\n",
+    "display(summary)"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "def2f16f-982e-448e-9aa8-782aa01c2193",
      "showTitle": true,
      "tableResultSettingsMap": {},
+     "title": "create distribution plots for each cluster (feature: foreign percentage)"
     }
    },
    "outputs": [],
    "source": [
+    "pdf_clustered = df_clustered.toPandas()\n",
     "\n",
+    "fig_foreign = px.box(\n",
+    "    pdf_clustered,\n",
+    "    x=\"cluster\",\n",
+    "    y=\"foreign_percentage\",\n",
+    "    title=\"Foreign Percentage by Cluster\",\n",
+    "    labels={\"foreign_percentage\": \"Foreign Percentage\", \"cluster\": \"Cluster\"}\n",
     ")\n",
+    "fig_foreign.show()"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "01020c54-f610-42e2-b43c-f2643f98a576",
+     "showTitle": true,
      "tableResultSettingsMap": {},
+     "title": "create distribution plots for each cluster (feature: max recipient state percentage)"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig_max_recipient = px.box(\n",
+    "    pdf_clustered,\n",
+    "    x=\"cluster\",\n",
+    "    y=\"max_recipient_state_percentage\",\n",
+    "    title=\"Max Recipient State Percentage by Cluster\",\n",
+    "    labels={\"max_recipient_state_percentage\": \"Max Recipient State Percentage\", \"cluster\": \"Cluster\"}\n",
+    ")\n",
+    "fig_max_recipient.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "170d71cf-cb3f-44d5-bf90-cc9746a3c1d3",
+     "showTitle": true,
+     "tableResultSettingsMap": {},
+     "title": "create distribution plots for each cluster (feature: number of states)"
     }
    },
    "outputs": [],
    "source": [
+    "fig_total_states = px.box(\n",
+    "    pdf_clustered,\n",
+    "    x=\"cluster\",\n",
+    "    y=\"total_recipient_states\",\n",
+    "    title=\"Total Recipient States by Cluster\",\n",
+    "    labels={\"total_recipient_states\": \"Total Recipient States\", \"cluster\": \"Cluster\"}\n",
     ")\n",
+    "fig_total_states.show()"
    ]
   },
   {
     "application/vnd.databricks.v1+cell": {
      "cellMetadata": {},
      "inputWidgets": {},
+     "nuid": "12dc14fa-0066-4fe5-8a99-d9c6d05860aa",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
     }
    },
    "source": [
+    "##Cluster Summary - Without Composite Feature"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "1545cd83-3e0a-43f9-9719-14d0f12f5dcb",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df_clustered = spark.read.table('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_without_composite_feature')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "f4a8fe26-50d1-4d55-bd18-0313a1d55136",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "display(df_clustered)"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "6b2a98fa-d5ee-4c8f-8aad-208a83b2d145",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
    },
    "outputs": [],
    "source": [
+    "from pyspark.sql import functions as F\n",
+    "\n",
+    "df_2023 = (\n",
+    "    df_clustered\n",
+    "    .filter(F.col(\"TAXYEAR\") == 2023)\n",
+    "    .withColumn(\"locality\", F.when(F.col(\"cluster\")==0, \"local/regional\").otherwise(F.when(F.col(\"cluster\")==1, \"international\").otherwise(F.when(F.col(\"cluster\")==2, \"national\").otherwise(None))))\n",
     "    .select(\n",
+    "        \"FILEREIN\",\n",
+    "        \"TAXYEAR\",\n",
+    "        \"FILERUSSTATE\",\n",
+    "        F.col(\"total_grant_value\").alias(\"value_of_grants\"),\n",
+    "        F.col(\"total_grant_count\").alias(\"number_of_grants\"),\n",
+    "        F.col(\"total_recipient_states\").alias(\"number_of_recipient_states\"),\n",
+    "        F.col(\"foreign_percentage\").alias(\"pct_grant_value_foreign\"),\n",
+    "        F.col(\"max_recipient_state_percentage\").alias(\"pct_grant_value_top_state\"),\n",
+    "        F.col(\"top_recipient_state\").alias(\"top_state\"),\n",
+    "        F.col(\"distinct_recipient_states\").alias(\"recipient_states\"),\n",
+    "        \"locality\",\n",
+    "        \"source\",\n",
     "    )\n",
+    ")\n",
+    "display(df_2023)"
    ]
   },
   {
       "rowLimit": 10000
      },
      "inputWidgets": {},
+     "nuid": "2483cf1c-4bed-47f0-91c0-0364e0f0d5da",
+     "showTitle": false,
+     "tableResultSettingsMap": {},
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df_2023.write.mode(\"overwrite\").saveAsTable(\"sandbox_edward.nonprofit_mapping.locality_by_granting_activity_segmentation_funding_orgs_taxyear2023\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "0781ad05-93a0-46fe-9357-48fea2039b81",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
     }
    },
+   "source": [
+    "Cluster 0 = local/regional<br>\n",
+    "Cluster 1 = international<br>\n",
+    "Cluster 2 = national"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "840cdfdb-dbad-4264-b0fc-85bb060ac2aa",
+     "showTitle": true,
+     "tableResultSettingsMap": {},
+     "title": "summarize clusters by original features"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "summary = (\n",
+    "    df_clustered\n",
+    "    .groupBy(\"cluster\")\n",
+    "    .agg(\n",
+    "        F.count(\"*\").alias(\"count\"),\n",
+    "        F.avg(\"foreign_percentage\").alias(\"avg_foreign_percentage\"),\n",
+    "        F.median(\"foreign_percentage\").alias(\"median_foreign_percentage\"),\n",
+    "        F.min(\"foreign_percentage\").alias(\"min_foreign_percentage\"),\n",
+    "        F.max(\"foreign_percentage\").alias(\"max_foreign_percentage\"),\n",
+    "        F.avg(\"max_recipient_state_percentage\").alias(\"avg_max_state_pct\"),\n",
+    "        F.median(\"max_recipient_state_percentage\").alias(\"median_max_state_pct\"),\n",
+    "        F.min(\"max_recipient_state_percentage\").alias(\"min_max_state_pct\"),\n",
+    "        F.max(\"max_recipient_state_percentage\").alias(\"max_max_state_pct\"),\n",
+    "        F.avg(\"total_recipient_states\").alias(\"avg_distinct_states\"),\n",
+    "        F.median(\"total_recipient_states\").alias(\"median_distinct_states\"),\n",
+    "        F.min(\"total_recipient_states\").alias(\"min_distinct_states\"),\n",
+    "        F.max(\"total_recipient_states\").alias(\"max_distinct_states\"),\n",
+    "    )\n",
+    "    .orderBy(\"cluster\")\n",
+    ")\n",
+    "\n",
+    "display(summary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "3e9a4fb8-11e3-4734-8d8e-943e03e3b738",
+     "showTitle": true,
+     "tableResultSettingsMap": {},
+     "title": "create distribution plots for each cluster (feature: foreign percentage)"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pdf_clustered = df_clustered.toPandas()\n",
+    "\n",
+    "fig_foreign = px.box(\n",
+    "    pdf_clustered,\n",
+    "    x=\"cluster\",\n",
+    "    y=\"foreign_percentage\",\n",
+    "    title=\"Foreign Percentage by Cluster\",\n",
+    "    labels={\"foreign_percentage\": \"Foreign Percentage\", \"cluster\": \"Cluster\"}\n",
+    ")\n",
+    "fig_foreign.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "42bc6e79-cc9f-4745-9751-03c9223a3642",
+     "showTitle": true,
+     "tableResultSettingsMap": {},
+     "title": "create distribution plots for each cluster (feature: max recipient state percentage)"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig_max_recipient = px.box(\n",
+    "    pdf_clustered,\n",
+    "    x=\"cluster\",\n",
+    "    y=\"max_recipient_state_percentage\",\n",
+    "    title=\"Max Recipient State Percentage by Cluster\",\n",
+    "    labels={\"max_recipient_state_percentage\": \"Max Recipient State Percentage\", \"cluster\": \"Cluster\"}\n",
+    ")\n",
+    "fig_max_recipient.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "b05a45f8-4d81-4ac2-9cfb-b73e18d2051f",
+     "showTitle": true,
+     "tableResultSettingsMap": {},
+     "title": "create distribution plots for each cluster (feature: number of states)"
+    }
+   },
    "outputs": [],
    "source": [
+    "fig_total_states = px.box(\n",
+    "    pdf_clustered,\n",
+    "    x=\"cluster\",\n",
+    "    y=\"total_recipient_states\",\n",
+    "    title=\"Total Recipient States by Cluster\",\n",
+    "    labels={\"total_recipient_states\": \"Total Recipient States\", \"cluster\": \"Cluster\"}\n",
+    ")\n",
+    "fig_total_states.show()"
    ]
   },
   {
     "application/vnd.databricks.v1+cell": {
      "cellMetadata": {},
      "inputWidgets": {},
+     "nuid": "3cf19964-c147-4cf4-b7b9-1f31a2e6a256",
      "showTitle": false,
      "tableResultSettingsMap": {},
      "title": ""
  ],
  "metadata": {
   "application/vnd.databricks.v1+notebook": {
+   "computePreferences": {
+    "hardware": {
+     "accelerator": null,
+     "gpuPoolId": null,
+     "memory": null
+    }
+   },
    "dashboards": [],
    "environmentMetadata": {
     "base_environment": "",
    "notebookMetadata": {
     "pythonIndentUnit": 4
    },
+   "notebookName": "(Clone) NP04_classification",
    "widgets": {}
   },
   "language_info": {