English
hassaanulhaq01 commited on
Commit
f6f26a4
·
verified ·
1 Parent(s): e6dcca9

Add interactive schedule_o notebook from Databricks

Browse files
Files changed (1) hide show
  1. notebooks/NP03_schedule_I.ipynb +540 -266
notebooks/NP03_schedule_I.ipynb CHANGED
@@ -1,27 +1,90 @@
1
  {
2
  "cells": [
3
  {
4
- "cell_type": "markdown",
 
5
  "metadata": {
6
  "application/vnd.databricks.v1+cell": {
7
- "cellMetadata": {},
 
 
 
8
  "inputWidgets": {},
9
- "nuid": "939230d3-02ed-43f2-a10a-e983c2c23964",
10
  "showTitle": false,
11
  "tableResultSettingsMap": {},
12
  "title": ""
13
  }
14
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  "source": [
16
- "#Funding organizations - 990 Filers\n",
 
 
 
 
 
 
 
 
17
  "\n",
18
- "Schedule I is completed by organizations who answer \"Yes\" on Form 990, Part IV, line 21 or 22.\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "\n",
20
- "Question 21: \n",
21
- "\"Did the organization report more than $5,000 of grants or other assistance to any domestic organization or domestic government on Part IX, column (A), line 1? If “Yes,” complete Schedule I, Parts I and II\"\n",
22
  "\n",
23
- "Question 22: \n",
24
- "\"Did the organization report more than $5,000 of grants or other assistance to or for domestic individuals on Part IX, column (A), line 2? If “Yes,” complete Schedule I, Parts I and III\""
 
 
 
25
  ]
26
  },
27
  {
@@ -34,7 +97,7 @@
34
  "rowLimit": 10000
35
  },
36
  "inputWidgets": {},
37
- "nuid": "9dc61c58-ae98-4c6a-b8a2-cfd67b848b61",
38
  "showTitle": false,
39
  "tableResultSettingsMap": {},
40
  "title": ""
@@ -43,7 +106,12 @@
43
  "outputs": [],
44
  "source": [
45
  "from pyspark.sql import functions as F\n",
46
- "from pyspark.sql.window import Window"
 
 
 
 
 
47
  ]
48
  },
49
  {
@@ -56,7 +124,7 @@
56
  "rowLimit": 10000
57
  },
58
  "inputWidgets": {},
59
- "nuid": "d052b039-2f34-48ce-851e-416a15697695",
60
  "showTitle": false,
61
  "tableResultSettingsMap": {},
62
  "title": ""
@@ -64,24 +132,62 @@
64
  },
65
  "outputs": [],
66
  "source": [
67
- "scheduleigrantsp2 = spark.table(\"prod_curated.irs.scheduleipart2grants\")\n",
68
- "form990cn120fields = spark.table(\"prod_curated.irs.990standardfields\")"
69
  ]
70
  },
71
  {
72
- "cell_type": "markdown",
 
73
  "metadata": {
74
  "application/vnd.databricks.v1+cell": {
75
- "cellMetadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  "inputWidgets": {},
77
- "nuid": "71943bff-f143-4481-82b7-8dfc0c3aa965",
78
  "showTitle": false,
79
  "tableResultSettingsMap": {},
80
  "title": ""
81
  }
82
  },
 
83
  "source": [
84
- "##Data checks"
85
  ]
86
  },
87
  {
@@ -89,28 +195,20 @@
89
  "execution_count": null,
90
  "metadata": {
91
  "application/vnd.databricks.v1+cell": {
92
- "cellMetadata": {},
 
 
 
93
  "inputWidgets": {},
94
- "nuid": "29735c68-9c07-49b7-9a37-816d82de393b",
95
- "showTitle": true,
96
  "tableResultSettingsMap": {},
97
- "title": "Check for non-unique filer state records within a tax year"
98
  }
99
  },
100
  "outputs": [],
101
  "source": [
102
- "# filer_states = (\n",
103
- "# form990cn120fields\n",
104
- "# .select(\n",
105
- "# 'FILEREIN',\n",
106
- "# 'TAXYEAR',\n",
107
- "# 'FILERUSSTATE',\n",
108
- "# )\n",
109
- "# .distinct()\n",
110
- "# .groupBy('FILEREIN', 'TAXYEAR')\n",
111
- "# .agg(F.countDistinct('FILERUSSTATE').alias('state_count'))\n",
112
- "# .filter(F.col('state_count') > 1)\n",
113
- "# )"
114
  ]
115
  },
116
  {
@@ -123,28 +221,25 @@
123
  "rowLimit": 10000
124
  },
125
  "inputWidgets": {},
126
- "nuid": "e8dea51f-9016-4a6d-b199-52d5509c6f32",
127
  "showTitle": true,
128
  "tableResultSettingsMap": {},
129
- "title": "Check for non-unique filer city records within a tax year"
130
  }
131
  },
132
  "outputs": [],
133
  "source": [
134
- "# filer_cities = (\n",
135
- "# form990cn120fields\n",
136
- "# .select(\n",
137
- "# 'FILEREIN',\n",
138
- "# 'TAXYEAR',\n",
139
- "# 'FILERUSCITY',\n",
140
- "# )\n",
141
- "# .distinct()\n",
142
- "# .groupBy('FILEREIN', 'TAXYEAR')\n",
143
- "# .agg(F.countDistinct('FILERUSCITY').alias('state_count'))\n",
144
- "# .filter(F.col('state_count') > 1)\n",
145
- "# )\n",
146
  "\n",
147
- "# display(filer_cities)"
 
 
 
 
148
  ]
149
  },
150
  {
@@ -157,29 +252,15 @@
157
  "rowLimit": 10000
158
  },
159
  "inputWidgets": {},
160
- "nuid": "d928d099-3b12-4c27-af21-bb85f83245d4",
161
  "showTitle": true,
162
  "tableResultSettingsMap": {},
163
- "title": "Check recipient state counts"
164
  }
165
  },
166
  "outputs": [],
167
  "source": [
168
- "display(\n",
169
- " scheduleigrantsp2_cl\n",
170
- " .groupBy(\n",
171
- " 'RECTABADDSTA'\n",
172
- " ).agg(\n",
173
- " F.count('*')\n",
174
- " )\n",
175
- ")\n",
176
- "\n",
177
- "# display(\n",
178
- "# scheduleigrantsp2_cl\n",
179
- "# .filter(\n",
180
- "# F.col('RECTABADDSTA') == 'AA'\n",
181
- "# )\n",
182
- "# )"
183
  ]
184
  },
185
  {
@@ -192,7 +273,7 @@
192
  "rowLimit": 10000
193
  },
194
  "inputWidgets": {},
195
- "nuid": "29713546-0dcb-4b5f-8717-0232b4f617d0",
196
  "showTitle": false,
197
  "tableResultSettingsMap": {},
198
  "title": ""
@@ -200,7 +281,23 @@
200
  },
201
  "outputs": [],
202
  "source": [
203
- "display(scheduleigrantsp2.filter(F.col('RECTABADDSTA').isNull()))"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  ]
205
  },
206
  {
@@ -208,46 +305,67 @@
208
  "execution_count": null,
209
  "metadata": {
210
  "application/vnd.databricks.v1+cell": {
211
- "cellMetadata": {},
 
 
 
212
  "inputWidgets": {},
213
- "nuid": "9c2e202d-cb25-4753-8b44-0adfa0a97132",
214
  "showTitle": true,
215
  "tableResultSettingsMap": {},
216
- "title": "Check overlap of null grant amounts versus null state codes"
217
  }
218
  },
219
  "outputs": [],
220
  "source": [
221
- "# cross_section_counts = (\n",
222
- "# scheduleigrantsp2_cl\n",
223
- "# .select(\n",
224
- "# F.when(F.col('RECTABADDSTA').isNull(), 'NULL').otherwise('NON_NULL').alias('RECTABADDSTA_status'),\n",
225
- "# F.when(F.col('RETAAMOFCAGR').isNull(), 'NULL').otherwise('NON_NULL').alias('RETAAMOFCAGR_status')\n",
226
- "# )\n",
227
- "# .groupBy(\n",
228
- "# 'RECTABADDSTA_status', 'RETAAMOFCAGR_status'\n",
229
- "# ).agg(\n",
230
- "# F.count('*').alias('count')\n",
231
- "# )\n",
 
 
 
 
232
  "# )\n",
 
 
 
233
  "\n",
234
- "# display(cross_section_counts)"
 
235
  ]
236
  },
237
  {
238
- "cell_type": "markdown",
 
239
  "metadata": {
240
  "application/vnd.databricks.v1+cell": {
241
- "cellMetadata": {},
 
 
 
242
  "inputWidgets": {},
243
- "nuid": "da3ea6fa-5d11-437e-8705-824ec6ea5d5a",
244
- "showTitle": false,
245
  "tableResultSettingsMap": {},
246
- "title": ""
247
  }
248
  },
 
249
  "source": [
250
- "##Schedule I Filers"
 
 
 
 
 
251
  ]
252
  },
253
  {
@@ -260,7 +378,7 @@
260
  "rowLimit": 10000
261
  },
262
  "inputWidgets": {},
263
- "nuid": "253b21d0-e9c6-4eea-b74a-942a96192e7f",
264
  "showTitle": false,
265
  "tableResultSettingsMap": {},
266
  "title": ""
@@ -268,14 +386,7 @@
268
  },
269
  "outputs": [],
270
  "source": [
271
- "filer_states = (\n",
272
- " form990cn120fields\n",
273
- " .select(\n",
274
- " 'FILEREIN',\n",
275
- " 'TAXYEAR',\n",
276
- " 'FILERUSSTATE',\n",
277
- " ).distinct()\n",
278
- ")"
279
  ]
280
  },
281
  {
@@ -288,7 +399,7 @@
288
  "rowLimit": 10000
289
  },
290
  "inputWidgets": {},
291
- "nuid": "ae49acb7-fc82-4b25-8644-2f1798ac70aa",
292
  "showTitle": false,
293
  "tableResultSettingsMap": {},
294
  "title": ""
@@ -296,14 +407,24 @@
296
  },
297
  "outputs": [],
298
  "source": [
299
- "filer_cities = (\n",
300
- " form990cn120fields\n",
301
- " .select(\n",
302
- " 'FILEREIN',\n",
303
- " 'TAXYEAR',\n",
304
- " 'FILERUSCITY',\n",
305
- " ).distinct()\n",
306
- ")"
 
 
 
 
 
 
 
 
 
 
307
  ]
308
  },
309
  {
@@ -316,7 +437,7 @@
316
  "rowLimit": 10000
317
  },
318
  "inputWidgets": {},
319
- "nuid": "952f0e50-716e-42cc-abb7-b88619c7fb86",
320
  "showTitle": false,
321
  "tableResultSettingsMap": {},
322
  "title": ""
@@ -324,18 +445,25 @@
324
  },
325
  "outputs": [],
326
  "source": [
327
- "scheduleigrantsp2_cl = (\n",
328
- " scheduleigrantsp2\n",
329
- " .select(\n",
330
- " 'FILEREIN',\n",
331
- " 'TAXYEAR',\n",
332
- " 'RTEINORECIPI',\n",
333
- " 'RTRNBBNLINE11',\n",
334
- " 'RECTABADDCIT',\n",
335
- " 'RECTABADDSTA',\n",
336
- " 'RETAAMOFCAGR',\n",
337
- " )\n",
338
- ")"
 
 
 
 
 
 
 
339
  ]
340
  },
341
  {
@@ -348,72 +476,36 @@
348
  "rowLimit": 10000
349
  },
350
  "inputWidgets": {},
351
- "nuid": "7f6df2c9-25a9-4eb0-9057-043e77d9b11d",
352
- "showTitle": false,
353
  "tableResultSettingsMap": {},
354
- "title": ""
355
  }
356
  },
357
  "outputs": [],
358
  "source": [
359
- "filer_window = Window.partitionBy('FILEREIN', 'TAXYEAR')\n",
360
- "state_rank_window = Window.partitionBy('FILEREIN', 'TAXYEAR').orderBy(F.desc('total_grant_value'))\n",
361
- "\n",
362
- "grants_per_state = (\n",
363
- " scheduleigrantsp2_cl\n",
364
- " .filter(\n",
365
- " F.col('RECTABADDSTA').isNotNull()\n",
366
- " )\n",
367
- " .groupBy(\n",
368
- " 'FILEREIN', 'TAXYEAR', 'RECTABADDSTA'\n",
369
- " ).agg(\n",
370
- " F.sum('RETAAMOFCAGR').alias('total_grant_value'),\n",
371
- " F.count('*').alias('total_grant_count'),\n",
372
- " ).withColumn(\n",
373
- " 'total',\n",
374
- " F.sum('total_grant_value').over(filer_window)\n",
375
- " ).filter(\n",
376
- " F.col('total') > 0\n",
377
- " ).withColumn(\n",
378
- " 'proportion',\n",
379
- " F.col('total_grant_value') / F.col('total')\n",
380
- " ).withColumn(\n",
381
- " 'total_states',\n",
382
- " F.count('*').over(filer_window)\n",
383
- " ).withColumn(\n",
384
- " 'rank',\n",
385
- " F.rank().over(state_rank_window) # Rank states within each filer-year based on grant value\n",
386
- " ).groupBy(\n",
387
- " 'FILEREIN', 'TAXYEAR'\n",
388
- " ).agg(\n",
389
- " F.sum('total_grant_value').alias('total_grant_value'),\n",
390
- " F.sum('total_grant_count').alias('total_grant_count'),\n",
391
- " F.first('total_states').alias('total_recipient_states'),\n",
392
- " F.max('proportion').alias('max_recipient_state_percentage'),\n",
393
- " F.collect_set('RECTABADDSTA').alias('distinct_recipient_states'), # Collect unique states into a list\n",
394
- " F.first(F.when(F.col('rank') == 1, F.col('RECTABADDSTA')), ignorenulls=True).alias('top_recipient_state'), # Get state with highest grant value\n",
395
- " ).join(\n",
396
- " filer_states,\n",
397
- " on=['FILEREIN', 'TAXYEAR'],\n",
398
- " how='left'\n",
399
- " ).withColumn(\n",
400
- " 'foreign_percentage', # Schedule I is for domestic grants so assumed 0 foreign\n",
401
- " F.lit(0) # Added column for combining datasets later on\n",
402
- " ).select(\n",
403
- " 'FILEREIN',\n",
404
- " 'TAXYEAR',\n",
405
- " 'FILERUSSTATE',\n",
406
- " 'total_grant_value',\n",
407
- " 'total_grant_count',\n",
408
- " 'total_recipient_states',\n",
409
- " 'foreign_percentage',\n",
410
- " 'max_recipient_state_percentage',\n",
411
- " 'distinct_recipient_states',\n",
412
- " 'top_recipient_state',\n",
413
  " )\n",
 
414
  ")\n",
415
  "\n",
416
- "display(grants_per_state)\n"
417
  ]
418
  },
419
  {
@@ -426,68 +518,24 @@
426
  "rowLimit": 10000
427
  },
428
  "inputWidgets": {},
429
- "nuid": "9772faeb-640c-4bdb-9a99-8b0e41f3bbda",
430
  "showTitle": true,
431
  "tableResultSettingsMap": {},
432
- "title": "Aggregation at city level could be used to identify funders with local activity"
433
  }
434
  },
435
  "outputs": [],
436
  "source": [
437
- "filer_window = Window.partitionBy('FILEREIN', 'TAXYEAR')\n",
438
- "city_rank_window = Window.partitionBy('FILEREIN', 'TAXYEAR').orderBy(F.desc('total_grant_value'))\n",
439
  "\n",
440
- "grants_per_city = (\n",
441
- " scheduleigrantsp2_cl\n",
442
- " .filter(\n",
443
- " F.col('RECTABADDCIT').isNotNull()\n",
444
- " )\n",
445
- " .groupBy(\n",
446
- " 'FILEREIN', 'TAXYEAR', 'RECTABADDCIT'\n",
447
- " ).agg(\n",
448
- " F.sum('RETAAMOFCAGR').alias('total_grant_value'),\n",
449
- " F.count('*').alias('total_grant_count'),\n",
450
- " ).withColumn(\n",
451
- " 'total',\n",
452
- " F.sum('total_grant_value').over(filer_window)\n",
453
- " ).filter(\n",
454
- " F.col('total') > 0\n",
455
- " ).withColumn(\n",
456
- " 'proportion',\n",
457
- " F.col('total_grant_value') / F.col('total')\n",
458
- " ).withColumn(\n",
459
- " 'total_cities',\n",
460
- " F.count('*').over(filer_window)\n",
461
- " ).withColumn(\n",
462
- " 'rank',\n",
463
- " F.rank().over(city_rank_window) # Rank cities within each filer-year based on grant value\n",
464
- " ).groupBy(\n",
465
- " 'FILEREIN', 'TAXYEAR'\n",
466
- " ).agg(\n",
467
- " F.sum('total_grant_value').alias('total_grant_value'),\n",
468
- " F.sum('total_grant_count').alias('total_grant_count'),\n",
469
- " F.first('total_cities').alias('total_recipient_cities'),\n",
470
- " F.max('proportion').alias('max_recipient_city_percentage'),\n",
471
- " F.collect_set('RECTABADDCIT').alias('distinct_recipient_cities'), # Collect unique cities into a list\n",
472
- " F.first(F.when(F.col('rank') == 1, F.col('RECTABADDCIT')), ignorenulls=True).alias('top_recipient_city'), # Get city with highest grant value\n",
473
- " ).join(\n",
474
- " filer_cities,\n",
475
- " on=['FILEREIN', 'TAXYEAR'],\n",
476
- " how='left'\n",
477
- " ).select(\n",
478
- " 'FILEREIN',\n",
479
- " 'TAXYEAR',\n",
480
- " 'FILERUSCITY',\n",
481
- " 'total_grant_value',\n",
482
- " 'total_grant_count',\n",
483
- " 'total_recipient_cities',\n",
484
- " 'max_recipient_city_percentage',\n",
485
- " 'distinct_recipient_cities',\n",
486
- " 'top_recipient_city',\n",
487
- " )\n",
488
  ")\n",
489
- "\n",
490
- "display(grants_per_city)\n"
491
  ]
492
  },
493
  {
@@ -500,18 +548,50 @@
500
  "rowLimit": 10000
501
  },
502
  "inputWidgets": {},
503
- "nuid": "74a921cc-2325-4a27-88a6-5cfa28f52ac0",
504
- "showTitle": false,
505
  "tableResultSettingsMap": {},
506
- "title": ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  }
508
  },
509
  "outputs": [],
510
  "source": [
511
- "grants_per_state = (\n",
512
- " grants_per_state.withColumn('source', F.lit('990 (domestic grants, schedule I)'))\n",
 
 
 
 
513
  ")\n",
514
- "grants_per_state.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.grants_per_state_990_filers')"
515
  ]
516
  },
517
  {
@@ -520,26 +600,56 @@
520
  "application/vnd.databricks.v1+cell": {
521
  "cellMetadata": {},
522
  "inputWidgets": {},
523
- "nuid": "78af447a-fd18-4f6d-8283-a431e2d82435",
524
  "showTitle": false,
525
  "tableResultSettingsMap": {},
526
  "title": ""
527
  }
528
  },
529
  "source": [
530
- "##International activity\n",
531
- "\n",
532
- "Would ideally use data in Schedule F but this isn't in Databricks yet. \n",
533
- "\n",
534
- "Form 990 questions 14a, 14b, 15, and 16 could be used to identify foregin activity but this information seems to be missing from prod_curated.irs.990cn120fields: \n",
535
- "F9_04_PC_FOREIGOFFICE (14a) \n",
536
- "F9_04_PC_FOREIGACTIVI (14b) \n",
537
- "F9_04_PC_MOTHKTTOORIN (15) \n",
538
- "F9_04_PC_MOTHKTTOORIN (15) \n",
539
- "F9_04_PC_MOTHKTTOININ (16) \n",
540
- "F9_04_PC_MOTHKTTOINND (16) \n",
541
- "\n",
542
- "prod_curated.irs.990cn120fields does contain the field F9_09_PC_FOREGRANTOTA from part 9 of the form which totals the amounts given in foreign grants"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  ]
544
  },
545
  {
@@ -552,7 +662,7 @@
552
  "rowLimit": 10000
553
  },
554
  "inputWidgets": {},
555
- "nuid": "30d7ddc0-df13-4683-98f5-7b24b918e01c",
556
  "showTitle": false,
557
  "tableResultSettingsMap": {},
558
  "title": ""
@@ -560,16 +670,28 @@
560
  },
561
  "outputs": [],
562
  "source": [
563
- "foreign_activity =(\n",
564
- " form990cn120fields\n",
 
 
 
 
565
  " .select(\n",
566
- " 'FILEREIN',\n",
567
- " 'TAXYEAR',\n",
568
- " 'FOREGRANTOTA',\n",
569
- " ).filter(\n",
570
- " F.col('FOREGRANTOTA') > 0\n",
 
 
 
 
 
 
 
571
  " )\n",
572
- ")"
 
573
  ]
574
  },
575
  {
@@ -582,15 +704,161 @@
582
  "rowLimit": 10000
583
  },
584
  "inputWidgets": {},
585
- "nuid": "582caba3-4d4d-4e64-b222-eb7af12ce6bd",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  "showTitle": false,
587
  "tableResultSettingsMap": {},
588
  "title": ""
589
  }
590
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591
  "outputs": [],
592
  "source": [
593
- "display(foreign_activity)"
 
 
 
 
 
 
 
594
  ]
595
  },
596
  {
@@ -600,7 +868,7 @@
600
  "application/vnd.databricks.v1+cell": {
601
  "cellMetadata": {},
602
  "inputWidgets": {},
603
- "nuid": "00451a40-f571-4bbc-820b-d6fd31b3537a",
604
  "showTitle": false,
605
  "tableResultSettingsMap": {},
606
  "title": ""
@@ -612,7 +880,13 @@
612
  ],
613
  "metadata": {
614
  "application/vnd.databricks.v1+notebook": {
615
- "computePreferences": null,
 
 
 
 
 
 
616
  "dashboards": [],
617
  "environmentMetadata": {
618
  "base_environment": "",
@@ -623,7 +897,7 @@
623
  "notebookMetadata": {
624
  "pythonIndentUnit": 4
625
  },
626
- "notebookName": "(Clone) NP03_schedule_I",
627
  "widgets": {}
628
  },
629
  "language_info": {
 
1
  {
2
  "cells": [
3
  {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
  "metadata": {
7
  "application/vnd.databricks.v1+cell": {
8
+ "cellMetadata": {
9
+ "byteLimit": 2048000,
10
+ "rowLimit": 10000
11
+ },
12
  "inputWidgets": {},
13
+ "nuid": "62d4799f-4935-4a2d-8f0a-5f6383b22cf7",
14
  "showTitle": false,
15
  "tableResultSettingsMap": {},
16
  "title": ""
17
  }
18
  },
19
+ "outputs": [],
20
+ "source": [
21
+ "df1 = spark.read.table(\"prod_curated.irs.990cn120fields\")\n",
22
+ "df2 = spark.read.table(\"prod_curated.irs.990standardfields\")"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "metadata": {
29
+ "application/vnd.databricks.v1+cell": {
30
+ "cellMetadata": {
31
+ "byteLimit": 2048000,
32
+ "rowLimit": 10000
33
+ },
34
+ "inputWidgets": {},
35
+ "nuid": "d5410f3d-7463-43f5-8bfb-528d36e80b42",
36
+ "showTitle": false,
37
+ "tableResultSettingsMap": {
38
+ "0": {
39
+ "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{\"column\":116},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1758734440525}",
40
+ "filterBlob": null,
41
+ "queryPlanFiltersBlob": null,
42
+ "tableResultIndex": 0
43
+ }
44
+ },
45
+ "title": ""
46
+ }
47
+ },
48
+ "outputs": [],
49
  "source": [
50
+ "from pyspark.sql import SparkSession\n",
51
+ "import pandas as pd\n",
52
+ "\n",
53
+ "# Extract (col, dtype) as dicts\n",
54
+ "df1_schema = {f.name: f.dataType.simpleString() for f in df1.schema.fields}\n",
55
+ "df2_schema = {f.name: f.dataType.simpleString() for f in df2.schema.fields}\n",
56
+ "\n",
57
+ "# Union of all column names\n",
58
+ "all_cols = set(df1_schema.keys()).union(df2_schema.keys())\n",
59
  "\n",
60
+ "# Build comparison rows\n",
61
+ "rows = []\n",
62
+ "for col in sorted(all_cols):\n",
63
+ " in_df1 = col in df1_schema\n",
64
+ " in_df2 = col in df2_schema\n",
65
+ " \n",
66
+ " if in_df1 and in_df2:\n",
67
+ " flag = \"both\"\n",
68
+ " elif in_df1:\n",
69
+ " flag = \"old\"\n",
70
+ " else:\n",
71
+ " flag = \"new\"\n",
72
+ " \n",
73
+ " rows.append({\n",
74
+ " \"column\": col,\n",
75
+ " \"in_df\": flag,\n",
76
+ " \"dtype_old\": df1_schema.get(col),\n",
77
+ " \"dtype_new\": df2_schema.get(col)\n",
78
+ " })\n",
79
  "\n",
80
+ "# Convert to pandas for inspection\n",
81
+ "comparison_df = pd.DataFrame(rows)\n",
82
  "\n",
83
+ "# If you prefer it as a Spark DataFrame:\n",
84
+ "spark = SparkSession.builder.getOrCreate()\n",
85
+ "spark_comparison_df = spark.createDataFrame(comparison_df)\n",
86
+ "\n",
87
+ "display(comparison_df)\n"
88
  ]
89
  },
90
  {
 
97
  "rowLimit": 10000
98
  },
99
  "inputWidgets": {},
100
+ "nuid": "2634e810-1046-456f-a00e-34db0ca198a2",
101
  "showTitle": false,
102
  "tableResultSettingsMap": {},
103
  "title": ""
 
106
  "outputs": [],
107
  "source": [
108
  "from pyspark.sql import functions as F\n",
109
+ "from pyspark.sql.window import Window\n",
110
+ "\n",
111
+ "from pyspark.ml.feature import VectorAssembler, StandardScaler\n",
112
+ "from pyspark.ml.clustering import KMeans\n",
113
+ "\n",
114
+ "import plotly.express as px"
115
  ]
116
  },
117
  {
 
124
  "rowLimit": 10000
125
  },
126
  "inputWidgets": {},
127
+ "nuid": "cc50ff8a-e01c-417d-b926-fecac95265d0",
128
  "showTitle": false,
129
  "tableResultSettingsMap": {},
130
  "title": ""
 
132
  },
133
  "outputs": [],
134
  "source": [
135
+ "grants_per_state_990 = spark.read.table('sandbox_edward.nonprofit_mapping.grants_per_state_990_filers')\n",
136
+ "grants_per_state_990pf = spark.read.table('sandbox_edward.nonprofit_mapping.grants_per_state_990pf_filers')"
137
  ]
138
  },
139
  {
140
+ "cell_type": "code",
141
+ "execution_count": null,
142
  "metadata": {
143
  "application/vnd.databricks.v1+cell": {
144
+ "cellMetadata": {
145
+ "byteLimit": 2048000,
146
+ "rowLimit": 10000
147
+ },
148
+ "inputWidgets": {},
149
+ "nuid": "79330be0-c72e-4670-b6bd-b95665af55c8",
150
+ "showTitle": true,
151
+ "tableResultSettingsMap": {},
152
+ "title": "check for EINs in both 990 and 990pf"
153
+ }
154
+ },
155
+ "outputs": [],
156
+ "source": [
157
+ "dual_filers = (\n",
158
+ " grants_per_state_990.select(\n",
159
+ " 'FILEREIN', \n",
160
+ " 'TAXYEAR'\n",
161
+ " )\n",
162
+ " .join(\n",
163
+ " grants_per_state_990pf.select('FILEREIN', 'TAXYEAR'), \n",
164
+ " on=['FILEREIN', 'TAXYEAR'],\n",
165
+ " how='inner'\n",
166
+ " )\n",
167
+ ")\n",
168
+ "\n",
169
+ "display(dual_filers)"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": null,
175
+ "metadata": {
176
+ "application/vnd.databricks.v1+cell": {
177
+ "cellMetadata": {
178
+ "byteLimit": 2048000,
179
+ "rowLimit": 10000
180
+ },
181
  "inputWidgets": {},
182
+ "nuid": "3cd453d0-0bac-42d7-b9d3-51a30be32e6b",
183
  "showTitle": false,
184
  "tableResultSettingsMap": {},
185
  "title": ""
186
  }
187
  },
188
+ "outputs": [],
189
  "source": [
190
+ "display(grants_per_state_990.filter(F.col('FILEREIN')=='85-0462315'))"
191
  ]
192
  },
193
  {
 
195
  "execution_count": null,
196
  "metadata": {
197
  "application/vnd.databricks.v1+cell": {
198
+ "cellMetadata": {
199
+ "byteLimit": 2048000,
200
+ "rowLimit": 10000
201
+ },
202
  "inputWidgets": {},
203
+ "nuid": "7232db59-693a-43d9-826b-9e6c2a271626",
204
+ "showTitle": false,
205
  "tableResultSettingsMap": {},
206
+ "title": ""
207
  }
208
  },
209
  "outputs": [],
210
  "source": [
211
+ "display(grants_per_state_990pf.filter(F.col('FILEREIN')=='85-0462315'))"
 
 
 
 
 
 
 
 
 
 
 
212
  ]
213
  },
214
  {
 
221
  "rowLimit": 10000
222
  },
223
  "inputWidgets": {},
224
+ "nuid": "64686496-b51f-4aad-ad34-f88e2b69cf61",
225
  "showTitle": true,
226
  "tableResultSettingsMap": {},
227
+ "title": "drop dual filers"
228
  }
229
  },
230
  "outputs": [],
231
  "source": [
232
+ "grants_per_state_990 = grants_per_state_990.join(\n",
233
+ " dual_filers.select(F.col('FILEREIN'), F.col('TAXYEAR')),\n",
234
+ " on=['FILEREIN', 'TAXYEAR'],\n",
235
+ " how='left_anti'\n",
236
+ ")\n",
 
 
 
 
 
 
 
237
  "\n",
238
+ "grants_per_state_990pf = grants_per_state_990pf.join(\n",
239
+ " dual_filers.select(F.col('FILEREIN'), F.col('TAXYEAR')),\n",
240
+ " on=['FILEREIN', 'TAXYEAR'],\n",
241
+ " how='left_anti'\n",
242
+ ")"
243
  ]
244
  },
245
  {
 
252
  "rowLimit": 10000
253
  },
254
  "inputWidgets": {},
255
+ "nuid": "f0f23c24-a091-49d6-bc7b-64596c89ed0a",
256
  "showTitle": true,
257
  "tableResultSettingsMap": {},
258
+ "title": "combine 990 & 990pf orgs into one df"
259
  }
260
  },
261
  "outputs": [],
262
  "source": [
263
+ "grants_per_state = grants_per_state_990.union(grants_per_state_990pf).orderBy('FILEREIN', 'TAXYEAR')"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  ]
265
  },
266
  {
 
273
  "rowLimit": 10000
274
  },
275
  "inputWidgets": {},
276
+ "nuid": "ac11e27e-18a2-41e0-b8a2-2f6889990a02",
277
  "showTitle": false,
278
  "tableResultSettingsMap": {},
279
  "title": ""
 
281
  },
282
  "outputs": [],
283
  "source": [
284
+ "display(grants_per_state)"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "markdown",
289
+ "metadata": {
290
+ "application/vnd.databricks.v1+cell": {
291
+ "cellMetadata": {},
292
+ "inputWidgets": {},
293
+ "nuid": "6b8aee01-6623-4556-a717-9f58d8af4b6e",
294
+ "showTitle": false,
295
+ "tableResultSettingsMap": {},
296
+ "title": ""
297
+ }
298
+ },
299
+ "source": [
300
+ "##KMeans Clustering"
301
  ]
302
  },
303
  {
 
305
  "execution_count": null,
306
  "metadata": {
307
  "application/vnd.databricks.v1+cell": {
308
+ "cellMetadata": {
309
+ "byteLimit": 2048000,
310
+ "rowLimit": 10000
311
+ },
312
  "inputWidgets": {},
313
+ "nuid": "013375d3-74b9-48a2-9db0-fe70b09c47f5",
314
  "showTitle": true,
315
  "tableResultSettingsMap": {},
316
+ "title": "feature engineering"
317
  }
318
  },
319
  "outputs": [],
320
  "source": [
321
+ "# Normalize/scale features\n",
322
+ "feature_cols = [\"foreign_percentage\", \"max_recipient_state_percentage\", \"total_recipient_states\"]\n",
323
+ "assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"features_unscaled\")\n",
324
+ "df_features = assembler.transform(grants_per_state)\n",
325
+ "\n",
326
+ "scaler = StandardScaler(inputCol=\"features_unscaled\", outputCol=\"features\", withStd=True, withMean=True)\n",
327
+ "df_scaled = scaler.fit(df_features).transform(df_features)\n",
328
+ "\n",
329
+ "# Create a composite score - optional, may not add value\n",
330
+ "# max_states = grants_per_state.select(F.max('total_recipient_states')).collect()[0][0]\n",
331
+ "# grants_per_state = grants_per_state.withColumn(\n",
332
+ "# \"composite_score\",\n",
333
+ "# 0.5 * (1 - F.col(\"max_recipient_state_percentage\")/100) + \n",
334
+ "# 0.3 * (F.col(\"total_recipient_states\")/max_states) + \n",
335
+ "# 0.2 * (F.col(\"foreign_percentage\")/100)\n",
336
  "# )\n",
337
+ "# feature_cols = [\"foreign_percentage\", \"max_recipient_state_percentage\", \"total_recipient_states\", \"composite_score\"]\n",
338
+ "# assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"features_unscaled\")\n",
339
+ "# df_features = assembler.transform(grants_per_state)\n",
340
  "\n",
341
+ "# scaler = StandardScaler(inputCol=\"features_unscaled\", outputCol=\"features\", withStd=True, withMean=True)\n",
342
+ "# df_scaled = scaler.fit(df_features).transform(df_features)"
343
  ]
344
  },
345
  {
346
+ "cell_type": "code",
347
+ "execution_count": null,
348
  "metadata": {
349
  "application/vnd.databricks.v1+cell": {
350
+ "cellMetadata": {
351
+ "byteLimit": 2048000,
352
+ "rowLimit": 10000
353
+ },
354
  "inputWidgets": {},
355
+ "nuid": "9bedd689-bb2c-4beb-9f7b-1756ba0c99c5",
356
+ "showTitle": true,
357
  "tableResultSettingsMap": {},
358
+ "title": "clustering"
359
  }
360
  },
361
+ "outputs": [],
362
  "source": [
363
+ "# Clustering on all the scaled features\n",
364
+ "kmeans = KMeans(featuresCol=\"features\", predictionCol=\"cluster\", k=3, seed=42)\n",
365
+ "model = kmeans.fit(df_scaled)\n",
366
+ "\n",
367
+ "# Assign clusters\n",
368
+ "df_clustered = model.transform(df_scaled)"
369
  ]
370
  },
371
  {
 
378
  "rowLimit": 10000
379
  },
380
  "inputWidgets": {},
381
+ "nuid": "4f47c4f6-2143-41a7-b921-55cc3405be3a",
382
  "showTitle": false,
383
  "tableResultSettingsMap": {},
384
  "title": ""
 
386
  },
387
  "outputs": [],
388
  "source": [
389
+ "display(df_clustered)"
 
 
 
 
 
 
 
390
  ]
391
  },
392
  {
 
399
  "rowLimit": 10000
400
  },
401
  "inputWidgets": {},
402
+ "nuid": "5cce9fe9-0c60-4c5d-971e-1460e813a0fc",
403
  "showTitle": false,
404
  "tableResultSettingsMap": {},
405
  "title": ""
 
407
  },
408
  "outputs": [],
409
  "source": [
410
+ "# df_clustered.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_with_composite_feature')\n",
411
+ "df_clustered.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_without_composite_feature')"
412
+ ]
413
+ },
414
+ {
415
+ "cell_type": "markdown",
416
+ "metadata": {
417
+ "application/vnd.databricks.v1+cell": {
418
+ "cellMetadata": {},
419
+ "inputWidgets": {},
420
+ "nuid": "86298a0c-3526-4749-84d8-33c4119da0d8",
421
+ "showTitle": false,
422
+ "tableResultSettingsMap": {},
423
+ "title": ""
424
+ }
425
+ },
426
+ "source": [
427
+ "##Cluster Summary - With Composite Feature"
428
  ]
429
  },
430
  {
 
437
  "rowLimit": 10000
438
  },
439
  "inputWidgets": {},
440
+ "nuid": "702ac066-0d69-47e9-9411-f080d3a541ea",
441
  "showTitle": false,
442
  "tableResultSettingsMap": {},
443
  "title": ""
 
445
  },
446
  "outputs": [],
447
  "source": [
448
+ "df_clustered = spark.read.table('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_with_composite_feature')"
449
+ ]
450
+ },
451
+ {
452
+ "cell_type": "markdown",
453
+ "metadata": {
454
+ "application/vnd.databricks.v1+cell": {
455
+ "cellMetadata": {},
456
+ "inputWidgets": {},
457
+ "nuid": "3926601d-c18f-4fed-a87e-06b762d61c6e",
458
+ "showTitle": false,
459
+ "tableResultSettingsMap": {},
460
+ "title": ""
461
+ }
462
+ },
463
+ "source": [
464
+ "Cluster 0 = local/regional<br>\n",
465
+ "Cluster 1 = international<br>\n",
466
+ "Cluster 2 = national"
467
  ]
468
  },
469
  {
 
476
  "rowLimit": 10000
477
  },
478
  "inputWidgets": {},
479
+ "nuid": "5ae9099f-f596-4565-954a-40035f6eb880",
480
+ "showTitle": true,
481
  "tableResultSettingsMap": {},
482
+ "title": "summarize clusters by original features"
483
  }
484
  },
485
  "outputs": [],
486
  "source": [
487
+ "summary = (\n",
488
+ " df_clustered\n",
489
+ " .groupBy(\"cluster\")\n",
490
+ " .agg(\n",
491
+ " F.count(\"*\").alias(\"count\"),\n",
492
+ " F.avg(\"foreign_percentage\").alias(\"avg_foreign_percentage\"),\n",
493
+ " F.median(\"foreign_percentage\").alias(\"median_foreign_percentage\"),\n",
494
+ " F.min(\"foreign_percentage\").alias(\"min_foreign_percentage\"),\n",
495
+ " F.max(\"foreign_percentage\").alias(\"max_foreign_percentage\"),\n",
496
+ " F.avg(\"max_recipient_state_percentage\").alias(\"avg_max_state_pct\"),\n",
497
+ " F.median(\"max_recipient_state_percentage\").alias(\"median_max_state_pct\"),\n",
498
+ " F.min(\"max_recipient_state_percentage\").alias(\"min_max_state_pct\"),\n",
499
+ " F.max(\"max_recipient_state_percentage\").alias(\"max_max_state_pct\"),\n",
500
+ " F.avg(\"total_recipient_states\").alias(\"avg_distinct_states\"),\n",
501
+ " F.median(\"total_recipient_states\").alias(\"median_distinct_states\"),\n",
502
+ " F.min(\"total_recipient_states\").alias(\"min_distinct_states\"),\n",
503
+ " F.max(\"total_recipient_states\").alias(\"max_distinct_states\"),\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  " )\n",
505
+ " .orderBy(\"cluster\")\n",
506
  ")\n",
507
  "\n",
508
+ "display(summary)"
509
  ]
510
  },
511
  {
 
518
  "rowLimit": 10000
519
  },
520
  "inputWidgets": {},
521
+ "nuid": "def2f16f-982e-448e-9aa8-782aa01c2193",
522
  "showTitle": true,
523
  "tableResultSettingsMap": {},
524
+ "title": "create distribution plots for each cluster (feature: foreign percentage)"
525
  }
526
  },
527
  "outputs": [],
528
  "source": [
529
+ "pdf_clustered = df_clustered.toPandas()\n",
 
530
  "\n",
531
+ "fig_foreign = px.box(\n",
532
+ " pdf_clustered,\n",
533
+ " x=\"cluster\",\n",
534
+ " y=\"foreign_percentage\",\n",
535
+ " title=\"Foreign Percentage by Cluster\",\n",
536
+ " labels={\"foreign_percentage\": \"Foreign Percentage\", \"cluster\": \"Cluster\"}\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  ")\n",
538
+ "fig_foreign.show()"
 
539
  ]
540
  },
541
  {
 
548
  "rowLimit": 10000
549
  },
550
  "inputWidgets": {},
551
+ "nuid": "01020c54-f610-42e2-b43c-f2643f98a576",
552
+ "showTitle": true,
553
  "tableResultSettingsMap": {},
554
+ "title": "create distribution plots for each cluster (feature: max recipient state percentage)"
555
+ }
556
+ },
557
+ "outputs": [],
558
+ "source": [
559
+ "fig_max_recipient = px.box(\n",
560
+ " pdf_clustered,\n",
561
+ " x=\"cluster\",\n",
562
+ " y=\"max_recipient_state_percentage\",\n",
563
+ " title=\"Max Recipient State Percentage by Cluster\",\n",
564
+ " labels={\"max_recipient_state_percentage\": \"Max Recipient State Percentage\", \"cluster\": \"Cluster\"}\n",
565
+ ")\n",
566
+ "fig_max_recipient.show()"
567
+ ]
568
+ },
569
+ {
570
+ "cell_type": "code",
571
+ "execution_count": null,
572
+ "metadata": {
573
+ "application/vnd.databricks.v1+cell": {
574
+ "cellMetadata": {
575
+ "byteLimit": 2048000,
576
+ "rowLimit": 10000
577
+ },
578
+ "inputWidgets": {},
579
+ "nuid": "170d71cf-cb3f-44d5-bf90-cc9746a3c1d3",
580
+ "showTitle": true,
581
+ "tableResultSettingsMap": {},
582
+ "title": "create distribution plots for each cluster (feature: number of states)"
583
  }
584
  },
585
  "outputs": [],
586
  "source": [
587
+ "fig_total_states = px.box(\n",
588
+ " pdf_clustered,\n",
589
+ " x=\"cluster\",\n",
590
+ " y=\"total_recipient_states\",\n",
591
+ " title=\"Total Recipient States by Cluster\",\n",
592
+ " labels={\"total_recipient_states\": \"Total Recipient States\", \"cluster\": \"Cluster\"}\n",
593
  ")\n",
594
+ "fig_total_states.show()"
595
  ]
596
  },
597
  {
 
600
  "application/vnd.databricks.v1+cell": {
601
  "cellMetadata": {},
602
  "inputWidgets": {},
603
+ "nuid": "12dc14fa-0066-4fe5-8a99-d9c6d05860aa",
604
  "showTitle": false,
605
  "tableResultSettingsMap": {},
606
  "title": ""
607
  }
608
  },
609
  "source": [
610
+ "##Cluster Summary - Without Composite Feature"
611
+ ]
612
+ },
613
+ {
614
+ "cell_type": "code",
615
+ "execution_count": null,
616
+ "metadata": {
617
+ "application/vnd.databricks.v1+cell": {
618
+ "cellMetadata": {
619
+ "byteLimit": 2048000,
620
+ "rowLimit": 10000
621
+ },
622
+ "inputWidgets": {},
623
+ "nuid": "1545cd83-3e0a-43f9-9719-14d0f12f5dcb",
624
+ "showTitle": false,
625
+ "tableResultSettingsMap": {},
626
+ "title": ""
627
+ }
628
+ },
629
+ "outputs": [],
630
+ "source": [
631
+ "df_clustered = spark.read.table('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_without_composite_feature')"
632
+ ]
633
+ },
634
+ {
635
+ "cell_type": "code",
636
+ "execution_count": null,
637
+ "metadata": {
638
+ "application/vnd.databricks.v1+cell": {
639
+ "cellMetadata": {
640
+ "byteLimit": 2048000,
641
+ "rowLimit": 10000
642
+ },
643
+ "inputWidgets": {},
644
+ "nuid": "f4a8fe26-50d1-4d55-bd18-0313a1d55136",
645
+ "showTitle": false,
646
+ "tableResultSettingsMap": {},
647
+ "title": ""
648
+ }
649
+ },
650
+ "outputs": [],
651
+ "source": [
652
+ "display(df_clustered)"
653
  ]
654
  },
655
  {
 
662
  "rowLimit": 10000
663
  },
664
  "inputWidgets": {},
665
+ "nuid": "6b2a98fa-d5ee-4c8f-8aad-208a83b2d145",
666
  "showTitle": false,
667
  "tableResultSettingsMap": {},
668
  "title": ""
 
670
  },
671
  "outputs": [],
672
  "source": [
673
+ "from pyspark.sql import functions as F\n",
674
+ "\n",
675
+ "df_2023 = (\n",
676
+ " df_clustered\n",
677
+ " .filter(F.col(\"TAXYEAR\") == 2023)\n",
678
+ " .withColumn(\"locality\", F.when(F.col(\"cluster\")==0, \"local/regional\").otherwise(F.when(F.col(\"cluster\")==1, \"international\").otherwise(F.when(F.col(\"cluster\")==2, \"national\").otherwise(None))))\n",
679
  " .select(\n",
680
+ " \"FILEREIN\",\n",
681
+ " \"TAXYEAR\",\n",
682
+ " \"FILERUSSTATE\",\n",
683
+ " F.col(\"total_grant_value\").alias(\"value_of_grants\"),\n",
684
+ " F.col(\"total_grant_count\").alias(\"number_of_grants\"),\n",
685
+ " F.col(\"total_recipient_states\").alias(\"number_of_recipient_states\"),\n",
686
+ " F.col(\"foreign_percentage\").alias(\"pct_grant_value_foreign\"),\n",
687
+ " F.col(\"max_recipient_state_percentage\").alias(\"pct_grant_value_top_state\"),\n",
688
+ " F.col(\"top_recipient_state\").alias(\"top_state\"),\n",
689
+ " F.col(\"distinct_recipient_states\").alias(\"recipient_states\"),\n",
690
+ " \"locality\",\n",
691
+ " \"source\",\n",
692
  " )\n",
693
+ ")\n",
694
+ "display(df_2023)"
695
  ]
696
  },
697
  {
 
704
  "rowLimit": 10000
705
  },
706
  "inputWidgets": {},
707
+ "nuid": "2483cf1c-4bed-47f0-91c0-0364e0f0d5da",
708
+ "showTitle": false,
709
+ "tableResultSettingsMap": {},
710
+ "title": ""
711
+ }
712
+ },
713
+ "outputs": [],
714
+ "source": [
715
+ "df_2023.write.mode(\"overwrite\").saveAsTable(\"sandbox_edward.nonprofit_mapping.locality_by_granting_activity_segmentation_funding_orgs_taxyear2023\")"
716
+ ]
717
+ },
718
+ {
719
+ "cell_type": "markdown",
720
+ "metadata": {
721
+ "application/vnd.databricks.v1+cell": {
722
+ "cellMetadata": {},
723
+ "inputWidgets": {},
724
+ "nuid": "0781ad05-93a0-46fe-9357-48fea2039b81",
725
  "showTitle": false,
726
  "tableResultSettingsMap": {},
727
  "title": ""
728
  }
729
  },
730
+ "source": [
731
+ "Cluster 0 = local/regional<br>\n",
732
+ "Cluster 1 = international<br>\n",
733
+ "Cluster 2 = national"
734
+ ]
735
+ },
736
+ {
737
+ "cell_type": "code",
738
+ "execution_count": null,
739
+ "metadata": {
740
+ "application/vnd.databricks.v1+cell": {
741
+ "cellMetadata": {
742
+ "byteLimit": 2048000,
743
+ "rowLimit": 10000
744
+ },
745
+ "inputWidgets": {},
746
+ "nuid": "840cdfdb-dbad-4264-b0fc-85bb060ac2aa",
747
+ "showTitle": true,
748
+ "tableResultSettingsMap": {},
749
+ "title": "summarize clusters by original features"
750
+ }
751
+ },
752
+ "outputs": [],
753
+ "source": [
754
+ "summary = (\n",
755
+ " df_clustered\n",
756
+ " .groupBy(\"cluster\")\n",
757
+ " .agg(\n",
758
+ " F.count(\"*\").alias(\"count\"),\n",
759
+ " F.avg(\"foreign_percentage\").alias(\"avg_foreign_percentage\"),\n",
760
+ " F.median(\"foreign_percentage\").alias(\"median_foreign_percentage\"),\n",
761
+ " F.min(\"foreign_percentage\").alias(\"min_foreign_percentage\"),\n",
762
+ " F.max(\"foreign_percentage\").alias(\"max_foreign_percentage\"),\n",
763
+ " F.avg(\"max_recipient_state_percentage\").alias(\"avg_max_state_pct\"),\n",
764
+ " F.median(\"max_recipient_state_percentage\").alias(\"median_max_state_pct\"),\n",
765
+ " F.min(\"max_recipient_state_percentage\").alias(\"min_max_state_pct\"),\n",
766
+ " F.max(\"max_recipient_state_percentage\").alias(\"max_max_state_pct\"),\n",
767
+ " F.avg(\"total_recipient_states\").alias(\"avg_distinct_states\"),\n",
768
+ " F.median(\"total_recipient_states\").alias(\"median_distinct_states\"),\n",
769
+ " F.min(\"total_recipient_states\").alias(\"min_distinct_states\"),\n",
770
+ " F.max(\"total_recipient_states\").alias(\"max_distinct_states\"),\n",
771
+ " )\n",
772
+ " .orderBy(\"cluster\")\n",
773
+ ")\n",
774
+ "\n",
775
+ "display(summary)"
776
+ ]
777
+ },
778
+ {
779
+ "cell_type": "code",
780
+ "execution_count": null,
781
+ "metadata": {
782
+ "application/vnd.databricks.v1+cell": {
783
+ "cellMetadata": {
784
+ "byteLimit": 2048000,
785
+ "rowLimit": 10000
786
+ },
787
+ "inputWidgets": {},
788
+ "nuid": "3e9a4fb8-11e3-4734-8d8e-943e03e3b738",
789
+ "showTitle": true,
790
+ "tableResultSettingsMap": {},
791
+ "title": "create distribution plots for each cluster (feature: foreign percentage)"
792
+ }
793
+ },
794
+ "outputs": [],
795
+ "source": [
796
+ "pdf_clustered = df_clustered.toPandas()\n",
797
+ "\n",
798
+ "fig_foreign = px.box(\n",
799
+ " pdf_clustered,\n",
800
+ " x=\"cluster\",\n",
801
+ " y=\"foreign_percentage\",\n",
802
+ " title=\"Foreign Percentage by Cluster\",\n",
803
+ " labels={\"foreign_percentage\": \"Foreign Percentage\", \"cluster\": \"Cluster\"}\n",
804
+ ")\n",
805
+ "fig_foreign.show()"
806
+ ]
807
+ },
808
+ {
809
+ "cell_type": "code",
810
+ "execution_count": null,
811
+ "metadata": {
812
+ "application/vnd.databricks.v1+cell": {
813
+ "cellMetadata": {
814
+ "byteLimit": 2048000,
815
+ "rowLimit": 10000
816
+ },
817
+ "inputWidgets": {},
818
+ "nuid": "42bc6e79-cc9f-4745-9751-03c9223a3642",
819
+ "showTitle": true,
820
+ "tableResultSettingsMap": {},
821
+ "title": "create distribution plots for each cluster (feature: max recipient state percentage)"
822
+ }
823
+ },
824
+ "outputs": [],
825
+ "source": [
826
+ "fig_max_recipient = px.box(\n",
827
+ " pdf_clustered,\n",
828
+ " x=\"cluster\",\n",
829
+ " y=\"max_recipient_state_percentage\",\n",
830
+ " title=\"Max Recipient State Percentage by Cluster\",\n",
831
+ " labels={\"max_recipient_state_percentage\": \"Max Recipient State Percentage\", \"cluster\": \"Cluster\"}\n",
832
+ ")\n",
833
+ "fig_max_recipient.show()"
834
+ ]
835
+ },
836
+ {
837
+ "cell_type": "code",
838
+ "execution_count": null,
839
+ "metadata": {
840
+ "application/vnd.databricks.v1+cell": {
841
+ "cellMetadata": {
842
+ "byteLimit": 2048000,
843
+ "rowLimit": 10000
844
+ },
845
+ "inputWidgets": {},
846
+ "nuid": "b05a45f8-4d81-4ac2-9cfb-b73e18d2051f",
847
+ "showTitle": true,
848
+ "tableResultSettingsMap": {},
849
+ "title": "create distribution plots for each cluster (feature: number of states)"
850
+ }
851
+ },
852
  "outputs": [],
853
  "source": [
854
+ "fig_total_states = px.box(\n",
855
+ " pdf_clustered,\n",
856
+ " x=\"cluster\",\n",
857
+ " y=\"total_recipient_states\",\n",
858
+ " title=\"Total Recipient States by Cluster\",\n",
859
+ " labels={\"total_recipient_states\": \"Total Recipient States\", \"cluster\": \"Cluster\"}\n",
860
+ ")\n",
861
+ "fig_total_states.show()"
862
  ]
863
  },
864
  {
 
868
  "application/vnd.databricks.v1+cell": {
869
  "cellMetadata": {},
870
  "inputWidgets": {},
871
+ "nuid": "3cf19964-c147-4cf4-b7b9-1f31a2e6a256",
872
  "showTitle": false,
873
  "tableResultSettingsMap": {},
874
  "title": ""
 
880
  ],
881
  "metadata": {
882
  "application/vnd.databricks.v1+notebook": {
883
+ "computePreferences": {
884
+ "hardware": {
885
+ "accelerator": null,
886
+ "gpuPoolId": null,
887
+ "memory": null
888
+ }
889
+ },
890
  "dashboards": [],
891
  "environmentMetadata": {
892
  "base_environment": "",
 
897
  "notebookMetadata": {
898
  "pythonIndentUnit": 4
899
  },
900
+ "notebookName": "(Clone) NP04_classification",
901
  "widgets": {}
902
  },
903
  "language_info": {