English
hassaanulhaq01 commited on
Commit
31d89f2
·
verified ·
1 Parent(s): 4b844d3

Add interactive schedule_o notebook from Databricks

Browse files
Files changed (1) hide show
  1. notebooks/NP03_schedule_I.ipynb +635 -0
notebooks/NP03_schedule_I.ipynb ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "application/vnd.databricks.v1+cell": {
7
+ "cellMetadata": {},
8
+ "inputWidgets": {},
9
+ "nuid": "939230d3-02ed-43f2-a10a-e983c2c23964",
10
+ "showTitle": false,
11
+ "tableResultSettingsMap": {},
12
+ "title": ""
13
+ }
14
+ },
15
+ "source": [
16
+ "#Funding organizations - 990 Filers\n",
17
+ "\n",
18
+ "Schedule I is completed by organizations who answer \"Yes\" on Form 990, Part IV, line 21 or 22.\n",
19
+ "\n",
20
+ "Question 21: \n",
21
+ "\"Did the organization report more than $5,000 of grants or other assistance to any domestic organization or domestic government on Part IX, column (A), line 1? If “Yes,” complete Schedule I, Parts I and II\"\n",
22
+ "\n",
23
+ "Question 22: \n",
24
+ "\"Did the organization report more than $5,000 of grants or other assistance to or for domestic individuals on Part IX, column (A), line 2? If “Yes,” complete Schedule I, Parts I and III\""
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "metadata": {
31
+ "application/vnd.databricks.v1+cell": {
32
+ "cellMetadata": {
33
+ "byteLimit": 2048000,
34
+ "rowLimit": 10000
35
+ },
36
+ "inputWidgets": {},
37
+ "nuid": "9dc61c58-ae98-4c6a-b8a2-cfd67b848b61",
38
+ "showTitle": false,
39
+ "tableResultSettingsMap": {},
40
+ "title": ""
41
+ }
42
+ },
43
+ "outputs": [],
44
+ "source": [
45
+ "from pyspark.sql import functions as F\n",
46
+ "from pyspark.sql.window import Window"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "metadata": {
53
+ "application/vnd.databricks.v1+cell": {
54
+ "cellMetadata": {
55
+ "byteLimit": 2048000,
56
+ "rowLimit": 10000
57
+ },
58
+ "inputWidgets": {},
59
+ "nuid": "d052b039-2f34-48ce-851e-416a15697695",
60
+ "showTitle": false,
61
+ "tableResultSettingsMap": {},
62
+ "title": ""
63
+ }
64
+ },
65
+ "outputs": [],
66
+ "source": [
67
+ "scheduleigrantsp2 = spark.table(\"prod_curated.irs.scheduleipart2grants\")\n",
68
+ "form990cn120fields = spark.table(\"prod_curated.irs.990standardfields\")"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "markdown",
73
+ "metadata": {
74
+ "application/vnd.databricks.v1+cell": {
75
+ "cellMetadata": {},
76
+ "inputWidgets": {},
77
+ "nuid": "71943bff-f143-4481-82b7-8dfc0c3aa965",
78
+ "showTitle": false,
79
+ "tableResultSettingsMap": {},
80
+ "title": ""
81
+ }
82
+ },
83
+ "source": [
84
+ "##Data checks"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "metadata": {
91
+ "application/vnd.databricks.v1+cell": {
92
+ "cellMetadata": {},
93
+ "inputWidgets": {},
94
+ "nuid": "29735c68-9c07-49b7-9a37-816d82de393b",
95
+ "showTitle": true,
96
+ "tableResultSettingsMap": {},
97
+ "title": "Check for non-unique filer state records within a tax year"
98
+ }
99
+ },
100
+ "outputs": [],
101
+ "source": [
102
+ "# filer_states = (\n",
103
+ "# form990cn120fields\n",
104
+ "# .select(\n",
105
+ "# 'FILEREIN',\n",
106
+ "# 'TAXYEAR',\n",
107
+ "# 'FILERUSSTATE',\n",
108
+ "# )\n",
109
+ "# .distinct()\n",
110
+ "# .groupBy('FILEREIN', 'TAXYEAR')\n",
111
+ "# .agg(F.countDistinct('FILERUSSTATE').alias('state_count'))\n",
112
+ "# .filter(F.col('state_count') > 1)\n",
113
+ "# )"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": null,
119
+ "metadata": {
120
+ "application/vnd.databricks.v1+cell": {
121
+ "cellMetadata": {
122
+ "byteLimit": 2048000,
123
+ "rowLimit": 10000
124
+ },
125
+ "inputWidgets": {},
126
+ "nuid": "e8dea51f-9016-4a6d-b199-52d5509c6f32",
127
+ "showTitle": true,
128
+ "tableResultSettingsMap": {},
129
+ "title": "Check for non-unique filer city records within a tax year"
130
+ }
131
+ },
132
+ "outputs": [],
133
+ "source": [
134
+ "# filer_cities = (\n",
135
+ "# form990cn120fields\n",
136
+ "# .select(\n",
137
+ "# 'FILEREIN',\n",
138
+ "# 'TAXYEAR',\n",
139
+ "# 'FILERUSCITY',\n",
140
+ "# )\n",
141
+ "# .distinct()\n",
142
+ "# .groupBy('FILEREIN', 'TAXYEAR')\n",
143
+ "# .agg(F.countDistinct('FILERUSCITY').alias('state_count'))\n",
144
+ "# .filter(F.col('state_count') > 1)\n",
145
+ "# )\n",
146
+ "\n",
147
+ "# display(filer_cities)"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "metadata": {
154
+ "application/vnd.databricks.v1+cell": {
155
+ "cellMetadata": {
156
+ "byteLimit": 2048000,
157
+ "rowLimit": 10000
158
+ },
159
+ "inputWidgets": {},
160
+ "nuid": "d928d099-3b12-4c27-af21-bb85f83245d4",
161
+ "showTitle": true,
162
+ "tableResultSettingsMap": {},
163
+ "title": "Check recipient state counts"
164
+ }
165
+ },
166
+ "outputs": [],
167
+ "source": [
168
+ "display(\n",
169
+ " scheduleigrantsp2_cl\n",
170
+ " .groupBy(\n",
171
+ " 'RECTABADDSTA'\n",
172
+ " ).agg(\n",
173
+ " F.count('*')\n",
174
+ " )\n",
175
+ ")\n",
176
+ "\n",
177
+ "# display(\n",
178
+ "# scheduleigrantsp2_cl\n",
179
+ "# .filter(\n",
180
+ "# F.col('RECTABADDSTA') == 'AA'\n",
181
+ "# )\n",
182
+ "# )"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "metadata": {
189
+ "application/vnd.databricks.v1+cell": {
190
+ "cellMetadata": {
191
+ "byteLimit": 2048000,
192
+ "rowLimit": 10000
193
+ },
194
+ "inputWidgets": {},
195
+ "nuid": "29713546-0dcb-4b5f-8717-0232b4f617d0",
196
+ "showTitle": false,
197
+ "tableResultSettingsMap": {},
198
+ "title": ""
199
+ }
200
+ },
201
+ "outputs": [],
202
+ "source": [
203
+ "display(scheduleigrantsp2.filter(F.col('RECTABADDSTA').isNull()))"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "metadata": {
210
+ "application/vnd.databricks.v1+cell": {
211
+ "cellMetadata": {},
212
+ "inputWidgets": {},
213
+ "nuid": "9c2e202d-cb25-4753-8b44-0adfa0a97132",
214
+ "showTitle": true,
215
+ "tableResultSettingsMap": {},
216
+ "title": "Check overlap of null grant amounts versus null state codes"
217
+ }
218
+ },
219
+ "outputs": [],
220
+ "source": [
221
+ "# cross_section_counts = (\n",
222
+ "# scheduleigrantsp2_cl\n",
223
+ "# .select(\n",
224
+ "# F.when(F.col('RECTABADDSTA').isNull(), 'NULL').otherwise('NON_NULL').alias('RECTABADDSTA_status'),\n",
225
+ "# F.when(F.col('RETAAMOFCAGR').isNull(), 'NULL').otherwise('NON_NULL').alias('RETAAMOFCAGR_status')\n",
226
+ "# )\n",
227
+ "# .groupBy(\n",
228
+ "# 'RECTABADDSTA_status', 'RETAAMOFCAGR_status'\n",
229
+ "# ).agg(\n",
230
+ "# F.count('*').alias('count')\n",
231
+ "# )\n",
232
+ "# )\n",
233
+ "\n",
234
+ "# display(cross_section_counts)"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "markdown",
239
+ "metadata": {
240
+ "application/vnd.databricks.v1+cell": {
241
+ "cellMetadata": {},
242
+ "inputWidgets": {},
243
+ "nuid": "da3ea6fa-5d11-437e-8705-824ec6ea5d5a",
244
+ "showTitle": false,
245
+ "tableResultSettingsMap": {},
246
+ "title": ""
247
+ }
248
+ },
249
+ "source": [
250
+ "##Schedule I Filers"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": null,
256
+ "metadata": {
257
+ "application/vnd.databricks.v1+cell": {
258
+ "cellMetadata": {
259
+ "byteLimit": 2048000,
260
+ "rowLimit": 10000
261
+ },
262
+ "inputWidgets": {},
263
+ "nuid": "253b21d0-e9c6-4eea-b74a-942a96192e7f",
264
+ "showTitle": false,
265
+ "tableResultSettingsMap": {},
266
+ "title": ""
267
+ }
268
+ },
269
+ "outputs": [],
270
+ "source": [
271
+ "filer_states = (\n",
272
+ " form990cn120fields\n",
273
+ " .select(\n",
274
+ " 'FILEREIN',\n",
275
+ " 'TAXYEAR',\n",
276
+ " 'FILERUSSTATE',\n",
277
+ " ).distinct()\n",
278
+ ")"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": null,
284
+ "metadata": {
285
+ "application/vnd.databricks.v1+cell": {
286
+ "cellMetadata": {
287
+ "byteLimit": 2048000,
288
+ "rowLimit": 10000
289
+ },
290
+ "inputWidgets": {},
291
+ "nuid": "ae49acb7-fc82-4b25-8644-2f1798ac70aa",
292
+ "showTitle": false,
293
+ "tableResultSettingsMap": {},
294
+ "title": ""
295
+ }
296
+ },
297
+ "outputs": [],
298
+ "source": [
299
+ "filer_cities = (\n",
300
+ " form990cn120fields\n",
301
+ " .select(\n",
302
+ " 'FILEREIN',\n",
303
+ " 'TAXYEAR',\n",
304
+ " 'FILERUSCITY',\n",
305
+ " ).distinct()\n",
306
+ ")"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": null,
312
+ "metadata": {
313
+ "application/vnd.databricks.v1+cell": {
314
+ "cellMetadata": {
315
+ "byteLimit": 2048000,
316
+ "rowLimit": 10000
317
+ },
318
+ "inputWidgets": {},
319
+ "nuid": "952f0e50-716e-42cc-abb7-b88619c7fb86",
320
+ "showTitle": false,
321
+ "tableResultSettingsMap": {},
322
+ "title": ""
323
+ }
324
+ },
325
+ "outputs": [],
326
+ "source": [
327
+ "scheduleigrantsp2_cl = (\n",
328
+ " scheduleigrantsp2\n",
329
+ " .select(\n",
330
+ " 'FILEREIN',\n",
331
+ " 'TAXYEAR',\n",
332
+ " 'RTEINORECIPI',\n",
333
+ " 'RTRNBBNLINE11',\n",
334
+ " 'RECTABADDCIT',\n",
335
+ " 'RECTABADDSTA',\n",
336
+ " 'RETAAMOFCAGR',\n",
337
+ " )\n",
338
+ ")"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": null,
344
+ "metadata": {
345
+ "application/vnd.databricks.v1+cell": {
346
+ "cellMetadata": {
347
+ "byteLimit": 2048000,
348
+ "rowLimit": 10000
349
+ },
350
+ "inputWidgets": {},
351
+ "nuid": "7f6df2c9-25a9-4eb0-9057-043e77d9b11d",
352
+ "showTitle": false,
353
+ "tableResultSettingsMap": {},
354
+ "title": ""
355
+ }
356
+ },
357
+ "outputs": [],
358
+ "source": [
359
+ "filer_window = Window.partitionBy('FILEREIN', 'TAXYEAR')\n",
360
+ "state_rank_window = Window.partitionBy('FILEREIN', 'TAXYEAR').orderBy(F.desc('total_grant_value'))\n",
361
+ "\n",
362
+ "grants_per_state = (\n",
363
+ " scheduleigrantsp2_cl\n",
364
+ " .filter(\n",
365
+ " F.col('RECTABADDSTA').isNotNull()\n",
366
+ " )\n",
367
+ " .groupBy(\n",
368
+ " 'FILEREIN', 'TAXYEAR', 'RECTABADDSTA'\n",
369
+ " ).agg(\n",
370
+ " F.sum('RETAAMOFCAGR').alias('total_grant_value'),\n",
371
+ " F.count('*').alias('total_grant_count'),\n",
372
+ " ).withColumn(\n",
373
+ " 'total',\n",
374
+ " F.sum('total_grant_value').over(filer_window)\n",
375
+ " ).filter(\n",
376
+ " F.col('total') > 0\n",
377
+ " ).withColumn(\n",
378
+ " 'proportion',\n",
379
+ " F.col('total_grant_value') / F.col('total')\n",
380
+ " ).withColumn(\n",
381
+ " 'total_states',\n",
382
+ " F.count('*').over(filer_window)\n",
383
+ " ).withColumn(\n",
384
+ " 'rank',\n",
385
+ " F.rank().over(state_rank_window) # Rank states within each filer-year based on grant value\n",
386
+ " ).groupBy(\n",
387
+ " 'FILEREIN', 'TAXYEAR'\n",
388
+ " ).agg(\n",
389
+ " F.sum('total_grant_value').alias('total_grant_value'),\n",
390
+ " F.sum('total_grant_count').alias('total_grant_count'),\n",
391
+ " F.first('total_states').alias('total_recipient_states'),\n",
392
+ " F.max('proportion').alias('max_recipient_state_percentage'),\n",
393
+ " F.collect_set('RECTABADDSTA').alias('distinct_recipient_states'), # Collect unique states into a list\n",
394
+ " F.first(F.when(F.col('rank') == 1, F.col('RECTABADDSTA')), ignorenulls=True).alias('top_recipient_state'), # Get state with highest grant value\n",
395
+ " ).join(\n",
396
+ " filer_states,\n",
397
+ " on=['FILEREIN', 'TAXYEAR'],\n",
398
+ " how='left'\n",
399
+ " ).withColumn(\n",
400
+ " 'foreign_percentage', # Schedule I is for domestic grants so assumed 0 foreign\n",
401
+ " F.lit(0) # Added column for combining datasets later on\n",
402
+ " ).select(\n",
403
+ " 'FILEREIN',\n",
404
+ " 'TAXYEAR',\n",
405
+ " 'FILERUSSTATE',\n",
406
+ " 'total_grant_value',\n",
407
+ " 'total_grant_count',\n",
408
+ " 'total_recipient_states',\n",
409
+ " 'foreign_percentage',\n",
410
+ " 'max_recipient_state_percentage',\n",
411
+ " 'distinct_recipient_states',\n",
412
+ " 'top_recipient_state',\n",
413
+ " )\n",
414
+ ")\n",
415
+ "\n",
416
+ "display(grants_per_state)\n"
417
+ ]
418
+ },
419
+ {
420
+ "cell_type": "code",
421
+ "execution_count": null,
422
+ "metadata": {
423
+ "application/vnd.databricks.v1+cell": {
424
+ "cellMetadata": {
425
+ "byteLimit": 2048000,
426
+ "rowLimit": 10000
427
+ },
428
+ "inputWidgets": {},
429
+ "nuid": "9772faeb-640c-4bdb-9a99-8b0e41f3bbda",
430
+ "showTitle": true,
431
+ "tableResultSettingsMap": {},
432
+ "title": "Aggregation at city level could be used to identify funders with local activity"
433
+ }
434
+ },
435
+ "outputs": [],
436
+ "source": [
437
+ "filer_window = Window.partitionBy('FILEREIN', 'TAXYEAR')\n",
438
+ "city_rank_window = Window.partitionBy('FILEREIN', 'TAXYEAR').orderBy(F.desc('total_grant_value'))\n",
439
+ "\n",
440
+ "grants_per_city = (\n",
441
+ " scheduleigrantsp2_cl\n",
442
+ " .filter(\n",
443
+ " F.col('RECTABADDCIT').isNotNull()\n",
444
+ " )\n",
445
+ " .groupBy(\n",
446
+ " 'FILEREIN', 'TAXYEAR', 'RECTABADDCIT'\n",
447
+ " ).agg(\n",
448
+ " F.sum('RETAAMOFCAGR').alias('total_grant_value'),\n",
449
+ " F.count('*').alias('total_grant_count'),\n",
450
+ " ).withColumn(\n",
451
+ " 'total',\n",
452
+ " F.sum('total_grant_value').over(filer_window)\n",
453
+ " ).filter(\n",
454
+ " F.col('total') > 0\n",
455
+ " ).withColumn(\n",
456
+ " 'proportion',\n",
457
+ " F.col('total_grant_value') / F.col('total')\n",
458
+ " ).withColumn(\n",
459
+ " 'total_cities',\n",
460
+ " F.count('*').over(filer_window)\n",
461
+ " ).withColumn(\n",
462
+ " 'rank',\n",
463
+ " F.rank().over(city_rank_window) # Rank cities within each filer-year based on grant value\n",
464
+ " ).groupBy(\n",
465
+ " 'FILEREIN', 'TAXYEAR'\n",
466
+ " ).agg(\n",
467
+ " F.sum('total_grant_value').alias('total_grant_value'),\n",
468
+ " F.sum('total_grant_count').alias('total_grant_count'),\n",
469
+ " F.first('total_cities').alias('total_recipient_cities'),\n",
470
+ " F.max('proportion').alias('max_recipient_city_percentage'),\n",
471
+ " F.collect_set('RECTABADDCIT').alias('distinct_recipient_cities'), # Collect unique cities into a list\n",
472
+ " F.first(F.when(F.col('rank') == 1, F.col('RECTABADDCIT')), ignorenulls=True).alias('top_recipient_city'), # Get city with highest grant value\n",
473
+ " ).join(\n",
474
+ " filer_cities,\n",
475
+ " on=['FILEREIN', 'TAXYEAR'],\n",
476
+ " how='left'\n",
477
+ " ).select(\n",
478
+ " 'FILEREIN',\n",
479
+ " 'TAXYEAR',\n",
480
+ " 'FILERUSCITY',\n",
481
+ " 'total_grant_value',\n",
482
+ " 'total_grant_count',\n",
483
+ " 'total_recipient_cities',\n",
484
+ " 'max_recipient_city_percentage',\n",
485
+ " 'distinct_recipient_cities',\n",
486
+ " 'top_recipient_city',\n",
487
+ " )\n",
488
+ ")\n",
489
+ "\n",
490
+ "display(grants_per_city)\n"
491
+ ]
492
+ },
493
+ {
494
+ "cell_type": "code",
495
+ "execution_count": null,
496
+ "metadata": {
497
+ "application/vnd.databricks.v1+cell": {
498
+ "cellMetadata": {
499
+ "byteLimit": 2048000,
500
+ "rowLimit": 10000
501
+ },
502
+ "inputWidgets": {},
503
+ "nuid": "74a921cc-2325-4a27-88a6-5cfa28f52ac0",
504
+ "showTitle": false,
505
+ "tableResultSettingsMap": {},
506
+ "title": ""
507
+ }
508
+ },
509
+ "outputs": [],
510
+ "source": [
511
+ "grants_per_state = (\n",
512
+ " grants_per_state.withColumn('source', F.lit('990 (domestic grants, schedule I)'))\n",
513
+ ")\n",
514
+ "grants_per_state.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.grants_per_state_990_filers')"
515
+ ]
516
+ },
517
+ {
518
+ "cell_type": "markdown",
519
+ "metadata": {
520
+ "application/vnd.databricks.v1+cell": {
521
+ "cellMetadata": {},
522
+ "inputWidgets": {},
523
+ "nuid": "78af447a-fd18-4f6d-8283-a431e2d82435",
524
+ "showTitle": false,
525
+ "tableResultSettingsMap": {},
526
+ "title": ""
527
+ }
528
+ },
529
+ "source": [
530
+ "##International activity\n",
531
+ "\n",
532
+ "Would ideally use data in Schedule F but this isn't in Databricks yet. \n",
533
+ "\n",
534
+ "Form 990 questions 14a, 14b, 15, and 16 could be used to identify foregin activity but this information seems to be missing from prod_curated.irs.990cn120fields: \n",
535
+ "F9_04_PC_FOREIGOFFICE (14a) \n",
536
+ "F9_04_PC_FOREIGACTIVI (14b) \n",
537
+ "F9_04_PC_MOTHKTTOORIN (15) \n",
538
+ "F9_04_PC_MOTHKTTOORIN (15) \n",
539
+ "F9_04_PC_MOTHKTTOININ (16) \n",
540
+ "F9_04_PC_MOTHKTTOINND (16) \n",
541
+ "\n",
542
+ "prod_curated.irs.990cn120fields does contain the field F9_09_PC_FOREGRANTOTA from part 9 of the form which totals the amounts given in foreign grants"
543
+ ]
544
+ },
545
+ {
546
+ "cell_type": "code",
547
+ "execution_count": null,
548
+ "metadata": {
549
+ "application/vnd.databricks.v1+cell": {
550
+ "cellMetadata": {
551
+ "byteLimit": 2048000,
552
+ "rowLimit": 10000
553
+ },
554
+ "inputWidgets": {},
555
+ "nuid": "30d7ddc0-df13-4683-98f5-7b24b918e01c",
556
+ "showTitle": false,
557
+ "tableResultSettingsMap": {},
558
+ "title": ""
559
+ }
560
+ },
561
+ "outputs": [],
562
+ "source": [
563
+ "foreign_activity =(\n",
564
+ " form990cn120fields\n",
565
+ " .select(\n",
566
+ " 'FILEREIN',\n",
567
+ " 'TAXYEAR',\n",
568
+ " 'FOREGRANTOTA',\n",
569
+ " ).filter(\n",
570
+ " F.col('FOREGRANTOTA') > 0\n",
571
+ " )\n",
572
+ ")"
573
+ ]
574
+ },
575
+ {
576
+ "cell_type": "code",
577
+ "execution_count": null,
578
+ "metadata": {
579
+ "application/vnd.databricks.v1+cell": {
580
+ "cellMetadata": {
581
+ "byteLimit": 2048000,
582
+ "rowLimit": 10000
583
+ },
584
+ "inputWidgets": {},
585
+ "nuid": "582caba3-4d4d-4e64-b222-eb7af12ce6bd",
586
+ "showTitle": false,
587
+ "tableResultSettingsMap": {},
588
+ "title": ""
589
+ }
590
+ },
591
+ "outputs": [],
592
+ "source": [
593
+ "display(foreign_activity)"
594
+ ]
595
+ },
596
+ {
597
+ "cell_type": "code",
598
+ "execution_count": null,
599
+ "metadata": {
600
+ "application/vnd.databricks.v1+cell": {
601
+ "cellMetadata": {},
602
+ "inputWidgets": {},
603
+ "nuid": "00451a40-f571-4bbc-820b-d6fd31b3537a",
604
+ "showTitle": false,
605
+ "tableResultSettingsMap": {},
606
+ "title": ""
607
+ }
608
+ },
609
+ "outputs": [],
610
+ "source": []
611
+ }
612
+ ],
613
+ "metadata": {
614
+ "application/vnd.databricks.v1+notebook": {
615
+ "computePreferences": null,
616
+ "dashboards": [],
617
+ "environmentMetadata": {
618
+ "base_environment": "",
619
+ "environment_version": "2"
620
+ },
621
+ "inputWidgetPreferences": null,
622
+ "language": "python",
623
+ "notebookMetadata": {
624
+ "pythonIndentUnit": 4
625
+ },
626
+ "notebookName": "(Clone) NP03_schedule_I",
627
+ "widgets": {}
628
+ },
629
+ "language_info": {
630
+ "name": "python"
631
+ }
632
+ },
633
+ "nbformat": 4,
634
+ "nbformat_minor": 0
635
+ }