English
hassaanulhaq01 commited on
Commit
59a9eee
·
verified ·
1 Parent(s): bc26f2f

Add interactive schedule_o notebook from Databricks

Browse files
notebooks/NP02_990PF-grants-summary.ipynb ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "application/vnd.databricks.v1+cell": {
8
+ "cellMetadata": {
9
+ "byteLimit": 2048000,
10
+ "rowLimit": 10000
11
+ },
12
+ "inputWidgets": {},
13
+ "nuid": "eb152d62-ceb8-4b46-80d9-c77ceaca41a4",
14
+ "showTitle": false,
15
+ "tableResultSettingsMap": {},
16
+ "title": ""
17
+ }
18
+ },
19
+ "outputs": [],
20
+ "source": [
21
+ "from pyspark.sql import functions as F\n",
22
+ "from pyspark.sql.window import Window"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "metadata": {
29
+ "application/vnd.databricks.v1+cell": {
30
+ "cellMetadata": {
31
+ "byteLimit": 2048000,
32
+ "rowLimit": 10000
33
+ },
34
+ "inputWidgets": {},
35
+ "nuid": "82b0ab9b-6936-4513-8f67-68ecfd0b16f6",
36
+ "showTitle": true,
37
+ "tableResultSettingsMap": {},
38
+ "title": "Load table built in NP01_cleaning-steps-990PF-grants"
39
+ }
40
+ },
41
+ "outputs": [],
42
+ "source": [
43
+ "grants3a_cleaned = spark.table(\"sandbox_edward.nonprofit_mapping.grants3a_cleaned\")"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": null,
49
+ "metadata": {
50
+ "application/vnd.databricks.v1+cell": {
51
+ "cellMetadata": {
52
+ "byteLimit": 2048000,
53
+ "rowLimit": 10000
54
+ },
55
+ "inputWidgets": {},
56
+ "nuid": "3f193ace-ecf5-4f55-b2a5-82b94eaf1a0e",
57
+ "showTitle": false,
58
+ "tableResultSettingsMap": {},
59
+ "title": ""
60
+ }
61
+ },
62
+ "outputs": [],
63
+ "source": [
64
+ "#pfpart6astatesregistered = spark.table(\"prod_curated.irs.990pfpart6astatesregistered\")\n",
65
+ "pfstandardfields = spark.table(\"prod_curated.irs.990pfstandardfields\")"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "metadata": {
72
+ "application/vnd.databricks.v1+cell": {
73
+ "cellMetadata": {
74
+ "byteLimit": 2048000,
75
+ "rowLimit": 10000
76
+ },
77
+ "inputWidgets": {},
78
+ "nuid": "ca110791-727b-4c86-9194-46f27e976fcd",
79
+ "showTitle": true,
80
+ "tableResultSettingsMap": {},
81
+ "title": "Check for duplicate state registrations by filer / tax year"
82
+ }
83
+ },
84
+ "outputs": [],
85
+ "source": [
86
+ "display(\n",
87
+ " pfstandardfields\n",
88
+ " .groupBy(\n",
89
+ " 'FILEREIN', 'TAXYEAR'\n",
90
+ " ).agg(\n",
91
+ " F.countDistinct('FILERUSSTATE').alias('filer_total_registered_states')\n",
92
+ " ).orderBy(\n",
93
+ " 'filer_total_registered_states',\n",
94
+ " ascending=False\n",
95
+ " )\n",
96
+ ")"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": null,
102
+ "metadata": {
103
+ "application/vnd.databricks.v1+cell": {
104
+ "cellMetadata": {
105
+ "byteLimit": 2048000,
106
+ "rowLimit": 10000
107
+ },
108
+ "inputWidgets": {},
109
+ "nuid": "f20fa65c-77f1-4a40-9162-8ca929405d5d",
110
+ "showTitle": false,
111
+ "tableResultSettingsMap": {},
112
+ "title": ""
113
+ }
114
+ },
115
+ "outputs": [],
116
+ "source": [
117
+ "filer_states = (\n",
118
+ " pfstandardfields\n",
119
+ " .select(\n",
120
+ " 'FILEREIN',\n",
121
+ " 'TAXYEAR',\n",
122
+ " 'FILERUSSTATE',\n",
123
+ " ).distinct()\n",
124
+ ")"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": null,
130
+ "metadata": {
131
+ "application/vnd.databricks.v1+cell": {
132
+ "cellMetadata": {
133
+ "byteLimit": 2048000,
134
+ "rowLimit": 10000
135
+ },
136
+ "inputWidgets": {},
137
+ "nuid": "33e98457-6688-4349-8eff-d65aed0fef95",
138
+ "showTitle": false,
139
+ "tableResultSettingsMap": {},
140
+ "title": ""
141
+ }
142
+ },
143
+ "outputs": [],
144
+ "source": [
145
+ "filer_window = Window.partitionBy('FILEREIN', 'TAXYEAR')\n",
146
+ "state_rank_window = Window.partitionBy('FILEREIN', 'TAXYEAR').orderBy(F.desc('proportion'))\n",
147
+ "\n",
148
+ "grants_per_state = (\n",
149
+ " grants3a_cleaned\n",
150
+ " # .filter(\n",
151
+ " # F.col('valid_state_code').isNotNull()\n",
152
+ " # )\n",
153
+ " .groupBy(\n",
154
+ " 'FILEREIN', 'TAXYEAR', 'valid_state_code',\n",
155
+ " ).agg(\n",
156
+ " F.sum('SIGOCPYAMOUN').alias('total_grant_value'),\n",
157
+ " F.count('*').alias('total_grant_count'),\n",
158
+ " ).withColumn(\n",
159
+ " 'total',\n",
160
+ " F.sum('total_grant_value').over(filer_window)\n",
161
+ " ).filter(\n",
162
+ " F.col('total') > 0\n",
163
+ " ).withColumn(\n",
164
+ " 'proportion',\n",
165
+ " F.when(F.col('valid_state_code') != 'INTL', F.col('total_grant_value') / F.col('total')).otherwise(0) # Exclude international grants - calculate separately\n",
166
+ " ).withColumn(\n",
167
+ " 'total_states',\n",
168
+ " F.sum(F.when(F.col(\"valid_state_code\") != \"INTL\", 1).otherwise(0)).over(filer_window) # Count number of states, excluding international grants\n",
169
+ " ).withColumn(\n",
170
+ " 'rank',\n",
171
+ " F.rank().over(state_rank_window) # Rank states within each filer-year based on grant value - rank by \"proportion\" to avoid INTL as the 1st state\n",
172
+ " ).withColumn(\n",
173
+ " 'pct_foreign', \n",
174
+ " F.when(F.col('valid_state_code') == 'INTL', F.col('total_grant_value') / F.col('total')).otherwise(0) # Calculate percent of foreign grants\n",
175
+ " ).groupBy(\n",
176
+ " 'FILEREIN', 'TAXYEAR',\n",
177
+ " ).agg(\n",
178
+ " F.sum('total_grant_value').alias('total_grant_value'), # Includes foreign grants\n",
179
+ " F.sum('total_grant_count').alias('total_grant_count'), # Includes foreign grants\n",
180
+ " F.first('total_states').alias('total_recipient_states'),\n",
181
+ " F.max('pct_foreign').alias('foreign_percentage'),\n",
182
+ " F.max('proportion').alias('max_recipient_state_percentage'),\n",
183
+ " F.collect_set(F.when(F.col(\"valid_state_code\") != \"INTL\", F.col(\"valid_state_code\"))).alias('distinct_recipient_states'), # Collect unique states into a list\n",
184
+ " F.first(F.when(F.col('rank') == 1, F.col('valid_state_code')), ignorenulls=True).alias('top_recipient_state'), # Get state with highest grant value (includes INTL)\n",
185
+ " ).join(\n",
186
+ " filer_states,\n",
187
+ " on=['FILEREIN', 'TAXYEAR'],\n",
188
+ " how='left'\n",
189
+ " ).select(\n",
190
+ " 'FILEREIN',\n",
191
+ " 'TAXYEAR',\n",
192
+ " 'FILERUSSTATE',\n",
193
+ " 'total_grant_value',\n",
194
+ " 'total_grant_count',\n",
195
+ " 'total_recipient_states',\n",
196
+ " 'foreign_percentage',\n",
197
+ " 'max_recipient_state_percentage',\n",
198
+ " 'distinct_recipient_states',\n",
199
+ " 'top_recipient_state',\n",
200
+ " )\n",
201
+ ")\n",
202
+ "\n",
203
+ "display(grants_per_state)\n"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "metadata": {
210
+ "application/vnd.databricks.v1+cell": {
211
+ "cellMetadata": {
212
+ "byteLimit": 2048000,
213
+ "rowLimit": 10000
214
+ },
215
+ "inputWidgets": {},
216
+ "nuid": "793c9ddc-3d69-40f0-a228-c1f77dc80ab9",
217
+ "showTitle": true,
218
+ "tableResultSettingsMap": {},
219
+ "title": "Note: some FILEREINs don't have a record in pfstandardfields"
220
+ }
221
+ },
222
+ "outputs": [],
223
+ "source": [
224
+ "display(pfstandardfields.filter(F.col('FILEREIN') == '01-0277832'))"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": null,
230
+ "metadata": {
231
+ "application/vnd.databricks.v1+cell": {
232
+ "cellMetadata": {
233
+ "byteLimit": 2048000,
234
+ "rowLimit": 10000
235
+ },
236
+ "inputWidgets": {},
237
+ "nuid": "4cc7ccb8-faa6-4321-926e-10f73df0b909",
238
+ "showTitle": false,
239
+ "tableResultSettingsMap": {},
240
+ "title": ""
241
+ }
242
+ },
243
+ "outputs": [],
244
+ "source": [
245
+ "foreign_activity =(\n",
246
+ " grants3a_cleaned\n",
247
+ " .select(\n",
248
+ " 'FILEREIN',\n",
249
+ " 'TAXYEAR',\n",
250
+ " # 'FOREGRANTOTA',\n",
251
+ " 'country_from_state_lookup',\n",
252
+ " ).filter(\n",
253
+ " F.col('country_from_state_lookup') == 'Assumed international'\n",
254
+ " )\n",
255
+ ")\n",
256
+ "\n",
257
+ "display(foreign_activity)"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "metadata": {
264
+ "application/vnd.databricks.v1+cell": {
265
+ "cellMetadata": {
266
+ "byteLimit": 2048000,
267
+ "rowLimit": 10000
268
+ },
269
+ "inputWidgets": {},
270
+ "nuid": "abe8358b-eac8-4bd2-8c55-b332eaec1988",
271
+ "showTitle": false,
272
+ "tableResultSettingsMap": {},
273
+ "title": ""
274
+ }
275
+ },
276
+ "outputs": [],
277
+ "source": [
278
+ "display(grants3a_cleaned.filter(F.col('FILEREIN')=='01-0351827')) "
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": null,
284
+ "metadata": {
285
+ "application/vnd.databricks.v1+cell": {
286
+ "cellMetadata": {
287
+ "byteLimit": 2048000,
288
+ "rowLimit": 10000
289
+ },
290
+ "inputWidgets": {},
291
+ "nuid": "aa091446-359a-4d39-8346-777f9b4b2610",
292
+ "showTitle": false,
293
+ "tableResultSettingsMap": {},
294
+ "title": ""
295
+ }
296
+ },
297
+ "outputs": [],
298
+ "source": [
299
+ "grants_per_state = (\n",
300
+ " grants_per_state.withColumn('source', F.lit('990pf (grants, part XIV-3a)'))\n",
301
+ ")\n",
302
+ "grants_per_state.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.grants_per_state_990pf_filers')"
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": null,
308
+ "metadata": {
309
+ "application/vnd.databricks.v1+cell": {
310
+ "cellMetadata": {},
311
+ "inputWidgets": {},
312
+ "nuid": "b81634c6-e989-420f-8206-f358f0eed425",
313
+ "showTitle": false,
314
+ "tableResultSettingsMap": {},
315
+ "title": ""
316
+ }
317
+ },
318
+ "outputs": [],
319
+ "source": []
320
+ }
321
+ ],
322
+ "metadata": {
323
+ "application/vnd.databricks.v1+notebook": {
324
+ "computePreferences": null,
325
+ "dashboards": [],
326
+ "environmentMetadata": {
327
+ "base_environment": "",
328
+ "environment_version": "2"
329
+ },
330
+ "inputWidgetPreferences": null,
331
+ "language": "python",
332
+ "notebookMetadata": {
333
+ "pythonIndentUnit": 4
334
+ },
335
+ "notebookName": "NP02_990PF-grants-summary",
336
+ "widgets": {}
337
+ },
338
+ "language_info": {
339
+ "name": "python"
340
+ }
341
+ },
342
+ "nbformat": 4,
343
+ "nbformat_minor": 0
344
+ }