English
hassaanulhaq01 commited on
Commit
bc26f2f
·
verified ·
1 Parent(s): d28f2e7

Add interactive schedule_o notebook from Databricks

Browse files
notebooks/NP01_cleaning-steps-990PF-grants.ipynb ADDED
@@ -0,0 +1,1052 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "application/vnd.databricks.v1+cell": {
8
+ "cellMetadata": {
9
+ "byteLimit": 2048000,
10
+ "rowLimit": 10000
11
+ },
12
+ "inputWidgets": {},
13
+ "nuid": "7c432975-f5e2-406f-9e43-39aeb35c057e",
14
+ "showTitle": false,
15
+ "tableResultSettingsMap": {},
16
+ "title": ""
17
+ }
18
+ },
19
+ "outputs": [],
20
+ "source": [
21
+ "from pyspark.sql import functions as F\n",
22
+ "from pyspark.sql.window import Window"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "markdown",
27
+ "metadata": {
28
+ "application/vnd.databricks.v1+cell": {
29
+ "cellMetadata": {},
30
+ "inputWidgets": {},
31
+ "nuid": "2eabbeba-4c9f-471f-a820-02a4b3a4ce89",
32
+ "showTitle": false,
33
+ "tableResultSettingsMap": {},
34
+ "title": ""
35
+ }
36
+ },
37
+ "source": [
38
+ "#Prepare 990pfp15grants3a dataset\n",
39
+ "\n",
40
+ "This notebook has some data cleaning steps for the 990pfp15grants3a dataset, which contains data from Form 990 PF, Part XIV question 3a (\"Grants and Contributions Paid During the Year\") \n",
41
+ "\n",
42
+ "I've summarized data related to the data cleaning steps [here.](https://docs.google.com/spreadsheets/d/1FLx0uBV0T_3Vq2uVCfZH_Z1DBCQkT4-S6APtTrWyPuc/edit?gid=88674851#gid=88674851) \n",
43
+ "\n",
44
+ "1. The majority of records are missing a valid country code but do have a valid US state code, so we join on valid US state codes to identify the country.\n",
45
+ "2. Filter out records where the grant value is Null or zero.\n",
46
+ "3. This leaves only 1% records with a Null country code. \n",
47
+ "4. For the remainder, I ran the place names and city codes through a geopy lookup to try and identify the country (notebook NP00_geopy-lookups). The majority of entires seem to be foreign countries, so in the below I've assumed that any of the remaining records with null country codes are foreign but more work will need to be done to validate this."
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "metadata": {
53
+ "application/vnd.databricks.v1+cell": {
54
+ "cellMetadata": {},
55
+ "inputWidgets": {},
56
+ "nuid": "e81bcd5e-cdd2-4403-ba21-27f7f2d22b8b",
57
+ "showTitle": false,
58
+ "tableResultSettingsMap": {},
59
+ "title": ""
60
+ }
61
+ },
62
+ "source": [
63
+ "##Load data"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "metadata": {
70
+ "application/vnd.databricks.v1+cell": {
71
+ "cellMetadata": {
72
+ "byteLimit": 2048000,
73
+ "rowLimit": 10000
74
+ },
75
+ "inputWidgets": {},
76
+ "nuid": "882f4738-c66f-4f9c-8415-ec3eb155f562",
77
+ "showTitle": false,
78
+ "tableResultSettingsMap": {},
79
+ "title": ""
80
+ }
81
+ },
82
+ "outputs": [],
83
+ "source": [
84
+ "grants3a = spark.table(\"prod_curated.irs.990pfpart14grants3a\")\n",
85
+ "# geo_data = spark.table(\"sandbox_edward.geo_data.geonames_all_countries\")\n",
86
+ "state_code_lookup = spark.table(\"sandbox_edward.nonprofit_mapping.state_code_lookup\")\n",
87
+ "geopy_place_lookup = spark.table(\"sandbox_edward.nonprofit_mapping.geopy_place_lookup\")\n",
88
+ "geopy_city_lookup = spark.table(\"sandbox_edward.nonprofit_mapping.geopy_city_lookup\")"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": null,
94
+ "metadata": {
95
+ "application/vnd.databricks.v1+cell": {
96
+ "cellMetadata": {
97
+ "byteLimit": 2048000,
98
+ "rowLimit": 10000
99
+ },
100
+ "inputWidgets": {},
101
+ "nuid": "f336e511-02ad-4fed-9e2d-eb7bc908a41b",
102
+ "showTitle": true,
103
+ "tableResultSettingsMap": {},
104
+ "title": "Clean grants3a"
105
+ }
106
+ },
107
+ "outputs": [],
108
+ "source": [
109
+ "# Regex to match exactly 9 digits\n",
110
+ "regex_pattern_9zips = r\"^\\d{9}$\"\n",
111
+ "regex_pattern_pcs = r'^(\\w{3,4}) \\w{3}$' # Matches 3 or 4 characters/digits followed by 3 characters/digits\n",
112
+ "\n",
113
+ "grants3a_cleaned = (\n",
114
+ " grants3a\n",
115
+ " .filter(\n",
116
+ " (F.col('SIGOCPYAMOUN') > 0) &\n",
117
+ " (F.col('SIGOCPYAMOUN').isNotNull())\n",
118
+ " ).withColumn( \n",
119
+ " 'original_postcode', # Keep original values in a separate column\n",
120
+ " F.col('SIGOCPYRFAPC')\n",
121
+ " ).withColumn(\n",
122
+ " 'SIGOCPYRFAPC',\n",
123
+ " F.when(\n",
124
+ " F.col(\"SIGOCPYRFAPC\").rlike(regex_pattern_9zips), # Check if the postcode is a nine digit value\n",
125
+ " F.col(\"SIGOCPYRFAPC\").substr(1, 5) # If it is, extract the first five digits\n",
126
+ " ).otherwise(F.col(\"SIGOCPYRFAPC\")) # Otherwise, keep the original value\n",
127
+ " ).withColumn(\n",
128
+ " 'SIGOCPYRFAPC',\n",
129
+ " F.when(\n",
130
+ " F.col(\"SIGOCPYRFAPC\").rlike(regex_pattern_pcs), # Check if it matches the regex\n",
131
+ " F.regexp_extract(F.col('SIGOCPYRFAPC'), regex_pattern_pcs, 1) # Extract the first 5 digits if it matches\n",
132
+ " ).otherwise(F.col(\"SIGOCPYRFAPC\")) # Otherwise, keep the original value\n",
133
+ " ).withColumn(\n",
134
+ " 'SIGOCAFFRFACO',\n",
135
+ " F.when(F.col('SIGOCAFFRFACO') == \"USA\", \"US\")\n",
136
+ " .otherwise(F.col('SIGOCAFFRFACO'))\n",
137
+ " ).join(\n",
138
+ " state_code_lookup.select(\n",
139
+ " F.lower(F.col('state_code')).alias('state_code_lower'),\n",
140
+ " F.col('country').alias('country_from_state_code')\n",
141
+ " ),\n",
142
+ " on=F.lower(F.col('SIGOCPYRFAPO')) == F.col('state_code_lower'),\n",
143
+ " how='left'\n",
144
+ " ).join(\n",
145
+ " state_code_lookup.select(\n",
146
+ " F.lower(F.col('state_name')).alias('state_name_lower'),\n",
147
+ " F.col('country').alias('country_from_state_name'),\n",
148
+ " F.col('state_code')\n",
149
+ " ),\n",
150
+ " on=F.lower(F.col('SIGOCPYRFAPO')) == F.col('state_name_lower'),\n",
151
+ " how='left'\n",
152
+ " ).withColumn(\n",
153
+ " 'valid_state_code',\n",
154
+ " F.upper(F.coalesce(F.col('state_code_lower'), F.col('state_code')))\n",
155
+ " ).withColumn(\n",
156
+ " 'valid_state_code', # If the state code is null, assume international\n",
157
+ " F.when(F.col('valid_state_code').isNull(), 'INTL').otherwise(F.col('valid_state_code'))\n",
158
+ " ).withColumn(\n",
159
+ " 'country_from_state_lookup',\n",
160
+ " F.coalesce(F.col('country_from_state_code'), F.col('country_from_state_name'))\n",
161
+ " ).drop(\n",
162
+ " 'state_name_lower',\n",
163
+ " 'state_code_lower',\n",
164
+ " 'state_code',\n",
165
+ " 'country_from_state_code',\n",
166
+ " 'country_from_state_name'\n",
167
+ " ).withColumn(\n",
168
+ " 'country_from_state_lookup',\n",
169
+ " F.coalesce(F.col('country_from_state_lookup'), F.col('SIGOCAFFRFACO'))\n",
170
+ " ).withColumn(\n",
171
+ " 'designated_foreign_org',\n",
172
+ " F.when(\n",
173
+ " (F.lower(F.col('SIGOCPYRFSTA')).like('%forei%')) |\n",
174
+ " F.lower(F.col('SIGOCPYRFSTA')).like('%forgn%'),\n",
175
+ " 1\n",
176
+ " ).otherwise(0)\n",
177
+ " ).withColumn(\n",
178
+ " 'country_from_state_lookup',\n",
179
+ " F.when(F.col('country_from_state_lookup').isNull(), 'Assumed international')\n",
180
+ " .otherwise(F.col('country_from_state_lookup'))\n",
181
+ " )\n",
182
+ ")\n"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "metadata": {
189
+ "application/vnd.databricks.v1+cell": {
190
+ "cellMetadata": {
191
+ "byteLimit": 2048000,
192
+ "rowLimit": 10000
193
+ },
194
+ "inputWidgets": {},
195
+ "nuid": "bbd388ea-2a1b-471f-84f0-51e3969c4169",
196
+ "showTitle": true,
197
+ "tableResultSettingsMap": {},
198
+ "title": "Write dataset"
199
+ }
200
+ },
201
+ "outputs": [],
202
+ "source": [
203
+ "grants3a_cleaned.write.mode(\"overwrite\").format(\"delta\").saveAsTable('sandbox_edward.nonprofit_mapping.grants3a_cleaned')"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "metadata": {
210
+ "application/vnd.databricks.v1+cell": {
211
+ "cellMetadata": {
212
+ "byteLimit": 2048000,
213
+ "rowLimit": 10000
214
+ },
215
+ "inputWidgets": {},
216
+ "nuid": "594d69d5-8ade-48de-aed5-a07c41de5038",
217
+ "showTitle": false,
218
+ "tableResultSettingsMap": {},
219
+ "title": ""
220
+ }
221
+ },
222
+ "outputs": [],
223
+ "source": [
224
+ "display(\n",
225
+ " grants3a_cleaned\n",
226
+ " .groupBy(\n",
227
+ " 'country_from_state_lookup'\n",
228
+ " ).agg(\n",
229
+ " F.count('*').alias('count')\n",
230
+ " )\n",
231
+ ")"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": null,
237
+ "metadata": {
238
+ "application/vnd.databricks.v1+cell": {
239
+ "cellMetadata": {
240
+ "byteLimit": 2048000,
241
+ "rowLimit": 10000
242
+ },
243
+ "inputWidgets": {},
244
+ "nuid": "a5d2c47b-d666-4e63-8726-f1aec028d81e",
245
+ "showTitle": false,
246
+ "tableResultSettingsMap": {},
247
+ "title": ""
248
+ }
249
+ },
250
+ "outputs": [],
251
+ "source": [
252
+ "display(grants3a_cleaned)"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": null,
258
+ "metadata": {
259
+ "application/vnd.databricks.v1+cell": {
260
+ "cellMetadata": {
261
+ "byteLimit": 2048000,
262
+ "rowLimit": 10000
263
+ },
264
+ "inputWidgets": {},
265
+ "nuid": "3152e313-b318-497f-883b-15071e21b0e8",
266
+ "showTitle": false,
267
+ "tableResultSettingsMap": {},
268
+ "title": ""
269
+ }
270
+ },
271
+ "outputs": [],
272
+ "source": [
273
+ "no_country_no_state_cities = (\n",
274
+ " grants3a_cleaned\n",
275
+ " .filter(F.col('country').isNull() & F.col('SIGOCPYRFAPO').isNull())\n",
276
+ " .groupBy(\n",
277
+ " 'SIGOCPYRFACI'\n",
278
+ " ).agg(\n",
279
+ " F.count('*').alias('count')\n",
280
+ " ).orderBy(\n",
281
+ " F.col('count').desc()\n",
282
+ " )\n",
283
+ ")\n",
284
+ "\n",
285
+ "no_country_no_state_cities.write.format(\"delta\").saveAsTable('sandbox_edward.nonprofit_mapping.no_country_no_state_cities')"
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": null,
291
+ "metadata": {
292
+ "application/vnd.databricks.v1+cell": {
293
+ "cellMetadata": {
294
+ "byteLimit": 2048000,
295
+ "rowLimit": 10000
296
+ },
297
+ "inputWidgets": {},
298
+ "nuid": "d21e77ac-9397-4545-b2ae-fda9e2ab10b1",
299
+ "showTitle": false,
300
+ "tableResultSettingsMap": {},
301
+ "title": ""
302
+ }
303
+ },
304
+ "outputs": [],
305
+ "source": [
306
+ "display(grants3a_cleaned.agg(F.sum('likely_foreign').alias('sum_likely_foreign')))"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": null,
312
+ "metadata": {
313
+ "application/vnd.databricks.v1+cell": {
314
+ "cellMetadata": {
315
+ "byteLimit": 2048000,
316
+ "rowLimit": 10000
317
+ },
318
+ "inputWidgets": {},
319
+ "nuid": "c75a63c3-3b70-4cfb-9815-d30bd51892b0",
320
+ "showTitle": true,
321
+ "tableResultSettingsMap": {},
322
+ "title": "recipient_countries"
323
+ }
324
+ },
325
+ "outputs": [],
326
+ "source": [
327
+ "window_spec = Window.partitionBy().orderBy(F.col('count').desc())\n",
328
+ "\n",
329
+ "recipient_countries = (\n",
330
+ " grants3a_cleaned\n",
331
+ " .groupBy('SIGOCAFFRFACO')\n",
332
+ " .agg(F.count('*').alias('count'))\n",
333
+ " .withColumn('total', F.sum('count').over(Window.partitionBy()))\n",
334
+ " .withColumn('proportion', F.col('count') / F.col('total'))\n",
335
+ " .withColumn('cumulative_count', F.sum('count').over(window_spec))\n",
336
+ " .withColumn('cumulative_proportion', F.col('cumulative_count') / F.col('total'))\n",
337
+ ")\n",
338
+ "\n",
339
+ "display(recipient_countries.orderBy(F.col('count').desc()))"
340
+ ]
341
+ },
342
+ {
343
+ "cell_type": "code",
344
+ "execution_count": null,
345
+ "metadata": {
346
+ "application/vnd.databricks.v1+cell": {
347
+ "cellMetadata": {
348
+ "byteLimit": 2048000,
349
+ "rowLimit": 10000
350
+ },
351
+ "inputWidgets": {},
352
+ "nuid": "80d1d532-f6a8-4c70-ad20-7a07abff85a6",
353
+ "showTitle": true,
354
+ "tableResultSettingsMap": {},
355
+ "title": "recipient_states"
356
+ }
357
+ },
358
+ "outputs": [],
359
+ "source": [
360
+ "window_spec = Window.partitionBy().orderBy(F.col('count').desc())\n",
361
+ "\n",
362
+ "recipient_states = (\n",
363
+ " grants3a_cleaned\n",
364
+ " .groupBy('SIGOCPYRFAPO')\n",
365
+ " .agg(F.count('*').alias('count'))\n",
366
+ " .withColumn('total', F.sum('count').over(Window.partitionBy()))\n",
367
+ " .withColumn('proportion', F.col('count') / F.col('total'))\n",
368
+ " .withColumn('cumulative_count', F.sum('count').over(window_spec))\n",
369
+ " .withColumn('cumulative_proportion', F.col('cumulative_count') / F.col('total'))\n",
370
+ ")\n",
371
+ "\n",
372
+ "display(recipient_states.orderBy(F.col('count').desc()))"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "execution_count": null,
378
+ "metadata": {
379
+ "application/vnd.databricks.v1+cell": {
380
+ "cellMetadata": {
381
+ "byteLimit": 2048000,
382
+ "rowLimit": 10000
383
+ },
384
+ "inputWidgets": {},
385
+ "nuid": "e4583edc-859e-4826-ad4c-f88dfa9979ae",
386
+ "showTitle": false,
387
+ "tableResultSettingsMap": {},
388
+ "title": ""
389
+ }
390
+ },
391
+ "outputs": [],
392
+ "source": [
393
+ "window_spec = Window.partitionBy().orderBy(F.col('count').desc())\n",
394
+ "\n",
395
+ "recipient_country_states = (\n",
396
+ " grants3a_cleaned\n",
397
+ " .groupBy('SIGOCAFFRFACO', 'SIGOCPYRFAPO')\n",
398
+ " .agg(F.count('*').alias('count'))\n",
399
+ " .withColumn('total', F.sum('count').over(Window.partitionBy()))\n",
400
+ " .withColumn('proportion', F.col('count') / F.col('total'))\n",
401
+ " .withColumn('cumulative_count', F.sum('count').over(window_spec))\n",
402
+ " .withColumn('cumulative_proportion', F.col('cumulative_count') / F.col('total'))\n",
403
+ ")\n",
404
+ "\n",
405
+ "display(recipient_country_states.orderBy(F.col('count').desc()))"
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": null,
411
+ "metadata": {
412
+ "application/vnd.databricks.v1+cell": {
413
+ "cellMetadata": {
414
+ "byteLimit": 2048000,
415
+ "rowLimit": 10000
416
+ },
417
+ "inputWidgets": {},
418
+ "nuid": "7abdc582-3d63-4755-80d1-cc59d05711ff",
419
+ "showTitle": false,
420
+ "tableResultSettingsMap": {},
421
+ "title": ""
422
+ }
423
+ },
424
+ "outputs": [],
425
+ "source": [
426
+ "output = (\n",
427
+ " grants3a_cleaned\n",
428
+ " .withColumn(\n",
429
+ " 'country_identified',\n",
430
+ " F.when(F.col('SIGOCAFFRFACO').isNotNull(), True)\n",
431
+ " .otherwise(False)\n",
432
+ " ).groupBy(\n",
433
+ " 'FILERNAME1', 'SIGOCPYRFAPO'\n",
434
+ " ).agg(\n",
435
+ " F.count()\n",
436
+ " )\n",
437
+ ")"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": null,
443
+ "metadata": {
444
+ "application/vnd.databricks.v1+cell": {
445
+ "cellMetadata": {
446
+ "byteLimit": 2048000,
447
+ "rowLimit": 10000
448
+ },
449
+ "inputWidgets": {},
450
+ "nuid": "efc2b6c8-7f33-420f-966c-cc2a4edbeb7d",
451
+ "showTitle": true,
452
+ "tableResultSettingsMap": {},
453
+ "title": "inconsistent_records"
454
+ }
455
+ },
456
+ "outputs": [],
457
+ "source": [
458
+ "inconsistent_records = (\n",
459
+ " grants3a\n",
460
+ " .groupBy(\n",
461
+ " 'SIGOCPYRBNBN1', 'SIGOCAFFRFACO'\n",
462
+ " ).agg(\n",
463
+ " F.count('*').alias('count')\n",
464
+ " ).withColumn(\n",
465
+ " 'id',\n",
466
+ " F.row_number().over(Window.partitionBy('SIGOCPYRBNBN1').orderBy(F.col('count').desc()))\n",
467
+ " ).withColumn(\n",
468
+ " 'duplicates',\n",
469
+ " F.max('id').over(Window.partitionBy('SIGOCPYRBNBN1'))\n",
470
+ " ).filter(\n",
471
+ " F.col('duplicates') > 1\n",
472
+ " )\n",
473
+ ")\n",
474
+ "\n",
475
+ "display(inconsistent_records)"
476
+ ]
477
+ },
478
+ {
479
+ "cell_type": "markdown",
480
+ "metadata": {
481
+ "application/vnd.databricks.v1+cell": {
482
+ "cellMetadata": {},
483
+ "inputWidgets": {},
484
+ "nuid": "9b7da0b7-7be3-4e16-ba13-cbb444bb2747",
485
+ "showTitle": false,
486
+ "tableResultSettingsMap": {},
487
+ "title": ""
488
+ }
489
+ },
490
+ "source": [
491
+ "#Old cleaning attempt - can ignore"
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "markdown",
496
+ "metadata": {
497
+ "application/vnd.databricks.v1+cell": {
498
+ "cellMetadata": {},
499
+ "inputWidgets": {},
500
+ "nuid": "87d7a6d6-9944-4c69-b895-4f4923e528b0",
501
+ "showTitle": false,
502
+ "tableResultSettingsMap": {},
503
+ "title": ""
504
+ }
505
+ },
506
+ "source": [
507
+ "##Merge country information\n",
508
+ "\n",
509
+ "Column name: SIGOCPYRFACI\\\n",
510
+ "Description: Recipient Foreign Address - City\n",
511
+ "\n",
512
+ "Column name: SIGOCPYRFAPC\\\n",
513
+ "Description: Recipient Foreign Address - Postal code\n",
514
+ "\n",
515
+ "Column name: SIGOCPYRFACO\\\n",
516
+ "Description: Recipient Foreign Address - Country\n",
517
+ "\n",
518
+ "Column name: SIGOCPYRFAPO\\\n",
519
+ "Description: Province or state"
520
+ ]
521
+ },
522
+ {
523
+ "cell_type": "code",
524
+ "execution_count": null,
525
+ "metadata": {
526
+ "application/vnd.databricks.v1+cell": {
527
+ "cellMetadata": {
528
+ "byteLimit": 2048000,
529
+ "rowLimit": 10000
530
+ },
531
+ "inputWidgets": {},
532
+ "nuid": "f6c96bf0-042d-4813-b64e-e2b531ed4d05",
533
+ "showTitle": false,
534
+ "tableResultSettingsMap": {},
535
+ "title": ""
536
+ }
537
+ },
538
+ "outputs": [],
539
+ "source": [
540
+ "# First, check the number of rows in the dataset:\n",
541
+ "# 13,119,448\n",
542
+ "# 13,119,506\n",
543
+ "# 13,119,506\n",
544
+ "# 13,132,628\n",
545
+ "\n",
546
+ "display(grants3a.count())"
547
+ ]
548
+ },
549
+ {
550
+ "cell_type": "code",
551
+ "execution_count": null,
552
+ "metadata": {
553
+ "application/vnd.databricks.v1+cell": {
554
+ "cellMetadata": {
555
+ "byteLimit": 2048000,
556
+ "rowLimit": 10000
557
+ },
558
+ "inputWidgets": {},
559
+ "nuid": "896f06e6-b8e6-4c2a-bff8-7953c243c163",
560
+ "showTitle": true,
561
+ "tableResultSettingsMap": {},
562
+ "title": "convert place information to lower case"
563
+ }
564
+ },
565
+ "outputs": [],
566
+ "source": [
567
+ "grants3a = (\n",
568
+ " grants3a\n",
569
+ " .withColumn('SIGOCPYRFACI', F.lower(F.col('SIGOCPYRFACI'))) # City / place name\n",
570
+ " .withColumn('SIGOCPYRFAPC', F.lower(F.col('SIGOCPYRFAPC'))) # Postcode\n",
571
+ " .withColumn('SIGOCPYRFAPO', F.lower(F.col('SIGOCPYRFAPO'))) # State codes\n",
572
+ " .withColumn('row_id', F.row_number().over(Window.orderBy(F.lit(0))))\n",
573
+ ")"
574
+ ]
575
+ },
576
+ {
577
+ "cell_type": "code",
578
+ "execution_count": null,
579
+ "metadata": {
580
+ "application/vnd.databricks.v1+cell": {
581
+ "cellMetadata": {
582
+ "byteLimit": 2048000,
583
+ "rowLimit": 10000
584
+ },
585
+ "inputWidgets": {},
586
+ "nuid": "4d60a9f8-a8b3-4eb8-aa99-2920450eb909",
587
+ "showTitle": true,
588
+ "tableResultSettingsMap": {},
589
+ "title": "geo_data"
590
+ }
591
+ },
592
+ "outputs": [],
593
+ "source": [
594
+ "geo_data = (\n",
595
+ " geo_data\n",
596
+ " .withColumn('place_name', F.lower(F.col('place_name')))\n",
597
+ " .withColumn('postal_code', F.lower(F.col('postal_code')))\n",
598
+ " .withColumn('admin_code1', F.lower(F.col('admin_code1')))\n",
599
+ " .withColumn('admin_name1', F.lower(F.col('admin_name1')))\n",
600
+ " # .withColumnRenamed('place_name', 'SIGOCPYRFACI')\n",
601
+ " # .withColumnRenamed('postal_code', 'SIGOCPYRFAPC')\n",
602
+ " # .withColumnRenamed('admin_code1', 'SIGOCPYRFAPO')\n",
603
+ ")"
604
+ ]
605
+ },
606
+ {
607
+ "cell_type": "code",
608
+ "execution_count": null,
609
+ "metadata": {
610
+ "application/vnd.databricks.v1+cell": {
611
+ "cellMetadata": {
612
+ "byteLimit": 2048000,
613
+ "rowLimit": 10000
614
+ },
615
+ "inputWidgets": {},
616
+ "nuid": "9413adb6-10ca-4cef-a6ed-267e7d71e28e",
617
+ "showTitle": true,
618
+ "tableResultSettingsMap": {},
619
+ "title": "joined_geo"
620
+ }
621
+ },
622
+ "outputs": [],
623
+ "source": [
624
+ "# First attempt to match on place name and postocde\n",
625
+ "\n",
626
+ "geo_data_1 = (\n",
627
+ " geo_data\n",
628
+ " .select('country_code', 'place_name', 'postal_code', 'admin_code1')\n",
629
+ " .withColumnRenamed('country_code', 'country_code_nm_pc_st_mtch')\n",
630
+ " .distinct()\n",
631
+ ")\n",
632
+ "\n",
633
+ "joined_geo = (\n",
634
+ " grants3a\n",
635
+ " .join(geo_data_1,\n",
636
+ " on = (\n",
637
+ " (F.col('SIGOCPYRFACI') == F.col('place_name')) &\n",
638
+ " (F.col('SIGOCPYRFAPC') == F.col('postal_code')) &\n",
639
+ " (F.col('SIGOCPYRFAPO') == F.col('admin_code1')) &\n",
640
+ " (F.col('SIGOCAFFRFACO').isNull())\n",
641
+ " ),\n",
642
+ " how='left')\n",
643
+ " .drop('place_name', 'postal_code', 'admin_code1')\n",
644
+ ")\n",
645
+ "\n",
646
+ "display(joined_geo.count())"
647
+ ]
648
+ },
649
+ {
650
+ "cell_type": "code",
651
+ "execution_count": null,
652
+ "metadata": {
653
+ "application/vnd.databricks.v1+cell": {
654
+ "cellMetadata": {
655
+ "byteLimit": 2048000,
656
+ "rowLimit": 10000
657
+ },
658
+ "inputWidgets": {},
659
+ "nuid": "38b09e83-db48-4e2a-b509-a12ba5e0416f",
660
+ "showTitle": true,
661
+ "tableResultSettingsMap": {},
662
+ "title": "joined_geo2"
663
+ }
664
+ },
665
+ "outputs": [],
666
+ "source": [
667
+ "# First attempt to match on place name and postocde\n",
668
+ "\n",
669
+ "geo_data_2 = (\n",
670
+ " geo_data\n",
671
+ " .select('country_code', 'place_name', 'postal_code')\n",
672
+ " .withColumnRenamed('country_code', 'country_code_nm_pc_mtch')\n",
673
+ " .distinct()\n",
674
+ ")\n",
675
+ "\n",
676
+ "joined_geo2 = (\n",
677
+ " joined_geo\n",
678
+ " .join(geo_data_2,\n",
679
+ " on = (\n",
680
+ " (F.col('SIGOCPYRFACI') == F.col('place_name')) &\n",
681
+ " (F.col('SIGOCPYRFAPC') == F.col('postal_code')) &\n",
682
+ " (F.col('SIGOCAFFRFACO').isNull()) &\n",
683
+ " (F.col('country_code_nm_pc_st_mtch').isNull())\n",
684
+ " ),\n",
685
+ " how='left')\n",
686
+ " .drop('place_name', 'postal_code')\n",
687
+ ")\n",
688
+ "\n",
689
+ "display(joined_geo2.count())"
690
+ ]
691
+ },
692
+ {
693
+ "cell_type": "code",
694
+ "execution_count": null,
695
+ "metadata": {
696
+ "application/vnd.databricks.v1+cell": {
697
+ "cellMetadata": {
698
+ "byteLimit": 2048000,
699
+ "rowLimit": 10000
700
+ },
701
+ "inputWidgets": {},
702
+ "nuid": "9c68184d-2476-4bc6-8d53-7262785e8b6c",
703
+ "showTitle": true,
704
+ "tableResultSettingsMap": {},
705
+ "title": "joined_geo3"
706
+ }
707
+ },
708
+ "outputs": [],
709
+ "source": [
710
+ "# Second, attempt to match on state code and postocde\n",
711
+ "\n",
712
+ "geo_data_3 = (\n",
713
+ " geo_data\n",
714
+ " .select('country_code', 'admin_code1', 'postal_code')\n",
715
+ " .withColumnRenamed('country_code', 'country_code_st_pc_mtch')\n",
716
+ " .distinct()\n",
717
+ ")\n",
718
+ "\n",
719
+ "# Perform the join with additional conditions in the \"on\" clause\n",
720
+ "joined_geo3 = (\n",
721
+ " joined_geo2\n",
722
+ " .join(\n",
723
+ " geo_data_3,\n",
724
+ " on=(\n",
725
+ " (F.col('SIGOCPYRFAPO') == F.col('admin_code1')) & # Match postocode\n",
726
+ " (F.col('SIGOCPYRFAPC') == F.col('postal_code')) & # Match state/province\n",
727
+ " (F.col('SIGOCAFFRFACO').isNull()) &\n",
728
+ " (F.col('country_code_nm_pc_st_mtch').isNull()) & # Only join when place name & postcode match did not find a match\n",
729
+ " (F.col('country_code_nm_pc_mtch').isNull())\n",
730
+ " ),\n",
731
+ " how='left'\n",
732
+ " )\n",
733
+ " .drop('admin_code1', 'postal_code')\n",
734
+ ")\n",
735
+ "\n",
736
+ "display(joined_geo3.count())"
737
+ ]
738
+ },
739
+ {
740
+ "cell_type": "code",
741
+ "execution_count": null,
742
+ "metadata": {
743
+ "application/vnd.databricks.v1+cell": {
744
+ "cellMetadata": {
745
+ "byteLimit": 2048000,
746
+ "rowLimit": 10000
747
+ },
748
+ "inputWidgets": {},
749
+ "nuid": "995292e5-8832-4728-b891-750a033e025d",
750
+ "showTitle": true,
751
+ "tableResultSettingsMap": {},
752
+ "title": "joined_geo4"
753
+ }
754
+ },
755
+ "outputs": [],
756
+ "source": [
757
+ "# Third, attempt to match on place name and state code\n",
758
+ "\n",
759
+ "geo_data_4 = (\n",
760
+ " geo_data\n",
761
+ " .select('country_code', 'admin_code1', 'place_name')\n",
762
+ " .withColumnRenamed('country_code', 'country_code_st_nm_mtch')\n",
763
+ " .distinct()\n",
764
+ ")\n",
765
+ "\n",
766
+ "# Perform the join with additional conditions in the \"on\" clause\n",
767
+ "joined_geo4 = (\n",
768
+ " joined_geo3\n",
769
+ " .withColumn('SIGOCPYRFAPO', F.lower(F.col('SIGOCPYRFAPO')))\n",
770
+ " .join(\n",
771
+ " geo_data_4,\n",
772
+ " on=(\n",
773
+ " (F.col('SIGOCPYRFACI') == F.col('place_name')) & # Match SIGOCPYRFAPC\n",
774
+ " (F.col('SIGOCPYRFAPO') == F.col('admin_code1')) & # Match SIGOCPYRFAPO\n",
775
+ " (F.col('SIGOCAFFRFACO').isNull()) &\n",
776
+ " (F.col('country_code_nm_pc_st_mtch').isNull()) &\n",
777
+ " (F.col('country_code_nm_pc_mtch').isNull()) & \n",
778
+ " (F.col('country_code_st_pc_mtch').isNull()) # Only match in instances where previous matches have failed\n",
779
+ " ),\n",
780
+ " how='left'\n",
781
+ " )\n",
782
+ " .drop('admin_code1', 'place_name')\n",
783
+ ")\n",
784
+ "\n",
785
+ "display(joined_geo4.count())"
786
+ ]
787
+ },
788
+ {
789
+ "cell_type": "code",
790
+ "execution_count": null,
791
+ "metadata": {
792
+ "application/vnd.databricks.v1+cell": {
793
+ "cellMetadata": {
794
+ "byteLimit": 2048000,
795
+ "rowLimit": 10000
796
+ },
797
+ "inputWidgets": {},
798
+ "nuid": "6ea92529-581a-4ad9-98f1-aa5c347422f6",
799
+ "showTitle": true,
800
+ "tableResultSettingsMap": {},
801
+ "title": "join duplicate check"
802
+ }
803
+ },
804
+ "outputs": [],
805
+ "source": [
806
+ "duplicate_window = Window.partitionBy('row_id').orderBy(F.lit(0))\n",
807
+ "\n",
808
+ "joined_geo4 = (\n",
809
+ " joined_geo3\n",
810
+ " .withColumn('repeat_count', F.sum(F.lit(1)).over(duplicate_window))\n",
811
+ ")\n",
812
+ "\n",
813
+ "display(joined_geo4.filter(F.col('repeat_count') > 1))"
814
+ ]
815
+ },
816
+ {
817
+ "cell_type": "code",
818
+ "execution_count": null,
819
+ "metadata": {
820
+ "application/vnd.databricks.v1+cell": {
821
+ "cellMetadata": {
822
+ "byteLimit": 2048000,
823
+ "rowLimit": 10000
824
+ },
825
+ "inputWidgets": {},
826
+ "nuid": "85f9154a-4e4b-4802-a9b8-a71096993ead",
827
+ "showTitle": true,
828
+ "tableResultSettingsMap": {},
829
+ "title": "recipient_countries"
830
+ }
831
+ },
832
+ "outputs": [],
833
+ "source": [
834
+ "window_spec = Window.partitionBy().orderBy(F.col('count').desc())\n",
835
+ "\n",
836
+ "recipient_countries = (\n",
837
+ " joined_geo4\n",
838
+ " .groupBy('SIGOCAFFRFACO', \n",
839
+ " 'country_code_nm_pc_st_mtch',\n",
840
+ " 'country_code_nm_pc_mtch',\n",
841
+ " 'country_code_st_pc_mtch',\n",
842
+ " 'country_code_st_nm_mtch'\n",
843
+ " )\n",
844
+ " .agg(F.count('*').alias('count'))\n",
845
+ " .withColumn('total', F.sum('count').over(Window.partitionBy()))\n",
846
+ " .withColumn('proportion', F.col('count') / F.col('total'))\n",
847
+ " .withColumn('cumulative_count', F.sum('count').over(window_spec))\n",
848
+ " .withColumn('cumulative_proportion', F.col('cumulative_count') / F.col('total'))\n",
849
+ ")\n",
850
+ "\n",
851
+ "display(recipient_countries.orderBy(F.col('count').desc()))"
852
+ ]
853
+ },
854
+ {
855
+ "cell_type": "code",
856
+ "execution_count": null,
857
+ "metadata": {
858
+ "application/vnd.databricks.v1+cell": {
859
+ "cellMetadata": {
860
+ "byteLimit": 2048000,
861
+ "rowLimit": 10000
862
+ },
863
+ "inputWidgets": {},
864
+ "nuid": "70b84b6b-b039-41b9-a8b4-3ae0818520d7",
865
+ "showTitle": true,
866
+ "tableResultSettingsMap": {},
867
+ "title": "missing_countries"
868
+ }
869
+ },
870
+ "outputs": [],
871
+ "source": [
872
+ "missing_countries = (\n",
873
+ " joined_geo4\n",
874
+ " .filter(\n",
875
+ " (F.col('SIGOCAFFRFACO').isNull()) &\n",
876
+ " (F.col('country_code_nm_pc_st_mtch').isNull()) &\n",
877
+ " (F.col('country_code_nm_pc_mtch').isNull()) &\n",
878
+ " (F.col('country_code_st_pc_mtch').isNull()) &\n",
879
+ " (F.col('country_code_st_nm_mtch').isNull()) &\n",
880
+ " (F.col('SIGOCPYAMOUN') > 0)\n",
881
+ " )\n",
882
+ " )"
883
+ ]
884
+ },
885
+ {
886
+ "cell_type": "code",
887
+ "execution_count": null,
888
+ "metadata": {
889
+ "application/vnd.databricks.v1+cell": {
890
+ "cellMetadata": {
891
+ "byteLimit": 2048000,
892
+ "rowLimit": 10000
893
+ },
894
+ "inputWidgets": {},
895
+ "nuid": "f8ffca0f-f043-41df-af46-7f225eee6cc6",
896
+ "showTitle": true,
897
+ "tableResultSettingsMap": {},
898
+ "title": "missing_countries"
899
+ }
900
+ },
901
+ "outputs": [],
902
+ "source": [
903
+ "display(missing_countries.filter(F.col('SIGOCPYRFAPO') == 'ontario'))"
904
+ ]
905
+ },
906
+ {
907
+ "cell_type": "code",
908
+ "execution_count": null,
909
+ "metadata": {
910
+ "application/vnd.databricks.v1+cell": {
911
+ "cellMetadata": {
912
+ "byteLimit": 2048000,
913
+ "rowLimit": 10000
914
+ },
915
+ "inputWidgets": {},
916
+ "nuid": "a4bcbfa0-4c19-4377-981d-bd9d72a8fd0f",
917
+ "showTitle": true,
918
+ "tableResultSettingsMap": {},
919
+ "title": "missing_countries"
920
+ }
921
+ },
922
+ "outputs": [],
923
+ "source": [
924
+ "display(\n",
925
+ " missing_countries\n",
926
+ " .groupBy(\n",
927
+ " 'SIGOCPYRFAPO'\n",
928
+ " ).agg(\n",
929
+ " F.count('*').alias('count')\n",
930
+ " ).orderBy(\n",
931
+ " 'count',\n",
932
+ " ascending=False\n",
933
+ " )\n",
934
+ ")"
935
+ ]
936
+ },
937
+ {
938
+ "cell_type": "code",
939
+ "execution_count": null,
940
+ "metadata": {
941
+ "application/vnd.databricks.v1+cell": {
942
+ "cellMetadata": {
943
+ "byteLimit": 2048000,
944
+ "rowLimit": 10000
945
+ },
946
+ "inputWidgets": {},
947
+ "nuid": "79c3bb18-50f8-4538-89d7-92288d143cb5",
948
+ "showTitle": true,
949
+ "tableResultSettingsMap": {},
950
+ "title": "missing_countries1"
951
+ }
952
+ },
953
+ "outputs": [],
954
+ "source": [
955
+ "geo_data_5 = (\n",
956
+ " geo_data\n",
957
+ " .select('country_code', 'admin_name1', 'place_name')\n",
958
+ " .withColumnRenamed('country_code', 'country_code_stnm_nm_mtch')\n",
959
+ " .distinct()\n",
960
+ ")\n",
961
+ "\n",
962
+ "# Perform the join with additional conditions in the \"on\" clause\n",
963
+ "missing_countries1 = (\n",
964
+ " missing_countries\n",
965
+ " .withColumn('SIGOCPYRFAPO', F.lower(F.col('SIGOCPYRFAPO')))\n",
966
+ " .join(\n",
967
+ " geo_data_5,\n",
968
+ " on=(\n",
969
+ " (F.col('SIGOCPYRFACI') == F.col('place_name')) & # Match SIGOCPYRFAPC\n",
970
+ " (F.col('SIGOCPYRFAPO') == F.col('admin_name1')) & # Match SIGOCPYRFAPO\n",
971
+ " (F.col('SIGOCAFFRFACO').isNull()) &\n",
972
+ " (F.col('country_code_nm_pc_st_mtch').isNull()) &\n",
973
+ " (F.col('country_code_nm_pc_mtch').isNull()) & \n",
974
+ " (F.col('country_code_st_pc_mtch').isNull()) &\n",
975
+ " (F.col('country_code_st_nm_mtch').isNull())\n",
976
+ " ),\n",
977
+ " how='left'\n",
978
+ " )\n",
979
+ " .drop('admin_name1', 'place_name')\n",
980
+ ")"
981
+ ]
982
+ },
983
+ {
984
+ "cell_type": "code",
985
+ "execution_count": null,
986
+ "metadata": {
987
+ "application/vnd.databricks.v1+cell": {
988
+ "cellMetadata": {
989
+ "byteLimit": 2048000,
990
+ "rowLimit": 10000
991
+ },
992
+ "inputWidgets": {},
993
+ "nuid": "2586ab16-8bed-4ef0-8e2c-428702bdb833",
994
+ "showTitle": false,
995
+ "tableResultSettingsMap": {},
996
+ "title": ""
997
+ }
998
+ },
999
+ "outputs": [],
1000
+ "source": [
1001
+ "display(\n",
1002
+ "joined_geo4.filter(\n",
1003
+ " (F.col('SIGOCAFFRFACO').isNull()) &\n",
1004
+ " (F.col('country_code_nm_pc_st_mtch').isNull()) &\n",
1005
+ " (F.col('country_code_nm_pc_mtch').isNull()) &\n",
1006
+ " (F.col('country_code_st_pc_mtch').isNull()) &\n",
1007
+ " (F.col('country_code_st_nm_mtch').isNull()) &\n",
1008
+ " (F.col('SIGOCPYAMOUN') > 0)\n",
1009
+ ")\n",
1010
+ ")"
1011
+ ]
1012
+ }
1013
+ ],
1014
+ "metadata": {
1015
+ "application/vnd.databricks.v1+notebook": {
1016
+ "computePreferences": {
1017
+ "hardware": {
1018
+ "accelerator": null,
1019
+ "gpuPoolId": null,
1020
+ "memory": null
1021
+ }
1022
+ },
1023
+ "dashboards": [],
1024
+ "environmentMetadata": {
1025
+ "base_environment": "",
1026
+ "dependencies": [
1027
+ "pgeocode",
1028
+ "geopy"
1029
+ ],
1030
+ "environment_version": "1"
1031
+ },
1032
+ "inputWidgetPreferences": null,
1033
+ "language": "python",
1034
+ "notebookMetadata": {
1035
+ "mostRecentlyExecutedCommandWithImplicitDF": {
1036
+ "commandId": 567296865538331,
1037
+ "dataframes": [
1038
+ "_sqldf"
1039
+ ]
1040
+ },
1041
+ "pythonIndentUnit": 4
1042
+ },
1043
+ "notebookName": "hhaq_NP01_cleaning-steps-990PF-grants",
1044
+ "widgets": {}
1045
+ },
1046
+ "language_info": {
1047
+ "name": "python"
1048
+ }
1049
+ },
1050
+ "nbformat": 4,
1051
+ "nbformat_minor": 0
1052
+ }