English
hassaanulhaq01 commited on
Commit
4b844d3
·
verified ·
1 Parent(s): f6f26a4

Delete notebooks/NP03_schedule_I.ipynb

Browse files
Files changed (1) hide show
  1. notebooks/NP03_schedule_I.ipynb +0 -909
notebooks/NP03_schedule_I.ipynb DELETED
@@ -1,909 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "metadata": {
7
- "application/vnd.databricks.v1+cell": {
8
- "cellMetadata": {
9
- "byteLimit": 2048000,
10
- "rowLimit": 10000
11
- },
12
- "inputWidgets": {},
13
- "nuid": "62d4799f-4935-4a2d-8f0a-5f6383b22cf7",
14
- "showTitle": false,
15
- "tableResultSettingsMap": {},
16
- "title": ""
17
- }
18
- },
19
- "outputs": [],
20
- "source": [
21
- "df1 = spark.read.table(\"prod_curated.irs.990cn120fields\")\n",
22
- "df2 = spark.read.table(\"prod_curated.irs.990standardfields\")"
23
- ]
24
- },
25
- {
26
- "cell_type": "code",
27
- "execution_count": null,
28
- "metadata": {
29
- "application/vnd.databricks.v1+cell": {
30
- "cellMetadata": {
31
- "byteLimit": 2048000,
32
- "rowLimit": 10000
33
- },
34
- "inputWidgets": {},
35
- "nuid": "d5410f3d-7463-43f5-8bfb-528d36e80b42",
36
- "showTitle": false,
37
- "tableResultSettingsMap": {
38
- "0": {
39
- "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{\"column\":116},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1758734440525}",
40
- "filterBlob": null,
41
- "queryPlanFiltersBlob": null,
42
- "tableResultIndex": 0
43
- }
44
- },
45
- "title": ""
46
- }
47
- },
48
- "outputs": [],
49
- "source": [
50
- "from pyspark.sql import SparkSession\n",
51
- "import pandas as pd\n",
52
- "\n",
53
- "# Extract (col, dtype) as dicts\n",
54
- "df1_schema = {f.name: f.dataType.simpleString() for f in df1.schema.fields}\n",
55
- "df2_schema = {f.name: f.dataType.simpleString() for f in df2.schema.fields}\n",
56
- "\n",
57
- "# Union of all column names\n",
58
- "all_cols = set(df1_schema.keys()).union(df2_schema.keys())\n",
59
- "\n",
60
- "# Build comparison rows\n",
61
- "rows = []\n",
62
- "for col in sorted(all_cols):\n",
63
- " in_df1 = col in df1_schema\n",
64
- " in_df2 = col in df2_schema\n",
65
- " \n",
66
- " if in_df1 and in_df2:\n",
67
- " flag = \"both\"\n",
68
- " elif in_df1:\n",
69
- " flag = \"old\"\n",
70
- " else:\n",
71
- " flag = \"new\"\n",
72
- " \n",
73
- " rows.append({\n",
74
- " \"column\": col,\n",
75
- " \"in_df\": flag,\n",
76
- " \"dtype_old\": df1_schema.get(col),\n",
77
- " \"dtype_new\": df2_schema.get(col)\n",
78
- " })\n",
79
- "\n",
80
- "# Convert to pandas for inspection\n",
81
- "comparison_df = pd.DataFrame(rows)\n",
82
- "\n",
83
- "# If you prefer it as a Spark DataFrame:\n",
84
- "spark = SparkSession.builder.getOrCreate()\n",
85
- "spark_comparison_df = spark.createDataFrame(comparison_df)\n",
86
- "\n",
87
- "display(comparison_df)\n"
88
- ]
89
- },
90
- {
91
- "cell_type": "code",
92
- "execution_count": null,
93
- "metadata": {
94
- "application/vnd.databricks.v1+cell": {
95
- "cellMetadata": {
96
- "byteLimit": 2048000,
97
- "rowLimit": 10000
98
- },
99
- "inputWidgets": {},
100
- "nuid": "2634e810-1046-456f-a00e-34db0ca198a2",
101
- "showTitle": false,
102
- "tableResultSettingsMap": {},
103
- "title": ""
104
- }
105
- },
106
- "outputs": [],
107
- "source": [
108
- "from pyspark.sql import functions as F\n",
109
- "from pyspark.sql.window import Window\n",
110
- "\n",
111
- "from pyspark.ml.feature import VectorAssembler, StandardScaler\n",
112
- "from pyspark.ml.clustering import KMeans\n",
113
- "\n",
114
- "import plotly.express as px"
115
- ]
116
- },
117
- {
118
- "cell_type": "code",
119
- "execution_count": null,
120
- "metadata": {
121
- "application/vnd.databricks.v1+cell": {
122
- "cellMetadata": {
123
- "byteLimit": 2048000,
124
- "rowLimit": 10000
125
- },
126
- "inputWidgets": {},
127
- "nuid": "cc50ff8a-e01c-417d-b926-fecac95265d0",
128
- "showTitle": false,
129
- "tableResultSettingsMap": {},
130
- "title": ""
131
- }
132
- },
133
- "outputs": [],
134
- "source": [
135
- "grants_per_state_990 = spark.read.table('sandbox_edward.nonprofit_mapping.grants_per_state_990_filers')\n",
136
- "grants_per_state_990pf = spark.read.table('sandbox_edward.nonprofit_mapping.grants_per_state_990pf_filers')"
137
- ]
138
- },
139
- {
140
- "cell_type": "code",
141
- "execution_count": null,
142
- "metadata": {
143
- "application/vnd.databricks.v1+cell": {
144
- "cellMetadata": {
145
- "byteLimit": 2048000,
146
- "rowLimit": 10000
147
- },
148
- "inputWidgets": {},
149
- "nuid": "79330be0-c72e-4670-b6bd-b95665af55c8",
150
- "showTitle": true,
151
- "tableResultSettingsMap": {},
152
- "title": "check for EINs in both 990 and 990pf"
153
- }
154
- },
155
- "outputs": [],
156
- "source": [
157
- "dual_filers = (\n",
158
- " grants_per_state_990.select(\n",
159
- " 'FILEREIN', \n",
160
- " 'TAXYEAR'\n",
161
- " )\n",
162
- " .join(\n",
163
- " grants_per_state_990pf.select('FILEREIN', 'TAXYEAR'), \n",
164
- " on=['FILEREIN', 'TAXYEAR'],\n",
165
- " how='inner'\n",
166
- " )\n",
167
- ")\n",
168
- "\n",
169
- "display(dual_filers)"
170
- ]
171
- },
172
- {
173
- "cell_type": "code",
174
- "execution_count": null,
175
- "metadata": {
176
- "application/vnd.databricks.v1+cell": {
177
- "cellMetadata": {
178
- "byteLimit": 2048000,
179
- "rowLimit": 10000
180
- },
181
- "inputWidgets": {},
182
- "nuid": "3cd453d0-0bac-42d7-b9d3-51a30be32e6b",
183
- "showTitle": false,
184
- "tableResultSettingsMap": {},
185
- "title": ""
186
- }
187
- },
188
- "outputs": [],
189
- "source": [
190
- "display(grants_per_state_990.filter(F.col('FILEREIN')=='85-0462315'))"
191
- ]
192
- },
193
- {
194
- "cell_type": "code",
195
- "execution_count": null,
196
- "metadata": {
197
- "application/vnd.databricks.v1+cell": {
198
- "cellMetadata": {
199
- "byteLimit": 2048000,
200
- "rowLimit": 10000
201
- },
202
- "inputWidgets": {},
203
- "nuid": "7232db59-693a-43d9-826b-9e6c2a271626",
204
- "showTitle": false,
205
- "tableResultSettingsMap": {},
206
- "title": ""
207
- }
208
- },
209
- "outputs": [],
210
- "source": [
211
- "display(grants_per_state_990pf.filter(F.col('FILEREIN')=='85-0462315'))"
212
- ]
213
- },
214
- {
215
- "cell_type": "code",
216
- "execution_count": null,
217
- "metadata": {
218
- "application/vnd.databricks.v1+cell": {
219
- "cellMetadata": {
220
- "byteLimit": 2048000,
221
- "rowLimit": 10000
222
- },
223
- "inputWidgets": {},
224
- "nuid": "64686496-b51f-4aad-ad34-f88e2b69cf61",
225
- "showTitle": true,
226
- "tableResultSettingsMap": {},
227
- "title": "drop dual filers"
228
- }
229
- },
230
- "outputs": [],
231
- "source": [
232
- "grants_per_state_990 = grants_per_state_990.join(\n",
233
- " dual_filers.select(F.col('FILEREIN'), F.col('TAXYEAR')),\n",
234
- " on=['FILEREIN', 'TAXYEAR'],\n",
235
- " how='left_anti'\n",
236
- ")\n",
237
- "\n",
238
- "grants_per_state_990pf = grants_per_state_990pf.join(\n",
239
- " dual_filers.select(F.col('FILEREIN'), F.col('TAXYEAR')),\n",
240
- " on=['FILEREIN', 'TAXYEAR'],\n",
241
- " how='left_anti'\n",
242
- ")"
243
- ]
244
- },
245
- {
246
- "cell_type": "code",
247
- "execution_count": null,
248
- "metadata": {
249
- "application/vnd.databricks.v1+cell": {
250
- "cellMetadata": {
251
- "byteLimit": 2048000,
252
- "rowLimit": 10000
253
- },
254
- "inputWidgets": {},
255
- "nuid": "f0f23c24-a091-49d6-bc7b-64596c89ed0a",
256
- "showTitle": true,
257
- "tableResultSettingsMap": {},
258
- "title": "combine 990 & 990pf orgs into one df"
259
- }
260
- },
261
- "outputs": [],
262
- "source": [
263
- "grants_per_state = grants_per_state_990.union(grants_per_state_990pf).orderBy('FILEREIN', 'TAXYEAR')"
264
- ]
265
- },
266
- {
267
- "cell_type": "code",
268
- "execution_count": null,
269
- "metadata": {
270
- "application/vnd.databricks.v1+cell": {
271
- "cellMetadata": {
272
- "byteLimit": 2048000,
273
- "rowLimit": 10000
274
- },
275
- "inputWidgets": {},
276
- "nuid": "ac11e27e-18a2-41e0-b8a2-2f6889990a02",
277
- "showTitle": false,
278
- "tableResultSettingsMap": {},
279
- "title": ""
280
- }
281
- },
282
- "outputs": [],
283
- "source": [
284
- "display(grants_per_state)"
285
- ]
286
- },
287
- {
288
- "cell_type": "markdown",
289
- "metadata": {
290
- "application/vnd.databricks.v1+cell": {
291
- "cellMetadata": {},
292
- "inputWidgets": {},
293
- "nuid": "6b8aee01-6623-4556-a717-9f58d8af4b6e",
294
- "showTitle": false,
295
- "tableResultSettingsMap": {},
296
- "title": ""
297
- }
298
- },
299
- "source": [
300
- "##KMeans Clustering"
301
- ]
302
- },
303
- {
304
- "cell_type": "code",
305
- "execution_count": null,
306
- "metadata": {
307
- "application/vnd.databricks.v1+cell": {
308
- "cellMetadata": {
309
- "byteLimit": 2048000,
310
- "rowLimit": 10000
311
- },
312
- "inputWidgets": {},
313
- "nuid": "013375d3-74b9-48a2-9db0-fe70b09c47f5",
314
- "showTitle": true,
315
- "tableResultSettingsMap": {},
316
- "title": "feature engineering"
317
- }
318
- },
319
- "outputs": [],
320
- "source": [
321
- "# Normalize/scale features\n",
322
- "feature_cols = [\"foreign_percentage\", \"max_recipient_state_percentage\", \"total_recipient_states\"]\n",
323
- "assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"features_unscaled\")\n",
324
- "df_features = assembler.transform(grants_per_state)\n",
325
- "\n",
326
- "scaler = StandardScaler(inputCol=\"features_unscaled\", outputCol=\"features\", withStd=True, withMean=True)\n",
327
- "df_scaled = scaler.fit(df_features).transform(df_features)\n",
328
- "\n",
329
- "# Create a composite score - optional, may not add value\n",
330
- "# max_states = grants_per_state.select(F.max('total_recipient_states')).collect()[0][0]\n",
331
- "# grants_per_state = grants_per_state.withColumn(\n",
332
- "# \"composite_score\",\n",
333
- "# 0.5 * (1 - F.col(\"max_recipient_state_percentage\")/100) + \n",
334
- "# 0.3 * (F.col(\"total_recipient_states\")/max_states) + \n",
335
- "# 0.2 * (F.col(\"foreign_percentage\")/100)\n",
336
- "# )\n",
337
- "# feature_cols = [\"foreign_percentage\", \"max_recipient_state_percentage\", \"total_recipient_states\", \"composite_score\"]\n",
338
- "# assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"features_unscaled\")\n",
339
- "# df_features = assembler.transform(grants_per_state)\n",
340
- "\n",
341
- "# scaler = StandardScaler(inputCol=\"features_unscaled\", outputCol=\"features\", withStd=True, withMean=True)\n",
342
- "# df_scaled = scaler.fit(df_features).transform(df_features)"
343
- ]
344
- },
345
- {
346
- "cell_type": "code",
347
- "execution_count": null,
348
- "metadata": {
349
- "application/vnd.databricks.v1+cell": {
350
- "cellMetadata": {
351
- "byteLimit": 2048000,
352
- "rowLimit": 10000
353
- },
354
- "inputWidgets": {},
355
- "nuid": "9bedd689-bb2c-4beb-9f7b-1756ba0c99c5",
356
- "showTitle": true,
357
- "tableResultSettingsMap": {},
358
- "title": "clustering"
359
- }
360
- },
361
- "outputs": [],
362
- "source": [
363
- "# Clustering on all the scaled features\n",
364
- "kmeans = KMeans(featuresCol=\"features\", predictionCol=\"cluster\", k=3, seed=42)\n",
365
- "model = kmeans.fit(df_scaled)\n",
366
- "\n",
367
- "# Assign clusters\n",
368
- "df_clustered = model.transform(df_scaled)"
369
- ]
370
- },
371
- {
372
- "cell_type": "code",
373
- "execution_count": null,
374
- "metadata": {
375
- "application/vnd.databricks.v1+cell": {
376
- "cellMetadata": {
377
- "byteLimit": 2048000,
378
- "rowLimit": 10000
379
- },
380
- "inputWidgets": {},
381
- "nuid": "4f47c4f6-2143-41a7-b921-55cc3405be3a",
382
- "showTitle": false,
383
- "tableResultSettingsMap": {},
384
- "title": ""
385
- }
386
- },
387
- "outputs": [],
388
- "source": [
389
- "display(df_clustered)"
390
- ]
391
- },
392
- {
393
- "cell_type": "code",
394
- "execution_count": null,
395
- "metadata": {
396
- "application/vnd.databricks.v1+cell": {
397
- "cellMetadata": {
398
- "byteLimit": 2048000,
399
- "rowLimit": 10000
400
- },
401
- "inputWidgets": {},
402
- "nuid": "5cce9fe9-0c60-4c5d-971e-1460e813a0fc",
403
- "showTitle": false,
404
- "tableResultSettingsMap": {},
405
- "title": ""
406
- }
407
- },
408
- "outputs": [],
409
- "source": [
410
- "# df_clustered.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_with_composite_feature')\n",
411
- "df_clustered.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_without_composite_feature')"
412
- ]
413
- },
414
- {
415
- "cell_type": "markdown",
416
- "metadata": {
417
- "application/vnd.databricks.v1+cell": {
418
- "cellMetadata": {},
419
- "inputWidgets": {},
420
- "nuid": "86298a0c-3526-4749-84d8-33c4119da0d8",
421
- "showTitle": false,
422
- "tableResultSettingsMap": {},
423
- "title": ""
424
- }
425
- },
426
- "source": [
427
- "##Cluster Summary - With Composite Feature"
428
- ]
429
- },
430
- {
431
- "cell_type": "code",
432
- "execution_count": null,
433
- "metadata": {
434
- "application/vnd.databricks.v1+cell": {
435
- "cellMetadata": {
436
- "byteLimit": 2048000,
437
- "rowLimit": 10000
438
- },
439
- "inputWidgets": {},
440
- "nuid": "702ac066-0d69-47e9-9411-f080d3a541ea",
441
- "showTitle": false,
442
- "tableResultSettingsMap": {},
443
- "title": ""
444
- }
445
- },
446
- "outputs": [],
447
- "source": [
448
- "df_clustered = spark.read.table('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_with_composite_feature')"
449
- ]
450
- },
451
- {
452
- "cell_type": "markdown",
453
- "metadata": {
454
- "application/vnd.databricks.v1+cell": {
455
- "cellMetadata": {},
456
- "inputWidgets": {},
457
- "nuid": "3926601d-c18f-4fed-a87e-06b762d61c6e",
458
- "showTitle": false,
459
- "tableResultSettingsMap": {},
460
- "title": ""
461
- }
462
- },
463
- "source": [
464
- "Cluster 0 = local/regional<br>\n",
465
- "Cluster 1 = international<br>\n",
466
- "Cluster 2 = national"
467
- ]
468
- },
469
- {
470
- "cell_type": "code",
471
- "execution_count": null,
472
- "metadata": {
473
- "application/vnd.databricks.v1+cell": {
474
- "cellMetadata": {
475
- "byteLimit": 2048000,
476
- "rowLimit": 10000
477
- },
478
- "inputWidgets": {},
479
- "nuid": "5ae9099f-f596-4565-954a-40035f6eb880",
480
- "showTitle": true,
481
- "tableResultSettingsMap": {},
482
- "title": "summarize clusters by original features"
483
- }
484
- },
485
- "outputs": [],
486
- "source": [
487
- "summary = (\n",
488
- " df_clustered\n",
489
- " .groupBy(\"cluster\")\n",
490
- " .agg(\n",
491
- " F.count(\"*\").alias(\"count\"),\n",
492
- " F.avg(\"foreign_percentage\").alias(\"avg_foreign_percentage\"),\n",
493
- " F.median(\"foreign_percentage\").alias(\"median_foreign_percentage\"),\n",
494
- " F.min(\"foreign_percentage\").alias(\"min_foreign_percentage\"),\n",
495
- " F.max(\"foreign_percentage\").alias(\"max_foreign_percentage\"),\n",
496
- " F.avg(\"max_recipient_state_percentage\").alias(\"avg_max_state_pct\"),\n",
497
- " F.median(\"max_recipient_state_percentage\").alias(\"median_max_state_pct\"),\n",
498
- " F.min(\"max_recipient_state_percentage\").alias(\"min_max_state_pct\"),\n",
499
- " F.max(\"max_recipient_state_percentage\").alias(\"max_max_state_pct\"),\n",
500
- " F.avg(\"total_recipient_states\").alias(\"avg_distinct_states\"),\n",
501
- " F.median(\"total_recipient_states\").alias(\"median_distinct_states\"),\n",
502
- " F.min(\"total_recipient_states\").alias(\"min_distinct_states\"),\n",
503
- " F.max(\"total_recipient_states\").alias(\"max_distinct_states\"),\n",
504
- " )\n",
505
- " .orderBy(\"cluster\")\n",
506
- ")\n",
507
- "\n",
508
- "display(summary)"
509
- ]
510
- },
511
- {
512
- "cell_type": "code",
513
- "execution_count": null,
514
- "metadata": {
515
- "application/vnd.databricks.v1+cell": {
516
- "cellMetadata": {
517
- "byteLimit": 2048000,
518
- "rowLimit": 10000
519
- },
520
- "inputWidgets": {},
521
- "nuid": "def2f16f-982e-448e-9aa8-782aa01c2193",
522
- "showTitle": true,
523
- "tableResultSettingsMap": {},
524
- "title": "create distribution plots for each cluster (feature: foreign percentage)"
525
- }
526
- },
527
- "outputs": [],
528
- "source": [
529
- "pdf_clustered = df_clustered.toPandas()\n",
530
- "\n",
531
- "fig_foreign = px.box(\n",
532
- " pdf_clustered,\n",
533
- " x=\"cluster\",\n",
534
- " y=\"foreign_percentage\",\n",
535
- " title=\"Foreign Percentage by Cluster\",\n",
536
- " labels={\"foreign_percentage\": \"Foreign Percentage\", \"cluster\": \"Cluster\"}\n",
537
- ")\n",
538
- "fig_foreign.show()"
539
- ]
540
- },
541
- {
542
- "cell_type": "code",
543
- "execution_count": null,
544
- "metadata": {
545
- "application/vnd.databricks.v1+cell": {
546
- "cellMetadata": {
547
- "byteLimit": 2048000,
548
- "rowLimit": 10000
549
- },
550
- "inputWidgets": {},
551
- "nuid": "01020c54-f610-42e2-b43c-f2643f98a576",
552
- "showTitle": true,
553
- "tableResultSettingsMap": {},
554
- "title": "create distribution plots for each cluster (feature: max recipient state percentage)"
555
- }
556
- },
557
- "outputs": [],
558
- "source": [
559
- "fig_max_recipient = px.box(\n",
560
- " pdf_clustered,\n",
561
- " x=\"cluster\",\n",
562
- " y=\"max_recipient_state_percentage\",\n",
563
- " title=\"Max Recipient State Percentage by Cluster\",\n",
564
- " labels={\"max_recipient_state_percentage\": \"Max Recipient State Percentage\", \"cluster\": \"Cluster\"}\n",
565
- ")\n",
566
- "fig_max_recipient.show()"
567
- ]
568
- },
569
- {
570
- "cell_type": "code",
571
- "execution_count": null,
572
- "metadata": {
573
- "application/vnd.databricks.v1+cell": {
574
- "cellMetadata": {
575
- "byteLimit": 2048000,
576
- "rowLimit": 10000
577
- },
578
- "inputWidgets": {},
579
- "nuid": "170d71cf-cb3f-44d5-bf90-cc9746a3c1d3",
580
- "showTitle": true,
581
- "tableResultSettingsMap": {},
582
- "title": "create distribution plots for each cluster (feature: number of states)"
583
- }
584
- },
585
- "outputs": [],
586
- "source": [
587
- "fig_total_states = px.box(\n",
588
- " pdf_clustered,\n",
589
- " x=\"cluster\",\n",
590
- " y=\"total_recipient_states\",\n",
591
- " title=\"Total Recipient States by Cluster\",\n",
592
- " labels={\"total_recipient_states\": \"Total Recipient States\", \"cluster\": \"Cluster\"}\n",
593
- ")\n",
594
- "fig_total_states.show()"
595
- ]
596
- },
597
- {
598
- "cell_type": "markdown",
599
- "metadata": {
600
- "application/vnd.databricks.v1+cell": {
601
- "cellMetadata": {},
602
- "inputWidgets": {},
603
- "nuid": "12dc14fa-0066-4fe5-8a99-d9c6d05860aa",
604
- "showTitle": false,
605
- "tableResultSettingsMap": {},
606
- "title": ""
607
- }
608
- },
609
- "source": [
610
- "##Cluster Summary - Without Composite Feature"
611
- ]
612
- },
613
- {
614
- "cell_type": "code",
615
- "execution_count": null,
616
- "metadata": {
617
- "application/vnd.databricks.v1+cell": {
618
- "cellMetadata": {
619
- "byteLimit": 2048000,
620
- "rowLimit": 10000
621
- },
622
- "inputWidgets": {},
623
- "nuid": "1545cd83-3e0a-43f9-9719-14d0f12f5dcb",
624
- "showTitle": false,
625
- "tableResultSettingsMap": {},
626
- "title": ""
627
- }
628
- },
629
- "outputs": [],
630
- "source": [
631
- "df_clustered = spark.read.table('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_without_composite_feature')"
632
- ]
633
- },
634
- {
635
- "cell_type": "code",
636
- "execution_count": null,
637
- "metadata": {
638
- "application/vnd.databricks.v1+cell": {
639
- "cellMetadata": {
640
- "byteLimit": 2048000,
641
- "rowLimit": 10000
642
- },
643
- "inputWidgets": {},
644
- "nuid": "f4a8fe26-50d1-4d55-bd18-0313a1d55136",
645
- "showTitle": false,
646
- "tableResultSettingsMap": {},
647
- "title": ""
648
- }
649
- },
650
- "outputs": [],
651
- "source": [
652
- "display(df_clustered)"
653
- ]
654
- },
655
- {
656
- "cell_type": "code",
657
- "execution_count": null,
658
- "metadata": {
659
- "application/vnd.databricks.v1+cell": {
660
- "cellMetadata": {
661
- "byteLimit": 2048000,
662
- "rowLimit": 10000
663
- },
664
- "inputWidgets": {},
665
- "nuid": "6b2a98fa-d5ee-4c8f-8aad-208a83b2d145",
666
- "showTitle": false,
667
- "tableResultSettingsMap": {},
668
- "title": ""
669
- }
670
- },
671
- "outputs": [],
672
- "source": [
673
- "from pyspark.sql import functions as F\n",
674
- "\n",
675
- "df_2023 = (\n",
676
- " df_clustered\n",
677
- " .filter(F.col(\"TAXYEAR\") == 2023)\n",
678
- " .withColumn(\"locality\", F.when(F.col(\"cluster\")==0, \"local/regional\").otherwise(F.when(F.col(\"cluster\")==1, \"international\").otherwise(F.when(F.col(\"cluster\")==2, \"national\").otherwise(None))))\n",
679
- " .select(\n",
680
- " \"FILEREIN\",\n",
681
- " \"TAXYEAR\",\n",
682
- " \"FILERUSSTATE\",\n",
683
- " F.col(\"total_grant_value\").alias(\"value_of_grants\"),\n",
684
- " F.col(\"total_grant_count\").alias(\"number_of_grants\"),\n",
685
- " F.col(\"total_recipient_states\").alias(\"number_of_recipient_states\"),\n",
686
- " F.col(\"foreign_percentage\").alias(\"pct_grant_value_foreign\"),\n",
687
- " F.col(\"max_recipient_state_percentage\").alias(\"pct_grant_value_top_state\"),\n",
688
- " F.col(\"top_recipient_state\").alias(\"top_state\"),\n",
689
- " F.col(\"distinct_recipient_states\").alias(\"recipient_states\"),\n",
690
- " \"locality\",\n",
691
- " \"source\",\n",
692
- " )\n",
693
- ")\n",
694
- "display(df_2023)"
695
- ]
696
- },
697
- {
698
- "cell_type": "code",
699
- "execution_count": null,
700
- "metadata": {
701
- "application/vnd.databricks.v1+cell": {
702
- "cellMetadata": {
703
- "byteLimit": 2048000,
704
- "rowLimit": 10000
705
- },
706
- "inputWidgets": {},
707
- "nuid": "2483cf1c-4bed-47f0-91c0-0364e0f0d5da",
708
- "showTitle": false,
709
- "tableResultSettingsMap": {},
710
- "title": ""
711
- }
712
- },
713
- "outputs": [],
714
- "source": [
715
- "df_2023.write.mode(\"overwrite\").saveAsTable(\"sandbox_edward.nonprofit_mapping.locality_by_granting_activity_segmentation_funding_orgs_taxyear2023\")"
716
- ]
717
- },
718
- {
719
- "cell_type": "markdown",
720
- "metadata": {
721
- "application/vnd.databricks.v1+cell": {
722
- "cellMetadata": {},
723
- "inputWidgets": {},
724
- "nuid": "0781ad05-93a0-46fe-9357-48fea2039b81",
725
- "showTitle": false,
726
- "tableResultSettingsMap": {},
727
- "title": ""
728
- }
729
- },
730
- "source": [
731
- "Cluster 0 = local/regional<br>\n",
732
- "Cluster 1 = international<br>\n",
733
- "Cluster 2 = national"
734
- ]
735
- },
736
- {
737
- "cell_type": "code",
738
- "execution_count": null,
739
- "metadata": {
740
- "application/vnd.databricks.v1+cell": {
741
- "cellMetadata": {
742
- "byteLimit": 2048000,
743
- "rowLimit": 10000
744
- },
745
- "inputWidgets": {},
746
- "nuid": "840cdfdb-dbad-4264-b0fc-85bb060ac2aa",
747
- "showTitle": true,
748
- "tableResultSettingsMap": {},
749
- "title": "summarize clusters by original features"
750
- }
751
- },
752
- "outputs": [],
753
- "source": [
754
- "summary = (\n",
755
- " df_clustered\n",
756
- " .groupBy(\"cluster\")\n",
757
- " .agg(\n",
758
- " F.count(\"*\").alias(\"count\"),\n",
759
- " F.avg(\"foreign_percentage\").alias(\"avg_foreign_percentage\"),\n",
760
- " F.median(\"foreign_percentage\").alias(\"median_foreign_percentage\"),\n",
761
- " F.min(\"foreign_percentage\").alias(\"min_foreign_percentage\"),\n",
762
- " F.max(\"foreign_percentage\").alias(\"max_foreign_percentage\"),\n",
763
- " F.avg(\"max_recipient_state_percentage\").alias(\"avg_max_state_pct\"),\n",
764
- " F.median(\"max_recipient_state_percentage\").alias(\"median_max_state_pct\"),\n",
765
- " F.min(\"max_recipient_state_percentage\").alias(\"min_max_state_pct\"),\n",
766
- " F.max(\"max_recipient_state_percentage\").alias(\"max_max_state_pct\"),\n",
767
- " F.avg(\"total_recipient_states\").alias(\"avg_distinct_states\"),\n",
768
- " F.median(\"total_recipient_states\").alias(\"median_distinct_states\"),\n",
769
- " F.min(\"total_recipient_states\").alias(\"min_distinct_states\"),\n",
770
- " F.max(\"total_recipient_states\").alias(\"max_distinct_states\"),\n",
771
- " )\n",
772
- " .orderBy(\"cluster\")\n",
773
- ")\n",
774
- "\n",
775
- "display(summary)"
776
- ]
777
- },
778
- {
779
- "cell_type": "code",
780
- "execution_count": null,
781
- "metadata": {
782
- "application/vnd.databricks.v1+cell": {
783
- "cellMetadata": {
784
- "byteLimit": 2048000,
785
- "rowLimit": 10000
786
- },
787
- "inputWidgets": {},
788
- "nuid": "3e9a4fb8-11e3-4734-8d8e-943e03e3b738",
789
- "showTitle": true,
790
- "tableResultSettingsMap": {},
791
- "title": "create distribution plots for each cluster (feature: foreign percentage)"
792
- }
793
- },
794
- "outputs": [],
795
- "source": [
796
- "pdf_clustered = df_clustered.toPandas()\n",
797
- "\n",
798
- "fig_foreign = px.box(\n",
799
- " pdf_clustered,\n",
800
- " x=\"cluster\",\n",
801
- " y=\"foreign_percentage\",\n",
802
- " title=\"Foreign Percentage by Cluster\",\n",
803
- " labels={\"foreign_percentage\": \"Foreign Percentage\", \"cluster\": \"Cluster\"}\n",
804
- ")\n",
805
- "fig_foreign.show()"
806
- ]
807
- },
808
- {
809
- "cell_type": "code",
810
- "execution_count": null,
811
- "metadata": {
812
- "application/vnd.databricks.v1+cell": {
813
- "cellMetadata": {
814
- "byteLimit": 2048000,
815
- "rowLimit": 10000
816
- },
817
- "inputWidgets": {},
818
- "nuid": "42bc6e79-cc9f-4745-9751-03c9223a3642",
819
- "showTitle": true,
820
- "tableResultSettingsMap": {},
821
- "title": "create distribution plots for each cluster (feature: max recipient state percentage)"
822
- }
823
- },
824
- "outputs": [],
825
- "source": [
826
- "fig_max_recipient = px.box(\n",
827
- " pdf_clustered,\n",
828
- " x=\"cluster\",\n",
829
- " y=\"max_recipient_state_percentage\",\n",
830
- " title=\"Max Recipient State Percentage by Cluster\",\n",
831
- " labels={\"max_recipient_state_percentage\": \"Max Recipient State Percentage\", \"cluster\": \"Cluster\"}\n",
832
- ")\n",
833
- "fig_max_recipient.show()"
834
- ]
835
- },
836
- {
837
- "cell_type": "code",
838
- "execution_count": null,
839
- "metadata": {
840
- "application/vnd.databricks.v1+cell": {
841
- "cellMetadata": {
842
- "byteLimit": 2048000,
843
- "rowLimit": 10000
844
- },
845
- "inputWidgets": {},
846
- "nuid": "b05a45f8-4d81-4ac2-9cfb-b73e18d2051f",
847
- "showTitle": true,
848
- "tableResultSettingsMap": {},
849
- "title": "create distribution plots for each cluster (feature: number of states)"
850
- }
851
- },
852
- "outputs": [],
853
- "source": [
854
- "fig_total_states = px.box(\n",
855
- " pdf_clustered,\n",
856
- " x=\"cluster\",\n",
857
- " y=\"total_recipient_states\",\n",
858
- " title=\"Total Recipient States by Cluster\",\n",
859
- " labels={\"total_recipient_states\": \"Total Recipient States\", \"cluster\": \"Cluster\"}\n",
860
- ")\n",
861
- "fig_total_states.show()"
862
- ]
863
- },
864
- {
865
- "cell_type": "code",
866
- "execution_count": null,
867
- "metadata": {
868
- "application/vnd.databricks.v1+cell": {
869
- "cellMetadata": {},
870
- "inputWidgets": {},
871
- "nuid": "3cf19964-c147-4cf4-b7b9-1f31a2e6a256",
872
- "showTitle": false,
873
- "tableResultSettingsMap": {},
874
- "title": ""
875
- }
876
- },
877
- "outputs": [],
878
- "source": []
879
- }
880
- ],
881
- "metadata": {
882
- "application/vnd.databricks.v1+notebook": {
883
- "computePreferences": {
884
- "hardware": {
885
- "accelerator": null,
886
- "gpuPoolId": null,
887
- "memory": null
888
- }
889
- },
890
- "dashboards": [],
891
- "environmentMetadata": {
892
- "base_environment": "",
893
- "environment_version": "2"
894
- },
895
- "inputWidgetPreferences": null,
896
- "language": "python",
897
- "notebookMetadata": {
898
- "pythonIndentUnit": 4
899
- },
900
- "notebookName": "(Clone) NP04_classification",
901
- "widgets": {}
902
- },
903
- "language_info": {
904
- "name": "python"
905
- }
906
- },
907
- "nbformat": 4,
908
- "nbformat_minor": 0
909
- }