English
hassaanulhaq01 commited on
Commit
18fdc8b
·
verified ·
1 Parent(s): 31d89f2

Add interactive schedule_o notebook from Databricks

Browse files
Files changed (1) hide show
  1. notebooks/NP04_classification.ipynb +909 -0
notebooks/NP04_classification.ipynb ADDED
@@ -0,0 +1,909 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "application/vnd.databricks.v1+cell": {
8
+ "cellMetadata": {
9
+ "byteLimit": 2048000,
10
+ "rowLimit": 10000
11
+ },
12
+ "inputWidgets": {},
13
+ "nuid": "62d4799f-4935-4a2d-8f0a-5f6383b22cf7",
14
+ "showTitle": false,
15
+ "tableResultSettingsMap": {},
16
+ "title": ""
17
+ }
18
+ },
19
+ "outputs": [],
20
+ "source": [
21
+ "df1 = spark.read.table(\"prod_curated.irs.990cn120fields\")\n",
22
+ "df2 = spark.read.table(\"prod_curated.irs.990standardfields\")"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "metadata": {
29
+ "application/vnd.databricks.v1+cell": {
30
+ "cellMetadata": {
31
+ "byteLimit": 2048000,
32
+ "rowLimit": 10000
33
+ },
34
+ "inputWidgets": {},
35
+ "nuid": "d5410f3d-7463-43f5-8bfb-528d36e80b42",
36
+ "showTitle": false,
37
+ "tableResultSettingsMap": {
38
+ "0": {
39
+ "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{\"column\":116},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1758734440525}",
40
+ "filterBlob": null,
41
+ "queryPlanFiltersBlob": null,
42
+ "tableResultIndex": 0
43
+ }
44
+ },
45
+ "title": ""
46
+ }
47
+ },
48
+ "outputs": [],
49
+ "source": [
50
+ "from pyspark.sql import SparkSession\n",
51
+ "import pandas as pd\n",
52
+ "\n",
53
+ "# Extract (col, dtype) as dicts\n",
54
+ "df1_schema = {f.name: f.dataType.simpleString() for f in df1.schema.fields}\n",
55
+ "df2_schema = {f.name: f.dataType.simpleString() for f in df2.schema.fields}\n",
56
+ "\n",
57
+ "# Union of all column names\n",
58
+ "all_cols = set(df1_schema.keys()).union(df2_schema.keys())\n",
59
+ "\n",
60
+ "# Build comparison rows\n",
61
+ "rows = []\n",
62
+ "for col in sorted(all_cols):\n",
63
+ " in_df1 = col in df1_schema\n",
64
+ " in_df2 = col in df2_schema\n",
65
+ " \n",
66
+ " if in_df1 and in_df2:\n",
67
+ " flag = \"both\"\n",
68
+ " elif in_df1:\n",
69
+ " flag = \"old\"\n",
70
+ " else:\n",
71
+ " flag = \"new\"\n",
72
+ " \n",
73
+ " rows.append({\n",
74
+ " \"column\": col,\n",
75
+ " \"in_df\": flag,\n",
76
+ " \"dtype_old\": df1_schema.get(col),\n",
77
+ " \"dtype_new\": df2_schema.get(col)\n",
78
+ " })\n",
79
+ "\n",
80
+ "# Convert to pandas for inspection\n",
81
+ "comparison_df = pd.DataFrame(rows)\n",
82
+ "\n",
83
+ "# If you prefer it as a Spark DataFrame:\n",
84
+ "spark = SparkSession.builder.getOrCreate()\n",
85
+ "spark_comparison_df = spark.createDataFrame(comparison_df)\n",
86
+ "\n",
87
+ "display(comparison_df)\n"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": null,
93
+ "metadata": {
94
+ "application/vnd.databricks.v1+cell": {
95
+ "cellMetadata": {
96
+ "byteLimit": 2048000,
97
+ "rowLimit": 10000
98
+ },
99
+ "inputWidgets": {},
100
+ "nuid": "2634e810-1046-456f-a00e-34db0ca198a2",
101
+ "showTitle": false,
102
+ "tableResultSettingsMap": {},
103
+ "title": ""
104
+ }
105
+ },
106
+ "outputs": [],
107
+ "source": [
108
+ "from pyspark.sql import functions as F\n",
109
+ "from pyspark.sql.window import Window\n",
110
+ "\n",
111
+ "from pyspark.ml.feature import VectorAssembler, StandardScaler\n",
112
+ "from pyspark.ml.clustering import KMeans\n",
113
+ "\n",
114
+ "import plotly.express as px"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": null,
120
+ "metadata": {
121
+ "application/vnd.databricks.v1+cell": {
122
+ "cellMetadata": {
123
+ "byteLimit": 2048000,
124
+ "rowLimit": 10000
125
+ },
126
+ "inputWidgets": {},
127
+ "nuid": "cc50ff8a-e01c-417d-b926-fecac95265d0",
128
+ "showTitle": false,
129
+ "tableResultSettingsMap": {},
130
+ "title": ""
131
+ }
132
+ },
133
+ "outputs": [],
134
+ "source": [
135
+ "grants_per_state_990 = spark.read.table('sandbox_edward.nonprofit_mapping.grants_per_state_990_filers')\n",
136
+ "grants_per_state_990pf = spark.read.table('sandbox_edward.nonprofit_mapping.grants_per_state_990pf_filers')"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": null,
142
+ "metadata": {
143
+ "application/vnd.databricks.v1+cell": {
144
+ "cellMetadata": {
145
+ "byteLimit": 2048000,
146
+ "rowLimit": 10000
147
+ },
148
+ "inputWidgets": {},
149
+ "nuid": "79330be0-c72e-4670-b6bd-b95665af55c8",
150
+ "showTitle": true,
151
+ "tableResultSettingsMap": {},
152
+ "title": "check for EINs in both 990 and 990pf"
153
+ }
154
+ },
155
+ "outputs": [],
156
+ "source": [
157
+ "dual_filers = (\n",
158
+ " grants_per_state_990.select(\n",
159
+ " 'FILEREIN', \n",
160
+ " 'TAXYEAR'\n",
161
+ " )\n",
162
+ " .join(\n",
163
+ " grants_per_state_990pf.select('FILEREIN', 'TAXYEAR'), \n",
164
+ " on=['FILEREIN', 'TAXYEAR'],\n",
165
+ " how='inner'\n",
166
+ " )\n",
167
+ ")\n",
168
+ "\n",
169
+ "display(dual_filers)"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": null,
175
+ "metadata": {
176
+ "application/vnd.databricks.v1+cell": {
177
+ "cellMetadata": {
178
+ "byteLimit": 2048000,
179
+ "rowLimit": 10000
180
+ },
181
+ "inputWidgets": {},
182
+ "nuid": "3cd453d0-0bac-42d7-b9d3-51a30be32e6b",
183
+ "showTitle": false,
184
+ "tableResultSettingsMap": {},
185
+ "title": ""
186
+ }
187
+ },
188
+ "outputs": [],
189
+ "source": [
190
+ "display(grants_per_state_990.filter(F.col('FILEREIN')=='85-0462315'))"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": null,
196
+ "metadata": {
197
+ "application/vnd.databricks.v1+cell": {
198
+ "cellMetadata": {
199
+ "byteLimit": 2048000,
200
+ "rowLimit": 10000
201
+ },
202
+ "inputWidgets": {},
203
+ "nuid": "7232db59-693a-43d9-826b-9e6c2a271626",
204
+ "showTitle": false,
205
+ "tableResultSettingsMap": {},
206
+ "title": ""
207
+ }
208
+ },
209
+ "outputs": [],
210
+ "source": [
211
+ "display(grants_per_state_990pf.filter(F.col('FILEREIN')=='85-0462315'))"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": null,
217
+ "metadata": {
218
+ "application/vnd.databricks.v1+cell": {
219
+ "cellMetadata": {
220
+ "byteLimit": 2048000,
221
+ "rowLimit": 10000
222
+ },
223
+ "inputWidgets": {},
224
+ "nuid": "64686496-b51f-4aad-ad34-f88e2b69cf61",
225
+ "showTitle": true,
226
+ "tableResultSettingsMap": {},
227
+ "title": "drop dual filers"
228
+ }
229
+ },
230
+ "outputs": [],
231
+ "source": [
232
+ "grants_per_state_990 = grants_per_state_990.join(\n",
233
+ " dual_filers.select(F.col('FILEREIN'), F.col('TAXYEAR')),\n",
234
+ " on=['FILEREIN', 'TAXYEAR'],\n",
235
+ " how='left_anti'\n",
236
+ ")\n",
237
+ "\n",
238
+ "grants_per_state_990pf = grants_per_state_990pf.join(\n",
239
+ " dual_filers.select(F.col('FILEREIN'), F.col('TAXYEAR')),\n",
240
+ " on=['FILEREIN', 'TAXYEAR'],\n",
241
+ " how='left_anti'\n",
242
+ ")"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": null,
248
+ "metadata": {
249
+ "application/vnd.databricks.v1+cell": {
250
+ "cellMetadata": {
251
+ "byteLimit": 2048000,
252
+ "rowLimit": 10000
253
+ },
254
+ "inputWidgets": {},
255
+ "nuid": "f0f23c24-a091-49d6-bc7b-64596c89ed0a",
256
+ "showTitle": true,
257
+ "tableResultSettingsMap": {},
258
+ "title": "combine 990 & 990pf orgs into one df"
259
+ }
260
+ },
261
+ "outputs": [],
262
+ "source": [
263
+ "grants_per_state = grants_per_state_990.union(grants_per_state_990pf).orderBy('FILEREIN', 'TAXYEAR')"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": null,
269
+ "metadata": {
270
+ "application/vnd.databricks.v1+cell": {
271
+ "cellMetadata": {
272
+ "byteLimit": 2048000,
273
+ "rowLimit": 10000
274
+ },
275
+ "inputWidgets": {},
276
+ "nuid": "ac11e27e-18a2-41e0-b8a2-2f6889990a02",
277
+ "showTitle": false,
278
+ "tableResultSettingsMap": {},
279
+ "title": ""
280
+ }
281
+ },
282
+ "outputs": [],
283
+ "source": [
284
+ "display(grants_per_state)"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "markdown",
289
+ "metadata": {
290
+ "application/vnd.databricks.v1+cell": {
291
+ "cellMetadata": {},
292
+ "inputWidgets": {},
293
+ "nuid": "6b8aee01-6623-4556-a717-9f58d8af4b6e",
294
+ "showTitle": false,
295
+ "tableResultSettingsMap": {},
296
+ "title": ""
297
+ }
298
+ },
299
+ "source": [
300
+ "##KMeans Clustering"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "execution_count": null,
306
+ "metadata": {
307
+ "application/vnd.databricks.v1+cell": {
308
+ "cellMetadata": {
309
+ "byteLimit": 2048000,
310
+ "rowLimit": 10000
311
+ },
312
+ "inputWidgets": {},
313
+ "nuid": "013375d3-74b9-48a2-9db0-fe70b09c47f5",
314
+ "showTitle": true,
315
+ "tableResultSettingsMap": {},
316
+ "title": "feature engineering"
317
+ }
318
+ },
319
+ "outputs": [],
320
+ "source": [
321
+ "# Normalize/scale features\n",
322
+ "feature_cols = [\"foreign_percentage\", \"max_recipient_state_percentage\", \"total_recipient_states\"]\n",
323
+ "assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"features_unscaled\")\n",
324
+ "df_features = assembler.transform(grants_per_state)\n",
325
+ "\n",
326
+ "scaler = StandardScaler(inputCol=\"features_unscaled\", outputCol=\"features\", withStd=True, withMean=True)\n",
327
+ "df_scaled = scaler.fit(df_features).transform(df_features)\n",
328
+ "\n",
329
+ "# Create a composite score - optional, may not add value\n",
330
+ "# max_states = grants_per_state.select(F.max('total_recipient_states')).collect()[0][0]\n",
331
+ "# grants_per_state = grants_per_state.withColumn(\n",
332
+ "# \"composite_score\",\n",
333
+ "# 0.5 * (1 - F.col(\"max_recipient_state_percentage\")/100) + \n",
334
+ "# 0.3 * (F.col(\"total_recipient_states\")/max_states) + \n",
335
+ "# 0.2 * (F.col(\"foreign_percentage\")/100)\n",
336
+ "# )\n",
337
+ "# feature_cols = [\"foreign_percentage\", \"max_recipient_state_percentage\", \"total_recipient_states\", \"composite_score\"]\n",
338
+ "# assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"features_unscaled\")\n",
339
+ "# df_features = assembler.transform(grants_per_state)\n",
340
+ "\n",
341
+ "# scaler = StandardScaler(inputCol=\"features_unscaled\", outputCol=\"features\", withStd=True, withMean=True)\n",
342
+ "# df_scaled = scaler.fit(df_features).transform(df_features)"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": null,
348
+ "metadata": {
349
+ "application/vnd.databricks.v1+cell": {
350
+ "cellMetadata": {
351
+ "byteLimit": 2048000,
352
+ "rowLimit": 10000
353
+ },
354
+ "inputWidgets": {},
355
+ "nuid": "9bedd689-bb2c-4beb-9f7b-1756ba0c99c5",
356
+ "showTitle": true,
357
+ "tableResultSettingsMap": {},
358
+ "title": "clustering"
359
+ }
360
+ },
361
+ "outputs": [],
362
+ "source": [
363
+ "# Clustering on all the scaled features\n",
364
+ "kmeans = KMeans(featuresCol=\"features\", predictionCol=\"cluster\", k=3, seed=42)\n",
365
+ "model = kmeans.fit(df_scaled)\n",
366
+ "\n",
367
+ "# Assign clusters\n",
368
+ "df_clustered = model.transform(df_scaled)"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": null,
374
+ "metadata": {
375
+ "application/vnd.databricks.v1+cell": {
376
+ "cellMetadata": {
377
+ "byteLimit": 2048000,
378
+ "rowLimit": 10000
379
+ },
380
+ "inputWidgets": {},
381
+ "nuid": "4f47c4f6-2143-41a7-b921-55cc3405be3a",
382
+ "showTitle": false,
383
+ "tableResultSettingsMap": {},
384
+ "title": ""
385
+ }
386
+ },
387
+ "outputs": [],
388
+ "source": [
389
+ "display(df_clustered)"
390
+ ]
391
+ },
392
+ {
393
+ "cell_type": "code",
394
+ "execution_count": null,
395
+ "metadata": {
396
+ "application/vnd.databricks.v1+cell": {
397
+ "cellMetadata": {
398
+ "byteLimit": 2048000,
399
+ "rowLimit": 10000
400
+ },
401
+ "inputWidgets": {},
402
+ "nuid": "5cce9fe9-0c60-4c5d-971e-1460e813a0fc",
403
+ "showTitle": false,
404
+ "tableResultSettingsMap": {},
405
+ "title": ""
406
+ }
407
+ },
408
+ "outputs": [],
409
+ "source": [
410
+ "# df_clustered.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_with_composite_feature')\n",
411
+ "df_clustered.write.mode('overwrite').saveAsTable('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_without_composite_feature')"
412
+ ]
413
+ },
414
+ {
415
+ "cell_type": "markdown",
416
+ "metadata": {
417
+ "application/vnd.databricks.v1+cell": {
418
+ "cellMetadata": {},
419
+ "inputWidgets": {},
420
+ "nuid": "86298a0c-3526-4749-84d8-33c4119da0d8",
421
+ "showTitle": false,
422
+ "tableResultSettingsMap": {},
423
+ "title": ""
424
+ }
425
+ },
426
+ "source": [
427
+ "##Cluster Summary - With Composite Feature"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "code",
432
+ "execution_count": null,
433
+ "metadata": {
434
+ "application/vnd.databricks.v1+cell": {
435
+ "cellMetadata": {
436
+ "byteLimit": 2048000,
437
+ "rowLimit": 10000
438
+ },
439
+ "inputWidgets": {},
440
+ "nuid": "702ac066-0d69-47e9-9411-f080d3a541ea",
441
+ "showTitle": false,
442
+ "tableResultSettingsMap": {},
443
+ "title": ""
444
+ }
445
+ },
446
+ "outputs": [],
447
+ "source": [
448
+ "df_clustered = spark.read.table('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_with_composite_feature')"
449
+ ]
450
+ },
451
+ {
452
+ "cell_type": "markdown",
453
+ "metadata": {
454
+ "application/vnd.databricks.v1+cell": {
455
+ "cellMetadata": {},
456
+ "inputWidgets": {},
457
+ "nuid": "3926601d-c18f-4fed-a87e-06b762d61c6e",
458
+ "showTitle": false,
459
+ "tableResultSettingsMap": {},
460
+ "title": ""
461
+ }
462
+ },
463
+ "source": [
464
+ "Cluster 0 = local/regional<br>\n",
465
+ "Cluster 1 = international<br>\n",
466
+ "Cluster 2 = national"
467
+ ]
468
+ },
469
+ {
470
+ "cell_type": "code",
471
+ "execution_count": null,
472
+ "metadata": {
473
+ "application/vnd.databricks.v1+cell": {
474
+ "cellMetadata": {
475
+ "byteLimit": 2048000,
476
+ "rowLimit": 10000
477
+ },
478
+ "inputWidgets": {},
479
+ "nuid": "5ae9099f-f596-4565-954a-40035f6eb880",
480
+ "showTitle": true,
481
+ "tableResultSettingsMap": {},
482
+ "title": "summarize clusters by original features"
483
+ }
484
+ },
485
+ "outputs": [],
486
+ "source": [
487
+ "summary = (\n",
488
+ " df_clustered\n",
489
+ " .groupBy(\"cluster\")\n",
490
+ " .agg(\n",
491
+ " F.count(\"*\").alias(\"count\"),\n",
492
+ " F.avg(\"foreign_percentage\").alias(\"avg_foreign_percentage\"),\n",
493
+ " F.median(\"foreign_percentage\").alias(\"median_foreign_percentage\"),\n",
494
+ " F.min(\"foreign_percentage\").alias(\"min_foreign_percentage\"),\n",
495
+ " F.max(\"foreign_percentage\").alias(\"max_foreign_percentage\"),\n",
496
+ " F.avg(\"max_recipient_state_percentage\").alias(\"avg_max_state_pct\"),\n",
497
+ " F.median(\"max_recipient_state_percentage\").alias(\"median_max_state_pct\"),\n",
498
+ " F.min(\"max_recipient_state_percentage\").alias(\"min_max_state_pct\"),\n",
499
+ " F.max(\"max_recipient_state_percentage\").alias(\"max_max_state_pct\"),\n",
500
+ " F.avg(\"total_recipient_states\").alias(\"avg_distinct_states\"),\n",
501
+ " F.median(\"total_recipient_states\").alias(\"median_distinct_states\"),\n",
502
+ " F.min(\"total_recipient_states\").alias(\"min_distinct_states\"),\n",
503
+ " F.max(\"total_recipient_states\").alias(\"max_distinct_states\"),\n",
504
+ " )\n",
505
+ " .orderBy(\"cluster\")\n",
506
+ ")\n",
507
+ "\n",
508
+ "display(summary)"
509
+ ]
510
+ },
511
+ {
512
+ "cell_type": "code",
513
+ "execution_count": null,
514
+ "metadata": {
515
+ "application/vnd.databricks.v1+cell": {
516
+ "cellMetadata": {
517
+ "byteLimit": 2048000,
518
+ "rowLimit": 10000
519
+ },
520
+ "inputWidgets": {},
521
+ "nuid": "def2f16f-982e-448e-9aa8-782aa01c2193",
522
+ "showTitle": true,
523
+ "tableResultSettingsMap": {},
524
+ "title": "create distribution plots for each cluster (feature: foreign percentage)"
525
+ }
526
+ },
527
+ "outputs": [],
528
+ "source": [
529
+ "pdf_clustered = df_clustered.toPandas()\n",
530
+ "\n",
531
+ "fig_foreign = px.box(\n",
532
+ " pdf_clustered,\n",
533
+ " x=\"cluster\",\n",
534
+ " y=\"foreign_percentage\",\n",
535
+ " title=\"Foreign Percentage by Cluster\",\n",
536
+ " labels={\"foreign_percentage\": \"Foreign Percentage\", \"cluster\": \"Cluster\"}\n",
537
+ ")\n",
538
+ "fig_foreign.show()"
539
+ ]
540
+ },
541
+ {
542
+ "cell_type": "code",
543
+ "execution_count": null,
544
+ "metadata": {
545
+ "application/vnd.databricks.v1+cell": {
546
+ "cellMetadata": {
547
+ "byteLimit": 2048000,
548
+ "rowLimit": 10000
549
+ },
550
+ "inputWidgets": {},
551
+ "nuid": "01020c54-f610-42e2-b43c-f2643f98a576",
552
+ "showTitle": true,
553
+ "tableResultSettingsMap": {},
554
+ "title": "create distribution plots for each cluster (feature: max recipient state percentage)"
555
+ }
556
+ },
557
+ "outputs": [],
558
+ "source": [
559
+ "fig_max_recipient = px.box(\n",
560
+ " pdf_clustered,\n",
561
+ " x=\"cluster\",\n",
562
+ " y=\"max_recipient_state_percentage\",\n",
563
+ " title=\"Max Recipient State Percentage by Cluster\",\n",
564
+ " labels={\"max_recipient_state_percentage\": \"Max Recipient State Percentage\", \"cluster\": \"Cluster\"}\n",
565
+ ")\n",
566
+ "fig_max_recipient.show()"
567
+ ]
568
+ },
569
+ {
570
+ "cell_type": "code",
571
+ "execution_count": null,
572
+ "metadata": {
573
+ "application/vnd.databricks.v1+cell": {
574
+ "cellMetadata": {
575
+ "byteLimit": 2048000,
576
+ "rowLimit": 10000
577
+ },
578
+ "inputWidgets": {},
579
+ "nuid": "170d71cf-cb3f-44d5-bf90-cc9746a3c1d3",
580
+ "showTitle": true,
581
+ "tableResultSettingsMap": {},
582
+ "title": "create distribution plots for each cluster (feature: number of states)"
583
+ }
584
+ },
585
+ "outputs": [],
586
+ "source": [
587
+ "fig_total_states = px.box(\n",
588
+ " pdf_clustered,\n",
589
+ " x=\"cluster\",\n",
590
+ " y=\"total_recipient_states\",\n",
591
+ " title=\"Total Recipient States by Cluster\",\n",
592
+ " labels={\"total_recipient_states\": \"Total Recipient States\", \"cluster\": \"Cluster\"}\n",
593
+ ")\n",
594
+ "fig_total_states.show()"
595
+ ]
596
+ },
597
+ {
598
+ "cell_type": "markdown",
599
+ "metadata": {
600
+ "application/vnd.databricks.v1+cell": {
601
+ "cellMetadata": {},
602
+ "inputWidgets": {},
603
+ "nuid": "12dc14fa-0066-4fe5-8a99-d9c6d05860aa",
604
+ "showTitle": false,
605
+ "tableResultSettingsMap": {},
606
+ "title": ""
607
+ }
608
+ },
609
+ "source": [
610
+ "##Cluster Summary - Without Composite Feature"
611
+ ]
612
+ },
613
+ {
614
+ "cell_type": "code",
615
+ "execution_count": null,
616
+ "metadata": {
617
+ "application/vnd.databricks.v1+cell": {
618
+ "cellMetadata": {
619
+ "byteLimit": 2048000,
620
+ "rowLimit": 10000
621
+ },
622
+ "inputWidgets": {},
623
+ "nuid": "1545cd83-3e0a-43f9-9719-14d0f12f5dcb",
624
+ "showTitle": false,
625
+ "tableResultSettingsMap": {},
626
+ "title": ""
627
+ }
628
+ },
629
+ "outputs": [],
630
+ "source": [
631
+ "df_clustered = spark.read.table('sandbox_edward.nonprofit_mapping.funding_orgs_local_vs_national_kmeans_without_composite_feature')"
632
+ ]
633
+ },
634
+ {
635
+ "cell_type": "code",
636
+ "execution_count": null,
637
+ "metadata": {
638
+ "application/vnd.databricks.v1+cell": {
639
+ "cellMetadata": {
640
+ "byteLimit": 2048000,
641
+ "rowLimit": 10000
642
+ },
643
+ "inputWidgets": {},
644
+ "nuid": "f4a8fe26-50d1-4d55-bd18-0313a1d55136",
645
+ "showTitle": false,
646
+ "tableResultSettingsMap": {},
647
+ "title": ""
648
+ }
649
+ },
650
+ "outputs": [],
651
+ "source": [
652
+ "display(df_clustered)"
653
+ ]
654
+ },
655
+ {
656
+ "cell_type": "code",
657
+ "execution_count": null,
658
+ "metadata": {
659
+ "application/vnd.databricks.v1+cell": {
660
+ "cellMetadata": {
661
+ "byteLimit": 2048000,
662
+ "rowLimit": 10000
663
+ },
664
+ "inputWidgets": {},
665
+ "nuid": "6b2a98fa-d5ee-4c8f-8aad-208a83b2d145",
666
+ "showTitle": false,
667
+ "tableResultSettingsMap": {},
668
+ "title": ""
669
+ }
670
+ },
671
+ "outputs": [],
672
+ "source": [
673
+ "from pyspark.sql import functions as F\n",
674
+ "\n",
675
+ "df_2023 = (\n",
676
+ " df_clustered\n",
677
+ " .filter(F.col(\"TAXYEAR\") == 2023)\n",
678
+ " .withColumn(\"locality\", F.when(F.col(\"cluster\")==0, \"local/regional\").otherwise(F.when(F.col(\"cluster\")==1, \"international\").otherwise(F.when(F.col(\"cluster\")==2, \"national\").otherwise(None))))\n",
679
+ " .select(\n",
680
+ " \"FILEREIN\",\n",
681
+ " \"TAXYEAR\",\n",
682
+ " \"FILERUSSTATE\",\n",
683
+ " F.col(\"total_grant_value\").alias(\"value_of_grants\"),\n",
684
+ " F.col(\"total_grant_count\").alias(\"number_of_grants\"),\n",
685
+ " F.col(\"total_recipient_states\").alias(\"number_of_recipient_states\"),\n",
686
+ " F.col(\"foreign_percentage\").alias(\"pct_grant_value_foreign\"),\n",
687
+ " F.col(\"max_recipient_state_percentage\").alias(\"pct_grant_value_top_state\"),\n",
688
+ " F.col(\"top_recipient_state\").alias(\"top_state\"),\n",
689
+ " F.col(\"distinct_recipient_states\").alias(\"recipient_states\"),\n",
690
+ " \"locality\",\n",
691
+ " \"source\",\n",
692
+ " )\n",
693
+ ")\n",
694
+ "display(df_2023)"
695
+ ]
696
+ },
697
+ {
698
+ "cell_type": "code",
699
+ "execution_count": null,
700
+ "metadata": {
701
+ "application/vnd.databricks.v1+cell": {
702
+ "cellMetadata": {
703
+ "byteLimit": 2048000,
704
+ "rowLimit": 10000
705
+ },
706
+ "inputWidgets": {},
707
+ "nuid": "2483cf1c-4bed-47f0-91c0-0364e0f0d5da",
708
+ "showTitle": false,
709
+ "tableResultSettingsMap": {},
710
+ "title": ""
711
+ }
712
+ },
713
+ "outputs": [],
714
+ "source": [
715
+ "df_2023.write.mode(\"overwrite\").saveAsTable(\"sandbox_edward.nonprofit_mapping.locality_by_granting_activity_segmentation_funding_orgs_taxyear2023\")"
716
+ ]
717
+ },
718
+ {
719
+ "cell_type": "markdown",
720
+ "metadata": {
721
+ "application/vnd.databricks.v1+cell": {
722
+ "cellMetadata": {},
723
+ "inputWidgets": {},
724
+ "nuid": "0781ad05-93a0-46fe-9357-48fea2039b81",
725
+ "showTitle": false,
726
+ "tableResultSettingsMap": {},
727
+ "title": ""
728
+ }
729
+ },
730
+ "source": [
731
+ "Cluster 0 = local/regional<br>\n",
732
+ "Cluster 1 = international<br>\n",
733
+ "Cluster 2 = national"
734
+ ]
735
+ },
736
+ {
737
+ "cell_type": "code",
738
+ "execution_count": null,
739
+ "metadata": {
740
+ "application/vnd.databricks.v1+cell": {
741
+ "cellMetadata": {
742
+ "byteLimit": 2048000,
743
+ "rowLimit": 10000
744
+ },
745
+ "inputWidgets": {},
746
+ "nuid": "840cdfdb-dbad-4264-b0fc-85bb060ac2aa",
747
+ "showTitle": true,
748
+ "tableResultSettingsMap": {},
749
+ "title": "summarize clusters by original features"
750
+ }
751
+ },
752
+ "outputs": [],
753
+ "source": [
754
+ "summary = (\n",
755
+ " df_clustered\n",
756
+ " .groupBy(\"cluster\")\n",
757
+ " .agg(\n",
758
+ " F.count(\"*\").alias(\"count\"),\n",
759
+ " F.avg(\"foreign_percentage\").alias(\"avg_foreign_percentage\"),\n",
760
+ " F.median(\"foreign_percentage\").alias(\"median_foreign_percentage\"),\n",
761
+ " F.min(\"foreign_percentage\").alias(\"min_foreign_percentage\"),\n",
762
+ " F.max(\"foreign_percentage\").alias(\"max_foreign_percentage\"),\n",
763
+ " F.avg(\"max_recipient_state_percentage\").alias(\"avg_max_state_pct\"),\n",
764
+ " F.median(\"max_recipient_state_percentage\").alias(\"median_max_state_pct\"),\n",
765
+ " F.min(\"max_recipient_state_percentage\").alias(\"min_max_state_pct\"),\n",
766
+ " F.max(\"max_recipient_state_percentage\").alias(\"max_max_state_pct\"),\n",
767
+ " F.avg(\"total_recipient_states\").alias(\"avg_distinct_states\"),\n",
768
+ " F.median(\"total_recipient_states\").alias(\"median_distinct_states\"),\n",
769
+ " F.min(\"total_recipient_states\").alias(\"min_distinct_states\"),\n",
770
+ " F.max(\"total_recipient_states\").alias(\"max_distinct_states\"),\n",
771
+ " )\n",
772
+ " .orderBy(\"cluster\")\n",
773
+ ")\n",
774
+ "\n",
775
+ "display(summary)"
776
+ ]
777
+ },
778
+ {
779
+ "cell_type": "code",
780
+ "execution_count": null,
781
+ "metadata": {
782
+ "application/vnd.databricks.v1+cell": {
783
+ "cellMetadata": {
784
+ "byteLimit": 2048000,
785
+ "rowLimit": 10000
786
+ },
787
+ "inputWidgets": {},
788
+ "nuid": "3e9a4fb8-11e3-4734-8d8e-943e03e3b738",
789
+ "showTitle": true,
790
+ "tableResultSettingsMap": {},
791
+ "title": "create distribution plots for each cluster (feature: foreign percentage)"
792
+ }
793
+ },
794
+ "outputs": [],
795
+ "source": [
796
+ "pdf_clustered = df_clustered.toPandas()\n",
797
+ "\n",
798
+ "fig_foreign = px.box(\n",
799
+ " pdf_clustered,\n",
800
+ " x=\"cluster\",\n",
801
+ " y=\"foreign_percentage\",\n",
802
+ " title=\"Foreign Percentage by Cluster\",\n",
803
+ " labels={\"foreign_percentage\": \"Foreign Percentage\", \"cluster\": \"Cluster\"}\n",
804
+ ")\n",
805
+ "fig_foreign.show()"
806
+ ]
807
+ },
808
+ {
809
+ "cell_type": "code",
810
+ "execution_count": null,
811
+ "metadata": {
812
+ "application/vnd.databricks.v1+cell": {
813
+ "cellMetadata": {
814
+ "byteLimit": 2048000,
815
+ "rowLimit": 10000
816
+ },
817
+ "inputWidgets": {},
818
+ "nuid": "42bc6e79-cc9f-4745-9751-03c9223a3642",
819
+ "showTitle": true,
820
+ "tableResultSettingsMap": {},
821
+ "title": "create distribution plots for each cluster (feature: max recipient state percentage)"
822
+ }
823
+ },
824
+ "outputs": [],
825
+ "source": [
826
+ "fig_max_recipient = px.box(\n",
827
+ " pdf_clustered,\n",
828
+ " x=\"cluster\",\n",
829
+ " y=\"max_recipient_state_percentage\",\n",
830
+ " title=\"Max Recipient State Percentage by Cluster\",\n",
831
+ " labels={\"max_recipient_state_percentage\": \"Max Recipient State Percentage\", \"cluster\": \"Cluster\"}\n",
832
+ ")\n",
833
+ "fig_max_recipient.show()"
834
+ ]
835
+ },
836
+ {
837
+ "cell_type": "code",
838
+ "execution_count": null,
839
+ "metadata": {
840
+ "application/vnd.databricks.v1+cell": {
841
+ "cellMetadata": {
842
+ "byteLimit": 2048000,
843
+ "rowLimit": 10000
844
+ },
845
+ "inputWidgets": {},
846
+ "nuid": "b05a45f8-4d81-4ac2-9cfb-b73e18d2051f",
847
+ "showTitle": true,
848
+ "tableResultSettingsMap": {},
849
+ "title": "create distribution plots for each cluster (feature: number of states)"
850
+ }
851
+ },
852
+ "outputs": [],
853
+ "source": [
854
+ "fig_total_states = px.box(\n",
855
+ " pdf_clustered,\n",
856
+ " x=\"cluster\",\n",
857
+ " y=\"total_recipient_states\",\n",
858
+ " title=\"Total Recipient States by Cluster\",\n",
859
+ " labels={\"total_recipient_states\": \"Total Recipient States\", \"cluster\": \"Cluster\"}\n",
860
+ ")\n",
861
+ "fig_total_states.show()"
862
+ ]
863
+ },
864
+ {
865
+ "cell_type": "code",
866
+ "execution_count": null,
867
+ "metadata": {
868
+ "application/vnd.databricks.v1+cell": {
869
+ "cellMetadata": {},
870
+ "inputWidgets": {},
871
+ "nuid": "3cf19964-c147-4cf4-b7b9-1f31a2e6a256",
872
+ "showTitle": false,
873
+ "tableResultSettingsMap": {},
874
+ "title": ""
875
+ }
876
+ },
877
+ "outputs": [],
878
+ "source": []
879
+ }
880
+ ],
881
+ "metadata": {
882
+ "application/vnd.databricks.v1+notebook": {
883
+ "computePreferences": {
884
+ "hardware": {
885
+ "accelerator": null,
886
+ "gpuPoolId": null,
887
+ "memory": null
888
+ }
889
+ },
890
+ "dashboards": [],
891
+ "environmentMetadata": {
892
+ "base_environment": "",
893
+ "environment_version": "2"
894
+ },
895
+ "inputWidgetPreferences": null,
896
+ "language": "python",
897
+ "notebookMetadata": {
898
+ "pythonIndentUnit": 4
899
+ },
900
+ "notebookName": "(Clone) NP04_classification",
901
+ "widgets": {}
902
+ },
903
+ "language_info": {
904
+ "name": "python"
905
+ }
906
+ },
907
+ "nbformat": 4,
908
+ "nbformat_minor": 0
909
+ }