{ "cells": [ { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "b7236467-00b7-43e0-bba8-6ed909c52fb5", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "source": [ "##### Classifies a Form 990 Schedule O text description based on rules for both Form 990 and Form 990-EZ using the open text response from the filer describing which part of the form they are providing supplementary information for. \n", "#### It outputs a string classification label from the following list:'I EZ', 'II EZ', 'III EZ', 'V EZ', 'III', 'V', 'VI', 'VII', 'IX', 'XI', 'XII', 'Unknown'. \n", "\n", "##### Note: Change the source and target tables according to the environment\n", "\n", "##### Author: GivingTuesday Data Commons" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "833ba8f0-d981-446d-9a8d-5b8ee3cad525", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ "import re\n", "import pandas as pd\n", "import string\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.functions import udf, col, when\n", "from pyspark.sql.types import StringType\n", "\n", "def classify_schedule_o_submission(sidfalrdesc: str) -> str:\n", " \"\"\"\n", " Classifies a Form 990 Schedule O text description based on rules for both\n", " Form 990 and Form 990-EZ.\n", "\n", " Args:\n", " sidfalrdesc: The open text response from the filer describing which\n", " part of the form they are providing supplementary\n", " information for.\n", "\n", " Returns:\n", " A string classification label from the following list:\n", " 'I EZ', 'II EZ', 'III EZ', 'V EZ', 'III', 'V', 'VI', 'VII', 'IX',\n", " 'XI', 'XII', 'Unknown'.\n", " \"\"\"\n", " if sidfalrdesc is None or sidfalrdesc == '':\n", " return 'Unknown'\n", " \n", " text = sidfalrdesc.lower()\n", "\n", " text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))\n", " \n", " # Replace multiple spaces with a single space\n", " text = re.sub(r'\\s+', ' ', text).strip()\n", " # Define a common pattern for 'part' or 'pt' for cleaner regexes\n", " part_or_pt_pattern = r'(part|pt)\\s+'\n", "\n", " rules = [\n", " # ======================================================================\n", " # Rule Group 1: Direct and Unambiguous EZ Form Mentions\n", " # ======================================================================\n", " ('I EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(i|1)\\b|' + part_or_pt_pattern + r'(i|1)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(i|1)\\b)'),\n", " ('II EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(ii|2)\\b|' + part_or_pt_pattern + r'(ii|2)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(ii|2)\\b)'),\n", " ('III EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(iii|3)\\b|' + part_or_pt_pattern + r'(iii|3)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(iii|3)\\b)'),\n", " ('V EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(v|5)\\b|' + part_or_pt_pattern + r'(v|5)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(v|5)\\b)'),\n", "\n", " # ======================================================================\n", " # Rule Group 2: Specific Part/Pt + Line Number Combinations (990 & 990-EZ)\n", " # These rules identify parts by their unique line numbers, regardless of surrounding context.\n", " # ======================================================================\n", " ('I EZ', r'\\b' + part_or_pt_pattern + r'(i|1)\\b.*(line|ln)\\s+(8|10|16|20)\\b'), # EZ Part I specific lines\n", " ('II EZ', r'\\b' + part_or_pt_pattern + r'(ii|2)\\b.*(line|ln)\\s+(24|26)\\b'), # EZ Part II specific lines\n", " ('III EZ', r'\\b' + part_or_pt_pattern + r'(iii|3)\\b.*(line|ln)\\s+(31)\\b'), # EZ Part III specific line\n", " ('V EZ', r'\\b' + part_or_pt_pattern + r'(v|5)\\b.*(line|ln)\\s+(33|34|35b|44d)\\b'), # EZ Part V specific lines\n", "\n", " ('III', r'\\b' + part_or_pt_pattern + r'(iii|3)\\b.*(line|ln)\\s+(2|3|4)\\b'), # Standard 990 Part III specific lines\n", " ('V', r'\\b' + part_or_pt_pattern + r'(v|5)\\b.*(line|ln)\\s+(3b|13a|14b)\\b'), # Standard 990 Part V specific lines\n", "\n", " ('VI', r'\\b' + part_or_pt_pattern + r'(vi|6)\\b.*(line|ln)\\s+(11b|19)\\b'),\n", " ('IX', r'\\b' + part_or_pt_pattern + r'(ix|9)\\b.*(line|ln)\\s+(11g|24e)\\b'),\n", " ('XII', r'\\b' + part_or_pt_pattern + r'(xii|12)\\b.*(line|ln)\\s+(1|2c)\\b'),\n", "\n", " # ======================================================================\n", " # Rule Group 3: Highly Specific Contextual Keywords (No mandatory line number or part mention)\n", " # ======================================================================\n", " ('VI', r'governance|governing\\s+body|board\\s+of\\s+directors|conflict\\s+of\\s+interest|whistleblower\\s+policy|document\\s+retention'\n", " r'|independent\\s+voting\\s+members|minutes|states\\s+where\\s+copy\\s+of\\s+return\\s+is\\s+filed|public\\s+disclosure'\n", " r'|process\\s+for\\s+review|process\\s+for\\s+determining\\s+compensation'),\n", " ('VII', r'compensation.*related\\s+organization|form\\s+w-2|form\\s+1099-misc|form\\s+1099-nec|severance\\s+payment|highest\\s+compensated'),\n", " ('IX', r'functional\\s+expenses|statement\\s+of\\s+functional\\s+expenses|lobbying|professional\\s+fundraising|investment\\s+management\\s+fees'\n", " r'|other\\s+fees\\s+for\\s+services|all\\s+other\\s+expenses'),\n", " ('XI', r'reconciliation\\s+of\\s+net\\s+assets|prior\\s+period\\s+adjustment|unrealized\\s+gains|donated\\s+services'),\n", " ('XII', r'financial\\s+statements\\s+and\\s+reporting|audited\\s+financial\\s+statements|reviewed\\s+financial\\s+statements|compiled\\s+financial\\s+statements'\n", " r'|fin\\s+48|asc\\s+740|basis\\s+of\\s+accounting|accounting\\s+method|committee\\s+oversight'),\n", " ('V', r'other\\s+irs\\s+filings|tax\\s+compliance|fbar|foreign\\s+bank\\s+account|form\\s+4720|tax-exempt\\s+bonds'),\n", " ('III', r'exempt\\s+purpose\\s+achievements'),\n", " ('III EZ', r'program\\s+service\\s+accomplishments|program\\s+service|additional\\s+program\\s+services'),\n", " ('V EZ', r'significant\\s+activity\\s+changes|amended\\s+bylaws|changes\\s+to\\s+organizing\\s+documents|unrelated\\s+business\\s+gross\\s+income|list\\s+of\\s+states\\s+return\\s+filed\\s+in'),\n", " ('II EZ', r'\\bother\\s+assets\\b|\\btotal\\s+assets\\b|\\bloans\\s+from\\s+officers\\b|\\baccounts\\s+payable\\b'),\n", "\n", " # ======================================================================\n", " # Rule Group 4: Ambiguous Part/Pt Mentions (Default to Standard 990)\n", " # ======================================================================\n", " ('XII', r'\\b' + part_or_pt_pattern + r'(xii|12)\\b'),\n", " ('XI', r'\\b' + part_or_pt_pattern + r'(xi|11)\\b'),\n", " ('IX', r'\\b' + part_or_pt_pattern + r'(ix|9)\\b'),\n", " ('VII', r'\\b' + part_or_pt_pattern + r'(vii|7)\\b'),\n", " ('VI', r'\\b' + part_or_pt_pattern + r'(vi|6)\\b'),\n", " ('V', r'\\b' + part_or_pt_pattern + r'(v|5)\\b'), \n", " ('III', r'\\b' + part_or_pt_pattern + r'(iii|3)\\b'),\n", "\n", " # ======================================================================\n", " # Rule Group 5: Ambiguous Keyword Mentions (Default to Standard 990)\n", " # ======================================================================\n", " ('IX', r'\\bother\\s+expenses\\b'), # Catch \"other expenses\" as a low-priority IX match\n", " ('XI', r'other\\s+changes\\s+in\\s+net\\s+assets'),\n", " ]\n", "\n", " for classification, pattern in rules:\n", " if re.search(pattern, text):\n", " return classification\n", "\n", " return 'Unknown'\n", "\n", "classify_udf = udf(classify_schedule_o_submission, StringType())\n", "\n", "\n", "def process_with_filters(source_table: str = \"---Put the source table name here schedule O Table Name here---\", \n", " target_table: str = \"put target table path here,\n", " tax_years: list = [2020,2021,2022,2023,2024,2025],\n", " limit_records: int = None):\n", " \"\"\"\n", " Process Schedule O classifications with optional filters.\n", " \n", " Args:\n", " source_table: Name of the source table\n", " target_table: Target table to write results\n", " tax_years: List of tax years to process (optional)\n", " limit_records: Limit number of records for testing (optional)\n", " \"\"\"\n", " \n", " df = spark.table(source_table)\n", " \n", " if tax_years:\n", " df = df.filter(col(\"TAXYEAR\").isin(tax_years))\n", " \n", " if limit_records:\n", " df = df.limit(limit_records)\n", " \n", " df_classified = df.withColumn(\n", " \"SCHEDULE_O_CLASSIFICATION\",\n", " classify_udf(col(\"SIDFALRDESC\"))\n", " )\n", "\n", " df_classified.write \\\n", " .mode(\"overwrite\") \\\n", " .option(\"overwriteSchema\", \"true\") \\\n", " .saveAsTable(target_table)\n", " \n", " df_classified.groupBy(\"SCHEDULE_O_CLASSIFICATION\").count().orderBy(\"count\", ascending=False).show()\n" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "c3bf2484-e248-4f39-a1b9-f76e6e4cf67e", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ "+-------------------------+-------+\n|SCHEDULE_O_CLASSIFICATION| count|\n+-------------------------+-------+\n| VI|4696909|\n| I EZ|1723572|\n| IX|1590432|\n| II EZ| 471703|\n| Unknown| 450339|\n| XI| 234674|\n| III| 214748|\n| III EZ| 184880|\n| XII| 160507|\n| VII| 45941|\n| V| 34822|\n| V EZ| 13777|\n+-------------------------+-------+\n\n" ] } ], "source": [ "process_with_filters()" ] } ], "metadata": { "application/vnd.databricks.v1+notebook": { "computePreferences": null, "dashboards": [], "environmentMetadata": { "base_environment": "", "environment_version": "4" }, "inputWidgetPreferences": null, "language": "python", "notebookMetadata": { "pythonIndentUnit": 4 }, "notebookName": "schedule_o_classifier_notebook", "widgets": {} }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }