File size: 12,168 Bytes

532394f

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "b7236467-00b7-43e0-bba8-6ed909c52fb5",
     "showTitle": false,
     "tableResultSettingsMap": {},
     "title": ""
    }
   },
   "source": [
    "##### Classifies a Form 990 Schedule O text description based on rules for both Form 990 and Form 990-EZ using the open text response from the filer describing which part of the form they are providing supplementary information for. \n",
    "#### It outputs a string classification label from the following list:'I EZ', 'II EZ', 'III EZ', 'V EZ', 'III', 'V', 'VI', 'VII', 'IX', 'XI', 'XII', 'Unknown'. \n",
    "\n",
    "##### Note: Change the source and target tables according to the environment\n",
    "\n",
    "##### Author: GivingTuesday Data Commons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "833ba8f0-d981-446d-9a8d-5b8ee3cad525",
     "showTitle": false,
     "tableResultSettingsMap": {},
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "import string\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.functions import udf, col, when\n",
    "from pyspark.sql.types import StringType\n",
    "\n",
    "def classify_schedule_o_submission(sidfalrdesc: str) -> str:\n",
    "    \"\"\"\n",
    "    Classifies a Form 990 Schedule O text description based on rules for both\n",
    "    Form 990 and Form 990-EZ.\n",
    "\n",
    "    Args:\n",
    "        sidfalrdesc: The open text response from the filer describing which\n",
    "                     part of the form they are providing supplementary\n",
    "                     information for.\n",
    "\n",
    "    Returns:\n",
    "        A string classification label from the following list:\n",
    "        'I EZ', 'II EZ', 'III EZ', 'V EZ', 'III', 'V', 'VI', 'VII', 'IX',\n",
    "        'XI', 'XII', 'Unknown'.\n",
    "    \"\"\"\n",
    "    if sidfalrdesc is None or sidfalrdesc == '':\n",
    "        return 'Unknown'\n",
    "    \n",
    "    text = sidfalrdesc.lower()\n",
    "\n",
    "    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))\n",
    "    \n",
    "    # Replace multiple spaces with a single space\n",
    "    text = re.sub(r'\\s+', ' ', text).strip()\n",
    "    # Define a common pattern for 'part' or 'pt' for cleaner regexes\n",
    "    part_or_pt_pattern = r'(part|pt)\\s+'\n",
    "\n",
    "    rules = [\n",
    "        # ======================================================================\n",
    "        # Rule Group 1: Direct and Unambiguous EZ Form Mentions\n",
    "        # ======================================================================\n",
    "        ('I EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(i|1)\\b|' + part_or_pt_pattern + r'(i|1)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(i|1)\\b)'),\n",
    "        ('II EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(ii|2)\\b|' + part_or_pt_pattern + r'(ii|2)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(ii|2)\\b)'),\n",
    "        ('III EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(iii|3)\\b|' + part_or_pt_pattern + r'(iii|3)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(iii|3)\\b)'),\n",
    "        ('V EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(v|5)\\b|' + part_or_pt_pattern + r'(v|5)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(v|5)\\b)'),\n",
    "\n",
    "        # ======================================================================\n",
    "        # Rule Group 2: Specific Part/Pt + Line Number Combinations (990 & 990-EZ)\n",
    "        # These rules identify parts by their unique line numbers, regardless of surrounding context.\n",
    "        # ======================================================================\n",
    "        ('I EZ', r'\\b' + part_or_pt_pattern + r'(i|1)\\b.*(line|ln)\\s+(8|10|16|20)\\b'), # EZ Part I specific lines\n",
    "        ('II EZ', r'\\b' + part_or_pt_pattern + r'(ii|2)\\b.*(line|ln)\\s+(24|26)\\b'), # EZ Part II specific lines\n",
    "        ('III EZ', r'\\b' + part_or_pt_pattern + r'(iii|3)\\b.*(line|ln)\\s+(31)\\b'), # EZ Part III specific line\n",
    "        ('V EZ', r'\\b' + part_or_pt_pattern + r'(v|5)\\b.*(line|ln)\\s+(33|34|35b|44d)\\b'), # EZ Part V specific lines\n",
    "\n",
    "        ('III', r'\\b' + part_or_pt_pattern + r'(iii|3)\\b.*(line|ln)\\s+(2|3|4)\\b'), # Standard 990 Part III specific lines\n",
    "        ('V', r'\\b' + part_or_pt_pattern + r'(v|5)\\b.*(line|ln)\\s+(3b|13a|14b)\\b'), # Standard 990 Part V specific lines\n",
    "\n",
    "        ('VI', r'\\b' + part_or_pt_pattern + r'(vi|6)\\b.*(line|ln)\\s+(11b|19)\\b'),\n",
    "        ('IX', r'\\b' + part_or_pt_pattern + r'(ix|9)\\b.*(line|ln)\\s+(11g|24e)\\b'),\n",
    "        ('XII', r'\\b' + part_or_pt_pattern + r'(xii|12)\\b.*(line|ln)\\s+(1|2c)\\b'),\n",
    "\n",
    "        # ======================================================================\n",
    "        # Rule Group 3: Highly Specific Contextual Keywords (No mandatory line number or part mention)\n",
    "        # ======================================================================\n",
    "        ('VI', r'governance|governing\\s+body|board\\s+of\\s+directors|conflict\\s+of\\s+interest|whistleblower\\s+policy|document\\s+retention'\n",
    "                 r'|independent\\s+voting\\s+members|minutes|states\\s+where\\s+copy\\s+of\\s+return\\s+is\\s+filed|public\\s+disclosure'\n",
    "                 r'|process\\s+for\\s+review|process\\s+for\\s+determining\\s+compensation'),\n",
    "        ('VII', r'compensation.*related\\s+organization|form\\s+w-2|form\\s+1099-misc|form\\s+1099-nec|severance\\s+payment|highest\\s+compensated'),\n",
    "        ('IX', r'functional\\s+expenses|statement\\s+of\\s+functional\\s+expenses|lobbying|professional\\s+fundraising|investment\\s+management\\s+fees'\n",
    "                 r'|other\\s+fees\\s+for\\s+services|all\\s+other\\s+expenses'),\n",
    "        ('XI', r'reconciliation\\s+of\\s+net\\s+assets|prior\\s+period\\s+adjustment|unrealized\\s+gains|donated\\s+services'),\n",
    "        ('XII', r'financial\\s+statements\\s+and\\s+reporting|audited\\s+financial\\s+statements|reviewed\\s+financial\\s+statements|compiled\\s+financial\\s+statements'\n",
    "                  r'|fin\\s+48|asc\\s+740|basis\\s+of\\s+accounting|accounting\\s+method|committee\\s+oversight'),\n",
    "        ('V', r'other\\s+irs\\s+filings|tax\\s+compliance|fbar|foreign\\s+bank\\s+account|form\\s+4720|tax-exempt\\s+bonds'),\n",
    "        ('III', r'exempt\\s+purpose\\s+achievements'),\n",
    "        ('III EZ', r'program\\s+service\\s+accomplishments|program\\s+service|additional\\s+program\\s+services'),\n",
    "        ('V EZ', r'significant\\s+activity\\s+changes|amended\\s+bylaws|changes\\s+to\\s+organizing\\s+documents|unrelated\\s+business\\s+gross\\s+income|list\\s+of\\s+states\\s+return\\s+filed\\s+in'),\n",
    "        ('II EZ', r'\\bother\\s+assets\\b|\\btotal\\s+assets\\b|\\bloans\\s+from\\s+officers\\b|\\baccounts\\s+payable\\b'),\n",
    "\n",
    "        # ======================================================================\n",
    "        # Rule Group 4: Ambiguous Part/Pt Mentions (Default to Standard 990)\n",
    "        # ======================================================================\n",
    "        ('XII', r'\\b' + part_or_pt_pattern + r'(xii|12)\\b'),\n",
    "        ('XI', r'\\b' + part_or_pt_pattern + r'(xi|11)\\b'),\n",
    "        ('IX', r'\\b' + part_or_pt_pattern + r'(ix|9)\\b'),\n",
    "        ('VII', r'\\b' + part_or_pt_pattern + r'(vii|7)\\b'),\n",
    "        ('VI', r'\\b' + part_or_pt_pattern + r'(vi|6)\\b'),\n",
    "        ('V', r'\\b' + part_or_pt_pattern + r'(v|5)\\b'),   \n",
    "        ('III', r'\\b' + part_or_pt_pattern + r'(iii|3)\\b'),\n",
    "\n",
    "        # ======================================================================\n",
    "        # Rule Group 5: Ambiguous Keyword Mentions (Default to Standard 990)\n",
    "        # ======================================================================\n",
    "        ('IX', r'\\bother\\s+expenses\\b'), # Catch \"other expenses\" as a low-priority IX match\n",
    "        ('XI', r'other\\s+changes\\s+in\\s+net\\s+assets'),\n",
    "    ]\n",
    "\n",
    "    for classification, pattern in rules:\n",
    "        if re.search(pattern, text):\n",
    "            return classification\n",
    "\n",
    "    return 'Unknown'\n",
    "\n",
    "classify_udf = udf(classify_schedule_o_submission, StringType())\n",
    "\n",
    "\n",
    "def process_with_filters(source_table: str = \"---Put the source table name here schedule O Table Name here---\", \n",
    "                        target_table: str = \"put target table path here,\n",
    "                        tax_years: list = [2020,2021,2022,2023,2024,2025],\n",
    "                        limit_records: int = None):\n",
    "    \"\"\"\n",
    "    Process Schedule O classifications with optional filters.\n",
    "    \n",
    "    Args:\n",
    "        source_table: Name of the source table\n",
    "        target_table: Target table to write results\n",
    "        tax_years: List of tax years to process (optional)\n",
    "        limit_records: Limit number of records for testing (optional)\n",
    "    \"\"\"\n",
    "    \n",
    "    df = spark.table(source_table)\n",
    "    \n",
    "    if tax_years:\n",
    "        df = df.filter(col(\"TAXYEAR\").isin(tax_years))\n",
    "    \n",
    "    if limit_records:\n",
    "        df = df.limit(limit_records)\n",
    "    \n",
    "    df_classified = df.withColumn(\n",
    "        \"SCHEDULE_O_CLASSIFICATION\",\n",
    "        classify_udf(col(\"SIDFALRDESC\"))\n",
    "    )\n",
    "\n",
    "    df_classified.write \\\n",
    "        .mode(\"overwrite\") \\\n",
    "        .option(\"overwriteSchema\", \"true\") \\\n",
    "        .saveAsTable(target_table)\n",
    "    \n",
    "    df_classified.groupBy(\"SCHEDULE_O_CLASSIFICATION\").count().orderBy(\"count\", ascending=False).show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "c3bf2484-e248-4f39-a1b9-f76e6e4cf67e",
     "showTitle": false,
     "tableResultSettingsMap": {},
     "title": ""
    }
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------------+-------+\n|SCHEDULE_O_CLASSIFICATION|  count|\n+-------------------------+-------+\n|                       VI|4696909|\n|                     I EZ|1723572|\n|                       IX|1590432|\n|                    II EZ| 471703|\n|                  Unknown| 450339|\n|                       XI| 234674|\n|                      III| 214748|\n|                   III EZ| 184880|\n|                      XII| 160507|\n|                      VII|  45941|\n|                        V|  34822|\n|                     V EZ|  13777|\n+-------------------------+-------+\n\n"
     ]
    }
   ],
   "source": [
    "process_with_filters()"
   ]
  }
 ],
 "metadata": {
  "application/vnd.databricks.v1+notebook": {
   "computePreferences": null,
   "dashboards": [],
   "environmentMetadata": {
    "base_environment": "",
    "environment_version": "4"
   },
   "inputWidgetPreferences": null,
   "language": "python",
   "notebookMetadata": {
    "pythonIndentUnit": 4
   },
   "notebookName": "schedule_o_classifier_notebook",
   "widgets": {}
  },
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}