Spaces:

boettiger-lab
/

tpl

Sleeping

tpl

File size: 35,363 Bytes

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a70ecae6-ed3d-47ab-8450-f500ab2f0362",
   "metadata": {},
   "source": [
    "# Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59760595-9aaa-49ee-b825-09c0ffab9af2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from cng.utils import *\n",
    "from cng.h3 import *\n",
    "from ibis import _\n",
    "import ibis.selectors as s\n",
    "import os\n",
    "from osgeo import gdal\n",
    "from minio import Minio\n",
    "import streamlit \n",
    "from datetime import timedelta\n",
    "import geopandas as gpd\n",
    "import re\n",
    "\n",
    "# Get signed URLs to access license-controlled layers\n",
    "key = st.secrets[\"MINIO_KEY\"]\n",
    "secret = st.secrets[\"MINIO_SECRET\"]\n",
    "client = Minio(\"minio.carlboettiger.info\", key, secret)\n",
    "\n",
    "con = ibis.duckdb.connect(extensions = [\"spatial\", \"h3\"])\n",
    "endpoint = os.getenv(\"AWS_S3_ENDPOINT\", \"minio.carlboettiger.info\")\n",
    "duckdb_install_h3()\n",
    "\n",
    "set_secrets(con)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "deb47703-31fd-4039-84ce-df6a24cdf702",
   "metadata": {},
   "outputs": [],
   "source": [
    "census_path = \"s3://public-census/2024/\"\n",
    "state_file = census_path + 'state/2024_us_state.parquet'\n",
    "county_file = census_path + 'county/2024_us_county.parquet'\n",
    "\n",
    "state_h3_file = census_path + 'state/2024_us_state_h3_z8.parquet'\n",
    "county_h3_file = census_path + 'county/2024_us_county_h3_z8.parquet'\n",
    "city_h3_file = census_path + 'places_subdivisions/2024_us_places_subdivisions_h3_z8.parquet'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b086a1a-af23-487b-923d-fca595a19111",
   "metadata": {},
   "source": [
    "#### Converting data to hexes at zoom 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "498c7474-7ee6-48b0-8242-6d29a28841f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def h3_from_geom(con, name, cols, save_path, zoom = 8):\n",
    "    \"\"\"\n",
    "    Computes hexes directly from geometry.\n",
    "    \"\"\"\n",
    "    cols = \", \".join(cols) if isinstance(cols, list) else cols\n",
    "    con.raw_sql(f'''\n",
    "    CREATE OR REPLACE TEMP TABLE t2 AS\n",
    "    SELECT {cols},\n",
    "           h3_polygon_wkt_to_cells_string(ST_Force2D(dump.geom), {zoom}) AS h{zoom}\n",
    "    FROM (\n",
    "        SELECT {cols}, UNNEST(ST_Dump(geom)) AS dump\n",
    "        FROM {name}\n",
    "    )\n",
    "    ''')\n",
    "    con.sql(f'''\n",
    "        SELECT {cols}, UNNEST(h{zoom}) AS h{zoom},\n",
    "        ST_GeomFromText(h3_cell_to_boundary_wkt(UNNEST(h{zoom}))) AS geom\n",
    "        FROM t2\n",
    "    ''').to_parquet(save_path)\n",
    "    return "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "084a33f5-3647-40bc-9c03-f145c60b95d1",
   "metadata": {},
   "source": [
    "# TPL Conservation Almanac\n",
    "\n",
    "Hexing this data at zoom 8 level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1afa510b-3434-4ded-88c7-e06844ee503e",
   "metadata": {},
   "outputs": [],
   "source": [
    "tpl = client.get_presigned_url(\n",
    "    \"GET\",\n",
    "    \"shared-tpl\",\n",
    "    \"tpl.parquet\",\n",
    "    expires=timedelta(hours=2),\n",
    ")\n",
    "\n",
    "cols = ['fid', 'tpl_id', 'state_id', 'state', 'county', 'municipality',\n",
    "        'site', 'acres', 'year', 'date', 'owner','owner_type','manager',\n",
    "        'manager_type','purchase_type','easement','easement_type',\n",
    "        'access_type','purpose_type','duration_type','data_provider',\n",
    "        'data_source','source_date','data_aggregator','comments','amount',\n",
    "        'program_id','program','sponsor_id','sponsor','sponsor_type','FIPS']\n",
    "\n",
    "state_ids = con.read_parquet(state_file).drop('geom')\n",
    "\n",
    "tpl_table = (con.read_parquet(tpl)\n",
    "    .rename(tpl_id = \"TPL_ID\", state = \"State\", county = \"County\", municipality = \"Municipality\", site = \"Site_Name\",\n",
    "            acres = \"Reported_Acres\", area = \"Shape_Area\", year = \"Close_Year\", date = \"Close_Date\", owner = \"Owner_Name\",\n",
    "            owner_type = \"Owner_Type\", manager = \"Manager_Name\", manager_type = \"Manager_Type\",\n",
    "            purchase_type = \"Purchase_Type\", easement = \"EasementHolder_Name\", easement_type = \"EasementHolder_Type\",\n",
    "            access_type = \"Public_Access_Type\", purpose_type = \"Purpose_Type\", duration_type = \"Duration_Type\",\n",
    "            data_provider = \"Data_Provider\", data_source = \"Data_Source\", source_date = \"Source_Date\",\n",
    "            data_aggregator = \"Data_Aggregator\", comments = \"Comments\", amount = \"Amount\", program_id = 'Program_ID',\n",
    "            program = 'Program_Name', sponsor_id = \"Sponsor_ID\", sponsor = \"Sponsor_Name\", sponsor_type = \"Sponsor_Type\")\n",
    "    .mutate(geom = _.geom.convert(\"ESRI:102039\", \"EPSG:4326\"))\n",
    "    .inner_join(state_ids, 'state'))\n",
    "    \n",
    "con.create_table('tpl', tpl_table, overwrite=True)\n",
    "# h3_from_geom(con, 'tpl', cols, save_path = 's3://shared-tpl/conservation_almanac/z8/tpl_h3_z8.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9612c804-0474-4bfe-924d-89dae0105663",
   "metadata": {},
   "source": [
    "#### Generate PMTiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "932476f8-53a9-4aa9-9520-cf3fa42b8150",
   "metadata": {},
   "outputs": [],
   "source": [
    "tpl_table.to_parquet('s3://shared-tpl/conservation_almanac/tpl.parquet')\n",
    "tpl_table.to_parquet('tpl_epsg4326.parquet') #local copy to use to_geojson\n",
    "to_geojson('tpl_epsg4326.parquet', \"tpl.geojson\")\n",
    "pmtiles = to_pmtiles(\"tpl.geojson\", \"tpl.pmtiles\")\n",
    "s3_cp('tpl.pmtiles', \"s3://shared-tpl/conservation_almanac/tpl.pmtiles\", \"minio\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f00cfe9-520c-4839-aeed-46a83b11ecce",
   "metadata": {},
   "source": [
    "# Census\n",
    "\n",
    "Getting polygons and FIPS codes from Census state, county, place, and subdivision data. \n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cd589ad-5b03-41de-8936-c20091a937e1",
   "metadata": {},
   "source": [
    "#### State"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08a861e6-fdcd-480a-ad0a-423c38cc1bc1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "url = \"/vsizip//vsicurl/https://www2.census.gov/geo/tiger/TIGER2024/STATE/tl_2024_us_state.zip\"\n",
    "state = (con.read_geo(url)\n",
    "            .mutate(geom = _.geom.convert('EPSG:4269','EPSG:4326'))\n",
    "            .rename(FIPS = \"GEOID\", state_id = \"STUSPS\", name = \"NAME\")\n",
    "            .select('FIPS','state_id','name','geom')\n",
    "             )\n",
    "state.to_parquet(state_file)\n",
    "\n",
    "#get h3\n",
    "con.read_parquet(state_file, table_name = 'state')\n",
    "cols = ['state','state_id','FIPS']\n",
    "h3_from_geom(con, 'state', cols, save_path = state_h3_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c37ea182-ac50-4ee8-a14b-5dcbcb14a044",
   "metadata": {},
   "source": [
    "#### County"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a6ea98d-878a-4ea5-8eda-ee10f34444e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "## CT counties changed to \"planning regions\" in 2022, so I'm grabbing older data to get the county boundaries \n",
    "url = \"/vsizip//vsicurl/https://www2.census.gov/geo/tiger/TIGER2020/COUNTY/tl_2020_us_county.zip\"\n",
    "con.read_geo(url)\n",
    "CT_counties = (con.read_geo(url)\n",
    "            .mutate(geom = _.geom.convert('EPSG:4269','EPSG:4326'))\n",
    "            .rename(FIPS = \"GEOID\", county = \"NAMELSAD\")\n",
    "            .select('FIPS','STATEFP','county','geom')\n",
    "            .filter(_.STATEFP == '09')\n",
    "             )\n",
    "\n",
    "# US counties \n",
    "url = \"/vsizip//vsicurl/https://www2.census.gov/geo/tiger/TIGER2024/COUNTY/tl_2024_us_county.zip\"\n",
    "con.read_geo(url)\n",
    "county = (con.read_geo(url)\n",
    "            .mutate(geom = _.geom.convert('EPSG:4269','EPSG:4326'))\n",
    "            .rename(FIPS = \"GEOID\", county = \"NAMELSAD\")\n",
    "            .select('FIPS','STATEFP','county','geom')\n",
    "            .union(CT_counties)\n",
    "         ) \n",
    "\n",
    "#adding states to counties\n",
    "state_ids = con.read_parquet(state_file).drop('geom')\n",
    "county.inner_join(state_ids, [state_ids.FIPS == county.STATEFP]).select('FIPS','state_id','state','county','geom').to_parquet(county_file)\n",
    "\n",
    "#get h3\n",
    "con.read_parquet(county_file, table_name = 'county')\n",
    "cols = ['state_id','state','county','FIPS']\n",
    "# h3_from_geom(con, 'county', cols, save_path = county_h3_file)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9e703812-5369-4bb3-857f-37ed946168e7",
   "metadata": {},
   "source": [
    "#### Cities (places + subdivisions)\n",
    "\n",
    "Note: Some cities are listed in both \"Places\" and \"Subdivisions\", so we will use `distinct()` to avoid duplicates."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a0e9a96-5fcb-409b-812c-b79f7a319eaa",
   "metadata": {},
   "outputs": [],
   "source": [
    "match_pattern = r\"(?i)\\s*(city|town|village|charter|municipality|Borough)\\b\"\n",
    "# match_pattern = r\"(?i)(?<![a-z])(?:city|town|charter|municipality|[Bb]orough)(?![a-z])\"\n",
    "\n",
    "city_cols = [\"state_id\",\"county\",\"FIPS\",\"name\",'city']\n",
    "\n",
    "places_url = \"https://www2.census.gov/geo/docs/reference/codes2020/national_place_by_county2020.txt\"\n",
    "places_fips = (con.read_csv(places_url)\n",
    "               .rename(state_id = \"STATE\", county = \"COUNTYNAME\", city = \"PLACENAME\")\n",
    "               .mutate(name=_.city.re_replace(match_pattern, \"\").strip())\n",
    "               .mutate(FIPS = _.STATEFP + _.COUNTYFP)\n",
    "               .select(city_cols))\n",
    "\n",
    "subdivisions_url = \"https://www2.census.gov/geo/docs/reference/codes2020/national_cousub2020.txt\"\n",
    "subdivisions_fips = (con.read_csv(subdivisions_url)\n",
    "                     .rename(state_id = \"STATE\", county = \"COUNTYNAME\", city = \"COUSUBNAME\")\n",
    "                     .mutate(name=_.city.re_replace(match_pattern, \"\").strip())\n",
    "                     .mutate(FIPS = _.STATEFP + _.COUNTYFP)\n",
    "                     .select(city_cols))\n",
    "\n",
    "#get unique -> some cities are listed in both places and subdivisions\n",
    "city_fips = places_fips.union(subdivisions_fips).distinct() \n",
    "\n",
    "#get h3 from counties \n",
    "county_h3 = con.read_parquet(county_h3_file)\n",
    "city_fips.inner_join(county_h3, 'FIPS').select('FIPS','state_id','state','county','city','name','geom','h8').to_parquet(city_h3_file)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "62511665-7859-4997-98d8-ae7c08b64f46",
   "metadata": {},
   "source": [
    "# Landvote\n",
    "\n",
    "We want to join Landvote data with TPL Conservation Almanac, but Landvote doesn't have spatial data.\n",
    "\n",
    "However, we can join Landvote with Census data to get FIPS codes and hexes. \n",
    "- First, need to split up landvote into its 3 jurisdictions: state, county, and municipals\n",
    "- Join states with Census \"states\" to get state FIPS/hex\n",
    "- Join counties with Census \"counties\" to get county FIPS/hex\n",
    "- Join special districts with Census \"places\" and \"subdivisions\" to get county FIPS/hex\n",
    "- Join municipals with Census \"places\" and \"subdivisions\" to get county FIPS/hex\n",
    "- Then join all municipal, county, special district, and state data back together!\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "793ba16f-b876-48f0-b1c1-1d2e1b1d3f3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "landvote_csv = client.get_presigned_url(\n",
    "    \"GET\",\n",
    "    \"shared-tpl\",\n",
    "    \"landvote/landvote_utf8.csv\",\n",
    "    expires=timedelta(hours=2),\n",
    ")\n",
    "collapse_spaces = r\"\\s+\"\n",
    "match_pattern = r\"(?i)\\b(city|town|charter|municipality|Borough)\\b\"\n",
    "landvote_ = (con.read_csv(landvote_csv) #it skips the row with a unicode error \n",
    "            .rename(jurisdiction = \"Jurisdiction Type\", state_id = \"State\")\n",
    "            .mutate(state_id = _.state_id.substitute({'Ore':'OR'}))\n",
    "            .mutate(name=_['Jurisdiction Name'].re_replace(match_pattern, \"\").strip())\n",
    "            .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
    "            .mutate(landvote_id=ibis.row_number().over(order_by=[_.state_id, _.jurisdiction, _.Date]))\n",
    "            .mutate(_['Conservation Funds Approved'].replace('$', '')\n",
    "                    .replace(',', '').cast('float').name('Conservation Funds Approved'))\n",
    "            .mutate(year = _.Date.year())\n",
    "            .rename(date = \"Date\", description = \"Description\", finance_mechanism = \"Finance Mechanism\",\n",
    "                    other_comments = '\"Other\" Comment', purpose = \"Purpose\", total_funds_at_stake = \"Total Funds at Stake\",\n",
    "                    conservation_funds_at_stake = \"Conservation Funds at Stake\", total_funds_approved = \"Total Funds Approved\",\n",
    "                    conservation_funds_approved = \"Conservation Funds Approved\", passed = \"Pass?\", status = \"Status\", \n",
    "                    percent_yes = '% Yes', percent_no = '% No', notes = 'Notes', voted_acq_measure = \"Voted Acq. Measure\")\n",
    "            )\n",
    "\n",
    "#landvote_id is made with a window function, which can be a bit buggy, so it helps to materialize it after generating \n",
    "landvote_with_ids = landvote_.execute()  \n",
    "landvote = con.create_table(\"landvote\", landvote_with_ids, overwrite = True)\n",
    "\n",
    "final_columns = ['landvote_id','FIPS',\n",
    "    'state_id','state','county',\n",
    "    'city','jurisdiction','year','date',\n",
    "    'description','finance_mechanism',\n",
    "    'other_comments','purpose',\n",
    "    'total_funds_at_stake',\n",
    "    'conservation_funds_at_stake',\n",
    "    'total_funds_approved',\n",
    "    'conservation_funds_approved',\n",
    "    'passed','status','percent_yes','percent_no',\n",
    "    'notes','voted_acq_measure',\n",
    "    'geom','h8']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da89e4e6-1845-4842-baa4-aa6908a1cde1",
   "metadata": {},
   "source": [
    "#### State level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00bb7c0c-80bb-4d6d-9088-0877e365a324",
   "metadata": {},
   "outputs": [],
   "source": [
    "state_z8 = con.read_parquet(state_h3_file)\n",
    "states = (landvote.filter(_.jurisdiction == \"State\")\n",
    "            .rename(state = \"Jurisdiction Name\")\n",
    "            .mutate(county = ibis.literal('None'))\n",
    "            .mutate(county_fips = ibis.literal('None'))\n",
    "            .mutate(city = ibis.literal('None')))\n",
    "\n",
    "landvote_state_z8 = (states.inner_join(state_z8, [states.state.upper() == state_z8.state.upper()])\n",
    "                   .select(final_columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85f50f50-f56a-4b15-a1ad-5b87fe80dd54",
   "metadata": {},
   "outputs": [],
   "source": [
    "# getting non hex version \n",
    "state_geo = con.read_parquet(state_file)\n",
    "landvote_state_geo = (states.inner_join(state_geo, [states.state.upper() == state_geo.state.upper()])).select(final_columns[:-1])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6202f31a-5868-4e74-951e-23acb49f0bc7",
   "metadata": {},
   "source": [
    "#### County level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a792367-dcab-4869-94b4-6343a3204e38",
   "metadata": {},
   "outputs": [],
   "source": [
    "county_match_pattern = r\"(?i)(?:(\\b[\\w-]+(?:\\s[\\w-]+)*)\\sCounty\\b|of\\s+([\\w-]+(?:\\s[\\w-]+)*))\"\n",
    "county_vals = {'Columbus and Franklin County Metro Parks':'Franklin',\n",
    "                ' Columbus and Franklin County Metro Parks':'Franklin',\n",
    "                'Athens-Clarke County': 'Clarke',\n",
    "                'City and County of San Francisco':'San Francisco',\n",
    "                'Cleveland Metropolitan Park District':'Cuyahoga',\n",
    "                'Denver City and County':'Denver',\n",
    "                'East Baton Rouge Parish':'East Baton Rouge Parish',\n",
    "                'Five Rivers MetroParks':'Montgomery',\n",
    "                'Forest Preserve District of DuPage County':'DuPage',\n",
    "                'Forest Preserve District of Kane County':'Kane',\n",
    "                'Forest Preserves of Cook County':'Cook',\n",
    "                'Great Parks of Hamilton County':'Hamilton',\n",
    "                'Jacksonville':'Duval',\n",
    "                'James City County': 'James City',\n",
    "                'Johnny Appleseed Park District':'Allen',\n",
    "                'Licking Park District':'Licking',\n",
    "                'Matanuska-Susitna Borough':'Matanuska-Susitna Borough',\n",
    "                'MetroParks of Butler County':'Butler',\n",
    "                ' Metropolitan Park District of Toledo Area':'Lucas',\n",
    "                'Metropolitan Park District of the Toledo Area':'Lucas',\n",
    "                'Metropolitan Park District of Toledo Area':'Lucas',\n",
    "                'Metropolitan Park District of Toledo Area ':'Lucas',\n",
    "                'Park District of Ottawa County':'Ottawa',\n",
    "                'Portage Park District':'Portage',\n",
    "                'Preservation Park District of Delaware County':'Delaware',\n",
    "                'Preservation Parks of Delaware County':'Delaware',\n",
    "                'Santa Clara Valley Water District': 'Santa Clara',\n",
    "                'St. Tammany Parish':'St. Tammany Parish',\n",
    "                'Summit Metro Parks':'Summit'}\n",
    "\n",
    "county_z8 = (con.read_parquet(county_h3_file)\n",
    "            .mutate(name=_.county.re_extract(county_match_pattern, 1).strip())\n",
    "            .mutate(name = _.county.substitute(value = county_vals,else_= _.name))\n",
    "             )\n",
    "\n",
    "counties = (landvote.filter(_.jurisdiction == \"County\")\n",
    "            .rename(county = \"Jurisdiction Name\")\n",
    "            .mutate(city = ibis.literal('None'))\n",
    "            .mutate(name=_.name.re_extract(county_match_pattern, 1).strip())\n",
    "            .mutate(name = _.county.substitute(value = county_vals,else_= _.name))\n",
    "           )\n",
    "\n",
    "landvote_county_z8 = (counties\n",
    "    .inner_join(county_z8, [counties.name.upper() == county_z8.name.upper(), counties.state_id == county_z8.state_id])\n",
    "    .select(final_columns)\n",
    "                     )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96846068-4efa-4908-a48b-3208e08001ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "# getting non hex version \n",
    "county_geo = (con.read_parquet(county_file)\n",
    "                .mutate(name=_.county.re_extract(county_match_pattern, 1).strip())\n",
    "                .mutate(name = _.county.substitute(value = county_vals,else_= _.name))\n",
    "            )\n",
    "\n",
    "landvote_county_geo = (counties.inner_join(county_geo, [counties.name.upper() == county_geo.name.upper(), \n",
    "                                                    counties.state_id == county_geo.state_id])\n",
    "                   .select(final_columns[:-1])\n",
    "                  )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "99be20da-2a62-4ece-97e5-69118f400c62",
   "metadata": {},
   "source": [
    "#### Special District Level\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d5a1b9e-49b9-482b-8814-7cbcabb6daae",
   "metadata": {},
   "outputs": [],
   "source": [
    "sd_match_pattern = r\"(?i)\\b(city|town|CDP|CCD|village|charter|municipality|Borough|Park District|Authority|Basin|Mountains|2|1|District|Services|Special|Preservation|Assessment|Initiative|Open Space|Metro|Parks|Community|Recreation District)\\b\"\n",
    "sd_z8 = (\n",
    "    con.read_parquet(city_h3_file)\n",
    "    .mutate(name=_.city.re_replace(sd_match_pattern, \"\"))\n",
    "    .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
    ")\n",
    "\n",
    "sd_vals = {'Tri-Lakes Park and Recreation District':'Monument',\n",
    "           'Urban Drainage and Flood Control District':'Denver',\n",
    "           'Blue Heron Recreation District':'Phoenix',\n",
    "           'Mountains Recreation and Conservation Authority':'Santa Monica',\n",
    "           'St. Helena Parish Recreation and Parks District':'Greensburg',\n",
    "           'West Geauga Park and Recreation District':'Chardon',\n",
    "           'Marin County Open Space District':'San Rafael',\n",
    "          }\n",
    "\n",
    "# filtering landvote to just special districts \n",
    "sd = (landvote.filter(_.jurisdiction == \"Special District\")\n",
    "            .rename(city = \"Jurisdiction Name\")\n",
    "            .mutate(name=_.name.re_replace(sd_match_pattern, \"\"))\n",
    "            .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
    "            .mutate(name=_.city.substitute(value=sd_vals, else_=_.name))\n",
    "     )\n",
    "\n",
    "# detecting if a record has multiple counties listed in the notes field \n",
    "multiple_counties_ = (\n",
    "    sd\n",
    "    .filter(~_.notes.isnull())\n",
    "    .filter( \n",
    "        (_.notes.contains(\"counties\")) |\n",
    "        (_.notes.contains(\"Counties\")) |\n",
    "        (_.notes.split(\"County\").length()-1>1) \n",
    "    )\n",
    ")\n",
    "\n",
    "#extracting multiple counties from notes column \n",
    "@ibis.udf.scalar.python\n",
    "def extract_counties_udf(note: str) -> list[str]:\n",
    "    pattern = r\"((?:[A-Z][a-zA-Z.\\'-]*(?:\\s+[A-Z][a-zA-Z.\\'-]*)*)(?:,\\s*)?(?:\\s+and\\s+)?)+(?=\\s+(?:[Cc]ounty|[Cc]ounties))\"\n",
    "    p = re.compile(pattern)\n",
    "    matches = [m.group(0) for m in p.finditer(note)]  # <-- Use finditer with group(0)\n",
    "    counties = []\n",
    "    for match in matches:\n",
    "        parts = re.split(r',\\s*|\\s+and\\s+', match)\n",
    "        counties.extend(f\"{part.strip()} County\" for part in parts if part.strip())\n",
    "    return counties\n",
    "\n",
    "multiple_counties = (multiple_counties_\n",
    "    .mutate(county_list=extract_counties_udf(_.notes))\n",
    "    .unnest([\"county_list\"])\n",
    "    .mutate(county=_.county_list)\n",
    "    .drop(\"county_list\")\n",
    ")\n",
    "\n",
    "multiple_counties_ids = multiple_counties.select('landvote_id').distinct().execute()['landvote_id'].to_list()\n",
    "\n",
    "# Only has 1 county in the notes field\n",
    "single_county_pattern = r'([A-Z][a-zA-Z]+(?:\\s[A-Z][a-zA-Z]*)*\\sCounty)\\.?'\n",
    "single_county = (sd\n",
    "    .filter(~_.notes.isnull())\n",
    "    .filter(_.landvote_id.notin(multiple_counties_ids))\n",
    "    .mutate(county=_.notes.re_extract(single_county_pattern, 1).strip())\n",
    "    .mutate(county=_.county.cases(\n",
    "        ('',_.city.re_extract(single_county_pattern, 1).strip()),\n",
    "        else_ = _.county))\n",
    "    .filter(_.county != '')\n",
    ")\n",
    "single_county_ids= single_county.select('landvote_id').distinct().execute()['landvote_id'].to_list()\n",
    "\n",
    "# Nothing in notes, need to join with census data to get county\n",
    "manually_fill = (sd\n",
    "    .filter(_.landvote_id.notin(multiple_counties_ids))\n",
    "    .filter(_.landvote_id.notin(single_county_ids))\n",
    "    .inner_join(sd_z8,[_.name.upper() == sd_z8.name.upper(),\n",
    "                        _.state_id == sd_z8.state_id]) \n",
    "    .select(final_columns)\n",
    "    .distinct()\n",
    ")\n",
    "\n",
    "sd_county_vals = {'Western Summit County':'Summit County'}\n",
    "sd_with_counties = single_county.union(multiple_counties).mutate(county=_.county.substitute(value=sd_county_vals, else_=_.county))\n",
    "\n",
    "#since we are joining on counties, there may be duplicate hexes because of the cities \n",
    "landvote_sd_z8 = (sd_with_counties\n",
    "                .inner_join(county_z8.distinct(), [sd_with_counties.county.upper() == county_z8.county.upper(), \n",
    "                                                        sd_with_counties.state_id == county_z8.state_id])\n",
    "                .select(final_columns)\n",
    "                .union(manually_fill)\n",
    "              )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63d8df3a-ddc8-49b3-9f3a-1762883472cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "sd = landvote_sd_z8.drop('h8','geom').distinct()\n",
    "landvote_sd_geo = (sd.inner_join(county_geo,[sd.county.upper() == county_geo.county.upper(), sd.state_id == county_geo.state_id])\n",
    "    .select(final_columns[:-1]))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cca12e06-8d7f-4a50-906c-7dc02e370072",
   "metadata": {},
   "source": [
    "#### Municipal level\n",
    "\n",
    "Because there isn't a 1 to 1 match from municipals to Census data, we need to use both \"Places\" and \"Subdivisons\". "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa81e457-00ba-4b01-86d7-cf46bec04edd",
   "metadata": {},
   "outputs": [],
   "source": [
    "municipal_vals = {\n",
    "    \"Addison\": \"Addison village\",\n",
    "    \"Anderson Township Park District\": \"Anderson township\",\n",
    "    \"Bainbridge Island Metropolitan Park & Recreation District\": \"Bainbridge Island\",\n",
    "    \"Bainbridge Island Metropolitan Park and Recreation District \": \"Bainbridge Island\",\n",
    "    \"Bel-Ridge\": \"Bel-Ridge village\",\n",
    "    \"Bend Park and Recreation District\": \"Bend\",\n",
    "    \"Boardman Township Park District\": \"Boardman township\",\n",
    "    \"Carney's Point Township\": \"Carneys Point township\",\n",
    "    \"Castro Valley\": \"Castro Valley CDP\",\n",
    "    \"Charter Township of Meridian\": \"Meridian township\",\n",
    "    \"Charter Township of Oakland\": \"Oakland township\",\n",
    "    \"Corrales\": \"Corrales village\",\n",
    "    \"Dobbs Ferry\": \"Dobbs Ferry village\",\n",
    "    \"Downers Grove Park District\": \"Downers Grove village\",\n",
    "    \"Gates Mills\": \"Gates Mills village\",\n",
    "    \"Glen Ellyn Park District\": \"Glen Ellyn village\",\n",
    "    \"Hillsborough\": \"Hillsborough township\",\n",
    "    \"Irvington\": \"Irvington village\",\n",
    "    \"Lake Zurich\": \"Lake Zurich village\",\n",
    "    \"Lake in the Hills\": \"Lake in the Hills village\",\n",
    "    \"Libertyville\": \"Libertyville township\",\n",
    "    \"Loch Arbor Village\": \"Loch Arbour Village\",\n",
    "    \"Lockport Township Park District\": \"Lockport township\",\n",
    "    \"Moapa\": \"Moapa CDP\",\n",
    "    \"Nunda\": \"Nunda township\",\n",
    "    \"Orland Park\": \"Orland Park village\",\n",
    "    \"Park Ridge Recreation and Park District\": \"Park Ridge\",\n",
    "    \"Peapack-Gladstone Borough\": \"Peapack and Gladstone\",\n",
    "    \"Princeton Township\": \"Princeton\",\n",
    "    \"Romeoville\": \"Romeoville village\",\n",
    "    \"San Diego Open Space Park Facilities District No. 1\": \"San Diego\",\n",
    "    \"Seattle Park District\": \"Seattle\",\n",
    "    \"Stookey\": \"Stookey township\",\n",
    "    \"Tarrytown\": \"Tarrytown village\",\n",
    "    \"Tofte\": \"Tofte township\",\n",
    "    \"Village of Corrales\": \"Corrales village\",\n",
    "    \"Village of Lake Barrington\": \"Lake Barrington village\",\n",
    "    \"Village of Los Ranchos de Albuquerque\": \"Los Ranchos de Albuquerque village\",\n",
    "    \"West Paterson Borough\": \"Woodland Park\",\n",
    "    \"Westampton\": \"Westampton township\",\n",
    "    \"Willamalane Park and Recreation District\": \"Springfield\",\n",
    "    \"Wilmette Park District\": \"Wilmette village\",  \n",
    "}\n",
    "collapse_spaces = r\"\\s+\"\n",
    "city_z8 = (\n",
    "    con.read_parquet(city_h3_file)\n",
    "    .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
    "    .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
    ")\n",
    "\n",
    "# filter to only ciites\n",
    "municipals = (landvote.filter(_.jurisdiction == \"Municipal\")\n",
    "            .rename(city = \"Jurisdiction Name\")\n",
    "            .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
    "            .mutate(name = _.city.substitute(value = municipal_vals, else_= _.name))\n",
    "             )\n",
    "\n",
    "# join with census data \n",
    "city_joined = (municipals.inner_join(city_z8, [municipals.name.upper() == city_z8.name.upper(), \n",
    "                                                municipals.state_id == city_z8.state_id]).select(final_columns))\n",
    "\n",
    "# handling cities with multiple counties\n",
    "dupes = city_joined.drop('h8','geom').distinct().group_by(\"landvote_id\").agg(county_count = _.count()).filter(_.county_count > 1)\n",
    "duplicate_ids = dupes.execute()['landvote_id'].to_list()\n",
    "\n",
    "# 105 that are already filled in, manually scraping the counties from the notes \n",
    "pattern = r'^\\s*([A-Z][a-z]+(?:\\s[A-Z][a-z]+)*)\\s(?:County|Co)\\.?\\s*$'\n",
    "counties_filled = (municipals.filter(_.landvote_id.isin(duplicate_ids))\n",
    "                    .filter(~_.notes.isnull())\n",
    "                    .mutate(county=_.notes.re_extract(pattern, 1).strip()+ ibis.literal(' County'))\n",
    "                    .filter(_.county !=' County')\n",
    "                  )\n",
    "\n",
    "# since we added the county, join it with the rest of the census data \n",
    "counties_filled_join = (counties_filled\n",
    "    .inner_join(city_z8,[counties_filled.name.upper() == city_z8.name.upper(),\n",
    "                          counties_filled.county.upper() == city_z8.county.upper(), \n",
    "                          counties_filled.state_id == city_z8.state_id])\n",
    "    .select(final_columns))\n",
    "\n",
    "counties_filled_ids = counties_filled_join.select('landvote_id').distinct().execute()['landvote_id'].to_list()\n",
    "\n",
    "# join with the rest of the municipal data\n",
    "landvote_city_z8 = city_joined.filter(~_.landvote_id.isin(counties_filled_ids)).union(counties_filled_join).distinct()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c751dabd-b39a-4c54-b9ed-17d2b0cb32ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "match_pattern = r\"(?i)\\b(city|town|charter|municipality|[Bb]orough)\\b\"\n",
    "\n",
    "city_geo = (city_fips.inner_join(county_geo, 'FIPS').select(~s.endswith('_right')).drop('name')\n",
    "    .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
    "    .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip()))\n",
    "\n",
    "municipals_counties = (counties_filled\n",
    "    .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
    "    .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
    "    .mutate(name = _.city.substitute(value = municipal_vals, else_= _.name))\n",
    "    .inner_join(city_geo,[_.name.upper() == city_geo.name.upper(),\n",
    "                          _.county.upper() == city_geo.county.upper(), \n",
    "                          _.state_id == city_geo.state_id])\n",
    "    .select(final_columns[:-1])\n",
    "                      )\n",
    "\n",
    "other_municipals = (municipals.filter(~_.landvote_id.isin(counties_filled_ids))\n",
    "    .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
    "    .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
    "    .mutate(name = _.city.substitute(value = municipal_vals, else_= _.name))\n",
    "    .inner_join(city_geo,[_.name.upper() == city_geo.name.upper(),_.state_id == city_geo.state_id])\n",
    "    .select(final_columns[:-1]))\n",
    "\n",
    "landvote_city_geo = municipals_counties.union(other_municipals).distinct() "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d52d97b-ad14-4d04-89a8-79a813f80353",
   "metadata": {},
   "source": [
    "#### Joining all the landvote data with census\n",
    "Note: `landvote_joined` has more unique rows than `landvote` because some cities/special districts span multiple counties. Each additional county creates a new row."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d481736-82f3-4be2-b5af-6280be5e9d75",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "landvote_joined_z8 = landvote_city_z8.union(landvote_county_z8).union(landvote_sd_z8).union(landvote_state_z8)\n",
    "landvote_joined_z8.to_parquet(\"s3://shared-tpl/landvote/z8/landvote_h3_z8.parquet\")\n",
    "\n",
    "\n",
    "# and non-hex version \n",
    "landvote_joined_geo = landvote_city_geo.union(landvote_county_geo).union(landvote_sd_geo).union(landvote_state_geo)\n",
    "landvote_joined_geo.to_parquet(\"s3://shared-tpl/landvote/landvote_geom.parquet\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "066e4fdd-f069-4d5d-b2be-f10124cfe19c",
   "metadata": {},
   "source": [
    "#### Generate PMTiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "100f0fd8-9588-4f65-a657-52bd5b942089",
   "metadata": {},
   "outputs": [],
   "source": [
    "parquet = client.get_presigned_url(\n",
    "    \"GET\",\n",
    "    \"shared-tpl\",\n",
    "    \"landvote/landvote_geom.parquet\",\n",
    "    expires=timedelta(hours=2),\n",
    ")\n",
    "to_geojson(parquet, \"landvote_geom.geojson\")\n",
    "pmtiles = to_pmtiles(\"landvote_geom.geojson\", \"landvote_geom.pmtiles\")\n",
    "s3_cp('landvote_geom.pmtiles', \"s3://shared-tpl/landvote/landvote_geom.pmtiles\", \"minio\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}