Spaces:

boettiger-lab
/

pad-us

Sleeping

App Files Files Community

cboettig commited on Jun 27, 2024

Commit

5b900d3

1 Parent(s): 0c5b57e

resizing

Browse files

Files changed (8) hide show

AK-HI-preprocess.py +0 -272
app.py +6 -3
chatmap.py +0 -58
pad-AK-HI-stats.parquet +0 -3
pad-stats.parquet +0 -3
pad.duckdb +0 -3
preprocess.py +2 -8
raster-vector-extract.py +6 -2

AK-HI-preprocess.py DELETED Viewed

@@ -1,272 +0,0 @@
-# +
-import ibis
-import ibis.selectors as s
-from ibis import _
-import fiona
-import geopandas as gpd
-import rioxarray
-from shapely.geometry import box
-vec_file = 'pad-AK-HI-stats.parquet'
-# +
-fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
-parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
-# gdb = "https://data.source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb" # original, all tables
-con = ibis.duckdb.connect()
-con.load_extension("spatial")
-threads = 1
-# or read the fgb version, much slower
-# pad = con.read_geo(fgb)
-# pad = con.read_parquet(parquet)
-# Currently ibis doesn't detect that this is GeoParquet.  We need a SQL escape-hatch to cast the geometry
-agency_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-name.parquet").select(manager_name_id = "Code", manager_name = "Dom")
-agency_type = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-type.parquet").select(manager_type_id = "Code", manager_type = "Dom")
-desig_type = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-desgination-type.parquet").select(designation_type_id = "Code", designation_type = "Dom")
-public_access = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-public-access.parquet").select(public_access_id = "Code", public_access = "Dom")
-state_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-state-name.parquet").select(state = "Code", state_name = "Dom")
-iucn = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-iucn.parquet").select(iucn_code = "CODE", iucn_category = "DOM")
-con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
-pad = con.table("pad")
-# -
-# Get the CRS
-# fiona is not built with parquet support, must read this from fgb.  ideally duckdb's st_read_meta would do this from the parquet
-meta = fiona.open(fgb)
-crs = meta.crs
-# Now we can do all the usual SQL queries to subset the data.  Note the `geom.within()` spatial filter!
-focal_columns = ["row_n", "FeatClass", "Mang_Name",
-                 "Mang_Type",  "Des_Tp", "Pub_Access",
-                 "GAP_Sts",  "IUCN_Cat",   "Unit_Nm",
-                 "State_Nm", "EsmtHldr", "Date_Est",
-                 "SHAPE_Area", "geom"]
-(
-    pad
-    .mutate(row_n=ibis.row_number())
-    .filter(_.FeatClass.isin(["Easement", "Fee"]))
-    .filter(_.State_Nm.isin(["AK", "HI"]))
-    .select(focal_columns)
-    .rename(geometry="geom")
-    .rename(manager_name_id = "Mang_Name",
-            manager_type_id = "Mang_Type",
-            designation_type_id = "Des_Tp",
-            public_access_id = "Pub_Access",
-            category = "FeatClass",
-            iucn_code = "IUCN_Cat",
-            gap_code = "GAP_Sts",
-            state = "State_Nm",
-            easement_holder = "EsmtHldr",
-            date_established = "Date_Est",
-            area_square_meters = "SHAPE_Area",
-            area_name = "Unit_Nm")
-    .left_join(agency_name, "manager_name_id")
-    .left_join(agency_type, "manager_type_id")
-    .left_join(desig_type, "designation_type_id")
-    .left_join(public_access, "public_access_id")
-    .left_join(state_name, "state")
-    .left_join(iucn, "iucn_code")
-    .select(~s.contains("_right"))
-#   .select(~s.contains("_id"))
-# if we keep the original geoparquet WKB 'geometry' column, to_pandas() (or execute) gives us only a normal pandas data.frame, and geopandas doesn't see the metadata.
-# if we replace the geometry with duckdb-native 'geometry' type, to_pandas() gives us a geopanadas!  But requires reading into RAM.
-    .to_pandas()
-    .set_crs(crs)
-    .to_parquet(vec_file)
-)
-# +
-import rasterio
-from rasterstats import zonal_stats
-import geopandas as gpd
-import pandas as pd
-from joblib import Parallel, delayed
-def big_zonal_stats(vec_file, tif_file, stats, col_name, n_jobs, verbose = 10, timeout=10000):
-    # read in vector as geopandas, match CRS to raster
-    with rasterio.open(tif_file) as src:
-        raster_profile = src.profile
-    gdf = gpd.read_parquet(vec_file).to_crs(raster_profile['crs'])
-    # row_n is a global id, may refer to excluded polygons
-    # gdf["row_id"] = gdf.index + 1
-    # lamba fn to zonal_stats a slice:
-    def get_stats(geom_slice, tif_file, stats):
-        stats = zonal_stats(geom_slice.geometry, tif_file, stats = stats)
-        stats[0]['row_n'] = geom_slice.row_n
-        # print(geom_slice.row_n)
-        return stats[0]
-    # iteratation (could be a list comprehension?)
-    jobs = []
-    for r in gdf.itertuples():
-        jobs.append(delayed(get_stats)(r, tif_file, stats))
-    # And here we go
-    output = Parallel(n_jobs=n_jobs, timeout=timeout, verbose=verbose)(jobs)
-    # reshape output
-    df = (
-        pd.DataFrame(output)
-        .rename(columns={'mean': col_name})
-        .merge(gdf, how='right', on = 'row_n')
-        )
-    gdf = gpd.GeoDataFrame(df, geometry="geometry")
-    return gdf
-# -
-tif_file = "/home/rstudio/boettiger-lab/us-pa-policy/hfp_2021_100m_v1-2_cog.tif"
-threads=1
-# +
-#import geopandas as gpd
-#test = gpd.read_parquet("pad-processed.parquet")
-#test.columns
-# +
-# %%time
-#
-tif_file = "/home/rstudio/boettiger-lab/us-pa-policy/hfp_2021_100m_v1-2_cog.tif"
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
-                     col_name = "human_impact", n_jobs=1, verbose=0)
-gpd.GeoDataFrame(df, geometry="geometry").to_parquet(vec_file)
-# -
-# %%time
-tif_file = '/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif'
-big_zonal_stats(vec_file, tif_file, stats = ['mean'], col_name = "richness", n_jobs=threads, verbose=0).to_parquet(vec_file)
-# +
-# %%time
-tif_file = '/home/rstudio/source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif'
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
-                      col_name = "rsr", n_jobs=threads, verbose=0).to_parquet(vec_file)
-# +
-# %%time
-tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif'
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
-                     col_name = "deforest_carbon", n_jobs=threads, verbose=0).to_parquet(vec_file)
-# +
-# %%time
-tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_bii_100m_cog.tif'
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
-                     col_name = "biodiversity_intactness_loss", n_jobs=threads, verbose=0).to_parquet(vec_file)
-# +
-# %%time
-tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_fii_100m_cog.tif'
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],
-                     col_name = "forest_integrity_loss", n_jobs=threads, verbose=0).to_parquet(vec_file)
-# +
-# %%time
-tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_expansion_100m_cog.tif'
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],  col_name = "crop_expansion", n_jobs=threads, verbose=0)
-gpd.GeoDataFrame(df, geometry="geometry").to_parquet(vec_file)
-# +
-# %%time
-tif_file = '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/natcrop_reduction_100m_cog.tif'
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],  col_name = "crop_reduction", n_jobs=threads, verbose=0).to_parquet(vec_file)
-# +
-# %%time
-tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/irrecoverable_c_total_2018.tif'
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],  col_name = "irrecoverable_carbon", n_jobs=threads, verbose=0).to_parquet(vec_file)
-# +
-# %%time
-tif_file = '/home/rstudio/source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif'
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],  col_name = "manageable_carbon", n_jobs=threads, verbose=0).to_parquet(vec_file)
-# +
-# %%time
-tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_rwr_2022.tif'
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],  col_name = "all_species_rwr", n_jobs=threads, verbose=0).to_parquet(vec_file)
-# +
-# %%time
-tif_file = '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022.tif'
-df = big_zonal_stats(vec_file, tif_file, stats = ['mean'],  col_name = "all_species_richness", n_jobs=threads, verbose=0).to_parquet(vec_file)
-# +
-columns = '''
-area_name,
-manager_name,
-manager_name_id,
-manager_type,
-manager_type_id,
-manager_group,
-designation_type,
-designation_type_id,
-public_access,
-category,
-iucn_code,
-iucn_category,
-gap_code,
-state,
-state_name,
-easement_holder,
-date_established,
-area_square_meters,
-geometry,
-all_species_richness,
-all_species_rwr,
-manageable_carbon,
-irrecoverable_carbon,
-crop_reduction,
-crop_expansion,
-deforest_carbon,
-richness,
-rsr,
-forest_integrity_loss,
-biodiversity_intactness_loss
-'''
-items = columns.split(',')
-# Remove empty strings and whitespace
-items = [item.strip() for item in items if item.strip()]
-items
-# -
-import ibis
-from ibis import _
-df = ibis.read_parquet(vec_file).select(items).to_parquet(vec_file)
-import ibis
-from ibis import _
-ibis.read_parquet("pad-AK-HI-stats.parquet")

app.py CHANGED Viewed

@@ -115,12 +115,12 @@ def area_plot(df, column):
         alt.Theta("percent_protected:Q").stack(True),
     )
     pie = ( base
-           .mark_arc(innerRadius= 40, outerRadius=70)
            .encode(alt.Color("color:N").scale(None).legend(None),
                    tooltip=['percent_protected', 'hectares_protected', column])
     )
     text = ( base
-            .mark_text(radius=60, size=12, color="white")
             .encode(text = column + ":N")
     )
     plot = pie # pie + text
@@ -292,12 +292,15 @@ bil_fill = {
     "fill-extrusion-opacity": 0.9,
 }
 # +
 st.set_page_config(layout="wide", page_title="Protected Areas Explorer", page_icon=":globe:")
 '''
-# US Protected Area Database Explorer
 '''

         alt.Theta("percent_protected:Q").stack(True),
     )
     pie = ( base
+           .mark_arc(innerRadius= 40, outerRadius=100)
            .encode(alt.Color("color:N").scale(None).legend(None),
                    tooltip=['percent_protected', 'hectares_protected', column])
     )
     text = ( base
+            .mark_text(radius=80, size=14, color="white")
             .encode(text = column + ":N")
     )
     plot = pie # pie + text
     "fill-extrusion-opacity": 0.9,
 }
+###########################################################################################################
 # +
 st.set_page_config(layout="wide", page_title="Protected Areas Explorer", page_icon=":globe:")
 '''
+# US Conservation Atlas Prototype
 '''

chatmap.py DELETED Viewed

@@ -1,58 +0,0 @@
-# This example does not use a langchain agent,
-# The langchain sql chain has knowledge of the database, but doesn't interact with it becond intialization.
-# The output of the sql chain is parsed seperately and passed to `duckdb.sql()` by streamlit
-import streamlit as st
-## Database connection
-from sqlalchemy import create_engine
-from langchain.sql_database import SQLDatabase
-db_uri = "duckdb:///pad.duckdb"
-engine = create_engine(db_uri, connect_args={'read_only': True})
-db = SQLDatabase(engine, view_support=True)
-import duckdb
-con = duckdb.connect("pad.duckdb", read_only=True)
-con.install_extension("spatial")
-con.load_extension("spatial")
-## ChatGPT Connection
-from langchain_openai import ChatOpenAI
-chatgpt_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key=st.secrets["OPENAI_API_KEY"])
-chatgpt4_llm = ChatOpenAI(model="gpt-4", temperature=0, api_key=st.secrets["OPENAI_API_KEY"])
-# Requires ollama server running locally
-from langchain_community.llms import Ollama
-## # from langchain_community.llms import ChatOllama
-ollama_llm = Ollama(model="duckdb-nsql", temperature=0)
-models = {"ollama": ollama_llm, "chatgpt3.5": chatgpt_llm, "chatgpt4": chatgpt4_llm}
-with st.sidebar:
-    choice = st.radio("Select an LLM:", models)
-    llm = models[choice]
-## A SQL Chain
-from langchain.chains import create_sql_query_chain
-chain = create_sql_query_chain(llm, db)
-# agent does not work
-# agent = create_sql_agent(llm, db=db, verbose=True)
-if prompt := st.chat_input():
-    st.chat_message("user").write(prompt)
-    with st.chat_message("assistant"):
-        response = chain.invoke({"question": prompt})
-        st.write(response)
-        tbl = con.sql(response).to_df()
-        st.dataframe(tbl)
-# duckdb_sql fails but chatgpt3.5 succeeds with a query like:
-# use the st_area function and st_GeomFromWKB functions to compute the area of the Shape column in the fee table, and then use that to compute the total area under each GAP_Sts category
-# Federal agencies are identified as 'FED' in the Mang_Type column in the 'combined' data table. The Mang_Name column indicates the different agencies. Which federal agencies manage the greatest area of GAP_Sts 1 or 2 land?

pad-AK-HI-stats.parquet DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b1019bf85ac264c5ebe437ebfea942809bf9df6394837c54f315bc94b487c566
-size 151708809

pad-stats.parquet DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7ac89736cc42bb2390853137b2793b9e9f1d4d11cefc34f307a1280043243ca1
-size 894911787

pad.duckdb DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fd6f7206c3d03bdd00516f53e9fded8037bcbbf98ee3a8d9a90c1bc258cb47f7
-size 1084502016

preprocess.py CHANGED Viewed

@@ -16,8 +16,6 @@ public_access = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/
 state_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-state-name.parquet").select(state = "Code", state_name = "Dom")
 iucn = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-iucn.parquet").select(iucn_code = "CODE", iucn_category = "DOM")
-# +
 fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
 parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
 # gdb = "https://data.source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb" # original, all tables
@@ -25,11 +23,8 @@ parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
 # pad = con.read_geo(fgb)
 # pad = con.read_parquet(parquet)
 # Currently ibis doesn't detect that this is GeoParquet.  We need a SQL escape-hatch to cast the geometry
 con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
 pad = con.table("pad")
-# -
 # Get the CRS
@@ -52,10 +47,9 @@ focal_columns = ["row_n", "FeatClass", "Mang_Name",
 pad_parquet = (
     pad
     .mutate(row_n=ibis.row_number())
-    .filter((_.FeatClass.isin(["Easement", "Fee"])) | (
-           (_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
            )
-    .filter(_.geom.within(bounds))
     .select(focal_columns)
     .rename(geometry="geom")
 )

 state_name = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-state-name.parquet").select(state = "Code", state_name = "Dom")
 iucn = con.read_parquet("https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-iucn.parquet").select(iucn_code = "CODE", iucn_category = "DOM")
 fgb = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.fgb"
 parquet = "https://data.source.coop/cboettig/pad-us-3/pad-us3-combined.parquet"
 # gdb = "https://data.source.coop/cboettig/pad-us-3/PADUS3/PAD_US3_0.gdb" # original, all tables
 # pad = con.read_geo(fgb)
 # pad = con.read_parquet(parquet)
 # Currently ibis doesn't detect that this is GeoParquet.  We need a SQL escape-hatch to cast the geometry
 con.raw_sql(f"CREATE OR REPLACE VIEW pad AS SELECT *, st_geomfromwkb(geometry) as geom from read_parquet('{parquet}')")
 pad = con.table("pad")
 # Get the CRS
 pad_parquet = (
     pad
     .mutate(row_n=ibis.row_number())
+    .filter((_.FeatClass.isin(["Easement", "Fee"]))  # | ((_.FeatClass == "Proclamation") & (_.Mang_Name == "TRIB"))
            )
+#    .filter(_.geom.within(bounds))
     .select(focal_columns)
     .rename(geometry="geom")
 )

raster-vector-extract.py CHANGED Viewed

@@ -32,6 +32,9 @@ def extract(raster, vector, layer, output = None):
 vector = "/home/rstudio/source.coop/cboettig/pad-us-3/PADUS3_0Geopackage.gpkg"
 layer = "PADUS3_0Combined_DOD_TRIB_Fee_Designation_Easement"
 rasters = [
 "/home/rstudio/boettiger-lab/us-pa-policy/hfp_2021_100m_v1-2_cog.tif",
 '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif',
@@ -43,8 +46,9 @@ rasters = [
 '/home/rstudio/source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif',
 '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_rwr_2022.tif',
 '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022.tif',
-#'/home/rstudio/source.coop/cboettig/mobi/species-richness-all/SpeciesRichness_All.tif',
-#'/home/rstudio/source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif',
 ]
 # extract(rasters[0], vector, layer) # just one

 vector = "/home/rstudio/source.coop/cboettig/pad-us-3/PADUS3_0Geopackage.gpkg"
 layer = "PADUS3_0Combined_DOD_TRIB_Fee_Designation_Easement"
+# +
+# Can possibly use remote addresses just fine with vsicurl
 rasters = [
 "/home/rstudio/boettiger-lab/us-pa-policy/hfp_2021_100m_v1-2_cog.tif",
 '/home/rstudio/source.coop/vizzuality/lg-land-carbon-data/deforest_carbon_100m_cog.tif',
 '/home/rstudio/source.coop/cboettig/carbon/cogs/manageable_c_total_2018.tif',
 '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_rwr_2022.tif',
 '/home/rstudio/minio/shared-biodiversity/redlist/cog/combined_sr_2022.tif',
+# CONUS coverage only
+'/home/rstudio/source.coop/cboettig/mobi/species-richness-all/mobi-species-richness.tif', # byte-encoded gdal_translate -ot Byte <in> <out>
+'/home/rstudio/source.coop/cboettig/mobi/range-size-rarity-all/RSR_All.tif',
 ]
 # extract(rasters[0], vector, layer) # just one