Spaces:
Running
Running
File size: 3,111 Bytes
0d4a0ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | import pandas as pd
import geopandas as gpd
import shapely
from shapely.geometry import Point
from src.utils import PERIODS, CURRENT_YEAR, YEAR_FROM, UNIT_NAME_COL
def build_geodataframe(df: pd.DataFrame) -> gpd.GeoDataFrame:
geometry = [Point(lon, lat) for lon, lat in zip(df["decimalLongitude"], df["decimalLatitude"])]
return gpd.GeoDataFrame(df.copy(), geometry=geometry, crs="EPSG:4326")
def _prepare_boundary(boundary_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
"""Reproject to WGS84 and drop invalid/degenerate geometries."""
# Ensure CRS is set and in WGS84
if boundary_gdf.crs is None:
gdf = boundary_gdf.set_crs("EPSG:4326", allow_override=True)
else:
gdf = boundary_gdf.to_crs("EPSG:4326")
# Drop null / empty geometries
gdf = gdf[~gdf["geometry"].isna()].copy()
gdf = gdf[~gdf["geometry"].is_empty].copy()
# Repair invalid geometries (fixes self-intersections etc.)
gdf["geometry"] = shapely.make_valid(gdf["geometry"])
# Drop geometries with non-finite coordinate bounds (NaN / Inf)
def _finite_bounds(geom):
try:
return all(v == v and abs(v) != float("inf") for v in geom.bounds)
except Exception:
return False
gdf = gdf[gdf["geometry"].apply(_finite_bounds)].copy()
return gdf
def spatial_join(occurrences_gdf: gpd.GeoDataFrame, boundary_gdf: gpd.GeoDataFrame,
scope: str) -> gpd.GeoDataFrame:
occ = occurrences_gdf.to_crs("EPSG:4326") if occurrences_gdf.crs else occurrences_gdf.set_crs("EPSG:4326")
bnd = _prepare_boundary(boundary_gdf)
if bnd.empty:
return occurrences_gdf.iloc[0:0]
unit_col = UNIT_NAME_COL.get(scope, "GEN")
join_cols = [c for c in [unit_col, "AGS", "bundesland", "ISO_A2", "admin", "iso_a2"]
if c in bnd.columns] + ["geometry"]
bnd_slim = bnd[join_cols].copy()
joined = gpd.sjoin(occ, bnd_slim, how="inner", predicate="intersects")
return joined.drop(columns=["index_right"], errors="ignore")
def count_by_unit(joined_gdf: gpd.GeoDataFrame, unit_col: str) -> pd.Series:
if unit_col not in joined_gdf.columns:
return pd.Series(dtype=int)
return joined_gdf.groupby(unit_col).size().rename("count")
def build_count_table(joined_gdf: gpd.GeoDataFrame, boundary_gdf: gpd.GeoDataFrame,
scope: str, year_from: int = YEAR_FROM,
year_to: int = CURRENT_YEAR) -> pd.DataFrame:
unit_col = UNIT_NAME_COL.get(scope, "GEN")
if unit_col not in boundary_gdf.columns:
return pd.DataFrame()
all_units = boundary_gdf[unit_col].dropna().unique()
table = pd.DataFrame(index=all_units)
table.index.name = unit_col
for year in range(year_from, year_to + 1):
year_data = joined_gdf[joined_gdf["year"] == year]
counts = count_by_unit(year_data, unit_col)
table[str(year)] = counts.reindex(table.index).fillna(0).astype(int)
table["Gesamt"] = table.sum(axis=1)
table = table[table["Gesamt"] > 0].sort_values("Gesamt", ascending=False)
return table.reset_index()
|