import pandas as pd import geopandas as gpd import shapely from shapely.geometry import Point from src.utils import PERIODS, CURRENT_YEAR, YEAR_FROM, UNIT_NAME_COL def build_geodataframe(df: pd.DataFrame) -> gpd.GeoDataFrame: geometry = [Point(lon, lat) for lon, lat in zip(df["decimalLongitude"], df["decimalLatitude"])] return gpd.GeoDataFrame(df.copy(), geometry=geometry, crs="EPSG:4326") def _prepare_boundary(boundary_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: """Reproject to WGS84 and drop invalid/degenerate geometries.""" # Ensure CRS is set and in WGS84 if boundary_gdf.crs is None: gdf = boundary_gdf.set_crs("EPSG:4326", allow_override=True) else: gdf = boundary_gdf.to_crs("EPSG:4326") # Drop null / empty geometries gdf = gdf[~gdf["geometry"].isna()].copy() gdf = gdf[~gdf["geometry"].is_empty].copy() # Repair invalid geometries (fixes self-intersections etc.) gdf["geometry"] = shapely.make_valid(gdf["geometry"]) # Drop geometries with non-finite coordinate bounds (NaN / Inf) def _finite_bounds(geom): try: return all(v == v and abs(v) != float("inf") for v in geom.bounds) except Exception: return False gdf = gdf[gdf["geometry"].apply(_finite_bounds)].copy() return gdf def spatial_join(occurrences_gdf: gpd.GeoDataFrame, boundary_gdf: gpd.GeoDataFrame, scope: str) -> gpd.GeoDataFrame: occ = occurrences_gdf.to_crs("EPSG:4326") if occurrences_gdf.crs else occurrences_gdf.set_crs("EPSG:4326") bnd = _prepare_boundary(boundary_gdf) if bnd.empty: return occurrences_gdf.iloc[0:0] unit_col = UNIT_NAME_COL.get(scope, "GEN") join_cols = [c for c in [unit_col, "AGS", "bundesland", "ISO_A2", "admin", "iso_a2"] if c in bnd.columns] + ["geometry"] bnd_slim = bnd[join_cols].copy() joined = gpd.sjoin(occ, bnd_slim, how="inner", predicate="intersects") return joined.drop(columns=["index_right"], errors="ignore") def count_by_unit(joined_gdf: gpd.GeoDataFrame, unit_col: str) -> pd.Series: if unit_col not in joined_gdf.columns: return pd.Series(dtype=int) return joined_gdf.groupby(unit_col).size().rename("count") def build_count_table(joined_gdf: gpd.GeoDataFrame, boundary_gdf: gpd.GeoDataFrame, scope: str, year_from: int = YEAR_FROM, year_to: int = CURRENT_YEAR) -> pd.DataFrame: unit_col = UNIT_NAME_COL.get(scope, "GEN") if unit_col not in boundary_gdf.columns: return pd.DataFrame() all_units = boundary_gdf[unit_col].dropna().unique() table = pd.DataFrame(index=all_units) table.index.name = unit_col for year in range(year_from, year_to + 1): year_data = joined_gdf[joined_gdf["year"] == year] counts = count_by_unit(year_data, unit_col) table[str(year)] = counts.reindex(table.index).fillna(0).astype(int) table["Gesamt"] = table.sum(axis=1) table = table[table["Gesamt"] > 0].sort_values("Gesamt", ascending=False) return table.reset_index()