Spaces:
Running
Running
| import pandas as pd | |
| import geopandas as gpd | |
| import shapely | |
| from shapely.geometry import Point | |
| from src.utils import PERIODS, CURRENT_YEAR, YEAR_FROM, UNIT_NAME_COL | |
| def build_geodataframe(df: pd.DataFrame) -> gpd.GeoDataFrame: | |
| geometry = [Point(lon, lat) for lon, lat in zip(df["decimalLongitude"], df["decimalLatitude"])] | |
| return gpd.GeoDataFrame(df.copy(), geometry=geometry, crs="EPSG:4326") | |
| def _prepare_boundary(boundary_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: | |
| """Reproject to WGS84 and drop invalid/degenerate geometries.""" | |
| # Ensure CRS is set and in WGS84 | |
| if boundary_gdf.crs is None: | |
| gdf = boundary_gdf.set_crs("EPSG:4326", allow_override=True) | |
| else: | |
| gdf = boundary_gdf.to_crs("EPSG:4326") | |
| # Drop null / empty geometries | |
| gdf = gdf[~gdf["geometry"].isna()].copy() | |
| gdf = gdf[~gdf["geometry"].is_empty].copy() | |
| # Repair invalid geometries (fixes self-intersections etc.) | |
| gdf["geometry"] = shapely.make_valid(gdf["geometry"]) | |
| # Drop geometries with non-finite coordinate bounds (NaN / Inf) | |
| def _finite_bounds(geom): | |
| try: | |
| return all(v == v and abs(v) != float("inf") for v in geom.bounds) | |
| except Exception: | |
| return False | |
| gdf = gdf[gdf["geometry"].apply(_finite_bounds)].copy() | |
| return gdf | |
| def spatial_join(occurrences_gdf: gpd.GeoDataFrame, boundary_gdf: gpd.GeoDataFrame, | |
| scope: str) -> gpd.GeoDataFrame: | |
| occ = occurrences_gdf.to_crs("EPSG:4326") if occurrences_gdf.crs else occurrences_gdf.set_crs("EPSG:4326") | |
| bnd = _prepare_boundary(boundary_gdf) | |
| if bnd.empty: | |
| return occurrences_gdf.iloc[0:0] | |
| unit_col = UNIT_NAME_COL.get(scope, "GEN") | |
| join_cols = [c for c in [unit_col, "AGS", "bundesland", "ISO_A2", "admin", "iso_a2"] | |
| if c in bnd.columns] + ["geometry"] | |
| bnd_slim = bnd[join_cols].copy() | |
| joined = gpd.sjoin(occ, bnd_slim, how="inner", predicate="intersects") | |
| return joined.drop(columns=["index_right"], errors="ignore") | |
| def count_by_unit(joined_gdf: gpd.GeoDataFrame, unit_col: str) -> pd.Series: | |
| if unit_col not in joined_gdf.columns: | |
| return pd.Series(dtype=int) | |
| return joined_gdf.groupby(unit_col).size().rename("count") | |
| def build_count_table(joined_gdf: gpd.GeoDataFrame, boundary_gdf: gpd.GeoDataFrame, | |
| scope: str, year_from: int = YEAR_FROM, | |
| year_to: int = CURRENT_YEAR) -> pd.DataFrame: | |
| unit_col = UNIT_NAME_COL.get(scope, "GEN") | |
| if unit_col not in boundary_gdf.columns: | |
| return pd.DataFrame() | |
| all_units = boundary_gdf[unit_col].dropna().unique() | |
| table = pd.DataFrame(index=all_units) | |
| table.index.name = unit_col | |
| for year in range(year_from, year_to + 1): | |
| year_data = joined_gdf[joined_gdf["year"] == year] | |
| counts = count_by_unit(year_data, unit_col) | |
| table[str(year)] = counts.reindex(table.index).fillna(0).astype(int) | |
| table["Gesamt"] = table.sum(axis=1) | |
| table = table[table["Gesamt"] > 0].sort_values("Gesamt", ascending=False) | |
| return table.reset_index() | |