File size: 3,111 Bytes
0d4a0ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
import geopandas as gpd
import shapely
from shapely.geometry import Point

from src.utils import PERIODS, CURRENT_YEAR, YEAR_FROM, UNIT_NAME_COL


def build_geodataframe(df: pd.DataFrame) -> gpd.GeoDataFrame:
    geometry = [Point(lon, lat) for lon, lat in zip(df["decimalLongitude"], df["decimalLatitude"])]
    return gpd.GeoDataFrame(df.copy(), geometry=geometry, crs="EPSG:4326")


def _prepare_boundary(boundary_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """Reproject to WGS84 and drop invalid/degenerate geometries."""
    # Ensure CRS is set and in WGS84
    if boundary_gdf.crs is None:
        gdf = boundary_gdf.set_crs("EPSG:4326", allow_override=True)
    else:
        gdf = boundary_gdf.to_crs("EPSG:4326")

    # Drop null / empty geometries
    gdf = gdf[~gdf["geometry"].isna()].copy()
    gdf = gdf[~gdf["geometry"].is_empty].copy()

    # Repair invalid geometries (fixes self-intersections etc.)
    gdf["geometry"] = shapely.make_valid(gdf["geometry"])

    # Drop geometries with non-finite coordinate bounds (NaN / Inf)
    def _finite_bounds(geom):
        try:
            return all(v == v and abs(v) != float("inf") for v in geom.bounds)
        except Exception:
            return False

    gdf = gdf[gdf["geometry"].apply(_finite_bounds)].copy()
    return gdf


def spatial_join(occurrences_gdf: gpd.GeoDataFrame, boundary_gdf: gpd.GeoDataFrame,
                 scope: str) -> gpd.GeoDataFrame:
    occ = occurrences_gdf.to_crs("EPSG:4326") if occurrences_gdf.crs else occurrences_gdf.set_crs("EPSG:4326")
    bnd = _prepare_boundary(boundary_gdf)

    if bnd.empty:
        return occurrences_gdf.iloc[0:0]

    unit_col = UNIT_NAME_COL.get(scope, "GEN")
    join_cols = [c for c in [unit_col, "AGS", "bundesland", "ISO_A2", "admin", "iso_a2"]
                 if c in bnd.columns] + ["geometry"]
    bnd_slim = bnd[join_cols].copy()

    joined = gpd.sjoin(occ, bnd_slim, how="inner", predicate="intersects")
    return joined.drop(columns=["index_right"], errors="ignore")


def count_by_unit(joined_gdf: gpd.GeoDataFrame, unit_col: str) -> pd.Series:
    if unit_col not in joined_gdf.columns:
        return pd.Series(dtype=int)
    return joined_gdf.groupby(unit_col).size().rename("count")


def build_count_table(joined_gdf: gpd.GeoDataFrame, boundary_gdf: gpd.GeoDataFrame,
                      scope: str, year_from: int = YEAR_FROM,
                      year_to: int = CURRENT_YEAR) -> pd.DataFrame:
    unit_col = UNIT_NAME_COL.get(scope, "GEN")
    if unit_col not in boundary_gdf.columns:
        return pd.DataFrame()

    all_units = boundary_gdf[unit_col].dropna().unique()
    table = pd.DataFrame(index=all_units)
    table.index.name = unit_col

    for year in range(year_from, year_to + 1):
        year_data = joined_gdf[joined_gdf["year"] == year]
        counts = count_by_unit(year_data, unit_col)
        table[str(year)] = counts.reindex(table.index).fillna(0).astype(int)

    table["Gesamt"] = table.sum(axis=1)
    table = table[table["Gesamt"] > 0].sort_values("Gesamt", ascending=False)
    return table.reset_index()