Spaces:
Sleeping
Sleeping
update scripts with path
Browse files- utils/add_missing.py +7 -4
- utils/add_missing_coordinates_app.py +11 -6
- utils/compare_old_coord.py +9 -6
- utils/get_coordinates.py +25 -21
- utils/process_data.py +6 -5
utils/add_missing.py
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
-
import os
|
| 3 |
|
| 4 |
-
MISSING_ENRICHED =
|
| 5 |
-
CITIES_ENRICHED =
|
| 6 |
-
CITIES_ENRICHED_MANUALLY =
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def merge_missing_coord(df, missing):
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
import pandas as pd
|
|
|
|
| 4 |
|
| 5 |
+
MISSING_ENRICHED = Path(__file__).parent.parent / "data" / "missing_enriched.csv"
|
| 6 |
+
CITIES_ENRICHED = Path(__file__).parent.parent / "data" / "cities_enriched_final.csv"
|
| 7 |
+
CITIES_ENRICHED_MANUALLY = (
|
| 8 |
+
Path(__file__).parent.parent / "data" / "cities_enriched_manually.csv"
|
| 9 |
+
)
|
| 10 |
|
| 11 |
|
| 12 |
def merge_missing_coord(df, missing):
|
utils/add_missing_coordinates_app.py
CHANGED
|
@@ -1,11 +1,16 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
import os
|
| 3 |
-
|
| 4 |
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
CITIES_ENRICHED_FINAL =
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def load_df(path: str) -> pd.DataFrame:
|
|
@@ -20,7 +25,7 @@ def get_missing(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 20 |
|
| 21 |
st.header("Impute missing coordinates")
|
| 22 |
st.write(
|
| 23 |
-
|
| 24 |
coordinates to cities.csv file for the Musterdatenkatalog.
|
| 25 |
Missing coordinates are cities with empty or missing Geometry column.
|
| 26 |
The reason for this can be that the city administation unit has changed and,
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import streamlit as st
|
| 6 |
|
| 7 |
+
CITIES_ENRICHED_FINAL = (
|
| 8 |
+
Path(__file__).parent.parent / "data" / "cities_enriched_final.csv"
|
| 9 |
+
)
|
| 10 |
+
CITIES_ENRICHED_MANUALLY = (
|
| 11 |
+
Path(__file__).parent.parent / "data" / "cities_enriched_manually.csv"
|
| 12 |
+
)
|
| 13 |
+
MISSING_ENRICHED = Path(__file__).parent.parent / "data" / "missing_enriched.csv"
|
| 14 |
|
| 15 |
|
| 16 |
def load_df(path: str) -> pd.DataFrame:
|
|
|
|
| 25 |
|
| 26 |
st.header("Impute missing coordinates")
|
| 27 |
st.write(
|
| 28 |
+
"""This is an application to manually add and save missing
|
| 29 |
coordinates to cities.csv file for the Musterdatenkatalog.
|
| 30 |
Missing coordinates are cities with empty or missing Geometry column.
|
| 31 |
The reason for this can be that the city administation unit has changed and,
|
utils/compare_old_coord.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import pandas as pd
|
| 3 |
import logging
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
# define logger
|
| 6 |
logging.basicConfig(
|
|
@@ -13,10 +14,12 @@ logging.basicConfig(
|
|
| 13 |
)
|
| 14 |
|
| 15 |
|
| 16 |
-
CITIES_ENRICHED_OLD =
|
| 17 |
-
CITIES_ENRICHED_NEW =
|
| 18 |
-
CITIES_ENRICHED_FINAL =
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
def load_data(path: str) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
|
| 6 |
# define logger
|
| 7 |
logging.basicConfig(
|
|
|
|
| 14 |
)
|
| 15 |
|
| 16 |
|
| 17 |
+
CITIES_ENRICHED_OLD = Path(__file__).parent.parent / "data" / "cities_enriched_old.csv"
|
| 18 |
+
CITIES_ENRICHED_NEW = Path(__file__).parent.parent / "data" / "cities_enriched.csv"
|
| 19 |
+
CITIES_ENRICHED_FINAL = (
|
| 20 |
+
Path(__file__).parent.parent / "data" / "cities_enriched_final.csv"
|
| 21 |
+
)
|
| 22 |
+
MISSING = Path(__file__).parent.parent / "data" / "missing_final.csv"
|
| 23 |
|
| 24 |
|
| 25 |
def load_data(path: str) -> pd.DataFrame:
|
utils/get_coordinates.py
CHANGED
|
@@ -1,11 +1,9 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
import os
|
| 3 |
import logging
|
| 4 |
-
import numpy as np
|
| 5 |
-
import ast
|
| 6 |
-
import math
|
| 7 |
from pathlib import Path
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
# define logger
|
| 10 |
logging.basicConfig(
|
| 11 |
level=logging.INFO,
|
|
@@ -17,18 +15,25 @@ logging.basicConfig(
|
|
| 17 |
)
|
| 18 |
|
| 19 |
|
| 20 |
-
CITIES_DATA =
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# meta data for kreis codes ( variable in coordinates table)
|
| 24 |
-
NAME_CODE_DATA =
|
| 25 |
-
CODES_KOMMUNEN =
|
|
|
|
| 26 |
# coordinates for Gemeinden
|
| 27 |
-
COORDINATES =
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def load_cities(path: str) -> pd.DataFrame:
|
|
@@ -60,7 +65,6 @@ def create_code_mapper(path: str) -> dict:
|
|
| 60 |
def map_code(org_name, code_mapper):
|
| 61 |
# Split the org_name string into parts
|
| 62 |
parts = org_name.split()
|
| 63 |
-
# print(parts, type(parts[0]))
|
| 64 |
# Find a key in code_mapper that contains all parts of the split org_name
|
| 65 |
for key in code_mapper.keys():
|
| 66 |
# look first for whole name (cases like "Landkreis München" , "kreisfreie Stadt München")
|
|
@@ -166,7 +170,7 @@ def merge_coordinates(df: pd.DataFrame, coordinates: pd.DataFrame) -> pd.DataFra
|
|
| 166 |
modified_row = row
|
| 167 |
modified_rows.append(modified_row)
|
| 168 |
df["Geometry"] = geometries
|
| 169 |
-
|
| 170 |
modified = pd.DataFrame(modified_rows)
|
| 171 |
modified["Geometry"] = geometries
|
| 172 |
return modified
|
|
@@ -179,7 +183,7 @@ def aggregate_coordinates(geo_element: str) -> list:
|
|
| 179 |
else:
|
| 180 |
actual_list = geo_element # ast.literal_eval(geo_element)
|
| 181 |
processed_list = [list(map(float, coord.split(", "))) for coord in actual_list]
|
| 182 |
-
|
| 183 |
if len(processed_list) > 1:
|
| 184 |
coordinates = np.mean(np.array(processed_list), axis=0)
|
| 185 |
else:
|
|
@@ -195,16 +199,17 @@ if __name__ == "__main__":
|
|
| 195 |
missing = data[data["Code"].isnull()]
|
| 196 |
logging.info(f"Missing values Gebietscode: {len(missing)}")
|
| 197 |
data.to_csv(
|
| 198 |
-
|
| 199 |
index=False,
|
| 200 |
)
|
| 201 |
-
# data = pd.read_csv(
|
| 202 |
-
# os.path.join("data", "preprocessed", "cities_enriched_with_code.csv"))
|
| 203 |
data["Code"] = data["Code"].apply(lambda x: int(x) if pd.notna(x) else None)
|
| 204 |
coordinates = load_coordinates(COORDINATES)
|
| 205 |
data = merge_coordinates(data, coordinates)
|
| 206 |
data.to_csv(
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
| 208 |
index=False,
|
| 209 |
)
|
| 210 |
logging.info("Coordinates merged")
|
|
@@ -222,6 +227,5 @@ if __name__ == "__main__":
|
|
| 222 |
logging.info(f"Missing geometry: {len(missing_geometry)}")
|
| 223 |
missing_geometry.to_csv(MISSING, index=False)
|
| 224 |
|
| 225 |
-
# data = pd.read_csv(os.path.join("data", "cities_enriched_manually.csv"))
|
| 226 |
data["Geometry"] = data["Geometry"].apply(aggregate_coordinates)
|
| 227 |
data.to_csv(DATA_ENRICHED, index=False)
|
|
|
|
|
|
|
|
|
|
| 1 |
import logging
|
|
|
|
|
|
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
# define logger
|
| 8 |
logging.basicConfig(
|
| 9 |
level=logging.INFO,
|
|
|
|
| 15 |
)
|
| 16 |
|
| 17 |
|
| 18 |
+
CITIES_DATA = (
|
| 19 |
+
Path(__file__).parent.parent / "data" / "raw" / "2025_06_12_cities_1474_v6.csv"
|
| 20 |
+
)
|
| 21 |
+
DATA_ENRICHED = Path(__file__).parent.parent / "data" / "cities_enriched.csv"
|
| 22 |
|
| 23 |
# meta data for kreis codes ( variable in coordinates table)
|
| 24 |
+
NAME_CODE_DATA = Path(__file__).parent.parent / "data" / "raw" / "name_kreiscode.csv"
|
| 25 |
+
CODES_KOMMUNEN = Path(__file__).parent.parent / "data" / "raw" / "Deutschlandatlas.csv"
|
| 26 |
+
|
| 27 |
# coordinates for Gemeinden
|
| 28 |
+
COORDINATES = (
|
| 29 |
+
Path(__file__).parent.parent / "data" / "raw" / "coordinates_plz_kreiscode.csv"
|
| 30 |
+
)
|
| 31 |
+
MISSING = Path(__file__).parent.parent / "data" / "missing_first_parser.csv"
|
| 32 |
+
|
| 33 |
|
| 34 |
+
preprocessed_path = Path(__file__).parent.parent / "data" / "preprocessed"
|
| 35 |
+
if not preprocessed_path.exists():
|
| 36 |
+
preprocessed_path.mkdir(parents=True, exist_ok=True)
|
| 37 |
|
| 38 |
|
| 39 |
def load_cities(path: str) -> pd.DataFrame:
|
|
|
|
| 65 |
def map_code(org_name, code_mapper):
|
| 66 |
# Split the org_name string into parts
|
| 67 |
parts = org_name.split()
|
|
|
|
| 68 |
# Find a key in code_mapper that contains all parts of the split org_name
|
| 69 |
for key in code_mapper.keys():
|
| 70 |
# look first for whole name (cases like "Landkreis München" , "kreisfreie Stadt München")
|
|
|
|
| 170 |
modified_row = row
|
| 171 |
modified_rows.append(modified_row)
|
| 172 |
df["Geometry"] = geometries
|
| 173 |
+
|
| 174 |
modified = pd.DataFrame(modified_rows)
|
| 175 |
modified["Geometry"] = geometries
|
| 176 |
return modified
|
|
|
|
| 183 |
else:
|
| 184 |
actual_list = geo_element # ast.literal_eval(geo_element)
|
| 185 |
processed_list = [list(map(float, coord.split(", "))) for coord in actual_list]
|
| 186 |
+
|
| 187 |
if len(processed_list) > 1:
|
| 188 |
coordinates = np.mean(np.array(processed_list), axis=0)
|
| 189 |
else:
|
|
|
|
| 199 |
missing = data[data["Code"].isnull()]
|
| 200 |
logging.info(f"Missing values Gebietscode: {len(missing)}")
|
| 201 |
data.to_csv(
|
| 202 |
+
Path(__file__).parent.parent / "data" / "preprocessed" / "cities_enriched.csv",
|
| 203 |
index=False,
|
| 204 |
)
|
|
|
|
|
|
|
| 205 |
data["Code"] = data["Code"].apply(lambda x: int(x) if pd.notna(x) else None)
|
| 206 |
coordinates = load_coordinates(COORDINATES)
|
| 207 |
data = merge_coordinates(data, coordinates)
|
| 208 |
data.to_csv(
|
| 209 |
+
Path(__file__).parent.parent
|
| 210 |
+
/ "data"
|
| 211 |
+
/ "preprocessed"
|
| 212 |
+
/ "cities_enriched_with_coordinates.csv",
|
| 213 |
index=False,
|
| 214 |
)
|
| 215 |
logging.info("Coordinates merged")
|
|
|
|
| 227 |
logging.info(f"Missing geometry: {len(missing_geometry)}")
|
| 228 |
missing_geometry.to_csv(MISSING, index=False)
|
| 229 |
|
|
|
|
| 230 |
data["Geometry"] = data["Geometry"].apply(aggregate_coordinates)
|
| 231 |
data.to_csv(DATA_ENRICHED, index=False)
|
utils/process_data.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
import os
|
| 2 |
import logging
|
| 3 |
-
|
| 4 |
|
|
|
|
| 5 |
|
| 6 |
# define logger
|
| 7 |
logging.basicConfig(
|
|
@@ -13,11 +13,12 @@ logging.basicConfig(
|
|
| 13 |
],
|
| 14 |
)
|
| 15 |
|
|
|
|
| 16 |
# change these to paths if you want to generate the map_data.csv separately from the app
|
| 17 |
-
DATA_RAW =
|
| 18 |
-
CITIES_ENRICHED =
|
| 19 |
|
| 20 |
-
OUTPUT =
|
| 21 |
|
| 22 |
|
| 23 |
def load_data(path: str = DATA_RAW) -> pd.DataFrame:
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
+
from pathlib import Path
|
| 3 |
|
| 4 |
+
import pandas as pd
|
| 5 |
|
| 6 |
# define logger
|
| 7 |
logging.basicConfig(
|
|
|
|
| 13 |
],
|
| 14 |
)
|
| 15 |
|
| 16 |
+
|
| 17 |
# change these to paths if you want to generate the map_data.csv separately from the app
|
| 18 |
+
DATA_RAW = Path(__file__).parent.parent / "2025-06-13_musterdatenkatalog.json"
|
| 19 |
+
CITIES_ENRICHED = Path(__file__).parent.parent / "data" / "cities_enriched_manually.csv"
|
| 20 |
|
| 21 |
+
OUTPUT = Path(__file__).parent.parent / "data" / "preprocessed" / "map_data.csv"
|
| 22 |
|
| 23 |
|
| 24 |
def load_data(path: str = DATA_RAW) -> pd.DataFrame:
|