Rahka commited on
Commit
ad87157
·
verified ·
1 Parent(s): 6c60c4e

update scripts with path

Browse files
utils/add_missing.py CHANGED
@@ -1,9 +1,12 @@
 
 
1
  import pandas as pd
2
- import os
3
 
4
- MISSING_ENRICHED = os.path.join("data", "missing_enriched.csv")
5
- CITIES_ENRICHED = os.path.join("data", "cities_enriched_final.csv")
6
- CITIES_ENRICHED_MANUALLY = os.path.join("data", "cities_enriched_manually.csv")
 
 
7
 
8
 
9
  def merge_missing_coord(df, missing):
 
1
+ from pathlib import Path
2
+
3
  import pandas as pd
 
4
 
5
+ MISSING_ENRICHED = Path(__file__).parent.parent / "data" / "missing_enriched.csv"
6
+ CITIES_ENRICHED = Path(__file__).parent.parent / "data" / "cities_enriched_final.csv"
7
+ CITIES_ENRICHED_MANUALLY = (
8
+ Path(__file__).parent.parent / "data" / "cities_enriched_manually.csv"
9
+ )
10
 
11
 
12
  def merge_missing_coord(df, missing):
utils/add_missing_coordinates_app.py CHANGED
@@ -1,11 +1,16 @@
1
- import pandas as pd
2
  import os
3
- import streamlit as st
4
 
 
 
5
 
6
- CITIES_ENRICHED_FINAL = os.path.join("data", "cities_enriched_final.csv")
7
- CITIES_ENRICHED_MANUALLY = os.path.join("data", "cities_enriched_manually.csv")
8
- MISSING_ENRICHED = os.path.join("data", "missing_enriched.csv")
 
 
 
 
9
 
10
 
11
  def load_df(path: str) -> pd.DataFrame:
@@ -20,7 +25,7 @@ def get_missing(df: pd.DataFrame) -> pd.DataFrame:
20
 
21
  st.header("Impute missing coordinates")
22
  st.write(
23
- f"""This is an application to manually add and save missing
24
  coordinates to cities.csv file for the Musterdatenkatalog.
25
  Missing coordinates are cities with empty or missing Geometry column.
26
  The reason for this can be that the city administation unit has changed and,
 
 
1
  import os
2
+ from pathlib import Path
3
 
4
+ import pandas as pd
5
+ import streamlit as st
6
 
7
+ CITIES_ENRICHED_FINAL = (
8
+ Path(__file__).parent.parent / "data" / "cities_enriched_final.csv"
9
+ )
10
+ CITIES_ENRICHED_MANUALLY = (
11
+ Path(__file__).parent.parent / "data" / "cities_enriched_manually.csv"
12
+ )
13
+ MISSING_ENRICHED = Path(__file__).parent.parent / "data" / "missing_enriched.csv"
14
 
15
 
16
  def load_df(path: str) -> pd.DataFrame:
 
25
 
26
  st.header("Impute missing coordinates")
27
  st.write(
28
+ """This is an application to manually add and save missing
29
  coordinates to cities.csv file for the Musterdatenkatalog.
30
  Missing coordinates are cities with empty or missing Geometry column.
31
  The reason for this can be that the city administation unit has changed and,
utils/compare_old_coord.py CHANGED
@@ -1,6 +1,7 @@
1
- import os
2
- import pandas as pd
3
  import logging
 
 
 
4
 
5
  # define logger
6
  logging.basicConfig(
@@ -13,10 +14,12 @@ logging.basicConfig(
13
  )
14
 
15
 
16
- CITIES_ENRICHED_OLD = os.path.join("data", "cities_enriched_old.csv")
17
- CITIES_ENRICHED_NEW = os.path.join("data", "cities_enriched.csv")
18
- CITIES_ENRICHED_FINAL = os.path.join("data", "cities_enriched_final.csv")
19
- MISSING = os.path.join("data", "missing_final.csv")
 
 
20
 
21
 
22
  def load_data(path: str) -> pd.DataFrame:
 
 
 
1
  import logging
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
 
6
  # define logger
7
  logging.basicConfig(
 
14
  )
15
 
16
 
17
+ CITIES_ENRICHED_OLD = Path(__file__).parent.parent / "data" / "cities_enriched_old.csv"
18
+ CITIES_ENRICHED_NEW = Path(__file__).parent.parent / "data" / "cities_enriched.csv"
19
+ CITIES_ENRICHED_FINAL = (
20
+ Path(__file__).parent.parent / "data" / "cities_enriched_final.csv"
21
+ )
22
+ MISSING = Path(__file__).parent.parent / "data" / "missing_final.csv"
23
 
24
 
25
  def load_data(path: str) -> pd.DataFrame:
utils/get_coordinates.py CHANGED
@@ -1,11 +1,9 @@
1
- import pandas as pd
2
- import os
3
  import logging
4
- import numpy as np
5
- import ast
6
- import math
7
  from pathlib import Path
8
 
 
 
 
9
  # define logger
10
  logging.basicConfig(
11
  level=logging.INFO,
@@ -17,18 +15,25 @@ logging.basicConfig(
17
  )
18
 
19
 
20
- CITIES_DATA = os.path.join("data", "raw", "2024_08_20_cities_1310_v5.csv")
21
- DATA_ENRICHED = os.path.join("data", "cities_enriched.csv")
 
 
22
 
23
  # meta data for kreis codes ( variable in coordinates table)
24
- NAME_CODE_DATA = os.path.join("data", "raw", "name_kreiscode.csv")
25
- CODES_KOMMUNEN = os.path.join("data", "raw", "Deutschlandatlas.csv")
 
26
  # coordinates for Gemeinden
27
- COORDINATES = os.path.join("data", "raw", "coordinates_plz_kreiscode.csv")
28
- MISSING = os.path.join("data", "missing_first_parser.csv")
 
 
 
29
 
30
- if not os.path.exists(os.path.join("data", "preprocessed")):
31
- Path(os.path.join("data", "preprocessed")).mkdir(parents=True, exist_ok=True)
 
32
 
33
 
34
  def load_cities(path: str) -> pd.DataFrame:
@@ -60,7 +65,6 @@ def create_code_mapper(path: str) -> dict:
60
  def map_code(org_name, code_mapper):
61
  # Split the org_name string into parts
62
  parts = org_name.split()
63
- # print(parts, type(parts[0]))
64
  # Find a key in code_mapper that contains all parts of the split org_name
65
  for key in code_mapper.keys():
66
  # look first for whole name (cases like "Landkreis München" , "kreisfreie Stadt München")
@@ -166,7 +170,7 @@ def merge_coordinates(df: pd.DataFrame, coordinates: pd.DataFrame) -> pd.DataFra
166
  modified_row = row
167
  modified_rows.append(modified_row)
168
  df["Geometry"] = geometries
169
- # print(modified_rows)
170
  modified = pd.DataFrame(modified_rows)
171
  modified["Geometry"] = geometries
172
  return modified
@@ -179,7 +183,7 @@ def aggregate_coordinates(geo_element: str) -> list:
179
  else:
180
  actual_list = geo_element # ast.literal_eval(geo_element)
181
  processed_list = [list(map(float, coord.split(", "))) for coord in actual_list]
182
- # print(processed_list)
183
  if len(processed_list) > 1:
184
  coordinates = np.mean(np.array(processed_list), axis=0)
185
  else:
@@ -195,16 +199,17 @@ if __name__ == "__main__":
195
  missing = data[data["Code"].isnull()]
196
  logging.info(f"Missing values Gebietscode: {len(missing)}")
197
  data.to_csv(
198
- os.path.join("data", "preprocessed", "cities_enriched_with_code.csv"),
199
  index=False,
200
  )
201
- # data = pd.read_csv(
202
- # os.path.join("data", "preprocessed", "cities_enriched_with_code.csv"))
203
  data["Code"] = data["Code"].apply(lambda x: int(x) if pd.notna(x) else None)
204
  coordinates = load_coordinates(COORDINATES)
205
  data = merge_coordinates(data, coordinates)
206
  data.to_csv(
207
- os.path.join("data", "preprocessed", "cities_enriched_with_coordinates.csv"),
 
 
 
208
  index=False,
209
  )
210
  logging.info("Coordinates merged")
@@ -222,6 +227,5 @@ if __name__ == "__main__":
222
  logging.info(f"Missing geometry: {len(missing_geometry)}")
223
  missing_geometry.to_csv(MISSING, index=False)
224
 
225
- # data = pd.read_csv(os.path.join("data", "cities_enriched_manually.csv"))
226
  data["Geometry"] = data["Geometry"].apply(aggregate_coordinates)
227
  data.to_csv(DATA_ENRICHED, index=False)
 
 
 
1
  import logging
 
 
 
2
  from pathlib import Path
3
 
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
  # define logger
8
  logging.basicConfig(
9
  level=logging.INFO,
 
15
  )
16
 
17
 
18
+ CITIES_DATA = (
19
+ Path(__file__).parent.parent / "data" / "raw" / "2025_06_12_cities_1474_v6.csv"
20
+ )
21
+ DATA_ENRICHED = Path(__file__).parent.parent / "data" / "cities_enriched.csv"
22
 
23
  # meta data for kreis codes ( variable in coordinates table)
24
+ NAME_CODE_DATA = Path(__file__).parent.parent / "data" / "raw" / "name_kreiscode.csv"
25
+ CODES_KOMMUNEN = Path(__file__).parent.parent / "data" / "raw" / "Deutschlandatlas.csv"
26
+
27
  # coordinates for Gemeinden
28
+ COORDINATES = (
29
+ Path(__file__).parent.parent / "data" / "raw" / "coordinates_plz_kreiscode.csv"
30
+ )
31
+ MISSING = Path(__file__).parent.parent / "data" / "missing_first_parser.csv"
32
+
33
 
34
+ preprocessed_path = Path(__file__).parent.parent / "data" / "preprocessed"
35
+ if not preprocessed_path.exists():
36
+ preprocessed_path.mkdir(parents=True, exist_ok=True)
37
 
38
 
39
  def load_cities(path: str) -> pd.DataFrame:
 
65
  def map_code(org_name, code_mapper):
66
  # Split the org_name string into parts
67
  parts = org_name.split()
 
68
  # Find a key in code_mapper that contains all parts of the split org_name
69
  for key in code_mapper.keys():
70
  # look first for whole name (cases like "Landkreis München" , "kreisfreie Stadt München")
 
170
  modified_row = row
171
  modified_rows.append(modified_row)
172
  df["Geometry"] = geometries
173
+
174
  modified = pd.DataFrame(modified_rows)
175
  modified["Geometry"] = geometries
176
  return modified
 
183
  else:
184
  actual_list = geo_element # ast.literal_eval(geo_element)
185
  processed_list = [list(map(float, coord.split(", "))) for coord in actual_list]
186
+
187
  if len(processed_list) > 1:
188
  coordinates = np.mean(np.array(processed_list), axis=0)
189
  else:
 
199
  missing = data[data["Code"].isnull()]
200
  logging.info(f"Missing values Gebietscode: {len(missing)}")
201
  data.to_csv(
202
+ Path(__file__).parent.parent / "data" / "preprocessed" / "cities_enriched.csv",
203
  index=False,
204
  )
 
 
205
  data["Code"] = data["Code"].apply(lambda x: int(x) if pd.notna(x) else None)
206
  coordinates = load_coordinates(COORDINATES)
207
  data = merge_coordinates(data, coordinates)
208
  data.to_csv(
209
+ Path(__file__).parent.parent
210
+ / "data"
211
+ / "preprocessed"
212
+ / "cities_enriched_with_coordinates.csv",
213
  index=False,
214
  )
215
  logging.info("Coordinates merged")
 
227
  logging.info(f"Missing geometry: {len(missing_geometry)}")
228
  missing_geometry.to_csv(MISSING, index=False)
229
 
 
230
  data["Geometry"] = data["Geometry"].apply(aggregate_coordinates)
231
  data.to_csv(DATA_ENRICHED, index=False)
utils/process_data.py CHANGED
@@ -1,7 +1,7 @@
1
- import os
2
  import logging
3
- import pandas as pd
4
 
 
5
 
6
  # define logger
7
  logging.basicConfig(
@@ -13,11 +13,12 @@ logging.basicConfig(
13
  ],
14
  )
15
 
 
16
  # change these to paths if you want to generate the map_data.csv separately from the app
17
- DATA_RAW = os.path.join("2024-08-21_musterdatenkatalog.json")
18
- CITIES_ENRICHED = os.path.join("data", "cities_enriched_manually.csv")
19
 
20
- OUTPUT = os.path.join("data", "preprocessed", "map_data.csv")
21
 
22
 
23
  def load_data(path: str = DATA_RAW) -> pd.DataFrame:
 
 
1
  import logging
2
+ from pathlib import Path
3
 
4
+ import pandas as pd
5
 
6
  # define logger
7
  logging.basicConfig(
 
13
  ],
14
  )
15
 
16
+
17
  # change these to paths if you want to generate the map_data.csv separately from the app
18
+ DATA_RAW = Path(__file__).parent.parent / "2025-06-13_musterdatenkatalog.json"
19
+ CITIES_ENRICHED = Path(__file__).parent.parent / "data" / "cities_enriched_manually.csv"
20
 
21
+ OUTPUT = Path(__file__).parent.parent / "data" / "preprocessed" / "map_data.csv"
22
 
23
 
24
  def load_data(path: str = DATA_RAW) -> pd.DataFrame: