Spaces:
Sleeping
Sleeping
| import os | |
| import logging | |
| import pandas as pd | |
| # define logger | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(message)s", | |
| handlers=[ | |
| logging.FileHandler("process_data.log"), | |
| logging.StreamHandler(), | |
| ], | |
| ) | |
| # change these to paths if you want to generate the map_data.csv separately from the app | |
| DATA_RAW = os.path.join("2025-06-13_musterdatenkatalog.json") | |
| CITIES_ENRICHED = os.path.join("data", "cities_enriched_manually.csv") | |
| OUTPUT = os.path.join("data", "preprocessed", "map_data.csv") | |
| def load_data(path: str = DATA_RAW) -> pd.DataFrame: | |
| df = pd.read_json(path) | |
| counts = df["ORG"].value_counts().reset_index() | |
| counts.columns = ["ORG", "Count"] | |
| return counts | |
| def merge_geoemtry(data: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame: | |
| data = data.merge(cities, left_on="ORG", right_on="Kommune", how="left") | |
| if data["Geometry"].isna().sum() > 0: | |
| logging.warning( | |
| f"Missing {data['Geometry'].isna().sum()} geometries in the data." | |
| ) | |
| for row in data[data["Geometry"].isna()].itertuples(): | |
| if row.ORG in cities["name"].values: | |
| data.at[row.Index, "Geometry"] = cities[cities["name"] == row.ORG][ | |
| "Geometry" | |
| ].values[0] | |
| logging.info("data found in citiesname.") | |
| return data | |
| def add_coor(data: pd.DataFrame): | |
| # very experminetal, but works | |
| for row in data.itertuples(): | |
| if type(row.Geometry) == str: | |
| data.at[row.Index, "Geometry"] = [ | |
| item for item in row.Geometry.strip("[]").split() | |
| ] | |
| else: | |
| logging.info(f"{row.Geometry}, {row.Geometry}") | |
| # print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0]) | |
| data["lat"] = data["Geometry"].apply( | |
| lambda x: float(x[0]) if x is not None else None | |
| ) | |
| data["lon"] = data["Geometry"].apply( | |
| lambda x: float(x[1]) if x is not None else None | |
| ) | |
| return data | |
| if __name__ == "__main__": | |
| extraction = load_data() | |
| # extraction.to_csv( | |
| # os.path.join("data", "preprocessed", "map_data.csv"), index=False) | |
| logging.info("Extraction data loaded.") | |
| extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED)) | |
| logging.info("Data merged with Geometry from cities.csv.") | |
| # extraction = extraction[extraction["Geometry"].notna()] | |
| extraction_enriched = add_coor(extraction) | |
| logging.info("Extra columns for lat/lon created from Geometry column.") | |
| extraction_enriched.to_csv(OUTPUT, index=False) | |
| logging.info("Data enriched and saved.") | |