Spaces:
Sleeping
Sleeping
| import logging | |
| from pathlib import Path | |
| import pandas as pd | |
| # define logger | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(message)s", | |
| handlers=[ | |
| logging.FileHandler("process_data.log"), | |
| logging.StreamHandler(), | |
| ], | |
| ) | |
| CITIES_ENRICHED_OLD = Path(__file__).parent.parent / "data" / "cities_enriched_old.csv" | |
| CITIES_ENRICHED_NEW = Path(__file__).parent.parent / "data" / "cities_enriched.csv" | |
| CITIES_ENRICHED_FINAL = ( | |
| Path(__file__).parent.parent / "data" / "cities_enriched_final.csv" | |
| ) | |
| MISSING = Path(__file__).parent.parent / "data" / "missing_final.csv" | |
| def load_data(path: str) -> pd.DataFrame: | |
| df = pd.read_csv(path) | |
| return df | |
| def compare_cities(old: pd.DataFrame, new: pd.DataFrame) -> tuple: | |
| old_cities = old["Kommune"].unique() | |
| new_cities = new["Kommune"].unique() | |
| new_cities = set(new_cities) - set(old_cities) | |
| deleted_cities = set(old_cities) - set(new_cities) | |
| return new_cities, deleted_cities | |
| def enrich_new(old, new) -> pd.DataFrame: | |
| missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)] | |
| for row in missing.itertuples(): | |
| old_city = old[old["Kommune"] == row.Kommune] | |
| old_city_code = old[old["Code"] == row.Code] | |
| # print(type(old_city.Geometry.iloc[0]), old_city) | |
| if len(old_city) > 0: | |
| # print(new.iloc[row.Index, 2]) | |
| new.at[row.Index, "Geometry"] = old_city["Geometry"].iloc[0] | |
| elif len(old_city_code) > 0: | |
| new.at[row.Index, "Code"] = old_city_code["Geometry"].iloc[0] | |
| # print(new.loc[[row.Index], ["Geometry"]]) | |
| return new | |
| def report_missing(new): | |
| missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)] | |
| logging.info(f"Finally missing cities: {missing['Kommune'].unique()}") | |
| return missing | |
| if __name__ == "__main__": | |
| old = load_data(CITIES_ENRICHED_OLD) | |
| new = load_data(CITIES_ENRICHED_NEW) | |
| new_cities, deleted_cities = compare_cities(old, new) | |
| logging.info(f"New cities: {new_cities}") | |
| logging.info(f"Deleted cities: {deleted_cities}") | |
| new = enrich_new(old, new) | |
| new.to_csv(CITIES_ENRICHED_FINAL, index=False) | |
| missing = report_missing(new) | |
| missing.to_csv(MISSING, index=False) | |