Spaces:

dmarr
/

NucPy

Sleeping

App Files Files Community

dmarr commited on Jul 3, 2025

Commit

bad9d6d

1 Parent(s): 8ad04d8

Update for v0.7 RTE

Browse files

Files changed (1) hide show

app.py +89 -41

app.py CHANGED Viewed

@@ -157,49 +157,97 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
     # print(mongo_db_data)
     mongo_df = pd.DataFrame(mongo_db_data)
-    mongo_df = mongo_df[['identifier', 'version', 'updated_date', 'type', 'production_type', 'message_id', 'unit', 'status', 'values',
-        'publication_date', 'unavailability_type', 'fuel_type',
-        'affected_asset_or_unit_name',
-        'affected_asset_or_unit_installed_capacity', 'event_status']]
-    # 1. Normalize “unit” into a DataFrame of its own
-    unit_expanded = pd.json_normalize(mongo_df["unit"])
-    # values_expanded = pd.json_normalize(mongo_df["values"])
-    # (This produces a new DF with columns “eic_code” and “name”.)
-    # 2. Concatenate those new columns back onto df, then drop the old “unit” column
-    mongo_df_2 = pd.concat([mongo_df.drop(columns=["unit"]), unit_expanded], axis=1)
-    # mongo_df_2 = pd.concat([mongo_df_2.drop(columns=["values"]), values_expanded], axis=1)
-    # 1. Create a temporary column that is “the first dict” of each list (or {} if empty/NaN)
-    mongo_df_2["values_first"] = mongo_df_2["values"].apply(
-        lambda lst: lst[0] if isinstance(lst, list) and len(lst) > 0 else {}
-    )
-    # 2. Normalize that dict into separate columns
-    values_expanded = pd.json_normalize(mongo_df_2["values_first"])
-    #    e.g. this produces columns like “start_date”, “end_date”, etc.
-    # 3. Concatenate back and drop the originals
-    mongo_df_2 = pd.concat(
-        [
-            mongo_df_2.drop(columns=["values", "values_first"]),
-            values_expanded
-        ],
-        axis=1
-    )
-    mongo_df_2["fuel_type"] = mongo_df_2["fuel_type"].combine_first(mongo_df_2["production_type"])
-    mongo_df_2["publication_date"] = mongo_df_2["publication_date"].combine_first(mongo_df_2["updated_date"])
-    mongo_df_2["event_status"] = mongo_df_2["event_status"].combine_first(mongo_df_2["status"])
-    mongo_df_2["affected_asset_or_unit_installed_capacity"] = mongo_df_2["affected_asset_or_unit_installed_capacity"].combine_first(mongo_df_2["installed_capacity"])
-    mongo_df_2["affected_asset_or_unit_name"] = mongo_df_2["affected_asset_or_unit_name"].combine_first(mongo_df_2["name"])
-    mongo_df_2["unavailability_type"] = (
-        mongo_df_2["unavailability_type"]
-        .combine_first(mongo_df_2.loc[:, "type"].iloc[:, 0])
-    )
-    mongo_df_2 = mongo_df_2.drop(columns=["production_type", "updated_date", "status", "installed_capacity", "name", "type", "eic_code"])
     # Convert the date columns to datetime objects
     for col in ["publication_date", "start_date", "end_date"]:
@@ -210,7 +258,7 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
     #     mongo_df_2[col] = mongo_df_2[col].dt.tz_convert("Europe/Paris")
     # mongo_df_2 = mongo_df_2.drop_duplicates(subset='identifier', keep='first')
-    mongo_df_2['version'] = mongo_df_2['version'].astype(int)
     # Sort by identifier and version to ensure the latest version is at the top
     # Method 1: Use groupby + idxmax to pick the row with the largest version per identifier
     idx = mongo_df_2.groupby("identifier")["version"].idxmax()

     # print(mongo_db_data)
     mongo_df = pd.DataFrame(mongo_db_data)
+    # the five columns you care about
+    _optional = {
+        "updated_date",
+        "type",
+        "production_type",
+        "unit",
+        "status",
+    }
+    # see which of those actually exist
+    present = set(mongo_df.columns) & _optional
+    if _optional.issubset(mongo_df.columns):
+        # 0. first, pick the columns you know exist, plus the always‑present ones
+        cols = [
+            "identifier", "version", "message_id",
+            "values", "publication_date", "unavailability_type", "fuel_type",
+            "affected_asset_or_unit_name", "affected_asset_or_unit_installed_capacity",
+            "event_status"
+        ] + list(_optional)
+        mongo_df = mongo_df[cols]
+        # 1. normalize “unit”
+        unit_expanded = pd.json_normalize(mongo_df["unit"])
+        mongo_df_2 = pd.concat([mongo_df.drop(columns=["unit"]), unit_expanded], axis=1)
+        # 2. normalize the first element of “values”
+        mongo_df_2["values_first"] = (
+            mongo_df_2["values"]
+            .apply(lambda lst: lst[0] if isinstance(lst, list) and lst else {})
+        )
+        values_expanded = pd.json_normalize(mongo_df_2["values_first"])
+        mongo_df_2 = pd.concat(
+            [mongo_df_2.drop(columns=["values", "values_first"]), values_expanded],
+            axis=1
+        )
+        # 3. coalesce and drop old cols
+        mongo_df_2["fuel_type"] = mongo_df_2["fuel_type"].combine_first(mongo_df_2["production_type"])
+        mongo_df_2["publication_date"] = mongo_df_2["publication_date"].combine_first(mongo_df_2["updated_date"])
+        mongo_df_2["event_status"] = mongo_df_2["event_status"].combine_first(mongo_df_2["status"])
+        mongo_df_2["affected_asset_or_unit_installed_capacity"] = (
+            mongo_df_2["affected_asset_or_unit_installed_capacity"]
+            .combine_first(mongo_df_2["installed_capacity"])
+        )
+        mongo_df_2["affected_asset_or_unit_name"] = (
+            mongo_df_2["affected_asset_or_unit_name"]
+            .combine_first(mongo_df_2["name"])
+        )
+        mongo_df_2["unavailability_type"] = (
+            mongo_df_2["unavailability_type"]
+            .combine_first(mongo_df_2["type"].iloc[:, 0])
+        )
+        drop_cols = [
+            "production_type",
+            "updated_date",
+            "status",
+            "installed_capacity",
+            "name",
+            "type",
+            "eic_code",
+        ]
+        # only drop those that are actually there
+        drop_cols = [c for c in drop_cols if c in mongo_df_2.columns]
+        mongo_df_2 = mongo_df_2.drop(columns=drop_cols)
+        # now mongo_df_2 is your final
+    else:
+        # at least one of the required columns is missing:
+        # present contains the ones you did find
+        missing = _optional - present
+        print(f"Skipping normalize process because these columns are missing: {missing}")
+        mongo_df_2 = mongo_df.copy()  # or however you want to proceed
+        mongo_df_2["values_first"] = mongo_df_2["values"].apply(
+            lambda lst: lst[0] if isinstance(lst, list) and len(lst) > 0 else {}
+        )
+        # 2. Normalize that dict into separate columns
+        values_expanded = pd.json_normalize(mongo_df_2["values_first"])
+        #    e.g. this produces columns like “start_date”, “end_date”, etc.
+        # 3. Concatenate back and drop the originals
+        mongo_df_2 = pd.concat(
+            [
+                mongo_df_2.drop(columns=["values", "values_first", "start_date", "end_date"]),
+                values_expanded
+            ],
+            axis=1
+        )
     # Convert the date columns to datetime objects
     for col in ["publication_date", "start_date", "end_date"]:
     #     mongo_df_2[col] = mongo_df_2[col].dt.tz_convert("Europe/Paris")
     # mongo_df_2 = mongo_df_2.drop_duplicates(subset='identifier', keep='first')
+    mongo_df_2['version'] = mongo_df_2['version'].astype(float)
     # Sort by identifier and version to ensure the latest version is at the top
     # Method 1: Use groupby + idxmax to pick the row with the largest version per identifier
     idx = mongo_df_2.groupby("identifier")["version"].idxmax()