Spaces:

dmarr
/

NucPy

Sleeping

App Files Files Community

dmarr commited on Jun 12, 2025

Commit

8ad04d8

1 Parent(s): 11e7be0

Update nucpy with the rte fix

Browse files

Files changed (2) hide show

app.py +203 -267
app_all.py +280 -216

app.py CHANGED Viewed

@@ -24,48 +24,80 @@ def mongo_unavs_call(user_input_start_date, user_input_end_date, user_input_past
     passw = "tN9XpCCQM2MtYDme"
     host = "nucmonitordata.xxcwx9k.mongodb.net"
     client = pymongo.MongoClient(
-        f"mongodb+srv://{user}:{passw}@{host}/?retryWrites=true&w=majority&connectTimeoutMS=5000"
     )
     db = client["data"]
     collection_past_unavs = db["unavs"]
-    collection_unavs =db["unavs_update"]
     start_date = f"{user_input_start_date}T00:00:00"
     end_date = f"{user_input_end_date}T23:59:59"
     past_date = f"{user_input_past_date}T23:59:59"
-    pipeline = [
         {
-            "$unwind": "$results"
         },
         {
-            "$unwind": "$results.generation_unavailabilities"
-        },
         {
             "$match": {
-                "results.generation_unavailabilities.production_type": "NUCLEAR",
-                # "results.generation_unavailabilities.start_date": {"$lte": end_date},
-                # "results.generation_unavailabilities.end_date": {"$gte": start_date},
-                # "results.generation_unavailabilities.updated_date": {"$lte": end_date}
-                "results.generation_unavailabilities.updated_date": {"$lte": past_date}
             }
         },
         {
-            "$project": {
-                "_id": 0,
-                "generation_unavailabilities": "$results.generation_unavailabilities"
             }
         }
     ]
-    result1 = list(collection_past_unavs.aggregate(pipeline))
-    result2 = list(collection_unavs.aggregate(pipeline))
-    # Merge the two lists of JSON results
-    merged_result = result1 + result2
-    return merged_result
 # --------------------------------------------------------------------------------------- #
@@ -125,281 +157,187 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
     # print(mongo_db_data)
     mongo_df = pd.DataFrame(mongo_db_data)
-    # print(mongo_df)
-    # Unpack the dictionaries into separate columns
-    mongo_df_unpacked = pd.json_normalize(mongo_df['generation_unavailabilities'])
-    # Concatenate the unpacked columns with the original DataFrame
-    mongo_df_result = pd.concat([mongo_df, mongo_df_unpacked], axis=1)
-    # Drop the original column
-    mongo_df_result.drop(columns=['generation_unavailabilities'], inplace=True)
-    mongo_df_result['start_date'] = mongo_df_result['values'].apply(lambda x: x[0]['start_date'])
-    mongo_df_result['end_date'] = mongo_df_result['values'].apply(lambda x: x[0]['end_date'])
-    mongo_df_result['available_capacity'] = mongo_df_result['values'].apply(lambda x: x[0]['available_capacity'])
-    mongo_df_result['unavailable_capacity'] = mongo_df_result['values'].apply(lambda x: x[0]['unavailable_capacity'])
-    # print(mongo_df_result)
-    # print(mongo_df_result.columns)
-    # Drop the original 'values' column
-    mongo_df_result.drop('values', axis=1, inplace=True)
-    mongo_df2 = mongo_df_result
-    mongo_df2.rename(columns=lambda col: col.replace('unit.', ''), inplace=True)
-    # --------------------- INITIAL DATA CLEANING FOR MONGO DATA ------------------------ #
-    # Make the two dataframes have the same columns
-    mongo_unavs = mongo_df2.copy()
-    mongo_unavs.drop(columns="type", inplace=True)
-    # merged_df['updated_date'] = merged_df['updated_date'].astype(str)
-# --------------------------- HERE IS THE CHANGE TO GET ONLY ACTIVE OR ACTIVE AND INACTIVE --------------------------- #
-    # start_date_str = usr_start_date.strftime("%Y-%m-%d")
-    start_date_str = str(usr_start_date)
-    # end_date_str = usr_end_date.strftime("%Y-%m-%d")
-    end_date_str = str(usr_end_date)
-    current_datetime = datetime.datetime.now()
-    past_date_str = str(past_date.strftime("%Y-%m-%dT%H:%M:%S%z"))
-    current_datetime_str = current_datetime.strftime("%Y-%m-%d")
-    # nuclear_unav = mongo_unavs.copy()[(mongo_unavs.copy()["production_type"] == "NUCLEAR") & (mongo_unavs.copy()["updated_date"] <= past_date_str)]
-    # print(past_date_str)
-        # Sort by updated date
-    sorted_df = mongo_unavs.copy().sort_values(by='updated_date')
-    sorted_df = sorted_df.copy().reset_index(drop=True)
-    # cruas_2 = sorted_df.copy()[(sorted_df.copy()["name"] == "ST ALBAN 2") & (sorted_df.copy()["end_date"] >= start_date_str)]
-    # print(cruas_2[['updated_date', 'end_date', 'available_capacity']])
-    # Filter to get identifiers
-    filtered_id_df = sorted_df.copy()
-# I commented this out
-    filtered_id_df = filtered_id_df.drop_duplicates(subset='identifier', keep='last')
-    # cruas_2 = filtered_id_df.copy()[(filtered_id_df.copy()["name"] == "ST ALBAN 2") & (filtered_id_df.copy()["end_date"] >= start_date_str)]
-    # print(cruas_2[['updated_date', 'end_date', 'available_capacity']])
-    filtered_id_df = filtered_id_df.copy().reset_index(drop=True)
-    filtered_df = filtered_id_df[
-    (filtered_id_df["production_type"] == "NUCLEAR") &
-    # (mongo_unavs["updated_date"] <= past_date_str) &
-    (filtered_id_df["status"] != "DISMISSED")]
-    # if photo_date == True:
-    #     nuclear_unav = merged_df.copy()[(merged_df.copy()["production_type"] == "NUCLEAR") & (merged_df.copy()["updated_date"] <= past_date_str)]
-    #     photo_date = True
-    # else: # need to add updated_date as a conditional to get the newest for that day
-    #     nuclear_unav = merged_df.copy()[(merged_df.copy()["production_type"] == "NUCLEAR") & (merged_df.copy()["updated_date"] <= end_date_str)]
-# --------------------------- HERE IS THE CHANGE TO GET ONLY ACTIVE OR ACTIVE AND INACTIVE --------------------------- #
-    # --------------------- SECOND DATA CLEANING ------------------------ #
-    # This filter should take only the most recent id and discard the rest
-    # This filter should take all the dates with unavs that include days with unavs in the range of the start and end date
-    # This filter might take out the most recent identifiers (Message ID) that change the dates of unavailability of a plant.
-    # This means that the actual unavailability is something else
-    # filtered_df = filtered_id_df.copy()[(filtered_id_df.copy()['start_date'] <= end_date_str) & (filtered_id_df.copy()['end_date'] >= start_date_str)]
-    # Need to eventually do a filter that takes the most restrictive updated identifier instead of the most recent when there
-    # is an overlap
-    # Update available_capacity where the condition is True
-    # Standardize datetime in dataframe
-    filtered_df2 = filtered_df.copy() # This code will just standardize datetime stuff
-    filtered_df2['creation_date'] = pd.to_datetime(filtered_df2['creation_date'], utc=True)
-    filtered_df2['updated_date'] = pd.to_datetime(filtered_df2['updated_date'], utc=True)
-    filtered_df2['start_date'] = pd.to_datetime(filtered_df2['start_date'], utc=True)
-    filtered_df2['end_date'] = pd.to_datetime(filtered_df2['end_date'], utc=True)
-    # Drop the duplicates
-    filtered_df3 = filtered_df2.copy().drop_duplicates()
-    # start_date_datetime = pd.to_datetime(start_date_str, utc=True)  # Remove timezone info
-    start_date_datetime = pd.Timestamp(start_date_str, tz='UTC')
-    # end_date_datetime = pd.to_datetime(end_date_str, utc=True)
-    end_date_datetime = pd.Timestamp(end_date_str, tz='UTC')
-    # Turn df into dict for json processing
-    filtered_unavs = filtered_df3.copy().to_dict(orient='records')
-    results = {}
-    for unav in filtered_unavs:
-        plant_name = unav['name']
-        if plant_name in results:
-            # If the key is already in the dictionary, append unavailability to the list
-            results[plant_name].append({'status': unav['status'],
-                                        'id': unav['message_id'],
-                                        'creation_date': unav['creation_date'],
-                                        'updated_date': unav['updated_date'],
-                                        'start_date': unav['start_date'],
-                                        'end_date': unav['end_date'],
-                                        'available_capacity': unav['available_capacity']})
-        else:
-            # if the key of the plant is not there yet, create a new element of the dictionary
-            # Get message_id instead of identifier, easier to identify stuff with it
-            results[plant_name] = [{'status': unav['status'],
-                                    'id': unav['message_id'],
-                                    'creation_date': unav['creation_date'],
-                                    'updated_date': unav['updated_date'],
-                                    'start_date': unav['start_date'],
-                                    'end_date': unav['end_date'],
-                                    'available_capacity': unav['available_capacity']}]
-    # Custom encoder to handle datetime objects
-    class DateTimeEncoder(json.JSONEncoder):
-        def default(self, o):
-            if isinstance(o, datetime.datetime):
-                return o.isoformat()
-            return super().default(o)
-    results_holder = results
-    # Create new dict with each plant only having start_date less than user_end_date and an end_date greater than user_start_date
-    # should just be doing the same as above in the df for filtering only dates that inclued the start and end date
-    start_date = start_date_datetime.date()
-    end_date = end_date_datetime.date()
-    results_filtered = results_holder
-    for key, value in results_filtered.items():
-        filtered_values = []
-        for item in value:
-            item_start_date = item['start_date'].date()
-            item_end_date = item['end_date'].date()
-            identifier = item['id']
-            if item_start_date < end_date and item_end_date > start_date and identifier not in filtered_values:
-                filtered_values.append(item)
-        results_filtered[key] = filtered_values
-    sorted_results = results_filtered
-    # --------------------- SECOND DATA CLEANING ------------------------ #
-# --------------------------- HERE IS THE FINAL PROCESS --------------------------- #
-    for key, value in sorted_results.items():
-        sorted_results[key] = sorted(value, key=lambda x: x['updated_date'])
-    results_sorted = sorted_results
-    dates_of_interest = [start_date] # We are creating a list of dates ranging from user specified start and end dates
-    date_plus_one = start_date
-    while date_plus_one < end_date:
-        date_plus_one = date_plus_one + datetime.timedelta(days=1)
-        dates_of_interest.append(date_plus_one)
-    # This is to standardize the datetimes. Without this, the datetime calculations for each power plant will not work
-    # This is just getting the plant metadata and giving it updated_date????? With an amount of items based on the length of the
-    # date range????
-    results_plants = {plant_name: {date: {"available_capacity": power, "updated_date": pd.to_datetime("1970-01-01", utc=True)} for date in dates_of_interest}
-                    for plant_name, power in plants_metadata.items()}
-    # print(results_sorted)
-    for plant, unavailabilities in results_sorted.items():
-        # Get the full power of a given plant according to the sorted results
-        original_power = plants_metadata[plant]
-        # Get all the unavailabilities scheduled for the plant.
-        # This is actually apparently just getting the metadata though???
-        results_current_plant = results_plants[plant]
-        for unavailability in unavailabilities:
-            # For each unavailability, the resulting power, start and end datetime are collected. Need to collect updated_date
-            power_unavailability = unavailability["available_capacity"]
-            updated_date_unav = unavailability["updated_date"]
-            # The date comes as a string
-            start_datetime_unav = unavailability["start_date"]
-            end_datetime_unav = unavailability["end_date"]
-            start_date_unav = start_datetime_unav.date()  # Extract date part
-            end_date_unav = end_datetime_unav.date()      # Extract date part
-            # For the current unavailability, we want to find which days it affects
-            for day in dates_of_interest:
-                start_hour = start_datetime_unav.hour
-                start_minute = start_datetime_unav.minute
-                end_hour = end_datetime_unav.hour
-                end_minute = end_datetime_unav.minute
-                if start_date_unav <= day <= end_date_unav:
-                    # Check if the day is already updated with a later update_date
-                    if day in results_current_plant and updated_date_unav <= results_current_plant[day]["updated_date"]:
-                        # Here is likely where we can do the filter for worst case scenario
-                        # --------------------------- !!!!!!CREATE NEW FILTER THAT KEEPS ONLY MOST RESTRICTIVE OVERLAP!!!!!! --------------------------- #
-                        # if power_unavailability < results_current_plant[day]['available_capacity']:
-                        #     # Calculate the % of the day that the plant is under maintenance
-                        #     if start_date_unav == day and day == end_date_unav:
-                        #         # The unavailability starts and ends on the same day
-                        #         percentage_of_day = (end_hour * 60 + end_minute - start_hour * 60 - start_minute) / (24 * 60)
-                        #     elif start_date_unav == day:
-                        #         # The unavailability starts on the current day but ends on a later day
-                        #         percentage_of_day = (24 * 60 - (start_hour * 60 + start_minute)) / (24 * 60)
-                        #     elif day == end_date_unav:
-                        #         # The unavailability starts on a previous day and ends on the current day
-                        #         percentage_of_day = (end_hour * 60 + end_minute) / (24 * 60)
-                        #     else:
-                        #         # The unavailability covers the entire day
-                        #         percentage_of_day = 1
-                        # --------------------------- !!!!!!CREATE NEW FILTER THAT KEEPS ONLY MOST RESTRICTIVE OVERLAP!!!!!! --------------------------- #
-                        # else:
-                        continue  # Skip to the next loop if there is already information for a later update_date
-                    # Calculate the % of the day that the plant is under maintenance
-                    if start_date_unav == day and day == end_date_unav:
-                        # The unavailability starts and ends on the same day
-                        percentage_of_day = (end_hour * 60 + end_minute - start_hour * 60 - start_minute) / (24 * 60)
-                    elif start_date_unav == day:
-                        # The unavailability starts on the current day but ends on a later day
-                        percentage_of_day = (24 * 60 - (start_hour * 60 + start_minute)) / (24 * 60)
-                    elif day == end_date_unav:
-                        # The unavailability starts on a previous day and ends on the current day
-                        percentage_of_day = (end_hour * 60 + end_minute) / (24 * 60)
-                    else:
-                        # The unavailability covers the entire day
-                        percentage_of_day = 1
-                    # The average power of the day is calculated
-                    power_of_day = percentage_of_day * power_unavailability + (1 - percentage_of_day) * original_power
-                    # Update the available_capacity for the day only if it's not already updated with a later update_date
-                    if (day not in results_current_plant):
-                        results_current_plant[day] = {"available_capacity": power_of_day, "updated_date": updated_date_unav}
-                    elif (day in results_current_plant) and (updated_date_unav > results_current_plant[day]["updated_date"]) and (power_of_day < results_current_plant[day]['available_capacity']):
-                        results_current_plant[day] = {"available_capacity": power_of_day, "updated_date": updated_date_unav}
-                    else:
-                        continue
     output_results = {}
     for plant, plant_data in results_plants.items():
         available_capacity_per_day = {str(date): data["available_capacity"] for date, data in plant_data.items()}
         output_results[plant] = available_capacity_per_day
-    # print(output_results)
     add_total(output_results)
-    # print("Done")
-    # print(results_plants)
-    # Convert datetime key to string to store in mongodb
     output_results = {plant: {str(date): power for date, power in plant_data.items()} for plant, plant_data in output_results.items()}
     output_results = pd.DataFrame(output_results)
-    print(output_results)
     # -------------------------------------------------
     # Calculate the average of each column excluding the last row
@@ -407,7 +345,7 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
     # Replace the last row with the calculated averages
     output_results.iloc[-1, :] = averages
     output_results = output_results.to_dict()
     def turn_total_row_to_avg(data):
@@ -418,8 +356,6 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
     turn_total_row_to_avg(output_results)
-    # print(output_results)
     json_data = json.dumps(output_results)
     # print(json_data)
     return json_data
@@ -537,10 +473,10 @@ def run_app():
         df_photo_date_2.index = pd.to_datetime(df_photo_date_2.index)
         # Calculate monthly averages with date in yyyy-mm format
-        monthly_average_nucmonitor = df_nucmonitor_2.resample('M').mean()
         monthly_average_nucmonitor.index = monthly_average_nucmonitor.index.strftime('%Y-%m')
-        monthly_average_photo_date = df_photo_date_2.resample('M').mean()
         monthly_average_photo_date.index = monthly_average_photo_date.index.strftime('%Y-%m')

     passw = "tN9XpCCQM2MtYDme"
     host = "nucmonitordata.xxcwx9k.mongodb.net"
     client = pymongo.MongoClient(
+        f"mongodb+srv://{user}:{passw}@{host}/?retryWrites=true&w=majority&connectTimeoutMS=100000"
     )
     db = client["data"]
     collection_past_unavs = db["unavs"]
+    collection_unavs = db["unavs_update"]
     start_date = f"{user_input_start_date}T00:00:00"
     end_date = f"{user_input_end_date}T23:59:59"
     past_date = f"{user_input_past_date}T23:59:59"
+    pipeline_v4 = [
+        # 1) Expand each results element into its own doc
+        { "$unwind": "$results" },
+        # 2) Expand each generation_unavailabilities element
+        { "$unwind": "$results.generation_unavailabilities" },
+        # 3) Keep only those that match your fuel_type + date criteria
         {
+            "$match": {
+                "results.generation_unavailabilities.production_type": "NUCLEAR",
+                "results.generation_unavailabilities.updated_date": { "$lte": past_date },
+                "results.generation_unavailabilities.start_date": { "$lte": end_date },
+                "results.generation_unavailabilities.start_date": { "$gte": start_date },
+                "results.generation_unavailabilities.end_date": { "$gte": start_date },
+                "results.generation_unavailabilities.end_date": { "$lte": end_date }
+            }
         },
+        # 4) Replace the entire document with just that sub-doc
         {
+            "$replaceRoot": {
+                "newRoot": "$results.generation_unavailabilities"
+            }
+        }
+    ]
+    pipeline_v6 = [
+        # 1) Expand each results element into its own doc
+        { "$unwind": "$results" },
+        # 2) Expand each generation_unavailabilities element
+        { "$unwind": "$results.generation_unavailabilities" },
+        # 3) Keep only those that match your fuel_type + date criteria
         {
             "$match": {
+                "results.generation_unavailabilities.fuel_type": "NUCLEAR",
+                "results.generation_unavailabilities.publication_date": { "$lte": past_date },
+                "results.generation_unavailabilities.start_date": { "$lte": end_date },
+                # "results.generation_unavailabilities.start_date": { "$gte": start_date },
+                "results.generation_unavailabilities.end_date": { "$gte": start_date },
+                # "results.generation_unavailabilities.end_date": { "$lte": end_date }
             }
         },
+        # 4) Replace the entire document with just that sub-doc
         {
+            "$replaceRoot": {
+                "newRoot": "$results.generation_unavailabilities"
             }
         }
     ]
+    result1 = list(collection_past_unavs.aggregate(pipeline_v4))
+    result2 = list(collection_unavs.aggregate(pipeline_v4))
+    result_v6 = list(collection_unavs.aggregate(pipeline_v6))
+    merge_results = result1 + result2 + result_v6
+    return merge_results
 # --------------------------------------------------------------------------------------- #
     # print(mongo_db_data)
     mongo_df = pd.DataFrame(mongo_db_data)
+    mongo_df = mongo_df[['identifier', 'version', 'updated_date', 'type', 'production_type', 'message_id', 'unit', 'status', 'values',
+        'publication_date', 'unavailability_type', 'fuel_type',
+        'affected_asset_or_unit_name',
+        'affected_asset_or_unit_installed_capacity', 'event_status']]
+    # 1. Normalize “unit” into a DataFrame of its own
+    unit_expanded = pd.json_normalize(mongo_df["unit"])
+    # values_expanded = pd.json_normalize(mongo_df["values"])
+    # (This produces a new DF with columns “eic_code” and “name”.)
+    # 2. Concatenate those new columns back onto df, then drop the old “unit” column
+    mongo_df_2 = pd.concat([mongo_df.drop(columns=["unit"]), unit_expanded], axis=1)
+    # mongo_df_2 = pd.concat([mongo_df_2.drop(columns=["values"]), values_expanded], axis=1)
+    # 1. Create a temporary column that is “the first dict” of each list (or {} if empty/NaN)
+    mongo_df_2["values_first"] = mongo_df_2["values"].apply(
+        lambda lst: lst[0] if isinstance(lst, list) and len(lst) > 0 else {}
+    )
+    # 2. Normalize that dict into separate columns
+    values_expanded = pd.json_normalize(mongo_df_2["values_first"])
+    #    e.g. this produces columns like “start_date”, “end_date”, etc.
+    # 3. Concatenate back and drop the originals
+    mongo_df_2 = pd.concat(
+        [
+            mongo_df_2.drop(columns=["values", "values_first"]),
+            values_expanded
+        ],
+        axis=1
+    )
+    mongo_df_2["fuel_type"] = mongo_df_2["fuel_type"].combine_first(mongo_df_2["production_type"])
+    mongo_df_2["publication_date"] = mongo_df_2["publication_date"].combine_first(mongo_df_2["updated_date"])
+    mongo_df_2["event_status"] = mongo_df_2["event_status"].combine_first(mongo_df_2["status"])
+    mongo_df_2["affected_asset_or_unit_installed_capacity"] = mongo_df_2["affected_asset_or_unit_installed_capacity"].combine_first(mongo_df_2["installed_capacity"])
+    mongo_df_2["affected_asset_or_unit_name"] = mongo_df_2["affected_asset_or_unit_name"].combine_first(mongo_df_2["name"])
+    mongo_df_2["unavailability_type"] = (
+        mongo_df_2["unavailability_type"]
+        .combine_first(mongo_df_2.loc[:, "type"].iloc[:, 0])
+    )
+    mongo_df_2 = mongo_df_2.drop(columns=["production_type", "updated_date", "status", "installed_capacity", "name", "type", "eic_code"])
+    # Convert the date columns to datetime objects
+    for col in ["publication_date", "start_date", "end_date"]:
+        mongo_df_2[col] = pd.to_datetime(mongo_df_2[col], utc=True)
+    # # Now convert everything to French time (CET/CEST):
+    # for col in ["publication_date", "start_date", "end_date"]:
+    #     mongo_df_2[col] = mongo_df_2[col].dt.tz_convert("Europe/Paris")
+    # mongo_df_2 = mongo_df_2.drop_duplicates(subset='identifier', keep='first')
+    mongo_df_2['version'] = mongo_df_2['version'].astype(int)
+    # Sort by identifier and version to ensure the latest version is at the top
+    # Method 1: Use groupby + idxmax to pick the row with the largest version per identifier
+    idx = mongo_df_2.groupby("identifier")["version"].idxmax()
+    mongo_df_2 = mongo_df_2.loc[idx].reset_index(drop=True)
+    mongo_df_2 = mongo_df_2[mongo_df_2['event_status'] != 'DISMISSED']
+    # Create the final dataframe
+    final_df = pd.DataFrame()
+    # Create the date column, with date range from start_date to end_date in daily granularity
+    final_df['Date'] = pd.date_range(start=usr_start_date, end=usr_end_date, freq='D')
+    final_df['Date'] = [ts.strftime("%Y-%m-%d") for ts in final_df['Date']]
+    # For each plant create a new column with the plant name
+    for plant, capacity in plants_metadata.items():
+        # Create a new column for each plant
+        final_df[plant] = np.nan  # Initialize with zeros
+    mongo_df_3 = mongo_df_2.copy()
+    dates_of_interest = list(pd.date_range(start=usr_start_date, end=usr_end_date, freq="D"))
+    # Now convert each Timestamp → “YYYY-MM-DD” string:
+    dates_of_interest = [ts.strftime("%Y-%m-%d") for ts in dates_of_interest]
+    mongo_df_3['start_day'] = mongo_df_3['start_date'].dt.day
+    mongo_df_3['start_hour'] = mongo_df_3['start_date'].dt.hour
+    mongo_df_3['start_minute'] = mongo_df_3['start_date'].dt.minute
+    mongo_df_3['end_day'] = mongo_df_3['end_date'].dt.day
+    mongo_df_3['end_hour'] = mongo_df_3['end_date'].dt.hour
+    mongo_df_3['end_minute'] = mongo_df_3['end_date'].dt.minute
+    # mongo_df_3 = mongo_df_3.sort_values(by=['publication_date'], ascending=False)
+    mongo_df_3 = mongo_df_3.sort_values(by=['publication_date'])
+    # results_plants = {plant_name: {date: {"available_capacity": power, "publication_date": pd.to_datetime("1970-01-01", utc=True)} for date in dates_of_interest}
+    #                 for plant_name, power in plants_metadata.items()}
+    results_plants = {plant_name: {date: {"available_capacity": power, "publication_date": pd.to_datetime("1970-01-01", utc=True)}
+                                for date in dates_of_interest}
+                                    for plant_name, power in plants_metadata.items()}
+    for row in mongo_df_3.itertuples():
+        # Get the start and end dates for the unavailability
+        row_start_date = str(row.start_date.date())
+        row_end_date = str(row.end_date.date())
+        # Get the plant name and capacity
+        plant_name = row.affected_asset_or_unit_name
+        plant_capacity = plants_metadata.get(plant_name, 0)  # Default to 0 if not found
+        results_current_plant = results_plants[plant_name]
+        power_unavailability = row.available_capacity
+        publication_date_unav = row.publication_date
+        for day in dates_of_interest:
+            # percentage_of_day = results_current_plant[day]["percentage_of_day"]
+            if row_start_date <= day <= row_end_date:
+                # Check if the day is already updated with a later (more recent) update_date; by sorting the DataFrame by publication_date,
+                # we ensure that the latest unavailability is applied
+                # Get the percentage of day that the plant is unavailable
+                # if day in final_df['Date'] and pd.notna(final_df.loc[final_df['Date'] == day, plant_name]).any():
+                if (day in results_current_plant) and (publication_date_unav <= results_current_plant[day]["publication_date"]):
+                    # If the plant's capacity for that day is already set, skip to the next day
+                    continue
+                # The unavailability starts and ends on the same day
+                if row_start_date == day and day == row_end_date:
+                    percentage_of_day = (row.end_hour * 60 + row.end_minute - row.start_hour * 60 - row.start_minute) / (24 * 60)
+                    # results_current_plant[day]["percentage_of_day"] += percentage_of_day
+                    # power_of_day = percentage_of_day * row.available_capacity + (1 - percentage_of_day) * plant_capacity
+                    # final_df.loc[final_df['Date'] == day, plant_name] = power_of_day
+                # The unavailability starts on the current day but ends on a later day
+                elif row_start_date == day and day < row_end_date:
+                    percentage_of_day = (24 * 60 - (row.start_hour * 60 + row.start_minute)) / (24 * 60)
+                    # results_current_plant[day]["percentage_of_day"] += percentage_of_day
+                    # power_of_day = percentage_of_day * row.available_capacity + (1 - percentage_of_day) * plant_capacity
+                    # final_df.loc[final_df['Date'] == day, plant_name] = power_of_day
+                # # The unavailability starts on a previous day and ends on the current day
+                elif row_end_date == day and row_start_date < day:
+                    percentage_of_day = (row.end_hour * 60 + row.end_minute) / (24 * 60)
+                    # results_current_plant[day]["percentage_of_day"] += percentage_of_day
+                    # power_of_day = percentage_of_day * row.available_capacity + (1 - percentage_of_day) * plant_capacity
+                    # final_df.loc[final_df['Date'] == day, plant_name] = power_of_day
+                else:
+                    # The unavailability starts on a previous day and ends on a later day
+                    percentage_of_day = 1
+                    # power_of_day = percentage_of_day * row.available_capacity + (1 - percentage_of_day) * plant_capacity
+                    # final_df.loc[final_df['Date'] == day, plant_name] = power_of_day
+                power_of_day = percentage_of_day * power_unavailability + (1 - percentage_of_day) * plant_capacity
+                # Update the available_capacity for the day only if it's not already updated with a later update_date
+                if (day not in results_current_plant):
+                    results_current_plant[day] = {"available_capacity": power_of_day, "publication_date": publication_date_unav}
+                elif (day in results_current_plant) and (publication_date_unav > results_current_plant[day]["publication_date"]) \
+                        and (power_of_day < results_current_plant[day]['available_capacity']):
+                    # results_current_plant[day]["available_capacity"] *= power_of_day
+                    # results_current_plant[day]["publication_date"] = publication_date_unav
+                    results_current_plant[day] = {"available_capacity": power_of_day, "publication_date": publication_date_unav}
+                else:
+                    continue
     output_results = {}
     for plant, plant_data in results_plants.items():
         available_capacity_per_day = {str(date): data["available_capacity"] for date, data in plant_data.items()}
         output_results[plant] = available_capacity_per_day
     add_total(output_results)
     output_results = {plant: {str(date): power for date, power in plant_data.items()} for plant, plant_data in output_results.items()}
     output_results = pd.DataFrame(output_results)
     # -------------------------------------------------
     # Calculate the average of each column excluding the last row
     # Replace the last row with the calculated averages
     output_results.iloc[-1, :] = averages
     output_results = output_results.to_dict()
     def turn_total_row_to_avg(data):
     turn_total_row_to_avg(output_results)
     json_data = json.dumps(output_results)
     # print(json_data)
     return json_data
         df_photo_date_2.index = pd.to_datetime(df_photo_date_2.index)
         # Calculate monthly averages with date in yyyy-mm format
+        monthly_average_nucmonitor = df_nucmonitor_2.resample('ME').mean()
         monthly_average_nucmonitor.index = monthly_average_nucmonitor.index.strftime('%Y-%m')
+        monthly_average_photo_date = df_photo_date_2.resample('ME').mean()
         monthly_average_photo_date.index = monthly_average_photo_date.index.strftime('%Y-%m')

app_all.py CHANGED Viewed

@@ -24,80 +24,48 @@ def mongo_unavs_call(user_input_start_date, user_input_end_date, user_input_past
     passw = "tN9XpCCQM2MtYDme"
     host = "nucmonitordata.xxcwx9k.mongodb.net"
     client = pymongo.MongoClient(
-        f"mongodb+srv://{user}:{passw}@{host}/?retryWrites=true&w=majority&connectTimeoutMS=100000"
     )
     db = client["data"]
     collection_past_unavs = db["unavs"]
-    collection_unavs = db["unavs_update"]
     start_date = f"{user_input_start_date}T00:00:00"
     end_date = f"{user_input_end_date}T23:59:59"
     past_date = f"{user_input_past_date}T23:59:59"
-    pipeline_v4 = [
-        # 1) Expand each results element into its own doc
-        { "$unwind": "$results" },
-        # 2) Expand each generation_unavailabilities element
-        { "$unwind": "$results.generation_unavailabilities" },
-        # 3) Keep only those that match your fuel_type + date criteria
         {
-            "$match": {
-                "results.generation_unavailabilities.production_type": "NUCLEAR",
-                "results.generation_unavailabilities.updated_date": { "$lte": past_date },
-                "results.generation_unavailabilities.start_date": { "$lte": end_date },
-                "results.generation_unavailabilities.start_date": { "$gte": start_date },
-                "results.generation_unavailabilities.end_date": { "$gte": start_date },
-                "results.generation_unavailabilities.end_date": { "$lte": end_date }
-            }
         },
-        # 4) Replace the entire document with just that sub-doc
         {
-            "$replaceRoot": {
-                "newRoot": "$results.generation_unavailabilities"
-            }
-        }
-    ]
-    pipeline_v6 = [
-        # 1) Expand each results element into its own doc
-        { "$unwind": "$results" },
-        # 2) Expand each generation_unavailabilities element
-        { "$unwind": "$results.generation_unavailabilities" },
-        # 3) Keep only those that match your fuel_type + date criteria
         {
             "$match": {
-                "results.generation_unavailabilities.fuel_type": "NUCLEAR",
-                "results.generation_unavailabilities.publication_date": { "$lte": past_date },
-                "results.generation_unavailabilities.start_date": { "$lte": end_date },
-                # "results.generation_unavailabilities.start_date": { "$gte": start_date },
-                "results.generation_unavailabilities.end_date": { "$gte": start_date },
-                # "results.generation_unavailabilities.end_date": { "$lte": end_date }
             }
         },
-        # 4) Replace the entire document with just that sub-doc
         {
-            "$replaceRoot": {
-                "newRoot": "$results.generation_unavailabilities"
             }
         }
     ]
-    result1 = list(collection_past_unavs.aggregate(pipeline_v4))
-    result2 = list(collection_unavs.aggregate(pipeline_v4))
-    result_v6 = list(collection_unavs.aggregate(pipeline_v6))
-    merge_results = result1 + result2 + result_v6
-    return merge_results
 # --------------------------------------------------------------------------------------- #
@@ -135,19 +103,19 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
     # # Slightly changed metadata to fit the data from the RTE API: ST-LAURENT B 2 --> ST LAURENT 2, ....
     plants_metadata = {"BELLEVILLE 1": 1310.0, "BELLEVILLE 2": 1310.0, "BLAYAIS 1": 910.0, "BLAYAIS 2": 910.0,
-                    "BLAYAIS 3": 910.0, "BLAYAIS 4": 910.0, "BUGEY 2": 910.0, "BUGEY 3": 910.0, "BUGEY 4": 880.0,
-                    "BUGEY 5": 880.0, "CATTENOM 1": 1300.0, "CATTENOM 2": 1300.0, "CATTENOM 3": 1300.0,
-                    "CATTENOM 4": 1300.0, "CHINON 1": 905.0, "CHINON 2": 905.0, "CHINON 3": 905.0,
-                    "CHINON 4": 905.0, "CHOOZ 1": 1500.0, "CHOOZ 2": 1500.0, "CIVAUX 1": 1495.0,
-                    "CIVAUX 2": 1495.0, "CRUAS 1": 915.0, "CRUAS 2": 915.0, "CRUAS 3": 915.0, "CRUAS 4": 915.0,
-                    "DAMPIERRE 1": 890.0, "DAMPIERRE 2": 890.0, "DAMPIERRE 3": 890.0, "DAMPIERRE 4": 890.0,
-                    "FLAMANVILLE 1": 1330.0, "FLAMANVILLE 2": 1330.0,  "FLAMANVILLE 3": 1620.0, "GOLFECH 1": 1310.0, "GOLFECH 2": 1310.0,
-                    "GRAVELINES 1": 910.0, "GRAVELINES 2": 910.0, "GRAVELINES 3": 910.0, "GRAVELINES 4": 910.0,
-                    "GRAVELINES 5": 910.0, "GRAVELINES 6": 910.0, "NOGENT 1": 1310.0, "NOGENT 2": 1310.0,
-                    "PALUEL 1": 1330.0, "PALUEL 2": 1330.0, "PALUEL 3": 1330.0, "PALUEL 4": 1330.0, "PENLY 1": 1330.0,
-                    "PENLY 2": 1330.0, "ST ALBAN 1": 1335.0, "ST ALBAN 2": 1335.0, "ST LAURENT 1": 915.0,
-                    "ST LAURENT 2": 915.0, "TRICASTIN 1": 915.0, "TRICASTIN 2": 915.0, "TRICASTIN 3": 915.0,
-                    "TRICASTIN 4": 915.0, "FESSENHEIM 1": 0.0, "FESSENHEIM 2": 0.0}
     # --------------------- INITIAL DATA CLEANING FOR MONGO DATA ------------------------ #
@@ -157,187 +125,281 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
     # print(mongo_db_data)
     mongo_df = pd.DataFrame(mongo_db_data)
-    mongo_df = mongo_df[['identifier', 'version', 'updated_date', 'type', 'production_type', 'message_id', 'unit', 'status', 'values',
-        'publication_date', 'unavailability_type', 'fuel_type',
-        'affected_asset_or_unit_name',
-        'affected_asset_or_unit_installed_capacity', 'event_status']]
-    # 1. Normalize “unit” into a DataFrame of its own
-    unit_expanded = pd.json_normalize(mongo_df["unit"])
-    # values_expanded = pd.json_normalize(mongo_df["values"])
-    # (This produces a new DF with columns “eic_code” and “name”.)
-    # 2. Concatenate those new columns back onto df, then drop the old “unit” column
-    mongo_df_2 = pd.concat([mongo_df.drop(columns=["unit"]), unit_expanded], axis=1)
-    # mongo_df_2 = pd.concat([mongo_df_2.drop(columns=["values"]), values_expanded], axis=1)
-    # 1. Create a temporary column that is “the first dict” of each list (or {} if empty/NaN)
-    mongo_df_2["values_first"] = mongo_df_2["values"].apply(
-        lambda lst: lst[0] if isinstance(lst, list) and len(lst) > 0 else {}
-    )
-    # 2. Normalize that dict into separate columns
-    values_expanded = pd.json_normalize(mongo_df_2["values_first"])
-    #    e.g. this produces columns like “start_date”, “end_date”, etc.
-    # 3. Concatenate back and drop the originals
-    mongo_df_2 = pd.concat(
-        [
-            mongo_df_2.drop(columns=["values", "values_first"]),
-            values_expanded
-        ],
-        axis=1
-    )
-    mongo_df_2["fuel_type"] = mongo_df_2["fuel_type"].combine_first(mongo_df_2["production_type"])
-    mongo_df_2["publication_date"] = mongo_df_2["publication_date"].combine_first(mongo_df_2["updated_date"])
-    mongo_df_2["event_status"] = mongo_df_2["event_status"].combine_first(mongo_df_2["status"])
-    mongo_df_2["affected_asset_or_unit_installed_capacity"] = mongo_df_2["affected_asset_or_unit_installed_capacity"].combine_first(mongo_df_2["installed_capacity"])
-    mongo_df_2["affected_asset_or_unit_name"] = mongo_df_2["affected_asset_or_unit_name"].combine_first(mongo_df_2["name"])
-    mongo_df_2["unavailability_type"] = (
-        mongo_df_2["unavailability_type"]
-        .combine_first(mongo_df_2.loc[:, "type"].iloc[:, 0])
-    )
-    mongo_df_2 = mongo_df_2.drop(columns=["production_type", "updated_date", "status", "installed_capacity", "name", "type", "eic_code"])
-    # Convert the date columns to datetime objects
-    for col in ["publication_date", "start_date", "end_date"]:
-        mongo_df_2[col] = pd.to_datetime(mongo_df_2[col], utc=True)
-    # # Now convert everything to French time (CET/CEST):
-    # for col in ["publication_date", "start_date", "end_date"]:
-    #     mongo_df_2[col] = mongo_df_2[col].dt.tz_convert("Europe/Paris")
-    # mongo_df_2 = mongo_df_2.drop_duplicates(subset='identifier', keep='first')
-    mongo_df_2['version'] = mongo_df_2['version'].astype(int)
-    # Sort by identifier and version to ensure the latest version is at the top
-    # Method 1: Use groupby + idxmax to pick the row with the largest version per identifier
-    idx = mongo_df_2.groupby("identifier")["version"].idxmax()
-    mongo_df_2 = mongo_df_2.loc[idx].reset_index(drop=True)
-    mongo_df_2 = mongo_df_2[mongo_df_2['event_status'] != 'DISMISSED']
-    # Create the final dataframe
-    final_df = pd.DataFrame()
-    # Create the date column, with date range from start_date to end_date in daily granularity
-    final_df['Date'] = pd.date_range(start=usr_start_date, end=usr_end_date, freq='D')
-    final_df['Date'] = [ts.strftime("%Y-%m-%d") for ts in final_df['Date']]
-    # For each plant create a new column with the plant name
-    for plant, capacity in plants_metadata.items():
-        # Create a new column for each plant
-        final_df[plant] = np.nan  # Initialize with zeros
-    mongo_df_3 = mongo_df_2.copy()
-    dates_of_interest = list(pd.date_range(start=usr_start_date, end=usr_end_date, freq="D"))
-    # Now convert each Timestamp → “YYYY-MM-DD” string:
-    dates_of_interest = [ts.strftime("%Y-%m-%d") for ts in dates_of_interest]
-    mongo_df_3['start_day'] = mongo_df_3['start_date'].dt.day
-    mongo_df_3['start_hour'] = mongo_df_3['start_date'].dt.hour
-    mongo_df_3['start_minute'] = mongo_df_3['start_date'].dt.minute
-    mongo_df_3['end_day'] = mongo_df_3['end_date'].dt.day
-    mongo_df_3['end_hour'] = mongo_df_3['end_date'].dt.hour
-    mongo_df_3['end_minute'] = mongo_df_3['end_date'].dt.minute
-    # mongo_df_3 = mongo_df_3.sort_values(by=['publication_date'], ascending=False)
-    mongo_df_3 = mongo_df_3.sort_values(by=['publication_date'])
-    # results_plants = {plant_name: {date: {"available_capacity": power, "publication_date": pd.to_datetime("1970-01-01", utc=True)} for date in dates_of_interest}
-    #                 for plant_name, power in plants_metadata.items()}
-    results_plants = {plant_name: {date: {"available_capacity": power, "publication_date": pd.to_datetime("1970-01-01", utc=True)}
-                                for date in dates_of_interest}
-                                    for plant_name, power in plants_metadata.items()}
-    for row in mongo_df_3.itertuples():
-        # Get the start and end dates for the unavailability
-        row_start_date = str(row.start_date.date())
-        row_end_date = str(row.end_date.date())
-        # Get the plant name and capacity
-        plant_name = row.affected_asset_or_unit_name
-        plant_capacity = plants_metadata.get(plant_name, 0)  # Default to 0 if not found
-        results_current_plant = results_plants[plant_name]
-        power_unavailability = row.available_capacity
-        publication_date_unav = row.publication_date
-        for day in dates_of_interest:
-            # percentage_of_day = results_current_plant[day]["percentage_of_day"]
-            if row_start_date <= day <= row_end_date:
-                # Check if the day is already updated with a later (more recent) update_date; by sorting the DataFrame by publication_date,
-                # we ensure that the latest unavailability is applied
-                # Get the percentage of day that the plant is unavailable
-                # if day in final_df['Date'] and pd.notna(final_df.loc[final_df['Date'] == day, plant_name]).any():
-                if (day in results_current_plant) and (publication_date_unav <= results_current_plant[day]["publication_date"]):
-                    # If the plant's capacity for that day is already set, skip to the next day
-                    continue
-                # The unavailability starts and ends on the same day
-                if row_start_date == day and day == row_end_date:
-                    percentage_of_day = (row.end_hour * 60 + row.end_minute - row.start_hour * 60 - row.start_minute) / (24 * 60)
-                    # results_current_plant[day]["percentage_of_day"] += percentage_of_day
-                    # power_of_day = percentage_of_day * row.available_capacity + (1 - percentage_of_day) * plant_capacity
-                    # final_df.loc[final_df['Date'] == day, plant_name] = power_of_day
-                # The unavailability starts on the current day but ends on a later day
-                elif row_start_date == day and day < row_end_date:
-                    percentage_of_day = (24 * 60 - (row.start_hour * 60 + row.start_minute)) / (24 * 60)
-                    # results_current_plant[day]["percentage_of_day"] += percentage_of_day
-                    # power_of_day = percentage_of_day * row.available_capacity + (1 - percentage_of_day) * plant_capacity
-                    # final_df.loc[final_df['Date'] == day, plant_name] = power_of_day
-                # # The unavailability starts on a previous day and ends on the current day
-                elif row_end_date == day and row_start_date < day:
-                    percentage_of_day = (row.end_hour * 60 + row.end_minute) / (24 * 60)
-                    # results_current_plant[day]["percentage_of_day"] += percentage_of_day
-                    # power_of_day = percentage_of_day * row.available_capacity + (1 - percentage_of_day) * plant_capacity
-                    # final_df.loc[final_df['Date'] == day, plant_name] = power_of_day
-                else:
-                    # The unavailability starts on a previous day and ends on a later day
-                    percentage_of_day = 1
-                    # power_of_day = percentage_of_day * row.available_capacity + (1 - percentage_of_day) * plant_capacity
-                    # final_df.loc[final_df['Date'] == day, plant_name] = power_of_day
-                power_of_day = percentage_of_day * power_unavailability + (1 - percentage_of_day) * plant_capacity
-                # Update the available_capacity for the day only if it's not already updated with a later update_date
-                if (day not in results_current_plant):
-                    results_current_plant[day] = {"available_capacity": power_of_day, "publication_date": publication_date_unav}
-                elif (day in results_current_plant) and (publication_date_unav > results_current_plant[day]["publication_date"]) \
-                        and (power_of_day < results_current_plant[day]['available_capacity']):
-                    # results_current_plant[day]["available_capacity"] *= power_of_day
-                    # results_current_plant[day]["publication_date"] = publication_date_unav
-                    results_current_plant[day] = {"available_capacity": power_of_day, "publication_date": publication_date_unav}
-                else:
-                    continue
     output_results = {}
     for plant, plant_data in results_plants.items():
         available_capacity_per_day = {str(date): data["available_capacity"] for date, data in plant_data.items()}
         output_results[plant] = available_capacity_per_day
     add_total(output_results)
     output_results = {plant: {str(date): power for date, power in plant_data.items()} for plant, plant_data in output_results.items()}
     output_results = pd.DataFrame(output_results)
     # -------------------------------------------------
     # Calculate the average of each column excluding the last row
@@ -345,7 +407,7 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
     # Replace the last row with the calculated averages
     output_results.iloc[-1, :] = averages
     output_results = output_results.to_dict()
     def turn_total_row_to_avg(data):
@@ -356,6 +418,8 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
     turn_total_row_to_avg(output_results)
     json_data = json.dumps(output_results)
     # print(json_data)
     return json_data
@@ -473,10 +537,10 @@ def run_app():
         df_photo_date_2.index = pd.to_datetime(df_photo_date_2.index)
         # Calculate monthly averages with date in yyyy-mm format
-        monthly_average_nucmonitor = df_nucmonitor_2.resample('ME').mean()
         monthly_average_nucmonitor.index = monthly_average_nucmonitor.index.strftime('%Y-%m')
-        monthly_average_photo_date = df_photo_date_2.resample('ME').mean()
         monthly_average_photo_date.index = monthly_average_photo_date.index.strftime('%Y-%m')

     passw = "tN9XpCCQM2MtYDme"
     host = "nucmonitordata.xxcwx9k.mongodb.net"
     client = pymongo.MongoClient(
+        f"mongodb+srv://{user}:{passw}@{host}/?retryWrites=true&w=majority&connectTimeoutMS=5000"
     )
     db = client["data"]
     collection_past_unavs = db["unavs"]
+    collection_unavs =db["unavs_update"]
     start_date = f"{user_input_start_date}T00:00:00"
     end_date = f"{user_input_end_date}T23:59:59"
     past_date = f"{user_input_past_date}T23:59:59"
+    pipeline = [
         {
+            "$unwind": "$results"
         },
         {
+            "$unwind": "$results.generation_unavailabilities"
+        },
         {
             "$match": {
+                "results.generation_unavailabilities.production_type": "NUCLEAR",
+                # "results.generation_unavailabilities.start_date": {"$lte": end_date},
+                # "results.generation_unavailabilities.end_date": {"$gte": start_date},
+                # "results.generation_unavailabilities.updated_date": {"$lte": end_date}
+                "results.generation_unavailabilities.updated_date": {"$lte": past_date}
             }
         },
         {
+            "$project": {
+                "_id": 0,
+                "generation_unavailabilities": "$results.generation_unavailabilities"
             }
         }
     ]
+    result1 = list(collection_past_unavs.aggregate(pipeline))
+    result2 = list(collection_unavs.aggregate(pipeline))
+    # Merge the two lists of JSON results
+    merged_result = result1 + result2
+    return merged_result
 # --------------------------------------------------------------------------------------- #
     # # Slightly changed metadata to fit the data from the RTE API: ST-LAURENT B 2 --> ST LAURENT 2, ....
     plants_metadata = {"BELLEVILLE 1": 1310.0, "BELLEVILLE 2": 1310.0, "BLAYAIS 1": 910.0, "BLAYAIS 2": 910.0,
+                   "BLAYAIS 3": 910.0, "BLAYAIS 4": 910.0, "BUGEY 2": 910.0, "BUGEY 3": 910.0, "BUGEY 4": 880.0,
+                   "BUGEY 5": 880.0, "CATTENOM 1": 1300.0, "CATTENOM 2": 1300.0, "CATTENOM 3": 1300.0,
+                   "CATTENOM 4": 1300.0, "CHINON 1": 905.0, "CHINON 2": 905.0, "CHINON 3": 905.0,
+                   "CHINON 4": 905.0, "CHOOZ 1": 1500.0, "CHOOZ 2": 1500.0, "CIVAUX 1": 1495.0,
+                   "CIVAUX 2": 1495.0, "CRUAS 1": 915.0, "CRUAS 2": 915.0, "CRUAS 3": 915.0, "CRUAS 4": 915.0,
+                   "DAMPIERRE 1": 890.0, "DAMPIERRE 2": 890.0, "DAMPIERRE 3": 890.0, "DAMPIERRE 4": 890.0,
+                   "FLAMANVILLE 1": 1330.0, "FLAMANVILLE 2": 1330.0, "GOLFECH 1": 1310.0, "GOLFECH 2": 1310.0,
+                   "GRAVELINES 1": 910.0, "GRAVELINES 2": 910.0, "GRAVELINES 3": 910.0, "GRAVELINES 4": 910.0,
+                   "GRAVELINES 5": 910.0, "GRAVELINES 6": 910.0, "NOGENT 1": 1310.0, "NOGENT 2": 1310.0,
+                   "PALUEL 1": 1330.0, "PALUEL 2": 1330.0, "PALUEL 3": 1330.0, "PALUEL 4": 1330.0, "PENLY 1": 1330.0,
+                   "PENLY 2": 1330.0, "ST ALBAN 1": 1335.0, "ST ALBAN 2": 1335.0, "ST LAURENT 1": 915.0,
+                   "ST LAURENT 2": 915.0, "TRICASTIN 1": 915.0, "TRICASTIN 2": 915.0, "TRICASTIN 3": 915.0,
+                   "TRICASTIN 4": 915.0, "FESSENHEIM 1": 880.0, "FESSENHEIM 2": 880.0}
     # --------------------- INITIAL DATA CLEANING FOR MONGO DATA ------------------------ #
     # print(mongo_db_data)
     mongo_df = pd.DataFrame(mongo_db_data)
+    # print(mongo_df)
+    # Unpack the dictionaries into separate columns
+    mongo_df_unpacked = pd.json_normalize(mongo_df['generation_unavailabilities'])
+    # Concatenate the unpacked columns with the original DataFrame
+    mongo_df_result = pd.concat([mongo_df, mongo_df_unpacked], axis=1)
+    # Drop the original column
+    mongo_df_result.drop(columns=['generation_unavailabilities'], inplace=True)
+    mongo_df_result['start_date'] = mongo_df_result['values'].apply(lambda x: x[0]['start_date'])
+    mongo_df_result['end_date'] = mongo_df_result['values'].apply(lambda x: x[0]['end_date'])
+    mongo_df_result['available_capacity'] = mongo_df_result['values'].apply(lambda x: x[0]['available_capacity'])
+    mongo_df_result['unavailable_capacity'] = mongo_df_result['values'].apply(lambda x: x[0]['unavailable_capacity'])
+    # print(mongo_df_result)
+    # print(mongo_df_result.columns)
+    # Drop the original 'values' column
+    mongo_df_result.drop('values', axis=1, inplace=True)
+    mongo_df2 = mongo_df_result
+    mongo_df2.rename(columns=lambda col: col.replace('unit.', ''), inplace=True)
+    # --------------------- INITIAL DATA CLEANING FOR MONGO DATA ------------------------ #
+    # Make the two dataframes have the same columns
+    mongo_unavs = mongo_df2.copy()
+    mongo_unavs.drop(columns="type", inplace=True)
+    # merged_df['updated_date'] = merged_df['updated_date'].astype(str)
+# --------------------------- HERE IS THE CHANGE TO GET ONLY ACTIVE OR ACTIVE AND INACTIVE --------------------------- #
+    # start_date_str = usr_start_date.strftime("%Y-%m-%d")
+    start_date_str = str(usr_start_date)
+    # end_date_str = usr_end_date.strftime("%Y-%m-%d")
+    end_date_str = str(usr_end_date)
+    current_datetime = datetime.datetime.now()
+    past_date_str = str(past_date.strftime("%Y-%m-%dT%H:%M:%S%z"))
+    current_datetime_str = current_datetime.strftime("%Y-%m-%d")
+    # nuclear_unav = mongo_unavs.copy()[(mongo_unavs.copy()["production_type"] == "NUCLEAR") & (mongo_unavs.copy()["updated_date"] <= past_date_str)]
+    # print(past_date_str)
+        # Sort by updated date
+    sorted_df = mongo_unavs.copy().sort_values(by='updated_date')
+    sorted_df = sorted_df.copy().reset_index(drop=True)
+    # cruas_2 = sorted_df.copy()[(sorted_df.copy()["name"] == "ST ALBAN 2") & (sorted_df.copy()["end_date"] >= start_date_str)]
+    # print(cruas_2[['updated_date', 'end_date', 'available_capacity']])
+    # Filter to get identifiers
+    filtered_id_df = sorted_df.copy()
+# I commented this out
+    filtered_id_df = filtered_id_df.drop_duplicates(subset='identifier', keep='last')
+    # cruas_2 = filtered_id_df.copy()[(filtered_id_df.copy()["name"] == "ST ALBAN 2") & (filtered_id_df.copy()["end_date"] >= start_date_str)]
+    # print(cruas_2[['updated_date', 'end_date', 'available_capacity']])
+    filtered_id_df = filtered_id_df.copy().reset_index(drop=True)
+    filtered_df = filtered_id_df[
+    (filtered_id_df["production_type"] == "NUCLEAR") &
+    # (mongo_unavs["updated_date"] <= past_date_str) &
+    (filtered_id_df["status"] != "DISMISSED")]
+    # if photo_date == True:
+    #     nuclear_unav = merged_df.copy()[(merged_df.copy()["production_type"] == "NUCLEAR") & (merged_df.copy()["updated_date"] <= past_date_str)]
+    #     photo_date = True
+    # else: # need to add updated_date as a conditional to get the newest for that day
+    #     nuclear_unav = merged_df.copy()[(merged_df.copy()["production_type"] == "NUCLEAR") & (merged_df.copy()["updated_date"] <= end_date_str)]
+# --------------------------- HERE IS THE CHANGE TO GET ONLY ACTIVE OR ACTIVE AND INACTIVE --------------------------- #
+    # --------------------- SECOND DATA CLEANING ------------------------ #
+    # This filter should take only the most recent id and discard the rest
+    # This filter should take all the dates with unavs that include days with unavs in the range of the start and end date
+    # This filter might take out the most recent identifiers (Message ID) that change the dates of unavailability of a plant.
+    # This means that the actual unavailability is something else
+    # filtered_df = filtered_id_df.copy()[(filtered_id_df.copy()['start_date'] <= end_date_str) & (filtered_id_df.copy()['end_date'] >= start_date_str)]
+    # Need to eventually do a filter that takes the most restrictive updated identifier instead of the most recent when there
+    # is an overlap
+    # Update available_capacity where the condition is True
+    # Standardize datetime in dataframe
+    filtered_df2 = filtered_df.copy() # This code will just standardize datetime stuff
+    filtered_df2['creation_date'] = pd.to_datetime(filtered_df2['creation_date'], utc=True)
+    filtered_df2['updated_date'] = pd.to_datetime(filtered_df2['updated_date'], utc=True)
+    filtered_df2['start_date'] = pd.to_datetime(filtered_df2['start_date'], utc=True)
+    filtered_df2['end_date'] = pd.to_datetime(filtered_df2['end_date'], utc=True)
+    # Drop the duplicates
+    filtered_df3 = filtered_df2.copy().drop_duplicates()
+    # start_date_datetime = pd.to_datetime(start_date_str, utc=True)  # Remove timezone info
+    start_date_datetime = pd.Timestamp(start_date_str, tz='UTC')
+    # end_date_datetime = pd.to_datetime(end_date_str, utc=True)
+    end_date_datetime = pd.Timestamp(end_date_str, tz='UTC')
+    # Turn df into dict for json processing
+    filtered_unavs = filtered_df3.copy().to_dict(orient='records')
+    results = {}
+    for unav in filtered_unavs:
+        plant_name = unav['name']
+        if plant_name in results:
+            # If the key is already in the dictionary, append unavailability to the list
+            results[plant_name].append({'status': unav['status'],
+                                        'id': unav['message_id'],
+                                        'creation_date': unav['creation_date'],
+                                        'updated_date': unav['updated_date'],
+                                        'start_date': unav['start_date'],
+                                        'end_date': unav['end_date'],
+                                        'available_capacity': unav['available_capacity']})
+        else:
+            # if the key of the plant is not there yet, create a new element of the dictionary
+            # Get message_id instead of identifier, easier to identify stuff with it
+            results[plant_name] = [{'status': unav['status'],
+                                    'id': unav['message_id'],
+                                    'creation_date': unav['creation_date'],
+                                    'updated_date': unav['updated_date'],
+                                    'start_date': unav['start_date'],
+                                    'end_date': unav['end_date'],
+                                    'available_capacity': unav['available_capacity']}]
+    # Custom encoder to handle datetime objects
+    class DateTimeEncoder(json.JSONEncoder):
+        def default(self, o):
+            if isinstance(o, datetime.datetime):
+                return o.isoformat()
+            return super().default(o)
+    results_holder = results
+    # Create new dict with each plant only having start_date less than user_end_date and an end_date greater than user_start_date
+    # should just be doing the same as above in the df for filtering only dates that inclued the start and end date
+    start_date = start_date_datetime.date()
+    end_date = end_date_datetime.date()
+    results_filtered = results_holder
+    for key, value in results_filtered.items():
+        filtered_values = []
+        for item in value:
+            item_start_date = item['start_date'].date()
+            item_end_date = item['end_date'].date()
+            identifier = item['id']
+            if item_start_date < end_date and item_end_date > start_date and identifier not in filtered_values:
+                filtered_values.append(item)
+        results_filtered[key] = filtered_values
+    sorted_results = results_filtered
+    # --------------------- SECOND DATA CLEANING ------------------------ #
+# --------------------------- HERE IS THE FINAL PROCESS --------------------------- #
+    for key, value in sorted_results.items():
+        sorted_results[key] = sorted(value, key=lambda x: x['updated_date'])
+    results_sorted = sorted_results
+    dates_of_interest = [start_date] # We are creating a list of dates ranging from user specified start and end dates
+    date_plus_one = start_date
+    while date_plus_one < end_date:
+        date_plus_one = date_plus_one + datetime.timedelta(days=1)
+        dates_of_interest.append(date_plus_one)
+    # This is to standardize the datetimes. Without this, the datetime calculations for each power plant will not work
+    # This is just getting the plant metadata and giving it updated_date????? With an amount of items based on the length of the
+    # date range????
+    results_plants = {plant_name: {date: {"available_capacity": power, "updated_date": pd.to_datetime("1970-01-01", utc=True)} for date in dates_of_interest}
+                    for plant_name, power in plants_metadata.items()}
+    # print(results_sorted)
+    for plant, unavailabilities in results_sorted.items():
+        # Get the full power of a given plant according to the sorted results
+        original_power = plants_metadata[plant]
+        # Get all the unavailabilities scheduled for the plant.
+        # This is actually apparently just getting the metadata though???
+        results_current_plant = results_plants[plant]
+        for unavailability in unavailabilities:
+            # For each unavailability, the resulting power, start and end datetime are collected. Need to collect updated_date
+            power_unavailability = unavailability["available_capacity"]
+            updated_date_unav = unavailability["updated_date"]
+            # The date comes as a string
+            start_datetime_unav = unavailability["start_date"]
+            end_datetime_unav = unavailability["end_date"]
+            start_date_unav = start_datetime_unav.date()  # Extract date part
+            end_date_unav = end_datetime_unav.date()      # Extract date part
+            # For the current unavailability, we want to find which days it affects
+            for day in dates_of_interest:
+                start_hour = start_datetime_unav.hour
+                start_minute = start_datetime_unav.minute
+                end_hour = end_datetime_unav.hour
+                end_minute = end_datetime_unav.minute
+                if start_date_unav <= day <= end_date_unav:
+                    # Check if the day is already updated with a later update_date
+                    if day in results_current_plant and updated_date_unav <= results_current_plant[day]["updated_date"]:
+                        # Here is likely where we can do the filter for worst case scenario
+                        # --------------------------- !!!!!!CREATE NEW FILTER THAT KEEPS ONLY MOST RESTRICTIVE OVERLAP!!!!!! --------------------------- #
+                        # if power_unavailability < results_current_plant[day]['available_capacity']:
+                        #     # Calculate the % of the day that the plant is under maintenance
+                        #     if start_date_unav == day and day == end_date_unav:
+                        #         # The unavailability starts and ends on the same day
+                        #         percentage_of_day = (end_hour * 60 + end_minute - start_hour * 60 - start_minute) / (24 * 60)
+                        #     elif start_date_unav == day:
+                        #         # The unavailability starts on the current day but ends on a later day
+                        #         percentage_of_day = (24 * 60 - (start_hour * 60 + start_minute)) / (24 * 60)
+                        #     elif day == end_date_unav:
+                        #         # The unavailability starts on a previous day and ends on the current day
+                        #         percentage_of_day = (end_hour * 60 + end_minute) / (24 * 60)
+                        #     else:
+                        #         # The unavailability covers the entire day
+                        #         percentage_of_day = 1
+                        # --------------------------- !!!!!!CREATE NEW FILTER THAT KEEPS ONLY MOST RESTRICTIVE OVERLAP!!!!!! --------------------------- #
+                        # else:
+                        continue  # Skip to the next loop if there is already information for a later update_date
+                    # Calculate the % of the day that the plant is under maintenance
+                    if start_date_unav == day and day == end_date_unav:
+                        # The unavailability starts and ends on the same day
+                        percentage_of_day = (end_hour * 60 + end_minute - start_hour * 60 - start_minute) / (24 * 60)
+                    elif start_date_unav == day:
+                        # The unavailability starts on the current day but ends on a later day
+                        percentage_of_day = (24 * 60 - (start_hour * 60 + start_minute)) / (24 * 60)
+                    elif day == end_date_unav:
+                        # The unavailability starts on a previous day and ends on the current day
+                        percentage_of_day = (end_hour * 60 + end_minute) / (24 * 60)
+                    else:
+                        # The unavailability covers the entire day
+                        percentage_of_day = 1
+                    # The average power of the day is calculated
+                    power_of_day = percentage_of_day * power_unavailability + (1 - percentage_of_day) * original_power
+                    # Update the available_capacity for the day only if it's not already updated with a later update_date
+                    if (day not in results_current_plant):
+                        results_current_plant[day] = {"available_capacity": power_of_day, "updated_date": updated_date_unav}
+                    elif (day in results_current_plant) and (updated_date_unav > results_current_plant[day]["updated_date"]) and (power_of_day < results_current_plant[day]['available_capacity']):
+                        results_current_plant[day] = {"available_capacity": power_of_day, "updated_date": updated_date_unav}
+                    else:
+                        continue
     output_results = {}
     for plant, plant_data in results_plants.items():
         available_capacity_per_day = {str(date): data["available_capacity"] for date, data in plant_data.items()}
         output_results[plant] = available_capacity_per_day
+    # print(output_results)
     add_total(output_results)
+    # print("Done")
+    # print(results_plants)
+    # Convert datetime key to string to store in mongodb
     output_results = {plant: {str(date): power for date, power in plant_data.items()} for plant, plant_data in output_results.items()}
     output_results = pd.DataFrame(output_results)
+    print(output_results)
     # -------------------------------------------------
     # Calculate the average of each column excluding the last row
     # Replace the last row with the calculated averages
     output_results.iloc[-1, :] = averages
     output_results = output_results.to_dict()
     def turn_total_row_to_avg(data):
     turn_total_row_to_avg(output_results)
+    # print(output_results)
     json_data = json.dumps(output_results)
     # print(json_data)
     return json_data
         df_photo_date_2.index = pd.to_datetime(df_photo_date_2.index)
         # Calculate monthly averages with date in yyyy-mm format
+        monthly_average_nucmonitor = df_nucmonitor_2.resample('M').mean()
         monthly_average_nucmonitor.index = monthly_average_nucmonitor.index.strftime('%Y-%m')
+        monthly_average_photo_date = df_photo_date_2.resample('M').mean()
         monthly_average_photo_date.index = monthly_average_photo_date.index.strftime('%Y-%m')