dmarr commited on
Commit
bad9d6d
·
1 Parent(s): 8ad04d8

Update for v0.7 RTE

Browse files
Files changed (1) hide show
  1. app.py +89 -41
app.py CHANGED
@@ -157,49 +157,97 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
157
  # print(mongo_db_data)
158
  mongo_df = pd.DataFrame(mongo_db_data)
159
 
160
- mongo_df = mongo_df[['identifier', 'version', 'updated_date', 'type', 'production_type', 'message_id', 'unit', 'status', 'values',
161
- 'publication_date', 'unavailability_type', 'fuel_type',
162
- 'affected_asset_or_unit_name',
163
- 'affected_asset_or_unit_installed_capacity', 'event_status']]
164
-
165
- # 1. Normalize “unit” into a DataFrame of its own
166
- unit_expanded = pd.json_normalize(mongo_df["unit"])
167
- # values_expanded = pd.json_normalize(mongo_df["values"])
168
-
169
- # (This produces a new DF with columns “eic_code” and “name”.)
170
-
171
- # 2. Concatenate those new columns back onto df, then drop the old “unit” column
172
- mongo_df_2 = pd.concat([mongo_df.drop(columns=["unit"]), unit_expanded], axis=1)
173
- # mongo_df_2 = pd.concat([mongo_df_2.drop(columns=["values"]), values_expanded], axis=1)
174
- # 1. Create a temporary column that is “the first dict” of each list (or {} if empty/NaN)
175
- mongo_df_2["values_first"] = mongo_df_2["values"].apply(
176
- lambda lst: lst[0] if isinstance(lst, list) and len(lst) > 0 else {}
177
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- # 2. Normalize that dict into separate columns
180
- values_expanded = pd.json_normalize(mongo_df_2["values_first"])
181
- # e.g. this produces columns like “start_date”, “end_date”, etc.
182
-
183
- # 3. Concatenate back and drop the originals
184
- mongo_df_2 = pd.concat(
185
- [
186
- mongo_df_2.drop(columns=["values", "values_first"]),
187
- values_expanded
188
- ],
189
- axis=1
190
- )
 
 
 
 
191
 
192
- mongo_df_2["fuel_type"] = mongo_df_2["fuel_type"].combine_first(mongo_df_2["production_type"])
193
- mongo_df_2["publication_date"] = mongo_df_2["publication_date"].combine_first(mongo_df_2["updated_date"])
194
- mongo_df_2["event_status"] = mongo_df_2["event_status"].combine_first(mongo_df_2["status"])
195
- mongo_df_2["affected_asset_or_unit_installed_capacity"] = mongo_df_2["affected_asset_or_unit_installed_capacity"].combine_first(mongo_df_2["installed_capacity"])
196
- mongo_df_2["affected_asset_or_unit_name"] = mongo_df_2["affected_asset_or_unit_name"].combine_first(mongo_df_2["name"])
197
- mongo_df_2["unavailability_type"] = (
198
- mongo_df_2["unavailability_type"]
199
- .combine_first(mongo_df_2.loc[:, "type"].iloc[:, 0])
200
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- mongo_df_2 = mongo_df_2.drop(columns=["production_type", "updated_date", "status", "installed_capacity", "name", "type", "eic_code"])
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  # Convert the date columns to datetime objects
205
  for col in ["publication_date", "start_date", "end_date"]:
@@ -210,7 +258,7 @@ def nuc_monitor(usr_start_date, usr_end_date, past_date, mongo_db_data):
210
  # mongo_df_2[col] = mongo_df_2[col].dt.tz_convert("Europe/Paris")
211
 
212
  # mongo_df_2 = mongo_df_2.drop_duplicates(subset='identifier', keep='first')
213
- mongo_df_2['version'] = mongo_df_2['version'].astype(int)
214
  # Sort by identifier and version to ensure the latest version is at the top
215
  # Method 1: Use groupby + idxmax to pick the row with the largest version per identifier
216
  idx = mongo_df_2.groupby("identifier")["version"].idxmax()
 
157
  # print(mongo_db_data)
158
  mongo_df = pd.DataFrame(mongo_db_data)
159
 
160
+ # the five columns you care about
161
+ _optional = {
162
+ "updated_date",
163
+ "type",
164
+ "production_type",
165
+ "unit",
166
+ "status",
167
+ }
168
+
169
+ # see which of those actually exist
170
+ present = set(mongo_df.columns) & _optional
171
+
172
+ if _optional.issubset(mongo_df.columns):
173
+ # 0. first, pick the columns you know exist, plus the always‑present ones
174
+ cols = [
175
+ "identifier", "version", "message_id",
176
+ "values", "publication_date", "unavailability_type", "fuel_type",
177
+ "affected_asset_or_unit_name", "affected_asset_or_unit_installed_capacity",
178
+ "event_status"
179
+ ] + list(_optional)
180
+ mongo_df = mongo_df[cols]
181
+
182
+ # 1. normalize “unit”
183
+ unit_expanded = pd.json_normalize(mongo_df["unit"])
184
+ mongo_df_2 = pd.concat([mongo_df.drop(columns=["unit"]), unit_expanded], axis=1)
185
+
186
+ # 2. normalize the first element of “values”
187
+ mongo_df_2["values_first"] = (
188
+ mongo_df_2["values"]
189
+ .apply(lambda lst: lst[0] if isinstance(lst, list) and lst else {})
190
+ )
191
+ values_expanded = pd.json_normalize(mongo_df_2["values_first"])
192
+ mongo_df_2 = pd.concat(
193
+ [mongo_df_2.drop(columns=["values", "values_first"]), values_expanded],
194
+ axis=1
195
+ )
196
 
197
+ # 3. coalesce and drop old cols
198
+ mongo_df_2["fuel_type"] = mongo_df_2["fuel_type"].combine_first(mongo_df_2["production_type"])
199
+ mongo_df_2["publication_date"] = mongo_df_2["publication_date"].combine_first(mongo_df_2["updated_date"])
200
+ mongo_df_2["event_status"] = mongo_df_2["event_status"].combine_first(mongo_df_2["status"])
201
+ mongo_df_2["affected_asset_or_unit_installed_capacity"] = (
202
+ mongo_df_2["affected_asset_or_unit_installed_capacity"]
203
+ .combine_first(mongo_df_2["installed_capacity"])
204
+ )
205
+ mongo_df_2["affected_asset_or_unit_name"] = (
206
+ mongo_df_2["affected_asset_or_unit_name"]
207
+ .combine_first(mongo_df_2["name"])
208
+ )
209
+ mongo_df_2["unavailability_type"] = (
210
+ mongo_df_2["unavailability_type"]
211
+ .combine_first(mongo_df_2["type"].iloc[:, 0])
212
+ )
213
 
214
+ drop_cols = [
215
+ "production_type",
216
+ "updated_date",
217
+ "status",
218
+ "installed_capacity",
219
+ "name",
220
+ "type",
221
+ "eic_code",
222
+ ]
223
+ # only drop those that are actually there
224
+ drop_cols = [c for c in drop_cols if c in mongo_df_2.columns]
225
+ mongo_df_2 = mongo_df_2.drop(columns=drop_cols)
226
+
227
+ # now mongo_df_2 is your final
228
+ else:
229
+ # at least one of the required columns is missing:
230
+ # present contains the ones you did find
231
+ missing = _optional - present
232
+ print(f"Skipping normalize process because these columns are missing: {missing}")
233
+ mongo_df_2 = mongo_df.copy() # or however you want to proceed
234
+
235
+ mongo_df_2["values_first"] = mongo_df_2["values"].apply(
236
+ lambda lst: lst[0] if isinstance(lst, list) and len(lst) > 0 else {}
237
+ )
238
 
239
+ # 2. Normalize that dict into separate columns
240
+ values_expanded = pd.json_normalize(mongo_df_2["values_first"])
241
+ # e.g. this produces columns like “start_date”, “end_date”, etc.
242
+
243
+ # 3. Concatenate back and drop the originals
244
+ mongo_df_2 = pd.concat(
245
+ [
246
+ mongo_df_2.drop(columns=["values", "values_first", "start_date", "end_date"]),
247
+ values_expanded
248
+ ],
249
+ axis=1
250
+ )
251
 
252
  # Convert the date columns to datetime objects
253
  for col in ["publication_date", "start_date", "end_date"]:
 
258
  # mongo_df_2[col] = mongo_df_2[col].dt.tz_convert("Europe/Paris")
259
 
260
  # mongo_df_2 = mongo_df_2.drop_duplicates(subset='identifier', keep='first')
261
+ mongo_df_2['version'] = mongo_df_2['version'].astype(float)
262
  # Sort by identifier and version to ensure the latest version is at the top
263
  # Method 1: Use groupby + idxmax to pick the row with the largest version per identifier
264
  idx = mongo_df_2.groupby("identifier")["version"].idxmax()