P2SAMAPA commited on
Commit
19c35a0
Β·
unverified Β·
1 Parent(s): ff640d6

Update data_manager.py

Browse files
Files changed (1) hide show
  1. data_manager.py +30 -33
data_manager.py CHANGED
@@ -163,41 +163,38 @@ def smart_update_hf_dataset(new_data, token, force_upload=False):
163
  if existing_df.index.tz is not None:
164
  existing_df.index = existing_df.index.tz_localize(None)
165
 
166
- # ── Step 1: merge recent new_data on top of existing ─────────────────
167
- combined = new_data.combine_first(existing_df)
168
-
169
- # ── Step 2: detect ETFs missing / all-NaN in existing dataset ────────
170
- new_etf_cols = []
171
- all_etfs = [c.replace("_Ret", "") for c in new_data.columns if c.endswith("_Ret")]
172
- for etf in all_etfs:
173
- ret_col = f"{etf}_Ret"
174
- if ret_col not in existing_df.columns or existing_df[ret_col].isna().mean() > 0.9:
175
- new_etf_cols.append(etf)
176
-
 
 
 
177
  if new_etf_cols:
178
- st.info(f"πŸ†• New ETFs detected: {new_etf_cols} β€” fetching full history from 2008...")
179
- full_history = fetch_etf_data(new_etf_cols, start_date="2008-01-01")
180
- if not full_history.empty:
181
- if full_history.index.tz is not None:
182
- full_history.index = full_history.index.tz_localize(None)
183
- # Expand combined to cover all dates in full_history
184
- full_index = combined.index.union(full_history.index)
185
- combined = combined.reindex(full_index)
186
- # Write full history directly into combined (overwrites NaN-only columns)
187
- cols_to_backfill = [
188
- c for c in full_history.columns
189
- if c not in combined.columns or combined[c].isna().mean() > 0.9
190
- ]
191
- for col in cols_to_backfill:
192
- combined[col] = full_history.reindex(full_index)[col]
193
- st.success(
194
- f"βœ… Full history fetched for {new_etf_cols}: "
195
- f"{len(full_history)} rows "
196
- f"({full_history.index[0].date()} β†’ {full_history.index[-1].date()}), "
197
- f"{len(cols_to_backfill)} columns backfilled"
198
  )
199
- else:
200
- st.warning(f"⚠️ Could not fetch full history for {new_etf_cols}")
201
 
202
  # ── Step 3: decide whether to upload ─────────────────────────────────
203
  new_rows = len(combined) - len(existing_df)
 
163
  if existing_df.index.tz is not None:
164
  existing_df.index = existing_df.index.tz_localize(None)
165
 
166
+ # ── Step 1: fetch FULL history for all target ETFs from 2008 ─────────
167
+ # Always fetch full history so new ETFs get complete backfill and
168
+ # existing ETFs stay current. This is the authoritative data source.
169
+ st.info("πŸ“‘ Fetching full ETF history from 2008...")
170
+ full_etf = fetch_etf_data(ETF_LIST, start_date="2008-01-01")
171
+ if full_etf.index.tz is not None:
172
+ full_etf.index = full_etf.index.tz_localize(None)
173
+
174
+ # Detect which ETFs are new (missing or all-NaN in existing dataset)
175
+ new_etf_cols = [
176
+ etf for etf in ETF_LIST
177
+ if f"{etf}_Ret" not in existing_df.columns
178
+ or existing_df[f"{etf}_Ret"].isna().mean() > 0.9
179
+ ]
180
  if new_etf_cols:
181
+ st.info(f"πŸ†• New ETFs: {new_etf_cols} β€” will be fully backfilled")
182
+
183
+ # ── Step 2: build combined β€” full ETF history + existing macro ────────
184
+ # Start from existing, expand index to cover all ETF dates
185
+ full_index = existing_df.index.union(full_etf.index)
186
+ combined = existing_df.reindex(full_index)
187
+ # Write all ETF columns from full history (overwrites stale/NaN data)
188
+ for col in full_etf.columns:
189
+ combined[col] = full_etf.reindex(full_index)[col]
190
+ # Merge in any macro columns from new_data not already in combined
191
+ for col in new_data.columns:
192
+ if col not in full_etf.columns:
193
+ combined[col] = new_data.reindex(full_index)[col].combine_first(
194
+ combined.get(col, pd.Series(dtype=float))
 
 
 
 
 
 
195
  )
196
+ st.success(f"βœ… ETF history: {len(full_etf)} rows, {len(full_etf.columns)} columns"
197
+ + (f" | New ETFs backfilled: {new_etf_cols}" if new_etf_cols else ""))
198
 
199
  # ── Step 3: decide whether to upload ─────────────────────────────────
200
  new_rows = len(combined) - len(existing_df)