Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
|
@@ -438,29 +438,38 @@ def load_all_data(folder_path):
|
|
| 438 |
|
| 439 |
def clean_data_and_report_outliers(df):
|
| 440 |
"""
|
| 441 |
-
|
| 442 |
-
|
| 443 |
"""
|
| 444 |
outlier_report = []
|
| 445 |
-
# Identify price columns (exclude _Volume, _High, _Low
|
| 446 |
-
price_columns = [col for col in df.columns if not (
|
| 447 |
|
| 448 |
for ticker in price_columns:
|
| 449 |
-
# Check if column exists
|
| 450 |
if ticker in df.columns:
|
| 451 |
-
# Ensure
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
if numeric_prices.isna().all(): continue # Skip fully NaN columns
|
| 455 |
-
|
| 456 |
-
daily_pct_change = numeric_prices.pct_change().abs()
|
| 457 |
-
# Outlier Threshold: > 1.0 (100% change in one day)
|
| 458 |
-
outlier_days = daily_pct_change[daily_pct_change > 1.0].index
|
| 459 |
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
return df, outlier_report
|
| 465 |
|
| 466 |
# --- 2. Custom Backtesting Engine ---
|
|
|
|
| 438 |
|
| 439 |
def clean_data_and_report_outliers(df):
|
| 440 |
"""
|
| 441 |
+
Cleans data using a 'Rolling Median' filter (User Defined: 1 Month Window).
|
| 442 |
+
Adapts to long-term trends while catching sudden 'Pence vs Pound' glitches.
|
| 443 |
"""
|
| 444 |
outlier_report = []
|
| 445 |
+
# Identify price columns (exclude _Volume, _High, _Low)
|
| 446 |
+
price_columns = [col for col in df.columns if not any(x in str(col) for x in ['_Volume', '_High', '_Low'])]
|
| 447 |
|
| 448 |
for ticker in price_columns:
|
|
|
|
| 449 |
if ticker in df.columns:
|
| 450 |
+
# Ensure numeric
|
| 451 |
+
series = pd.to_numeric(df[ticker], errors='coerce')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
|
| 453 |
+
# --- ROLLING MEDIAN FILTER (1 Month / 20 Days) ---
|
| 454 |
+
# center=True looks at 10 days before and 10 days after (if available) to find the 'true' level.
|
| 455 |
+
# min_periods=1 ensures it works even at the start of the file.
|
| 456 |
+
rolling_median = series.rolling(window=20, center=True, min_periods=1).median()
|
| 457 |
+
|
| 458 |
+
# 1. Catch the 'Pence vs Pounds' Crash (e.g., 154 -> 1.56)
|
| 459 |
+
# Logic: If price is < 20% of the recent median (an 80% drop).
|
| 460 |
+
low_threshold = rolling_median * 0.20
|
| 461 |
+
|
| 462 |
+
# 2. Catch massive data spikes (e.g., 1.56 -> 154 if logic reversed)
|
| 463 |
+
# Logic: If price is > 5x the recent median (500% spike).
|
| 464 |
+
high_threshold = rolling_median * 5.0
|
| 465 |
+
|
| 466 |
+
bad_data_mask = (series < low_threshold) | (series > high_threshold)
|
| 467 |
+
bad_days = series[bad_data_mask].index
|
| 468 |
+
|
| 469 |
+
if not bad_days.empty:
|
| 470 |
+
df.loc[bad_days, ticker] = np.nan
|
| 471 |
+
outlier_report.append({'Ticker': ticker, 'Type': 'Rolling Filter', 'Count': len(bad_days)})
|
| 472 |
+
|
| 473 |
return df, outlier_report
|
| 474 |
|
| 475 |
# --- 2. Custom Backtesting Engine ---
|