Daveabc12 commited on
Commit
f252f94
·
verified ·
1 Parent(s): 909a3e9

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -17
app.py CHANGED
@@ -438,29 +438,38 @@ def load_all_data(folder_path):
438
 
439
  def clean_data_and_report_outliers(df):
440
  """
441
- Scans for massive daily percentage moves (>100%) which are likely data errors
442
- and sets them to NaN. Preserved from original code.
443
  """
444
  outlier_report = []
445
- # Identify price columns (exclude _Volume, _High, _Low if present)
446
- price_columns = [col for col in df.columns if not ('_Volume' in str(col) or '_High' in str(col) or '_Low' in str(col))]
447
 
448
  for ticker in price_columns:
449
- # Check if column exists
450
  if ticker in df.columns:
451
- # Ensure it's numeric before pct_change
452
- numeric_prices = pd.to_numeric(df[ticker], errors='coerce').replace(0, np.nan)
453
-
454
- if numeric_prices.isna().all(): continue # Skip fully NaN columns
455
-
456
- daily_pct_change = numeric_prices.pct_change().abs()
457
- # Outlier Threshold: > 1.0 (100% change in one day)
458
- outlier_days = daily_pct_change[daily_pct_change > 1.0].index
459
 
460
- if not outlier_days.empty:
461
- outlier_report.append({'Ticker': ticker, 'Outliers Removed': len(outlier_days)})
462
- df.loc[outlier_days, ticker] = np.nan # Set outliers to NaN
463
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  return df, outlier_report
465
 
466
  # --- 2. Custom Backtesting Engine ---
 
438
 
439
  def clean_data_and_report_outliers(df):
440
  """
441
+ Cleans data using a 'Rolling Median' filter (User Defined: 1 Month Window).
442
+ Adapts to long-term trends while catching sudden 'Pence vs Pound' glitches.
443
  """
444
  outlier_report = []
445
+ # Identify price columns (exclude _Volume, _High, _Low)
446
+ price_columns = [col for col in df.columns if not any(x in str(col) for x in ['_Volume', '_High', '_Low'])]
447
 
448
  for ticker in price_columns:
 
449
  if ticker in df.columns:
450
+ # Ensure numeric
451
+ series = pd.to_numeric(df[ticker], errors='coerce')
 
 
 
 
 
 
452
 
453
+ # --- ROLLING MEDIAN FILTER (1 Month / 20 Days) ---
454
+ # center=True looks at 10 days before and 10 days after (if available) to find the 'true' level.
455
+ # min_periods=1 ensures it works even at the start of the file.
456
+ rolling_median = series.rolling(window=20, center=True, min_periods=1).median()
457
+
458
+ # 1. Catch the 'Pence vs Pounds' Crash (e.g., 154 -> 1.56)
459
+ # Logic: If price is < 20% of the recent median (an 80% drop).
460
+ low_threshold = rolling_median * 0.20
461
+
462
+ # 2. Catch massive data spikes (e.g., 1.56 -> 154 if logic reversed)
463
+ # Logic: If price is > 5x the recent median (500% spike).
464
+ high_threshold = rolling_median * 5.0
465
+
466
+ bad_data_mask = (series < low_threshold) | (series > high_threshold)
467
+ bad_days = series[bad_data_mask].index
468
+
469
+ if not bad_days.empty:
470
+ df.loc[bad_days, ticker] = np.nan
471
+ outlier_report.append({'Ticker': ticker, 'Type': 'Rolling Filter', 'Count': len(bad_days)})
472
+
473
  return df, outlier_report
474
 
475
  # --- 2. Custom Backtesting Engine ---