Spaces:

Daveabc12
/

Backtesting-App

Running

App Files Files Community

Daveabc12 commited on Dec 6, 2025

Commit

f252f94

verified ·

1 Parent(s): 909a3e9

Upload app.py

Browse files

Files changed (1) hide show

app.py +26 -17

app.py CHANGED Viewed

@@ -438,29 +438,38 @@ def load_all_data(folder_path):
 def clean_data_and_report_outliers(df):
     """
-    Scans for massive daily percentage moves (>100%) which are likely data errors
-    and sets them to NaN. Preserved from original code.
     """
     outlier_report = []
-    # Identify price columns (exclude _Volume, _High, _Low if present)
-    price_columns = [col for col in df.columns if not ('_Volume' in str(col) or '_High' in str(col) or '_Low' in str(col))]
     for ticker in price_columns:
-        # Check if column exists
         if ticker in df.columns:
-            # Ensure it's numeric before pct_change
-            numeric_prices = pd.to_numeric(df[ticker], errors='coerce').replace(0, np.nan)
-            if numeric_prices.isna().all(): continue # Skip fully NaN columns
-            daily_pct_change = numeric_prices.pct_change().abs()
-            # Outlier Threshold: > 1.0 (100% change in one day)
-            outlier_days = daily_pct_change[daily_pct_change > 1.0].index
-            if not outlier_days.empty:
-                outlier_report.append({'Ticker': ticker, 'Outliers Removed': len(outlier_days)})
-                df.loc[outlier_days, ticker] = np.nan # Set outliers to NaN
     return df, outlier_report
 # --- 2. Custom Backtesting Engine ---

 def clean_data_and_report_outliers(df):
     """
+    Cleans data using a 'Rolling Median' filter (User Defined: 1 Month Window).
+    Adapts to long-term trends while catching sudden 'Pence vs Pound' glitches.
     """
     outlier_report = []
+    # Identify price columns (exclude _Volume, _High, _Low)
+    price_columns = [col for col in df.columns if not any(x in str(col) for x in ['_Volume', '_High', '_Low'])]
     for ticker in price_columns:
         if ticker in df.columns:
+            # Ensure numeric
+            series = pd.to_numeric(df[ticker], errors='coerce')
+            # --- ROLLING MEDIAN FILTER (1 Month / 20 Days) ---
+            # center=True looks at 10 days before and 10 days after (if available) to find the 'true' level.
+            # min_periods=1 ensures it works even at the start of the file.
+            rolling_median = series.rolling(window=20, center=True, min_periods=1).median()
+            # 1. Catch the 'Pence vs Pounds' Crash (e.g., 154 -> 1.56)
+            # Logic: If price is < 20% of the recent median (an 80% drop).
+            low_threshold = rolling_median * 0.20
+            # 2. Catch massive data spikes (e.g., 1.56 -> 154 if logic reversed)
+            # Logic: If price is > 5x the recent median (500% spike).
+            high_threshold = rolling_median * 5.0
+            bad_data_mask = (series < low_threshold) | (series > high_threshold)
+            bad_days = series[bad_data_mask].index
+            if not bad_days.empty:
+                df.loc[bad_days, ticker] = np.nan
+                outlier_report.append({'Ticker': ticker, 'Type': 'Rolling Filter', 'Count': len(bad_days)})
     return df, outlier_report
 # --- 2. Custom Backtesting Engine ---