add files

Browse files

Files changed (5) hide show

utils/FixedLengthTransformer.py +152 -0
utils/drawPlots.py +24 -0
utils/functionalPatternLocateAndPlot.py +653 -0
utils/patternLocating.py +380 -0
utils/patternLocatingGemni.py +452 -0

utils/FixedLengthTransformer.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""Fixed length transformer, pad or truncate panel to fixed length."""
+import numpy as np
+import pandas as pd
+from sktime.transformations.base import BaseTransformer
+from sktime.utils.pandas import df_map
+__all__ = ["FixedLengthTransformer"]
+__author__ = ["user"]
+class FixedLengthTransformer(BaseTransformer):
+    """Transform panel of variable length time series to fixed length.
+    Transforms input dataset to a fixed length by either:
+    - Padding shorter series with a fill value (default: 0)
+    - Truncating longer series to the specified length
+    Unlike PaddingTransformer, this transformer requires a fixed_length parameter
+    and will both pad and truncate as needed.
+    Parameters
+    ----------
+    fixed_length : int
+        The exact length that all series will be transformed to
+    fill_value : any, optional (default=0)
+        The value used to pad shorter series
+    Example
+    -------
+    >>> import pandas as pd
+    >>> from sktime.transformations.panel.fixed_length import FixedLengthTransformer
+    >>>
+    >>> # Create a sample nested DataFrame with unequal length time series
+    >>> data = {
+    ...     'feature1': [
+    ...         pd.Series([1, 2, 3]), pd.Series([4, 5]), pd.Series([6, 7, 8, 9])
+    ...     ],
+    ...     'feature2': [
+    ...         pd.Series([10, 11]), pd.Series([12, 13, 14]), pd.Series([15])
+    ...     ]
+    ... }
+    >>> X = pd.DataFrame(data)
+    >>>
+    >>> # Initialize the FixedLengthTransformer with fixed_length=3
+    >>> transformer = FixedLengthTransformer(fixed_length=3)
+    >>>
+    >>> # Fit the transformer to the data
+    >>> transformer.fit(X)
+    >>>
+    >>> # Transform the data
+    >>> Xt = transformer.transform(X)
+    >>>
+    >>> # Display the transformed data
+    >>> print(Xt)
+    """
+    _tags = {
+        "authors": ["user"],
+        "maintainers": ["user"],
+        "scitype:transform-input": "Series",
+        "scitype:transform-output": "Series",
+        "scitype:instancewise": False,
+        "X_inner_mtype": "nested_univ",
+        "y_inner_mtype": "None",
+        "fit_is_empty": True,  # No need to compute anything during fit
+        "capability:unequal_length:removes": True,
+    }
+    def __init__(self, fixed_length, fill_value=0):
+        if fixed_length is None or fixed_length <= 0:
+            raise ValueError("fixed_length must be a positive integer")
+        self.fixed_length = fixed_length
+        self.fill_value = fill_value
+        super().__init__()
+    def _fit(self, X, y=None):
+        """Fit transformer to X and y.
+        This is a no-op since we only need the fixed_length parameter.
+        Parameters
+        ----------
+        X : nested pandas DataFrame of shape [n_instances, n_features]
+            each cell of X must contain pandas.Series
+        y : ignored argument for interface compatibility
+        Returns
+        -------
+        self : reference to self
+        """
+        return self
+    def _transform_series(self, series):
+        """Transform a single series to fixed length by padding or truncating.
+        Parameters
+        ----------
+        series : pandas.Series
+            The input series to transform
+        Returns
+        -------
+        numpy.ndarray
+            Fixed length array
+        """
+        series_length = len(series)
+        if series_length == self.fixed_length:
+            # Series is already the correct length
+            return series.values
+        elif series_length < self.fixed_length:
+            # Pad the series with fill_value
+            result = np.full(self.fixed_length, self.fill_value, dtype=float)
+            result[:series_length] = series.iloc[:series_length]
+            return result
+        else:
+            # Truncate the series
+            return series.iloc[:self.fixed_length].values
+    def _transform(self, X, y=None):
+        """Transform X and return a transformed version.
+        Parameters
+        ----------
+        X : nested pandas DataFrame of shape [n_instances, n_features]
+            each cell of X must contain pandas.Series
+        y : ignored argument for interface compatibility
+        Returns
+        -------
+        Xt : nested pandas DataFrame of shape [n_instances, n_features]
+            each cell of Xt contains pandas.Series with fixed length
+        """
+        n_instances, _ = X.shape
+        # Process each row of instances
+        transformed_rows = []
+        for i in range(n_instances):
+            # Transform each series in the row
+            row_series = X.iloc[i, :].values
+            transformed_series = [pd.Series(self._transform_series(series))
+                                 for series in row_series]
+            transformed_rows.append(pd.Series(transformed_series))
+        # Convert back to DataFrame
+        Xt = df_map(pd.DataFrame(transformed_rows))(pd.Series)
+        Xt.columns = X.columns
+        return Xt

utils/drawPlots.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import pandas as pd
+# import mplfinance as mpf
+def plot_ohlc_segment(data_segment):
+    """
+    Plots a segment of OHLC data using mplfinance.
+    Parameters:
+    - data_segment (pd.DataFrame): A DataFrame containing columns ['Open', 'High', 'Low', 'Close', 'Volume']
+    """
+    # Commenting out plotting functionality
+    pass
+    """
+    # Ensure the DataFrame index is datetime for mplfinance
+    data_segment = data_segment.copy()
+    data_segment.index = pd.date_range(start='2024-01-01', periods=len(data_segment), freq='D')
+    # Plot the candlestick chart
+    mpf.plot(data_segment, type='candle', style='charles',
+             volume=True, ylabel='Price', ylabel_lower='Volume',
+             title="OHLC Segment", figsize=(10, 6))
+    """

utils/functionalPatternLocateAndPlot.py ADDED Viewed

	@@ -0,0 +1,653 @@

+# import matplotlib
+# matplotlib.use('Agg')
+from scipy.signal import find_peaks
+from utils.formatAndPreprocessNewPatterns import get_pattern_encoding
+path = 'Datasets/OHLC data'
+pattern_encoding = get_pattern_encoding()
+def calc_head_and_sholder_top(row,ohlc_data_pattern_segment):
+    high_prices = ohlc_data_pattern_segment['High'].values
+    low_prices = ohlc_data_pattern_segment['Low'].values
+    # Adjust this parameter to suit your data – lower values detect smaller features.
+    prominence_value = 0.1
+    # Find peaks (local maxima)
+    peak_indices, _ = find_peaks(high_prices, prominence=prominence_value)
+    # Find valleys (local minima) by inverting the low prices
+    valley_indices, _ = find_peaks(-low_prices, prominence=prominence_value)
+    # create a list of dates for peaks and valleys
+    peak_dates = ohlc_data_pattern_segment['Date'].iloc[peak_indices]
+    valley_dates = ohlc_data_pattern_segment['Date'].iloc[valley_indices]
+    if len(peak_indices) < 3 or len(valley_indices) < 2:
+        print("Not enough peaks and valleys to form a Head & Shoulders pattern.")
+        return
+    try:
+        H_index = np.argmax(high_prices[peak_indices])
+        H = peak_indices[H_index]
+        LS_index = np.argmax(high_prices[peak_indices[0:H_index]])
+        LS = peak_indices[LS_index]
+        RS_index = np.argmax(high_prices[peak_indices[H_index+1:]]) + H_index + 1
+        RS = peak_indices[RS_index]
+        vally_left = valley_indices[(valley_indices > LS) & (valley_indices < H)]
+        vally_right = valley_indices[(valley_indices > H) & (valley_indices < RS)]
+        NL1 = vally_left[np.argmin(low_prices[vally_left])]
+        NL2 = vally_right[np.argmin(low_prices[vally_right])]
+        # Ensure the middle peak is the highest
+        if high_prices[H] <= max(high_prices[LS], high_prices[RS]):
+            print("Not a valid Head & Shoulders pattern.")
+            return
+        LS_date = ohlc_data_pattern_segment['Date'].iloc[LS]
+        H_date = ohlc_data_pattern_segment['Date'].iloc[H]
+        RS_date = ohlc_data_pattern_segment['Date'].iloc[RS]
+        NL1_date = ohlc_data_pattern_segment['Date'].iloc[NL1]
+        NL2_date = ohlc_data_pattern_segment['Date'].iloc[NL2]
+        # add the dates to the row
+        row['HS_Left_Shoulder'] = LS_date
+        row['HS_Head'] = H_date
+        row['HS_Right_Shoulder'] = RS_date
+        row['HS_Neckline_1'] = NL1_date
+        row['HS_Neckline_2'] = NL2_date
+        row['Peak_Dates'] = peak_dates
+        row['Valley_Dates'] = valley_dates
+        row['Calc_Start'] = LS_date
+        row['Calc_End'] = RS_date
+        return row
+    except:
+        print("Error in finding the peaks or valleys in the Head and Shoulders pattern")
+        return
+def calc_head_and_shoulder_bottom(row, ohlc_data_pattern_segment):
+    high_prices = ohlc_data_pattern_segment['High'].values
+    low_prices = ohlc_data_pattern_segment['Low'].values
+    # Adjust this parameter to suit your data – lower values detect smaller features.
+    prominence_value = 0.1
+    # Find valleys (local minima)
+    valley_indices, _ = find_peaks(-low_prices, prominence=prominence_value)
+    # Find peaks (local maxima)
+    peak_indices, _ = find_peaks(high_prices, prominence=prominence_value)
+    # Create lists of dates for valleys and peaks
+    valley_dates = ohlc_data_pattern_segment['Date'].iloc[valley_indices]
+    peak_dates = ohlc_data_pattern_segment['Date'].iloc[peak_indices]
+    if len(valley_indices) < 3 or len(peak_indices) < 2:
+        print("Not enough valleys and peaks to form a Head & Shoulders Bottom pattern.")
+        return
+    try:
+        H_index = np.argmin(low_prices[valley_indices])  # Find lowest valley (Head)
+        H = valley_indices[H_index]
+        LS_index = np.argmin(low_prices[valley_indices[0:H_index]])
+        LS = valley_indices[LS_index]
+        RS_index = np.argmin(low_prices[valley_indices[H_index+1:]]) + H_index + 1
+        RS = valley_indices[RS_index]
+        peak_left = peak_indices[(peak_indices > LS) & (peak_indices < H)]
+        peak_right = peak_indices[(peak_indices > H) & (peak_indices < RS)]
+        NL1 = peak_left[np.argmax(high_prices[peak_left])]
+        NL2 = peak_right[np.argmax(high_prices[peak_right])]
+        # Ensure the middle valley is the lowest
+        if low_prices[H] >= min(low_prices[LS], low_prices[RS]):
+            print("Not a valid Head & Shoulders Bottom pattern.")
+            return
+        LS_date = ohlc_data_pattern_segment['Date'].iloc[LS]
+        H_date = ohlc_data_pattern_segment['Date'].iloc[H]
+        RS_date = ohlc_data_pattern_segment['Date'].iloc[RS]
+        NL1_date = ohlc_data_pattern_segment['Date'].iloc[NL1]
+        NL2_date = ohlc_data_pattern_segment['Date'].iloc[NL2]
+        # Add the detected pattern data to the row
+        row['HS_Left_Shoulder'] = LS_date
+        row['HS_Head'] = H_date
+        row['HS_Right_Shoulder'] = RS_date
+        row['HS_Neckline_1'] = NL1_date
+        row['HS_Neckline_2'] = NL2_date
+        row['Valley_Dates'] = valley_dates
+        row['Peak_Dates'] = peak_dates
+        row['Calc_Start'] = LS_date
+        row['Calc_End'] = RS_date
+        return row
+    except:
+        print("Error in finding the valleys or peaks in the Head and Shoulders Bottom pattern")
+        return
+def calc_double_top_aa(row,ohlc_data_pattern_segment):
+    high_prices = ohlc_data_pattern_segment['High'].values
+    low_prices = ohlc_data_pattern_segment['Low'].values
+    # Adjust this parameter to suit your data – lower values detect smaller features.
+    prominence_value = 0.1
+    # Find peaks (local maxima)
+    peak_indices, _ = find_peaks(high_prices, prominence=prominence_value)
+    # Find valleys (local minima) by inverting the low prices
+    valley_indices, _ = find_peaks(-low_prices, prominence=prominence_value)
+    # create a list of dates for peaks and valleys
+    peak_dates = ohlc_data_pattern_segment['Date'].iloc[peak_indices]
+    valley_dates = ohlc_data_pattern_segment['Date'].iloc[valley_indices]
+    if len(peak_indices) < 2 or len(valley_indices) < 1:
+        print("Not enough peaks and valleys to form a Double Top pattern.")
+        return
+    try:
+        H1_index = np.argmax(high_prices[peak_indices])
+        H1 = peak_indices[H1_index]
+        H2_index = np.argmax(high_prices[peak_indices[H1_index+1:]]) + H1_index + 1
+        H2 = peak_indices[H2_index]
+        # get v index that is between H1 and H2
+        valley_indices_between_H1_H2 = valley_indices[(valley_indices > H1) & (valley_indices < H2)]
+        V = valley_indices_between_H1_H2[np.argmax(low_prices[ valley_indices_between_H1_H2])]
+        # # Ensure the middle peak is the highest
+        # if high_prices[H1] <= high_prices[H2]:
+        #     print("Not a valid Double Top pattern.")
+        #     return
+        H1_date = ohlc_data_pattern_segment['Date'].iloc[H1]
+        H2_date = ohlc_data_pattern_segment['Date'].iloc[H2]
+        V_date = ohlc_data_pattern_segment['Date'].iloc[V]
+        # add the dates to the row
+        row['DT_Peak_1'] = H1_date
+        row['DT_Peak_2'] = H2_date
+        row['DT_Valley'] = V_date
+        row['Peak_Dates'] = peak_dates
+        row['Valley_Dates'] = valley_dates
+        row['Calc_Start'] = H1_date
+        row['Calc_End'] = H2_date
+        return row
+    except:
+        print("Error in finding the peaks or valleys in the Double Top pattern")
+        return
+def calc_double_bottom_aa(row,ohlc_data_pattern_segment):
+    high_prices = ohlc_data_pattern_segment['High'].values
+    low_prices = ohlc_data_pattern_segment['Low'].values
+    # Adjust this parameter to suit your data – lower values detect smaller features.
+    prominence_value = 0.05
+    # Find valleys (local minima)
+    valley_indices, _ = find_peaks(-low_prices, prominence=prominence_value)
+    # Find peaks (local maxima)
+    peak_indices, _ = find_peaks(high_prices, prominence=prominence_value)
+    # Create lists of dates for valleys and peaks
+    valley_dates = ohlc_data_pattern_segment['Date'].iloc[valley_indices]
+    peak_dates = ohlc_data_pattern_segment['Date'].iloc[peak_indices]
+    if len(valley_indices) < 2 or len(peak_indices) < 1:
+        print("Not enough valleys and peaks to form a Double Bottom pattern.")
+        return
+    try:
+        H1_index = np.argmin(low_prices[valley_indices])
+        H1 = valley_indices[H1_index]
+        H2_index = np.argmin(low_prices[valley_indices[H1_index+1:]]) + H1_index + 1
+        H2 = valley_indices[H2_index]
+        # get v index that is between H1 and H2
+        peak_indices_between_H1_H2 = peak_indices[(peak_indices > H1) & (peak_indices < H2)]
+        P = peak_indices_between_H1_H2[np.argmax(high_prices[ peak_indices_between_H1_H2])]
+        # # Ensure the middle valley is the lowest
+        # if low_prices[H1] >= low_prices[H2]:
+        #     print("Not a valid Double Bottom pattern.")
+        #     return
+        H1_date = ohlc_data_pattern_segment['Date'].iloc[H1]
+        H2_date = ohlc_data_pattern_segment['Date'].iloc[H2]
+        P_date = ohlc_data_pattern_segment['Date'].iloc[P]
+        # Add the detected pattern data to the row
+        row['DB_Valley_1'] = H1_date
+        row['DB_Valley_2'] = H2_date
+        row['DB_Peak'] = P_date
+        row['Valley_Dates'] = valley_dates
+        row['Peak_Dates'] = peak_dates
+        row['Calc_Start'] = H1_date
+        row['Calc_End'] = H2_date
+        return row
+    except:
+        print("Error in finding the valleys or peaks in the Double Bottom pattern")
+        return
+def calc_double_bottom_ea(row,ohlc_data_pattern_segment):
+    high_prices = ohlc_data_pattern_segment['High'].values
+    low_prices = ohlc_data_pattern_segment['Low'].values
+    # Adjust this parameter to suit your data – lower values detect smaller features.
+    prominence_value = 0.1
+    # Find valleys (local minima)
+    valley_indices, _ = find_peaks(-low_prices, prominence=prominence_value)
+    # Find peaks (local maxima)
+    peak_indices, _ = find_peaks(high_prices, prominence=prominence_value)
+    round_vallies,_ = find_peaks(-low_prices, prominence=0.01,width=3,threshold=0.01)
+    # Create lists of dates for valleys and peaks
+    valley_dates = ohlc_data_pattern_segment['Date'].iloc[valley_indices]
+    peak_dates = ohlc_data_pattern_segment['Date'].iloc[peak_indices]
+    if len(valley_indices) < 2 or len(peak_indices) < 1:
+        print("Not enough valleys and peaks to form a Double Bottom pattern.")
+        return
+    try:
+        H1_index = np.argmin(low_prices[round_vallies])
+        H1 = valley_indices[H1_index]
+        H2_index = np.argmin(low_prices[valley_indices[H1_index+1:]]) + H1_index + 1
+        H2 = valley_indices[H2_index]
+        # get v index that is between H1 and H2
+        peak_indices_between_H1_H2 = peak_indices[(peak_indices > H1) & (peak_indices < H2)]
+        P = peak_indices_between_H1_H2[np.argmax(high_prices[ peak_indices_between_H1_H2])]
+        # # Ensure the middle valley is the lowest
+        # if low_prices[H1] >= low_prices[H2]:
+        #     print("Not a valid Double Bottom pattern.")
+        #     return
+        H1_date = ohlc_data_pattern_segment['Date'].iloc[H1]
+        H2_date = ohlc_data_pattern_segment['Date'].iloc[H2]
+        P_date = ohlc_data_pattern_segment['Date'].iloc[P]
+        # Add the detected pattern data to the row
+        row['DB_Valley_1'] = H1_date
+        row['DB_Valley_2'] = H2_date
+        row['DB_Peak'] = P_date
+        row['Valley_Dates'] = valley_dates
+        row['Peak_Dates'] = peak_dates
+        row['Calc_Start'] = H1_date
+        row['Calc_End'] = H2_date
+        return row
+    except:
+        print("Error in finding the valleys or peaks in the Double Bottom pattern")
+        return
+# Commenting out all plotting functions
+"""
+import matplotlib.pyplot as plt
+import mplfinance as mpf
+import pandas as pd
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import mplfinance as mpf
+from scipy.signal import argrelextrema
+from scipy.signal import find_peaks
+def draw_head_and_shoulders_top(ax, ohlc_data, pat_start_idx,row):
+    Draws a Head and Shoulders pattern on an existing mplfinance plot and visualizes detected peaks and valleys.
+    Parameters:
+        ax (matplotlib.axes.Axes): The candlestick chart's axis.
+        ohlc_data (pd.DataFrame): Data containing 'High' and 'Low' columns.
+    # reset the index of the ohlc_data
+    ohlc_data.reset_index(drop=True, inplace=True)
+    high_prices = ohlc_data['High'].values
+    low_prices = ohlc_data['Low'].values
+    # check if 'Peak_Dates' and 'Valley_Dates' columns are present in the row
+    if 'Peak_Dates' in row and 'Valley_Dates' in row:
+        peak_days = row['Peak_Dates']
+        valley_days = row['Valley_Dates']
+        peak_indices = ohlc_data[ohlc_data['Date'].isin(peak_days)].index
+        # add the pat_start_idx to the peak_indices
+        peak_indices = peak_indices
+        valley_indices = ohlc_data[ohlc_data['Date'].isin(valley_days)].index
+        # add the pat_start_idx to the valley_indices
+        valley_indices = valley_indices
+        # Debugging visualization: Plot detected peaks and valleys
+        ax.scatter(peak_indices , high_prices[peak_indices], color='green', marker='^', label='Peaks', zorder=3)
+        ax.scatter(valley_indices, low_prices[valley_indices], color='red', marker='v', label='Valleys', zorder=3)
+    calc_start_date = row['Calc_Start']
+    calc_end_date = row['Calc_End']
+    calc_start_idx = ohlc_data[ohlc_data['Date']== calc_start_date].index
+    calc_end_idx = ohlc_data[ohlc_data['Date']== calc_end_date].index
+    # drow a pink dotted vertical line at calc_start_idx and calc_end_idx
+    ax.axvline(x=calc_start_idx, color='blue', linestyle='dotted', linewidth=1)
+    ax.axvline(x=calc_end_idx, color='blue', linestyle='dotted', linewidth=1)
+    LS_idx = ohlc_data[ohlc_data['Date']== row['HS_Left_Shoulder']].index
+    H_idx = ohlc_data[ohlc_data['Date']== row['HS_Head']].index
+    RS_idx = ohlc_data[ohlc_data['Date']== row['HS_Right_Shoulder']].index
+    NL1_idx = ohlc_data[ohlc_data['Date']== row['HS_Neckline_1']].index
+    NL2_idx = ohlc_data[ohlc_data['Date']== row['HS_Neckline_2']].index
+    # Draw the head and shoulders
+    ax.plot([LS_idx, H_idx, RS_idx], [high_prices[LS_idx], high_prices[H_idx], high_prices[RS_idx]],
+            linestyle="solid", marker="o", color="blue", linewidth=1, label="H&S Pattern")
+    # Use NL1_idx and NL2_idx as the x-range to keep the line within bounds
+    x_min, x_max = min(NL1_idx, NL2_idx), max(NL1_idx, NL2_idx)
+    # Compute the y-values using the line equation (y = mx + c)
+    slope = (low_prices[NL2_idx] - low_prices[NL1_idx]) / (NL2_idx - NL1_idx)
+    y_min = low_prices[NL1_idx] + slope * (x_min - NL1_idx)
+    y_max = low_prices[NL1_idx] + slope * (x_max - NL1_idx)
+    # Plot the line within the original graph size
+    ax.plot([x_min, x_max], [y_min, y_max],
+            linestyle="dashed", color="red", linewidth=1, label="Neckline")
+def draw_head_and_shoulders_bottom(ax, ohlc_data, pat_start_idx,row):
+    Draws a Head and Shoulders pattern on an existing mplfinance plot and visualizes detected peaks and valleys.
+    Parameters:
+        ax (matplotlib.axes.Axes): The candlestick chart's axis.
+        ohlc_data (pd.DataFrame): Data containing 'High' and 'Low' columns.
+    # reset the index of the ohlc_data
+    ohlc_data.reset_index(drop=True, inplace=True)
+    high_prices = ohlc_data['High'].values
+    low_prices = ohlc_data['Low'].values
+    # check if 'Peak_Dates' and 'Valley_Dates' columns are present in the row
+    if 'Peak_Dates' in row and 'Valley_Dates' in row:
+        peak_days = row['Peak_Dates']
+        valley_days = row['Valley_Dates']
+        peak_indices = ohlc_data[ohlc_data['Date'].isin(peak_days)].index
+        # add the pat_start_idx to the peak_indices
+        peak_indices = peak_indices
+        valley_indices = ohlc_data[ohlc_data['Date'].isin(valley_days)].index
+        # add the pat_start_idx to the valley_indices
+        valley_indices = valley_indices
+        # Debugging visualization: Plot detected peaks and valleys
+        ax.scatter(peak_indices , high_prices[peak_indices], color='green', marker='^', label='Peaks', zorder=3)
+        ax.scatter(valley_indices, low_prices[valley_indices], color='red', marker='v', label='Valleys', zorder=3)
+    calc_start_date = row['Calc_Start']
+    calc_end_date = row['Calc_End']
+    calc_start_idx = ohlc_data[ohlc_data['Date']== calc_start_date].index
+    calc_end_idx = ohlc_data[ohlc_data['Date']== calc_end_date].index
+    # drow a pink dotted vertical line at calc_start_idx and calc_end_idx
+    ax.axvline(x=calc_start_idx, color='blue', linestyle='dotted', linewidth=1)
+    ax.axvline(x=calc_end_idx, color='blue', linestyle='dotted', linewidth=1)
+    LS_idx = ohlc_data[ohlc_data['Date']== row['HS_Left_Shoulder']].index
+    H_idx = ohlc_data[ohlc_data['Date']== row['HS_Head']].index
+    RS_idx = ohlc_data[ohlc_data['Date']== row['HS_Right_Shoulder']].index
+    NL1_idx = ohlc_data[ohlc_data['Date']== row['HS_Neckline_1']].index
+    NL2_idx = ohlc_data[ohlc_data['Date']== row['HS_Neckline_2']].index
+    # Draw the head and shoulders
+    ax.plot([LS_idx, H_idx, RS_idx], [low_prices[LS_idx], low_prices[H_idx], low_prices[RS_idx]],
+            linestyle="solid", marker="o", color="blue", linewidth=1, label="H&S Pattern")
+    # Use NL1_idx and NL2_idx as the x-range to keep the line within bounds
+    x_min, x_max = min(NL1_idx, NL2_idx), max(NL1_idx, NL2_idx)
+    # Compute the y-values using the line equation (y = mx + c)
+    slope = (high_prices[NL2_idx] - high_prices[NL1_idx]) / (NL2_idx - NL1_idx)
+    y_min = high_prices[NL1_idx] + slope * (x_min - NL1_idx)
+    y_max = high_prices[NL1_idx] + slope * (x_max - NL1_idx)
+    # Plot the line within the original graph size
+    ax.plot([x_min, x_max], [y_min, y_max],
+            linestyle="dashed", color="red", linewidth=1, label="Neckline")
+def draw_double_top_aa(ax, ohlc_data, pat_start_idx,row):
+    Draws a Double Top pattern on an existing mplfinance plot and visualizes detected peaks and valleys.
+    Parameters:
+        ax (matplotlib.axes.Axes): The candlestick chart's axis.
+        ohlc_data (pd.DataFrame): Data containing 'High' and 'Low' columns.
+    # reset the index of the ohlc_data
+    ohlc_data.reset_index(drop=True, inplace=True)
+    high_prices = ohlc_data['High'].values
+    low_prices = ohlc_data['Low'].values
+    # check if 'Peak_Dates' and 'Valley_Dates' columns are present in the row
+    if 'Peak_Dates' in row and 'Valley_Dates' in row:
+        peak_days = row['Peak_Dates']
+        valley_days = row['Valley_Dates']
+        peak_indices = ohlc_data[ohlc_data['Date'].isin(peak_days)].index
+        # add the pat_start_idx to the peak_indices
+        peak_indices = peak_indices
+        valley_indices = ohlc_data[ohlc_data['Date'].isin(valley_days)].index
+        # add the pat_start_idx to the valley_indices
+        valley_indices = valley_indices
+        # Debugging visualization: Plot detected peaks and valleys
+        ax.scatter(peak_indices , high_prices[peak_indices], color='green', marker='^', label='Peaks', zorder=3)
+        ax.scatter(valley_indices, low_prices[valley_indices], color='red', marker='v', label='Valleys', zorder=3)
+    DT_Peak_1_idx = ohlc_data[ohlc_data['Date']== row['DT_Peak_1']].index
+    DT_Peak_2_idx = ohlc_data[ohlc_data['Date']== row['DT_Peak_2']].index
+    DT_Valley_idx = ohlc_data[ohlc_data['Date']== row['DT_Valley']].index
+    # draw the double peaks
+    ax.plot([DT_Peak_1_idx,DT_Valley_idx, DT_Peak_2_idx], [high_prices[DT_Peak_1_idx],high_prices[DT_Valley_idx], high_prices[DT_Peak_2_idx]],
+            linestyle="solid", marker="o", color="blue", linewidth=1, label="Double Top Pattern")
+    # Draw the neckline
+    ax.hlines(y=low_prices[DT_Valley_idx], xmin=ax.get_xlim()[0], xmax=ax.get_xlim()[1], color='red', linestyle='dotted', linewidth=1)
+def draw_double_bottom_aa(ax, ohlc_data, pat_start_idx,row):
+    Draws a Double Bottom pattern on an existing mplfinance plot and visualizes detected peaks and valleys.
+    Parameters:
+        ax (matplotlib.axes.Axes): The candlestick chart's axis.
+        ohlc_data (pd.DataFrame): Data containing 'High' and 'Low' columns.
+    # reset the index of the ohlc_data
+    ohlc_data.reset_index(drop=True, inplace=True)
+    high_prices = ohlc_data['High'].values
+    low_prices = ohlc_data['Low'].values
+    # check if 'Peak_Dates' and 'Valley_Dates' columns are present in the row
+    if 'Peak_Dates' in row and 'Valley_Dates' in row:
+        peak_days = row['Peak_Dates']
+        valley_days = row['Valley_Dates']
+        peak_indices = ohlc_data[ohlc_data['Date'].isin(peak_days)].index
+        # add the pat_start_idx to the peak_indices
+        peak_indices = peak_indices
+        valley_indices = ohlc_data[ohlc_data['Date'].isin(valley_days)].index
+        # add the pat_start_idx to the valley_indices
+        valley_indices = valley_indices
+        # Debugging visualization: Plot detected peaks and valleys
+        ax.scatter(peak_indices , high_prices[peak_indices], color='green', marker='^', label='Peaks', zorder=3)
+        ax.scatter(valley_indices, low_prices[valley_indices], color='red', marker='v', label='Valleys', zorder=3)
+    DB_Valley_1_idx = ohlc_data[ohlc_data['Date']== row['DB_Valley_1']].index
+    DB_Valley_2_idx = ohlc_data[ohlc_data['Date']== row['DB_Valley_2']].index
+    DB_Peak_idx = ohlc_data[ohlc_data['Date']== row['DB_Peak']].index
+    # draw the double peaks
+    ax.plot([DB_Valley_1_idx,DB_Peak_idx, DB_Valley_2_idx], [low_prices[DB_Valley_1_idx],low_prices[DB_Peak_idx], low_prices[DB_Valley_2_idx]],
+            linestyle="solid", marker="o", color="blue", linewidth=1, label="Double Bottom Pattern")
+    # Draw the neckline
+    ax.hlines(y=high_prices[DB_Peak_idx], xmin=ax.get_xlim()[0], xmax=ax.get_xlim()[1], color='red', linestyle='dotted', linewidth=1)
+def plot_pattern_clusters( test_pattern_segment_wise, ohcl_data_given=None, padding_days=0,draw_lines = False):
+    colors = ["blue", "green", "red", "cyan", "magenta", "yellow", "purple", "orange", "brown", "pink", "lime", "teal"]
+    group = test_pattern_segment_wise
+    if ohcl_data_given is None:
+        symbol = group['Symbol'].iloc[0]
+        ohcl_data = pd.read_csv(path + '/' + symbol + '.csv')
+    else:
+        ohcl_data = ohcl_data_given
+    ohcl_data['Date'] = pd.to_datetime(ohcl_data['Date'])
+    ohcl_data['Date'] = ohcl_data['Date'].dt.tz_localize(None)
+    seg_start = group['Seg_Start'].iloc[0] - pd.to_timedelta(padding_days, unit='D')
+    seg_end = group['Seg_End'].iloc[0] + pd.to_timedelta(padding_days, unit='D')
+    ohcl_data = ohcl_data[(ohcl_data['Date'] >= seg_start) & (ohcl_data['Date'] <= seg_end)]
+    if ohcl_data.empty:
+        print("OHLC Data set is empty")
+        return
+    ohlc_for_mpf = ohcl_data[['Open', 'High', 'Low', 'Close']].copy()
+    ohlc_for_mpf.index = pd.to_datetime(ohcl_data['Date'])
+    fig, axes = mpf.plot(ohlc_for_mpf, type='candle', style='charles', datetime_format='%Y-%m-%d', returnfig=True)
+    ax = axes[0]
+    for _, row in group.iterrows():
+        pattern_name = row['Chart Pattern']
+        cluster = row['Cluster']
+        color = "gray" if cluster == -1 else colors[cluster % len(colors)]
+        pattern_start_date = pd.to_datetime(row['Start']).tz_localize(None)
+        pattern_end_date = pd.to_datetime(row['End']).tz_localize(None)
+        num_start = len(ohcl_data[ohcl_data['Date'] < pattern_start_date])
+        num_end = num_start + len(ohcl_data[(ohcl_data['Date'] >= pattern_start_date) & (ohcl_data['Date'] <= pattern_end_date)])
+        ax.axvspan(num_start, num_end, color=color, alpha=0.1, label=pattern_name)
+        if draw_lines:
+            # error = row['Error'] check only if the column is present
+            error = False
+            if 'Error' in row and row['Error'] != np.nan:
+                error = row['Error']
+            if error != True:
+                calc_start_date = row['Calc_Start']
+                calc_end_date = row['Calc_End']
+                # reset the index of the ohlc_data
+                ohcl_data.reset_index(drop=True, inplace=True)
+                calc_start_idx = ohcl_data[ohcl_data['Date']== calc_start_date].index
+                calc_end_idx = ohcl_data[ohcl_data['Date']== calc_end_date].index
+                # drow a pink dotted vertical line at calc_start_idx and calc_end_idx
+                ax.axvline(x=calc_start_idx, color='blue', linestyle='dotted', linewidth=1)
+                ax.axvline(x=calc_end_idx, color='blue', linestyle='dotted', linewidth=1)
+                # # If detected pattern is Head and Shoulders, plot indicator lines
+                # if pattern_name == "Head-and-shoulders top":
+                #     # get the ohlc segment of where the date is between the pattern start and end from ohlc_for_mpf data set where the index is the date
+                #     ohlc_segment_head_and_sholder = ohlc_for_mpf.loc[pattern_start_date:pattern_end_date]
+                #     draw_head_and_shoulders_top(ax, ohcl_data, num_start,row)
+                # elif pattern_name == "Head-and-shoulders bottom":
+                #     # get the ohlc segment of where the date is between the pattern start and end from ohlc_for_mpf data set where the index is the date
+                #     ohlc_segment_head_and_sholder = ohlc_for_mpf.loc[pattern_start_date:pattern_end_date]
+                #     draw_head_and_shoulders_bottom(ax, ohcl_data, num_start,row)
+                # elif pattern_name == "Double Top, Adam and Adam":
+                #     # get the ohlc segment of where the date is between the pattern start and end from ohlc_for_mpf data set where the index is the date
+                #     ohlc_segment_double_top = ohlc_for_mpf.loc[pattern_start_date:pattern_end_date]
+                #     draw_double_top_aa(ax, ohcl_data, num_start,row)
+                # elif pattern_name == "Double Bottom, Adam and Adam":
+                #     ohlc_segment_double_top = ohlc_for_mpf.loc[pattern_start_date:pattern_end_date]
+                #     draw_double_bottom_aa(ax, ohcl_data, num_start,row)
+                # elif pattern_name == "Double Bottom, Eve and Adam":
+                #     ohlc_segment_double_top = ohlc_for_mpf.loc[pattern_start_date:pattern_end_date]
+                #     draw_double_bottom_aa(ax, ohcl_data, num_start,row)
+    if draw_lines:
+        # Get unique legend handles and labels
+        handles, labels = ax.get_legend_handles_labels()
+        unique_labels = {}
+        unique_handles = []
+        # Initialize storage for unique handles/labels
+        unique_labels = {}
+        unique_handles = []
+        i= 1
+        for handle, label in zip(handles, labels):
+            # print(label)
+            # Allow duplication if the label is in pattern_encoding
+            if label in pattern_encoding or label not in unique_labels:
+                if label not in unique_labels:
+                    unique_labels[label] = handle
+                    unique_handles.append(handle)
+                else:
+                    unique_labels[label + f"_{i}"] = handle
+                    unique_handles.append(handle)
+                    i += 1
+        ax.legend(unique_handles, unique_labels.keys())
+    ax.grid(True)
+    plt.show()
+def plot_pattern_groups_and_finalized_sections(located_patterns_and_other_info, cluster_labled_windows_df ,ohcl_data_given=None):
+    # for each unique Chart Pattern in located_patterns_and_other_info plot the patterns
+    for pattern, group in located_patterns_and_other_info.groupby('Chart Pattern'):
+        # pattern = 'Head-and-shoulders top'
+        print (pattern ," :")
+        print("    Clustered Windows :")
+        plot_pattern_clusters( cluster_labled_windows_df[cluster_labled_windows_df['Chart Pattern'] == pattern],ohcl_data_given=ohcl_data_given)
+        print("    Finalized Section :")
+        plot_pattern_clusters( located_patterns_and_other_info[located_patterns_and_other_info['Chart Pattern'] == pattern],draw_lines=True,ohcl_data_given=ohcl_data_given)
+"""

utils/patternLocating.py ADDED Viewed

	@@ -0,0 +1,380 @@

+import joblib
+from tqdm import tqdm
+from utils.eval import intersection_over_union
+from utils.formatAndPreprocessNewPatterns import get_patetrn_name_by_encoding, get_pattern_encoding_by_name, get_reverse_pattern_encoding
+import pandas as pd
+import numpy as np
+from joblib import Parallel, delayed
+import math
+from sklearn.cluster import DBSCAN
+path = 'Datasets/OHLC data'
+def process_window(i, ohlc_data_segment, rocket_model, probability_threshold, pattern_encoding_reversed,seg_start, seg_end, window_size, padding_proportion,prob_threshold_of_no_pattern_to_mark_as_no_pattern=1):
+    start_index = i - math.ceil(window_size * padding_proportion)
+    end_index = start_index + window_size
+    start_index = max(start_index, 0)
+    end_index = min(end_index, len(ohlc_data_segment))
+    ohlc_segment = ohlc_data_segment[start_index:end_index]
+    if len(ohlc_segment) == 0:
+        return None  # Skip empty segments
+    win_start_date = ohlc_segment['Date'].iloc[0]
+    win_end_date = ohlc_segment['Date'].iloc[-1]
+    # print("ohlc befor :" , ohlc_segment)
+    ohlc_array_for_rocket = ohlc_segment[['Open', 'High', 'Low', 'Close','Volume']].to_numpy().reshape(1, len(ohlc_segment), 5)
+    ohlc_array_for_rocket = np.transpose(ohlc_array_for_rocket, (0, 2, 1))
+    # print( "ohlc for rocket :" , ohlc_array_for_rocket)
+    try:
+        pattern_probabilities = rocket_model.predict_proba(ohlc_array_for_rocket)
+    except Exception as e:
+        print(f"Error in prediction: {e}")
+        return None
+    max_probability = np.max(pattern_probabilities)
+    # print(pattern_probabilities)
+    # print(f"Predicted Pattern: {pattern_encoding_reversed[np.argmax(pattern_probabilities)]} with probability: {max_probability} in num {i} window")
+    # if max_probability > probability_threshold:
+    no_pattern_proba = pattern_probabilities[0][get_pattern_encoding_by_name ('No Pattern')]
+    pattern_index = np.argmax(pattern_probabilities)
+    pred_proba = max_probability
+    pred_pattern = get_patetrn_name_by_encoding(pattern_index)
+    if no_pattern_proba > prob_threshold_of_no_pattern_to_mark_as_no_pattern:
+        pred_proba = no_pattern_proba
+        pred_pattern = 'No Pattern'
+    new_row = {
+        'Start': win_start_date, 'End': win_end_date,  'Chart Pattern': pred_pattern,  'Seg_Start': seg_start, 'Seg_End': seg_end ,
+        'Probability': pred_proba
+    }
+    # plot_patterns_for_segment(test_seg_id, pd.DataFrame([new_row]), ohlc_data_segment)
+    return new_row
+    # return None
+def parallel_process_sliding_window(ohlc_data_segment, rocket_model, probability_threshold, stride, pattern_encoding_reversed, window_size, padding_proportion,prob_threshold_of_no_pattern_to_mark_as_no_pattern=1,parallel=True,num_cores=-1):
+    # get the start and end dates of the ohlc data
+    seg_start = ohlc_data_segment['Date'].iloc[0]
+    seg_end = ohlc_data_segment['Date'].iloc[-1]
+    if parallel:
+        # Use Parallel as a context manager to ensure cleanup
+        with Parallel(n_jobs=num_cores,verbose = 1) as parallel:
+            results = parallel(
+                delayed(process_window)(
+                    i=i,
+                    ohlc_data_segment=ohlc_data_segment,
+                    rocket_model=rocket_model,
+                    probability_threshold=probability_threshold,
+                    pattern_encoding_reversed=pattern_encoding_reversed,
+                    window_size=window_size,
+                    seg_start=seg_start,
+                    seg_end=seg_end,
+                    padding_proportion=padding_proportion,
+                    prob_threshold_of_no_pattern_to_mark_as_no_pattern=prob_threshold_of_no_pattern_to_mark_as_no_pattern
+                )
+                for i in range(0, len(ohlc_data_segment), stride)
+            )
+        # print(f"Finished processing segment {seg_id} for symbol {symbol}")
+        # print(results)
+        # Filter out None values and create DataFrame
+        return pd.DataFrame([res for res in results if res is not None])
+    else:
+        #  do the sam e thing without parrellel processing
+        results = []
+        total_iterations = len(range(0, len(ohlc_data_segment), stride))
+        for i_idx, i in enumerate(range(0, len(ohlc_data_segment), stride)):
+            res = process_window(i, ohlc_data_segment, rocket_model, probability_threshold, pattern_encoding_reversed, seg_start, seg_end, window_size, padding_proportion)
+            if res is not None:
+                results.append(res)
+            # Progress print statement
+            print(f"Processing window {i_idx + 1} of {total_iterations}...")
+        return pd.DataFrame(results)
+def prepare_dataset_for_cluster(ohlc_data_segment, win_results_df):
+    predicted_patterns = win_results_df.copy()
+    origin_date = ohlc_data_segment['Date'].min()
+    for index, row in predicted_patterns.iterrows():
+        pattern_start = row['Start']
+        pattern_end = row['End']
+        #  get the number of OHLC data points from the origin date to the pattern start date
+        start_point_index = len(ohlc_data_segment[ohlc_data_segment['Date'] < pattern_start])
+        pattern_len = len(ohlc_data_segment[(ohlc_data_segment['Date'] >= pattern_start) & (ohlc_data_segment['Date'] <= pattern_end)])
+        pattern_mid_index = start_point_index + (pattern_len / 2)
+        # add the center index to a new column Center in the predicted_patterns current row
+        predicted_patterns.at[index, 'Center'] = pattern_mid_index
+        predicted_patterns.at[index, 'Pattern_Start_pos'] = start_point_index
+        predicted_patterns.at[index, 'Pattern_End_pos'] = start_point_index + pattern_len
+    return predicted_patterns
+def cluster_windows(predicted_patterns , probability_threshold, window_size,eps = 0.05 , min_samples = 2):
+    df = predicted_patterns.copy()
+    # check if the probability_threshold is a list or a float
+    if isinstance(probability_threshold, list):
+        # the list contain the probability thresholds for each chart pattern
+        # filter the dataframe for each probability threshold
+        for i in range(len(probability_threshold)):
+            pattern_name = get_patetrn_name_by_encoding(i)
+            df.drop(df[(df['Chart Pattern'] == pattern_name) & (df['Probability'] < probability_threshold[i])].index, inplace=True)
+            # print(f"Filtered {pattern_name} with probability < {probability_threshold[i]}")
+    else:
+        # only get the rows that has a probability greater than the probability threshold
+        df = df[df['Probability'] > probability_threshold]
+    # Initialize a list to store merged clusters from all groups
+    cluster_labled_windows = []
+    interseced_clusters = []
+    min_center = df['Center'].min()
+    max_center = df['Center'].max()
+    # Group by 'Chart Pattern' and apply clustering to each group
+    for pattern, group in df.groupby('Chart Pattern'):
+        # print (pattern)
+        # print(group)
+        # Clustering
+        centers = group['Center'].values.reshape(-1, 1)
+        # centers normalization
+        if min_center < max_center:  # Avoid division by zero
+            norm_centers = (centers - min_center) / (max_center - min_center)
+        else:
+            # If all values are the same, set to constant (e.g., 0 or 1)
+            norm_centers = np.ones_like(centers)
+        # eps  =window_size/2 + 4
+        db = DBSCAN(eps=eps, min_samples=min_samples).fit(norm_centers)
+        group['Cluster'] = db.labels_
+        cluster_labled_windows.append(group)
+        # Filter out noise (-1) and group by Cluster
+        for cluster_id, cluster_group in group[group['Cluster'] != -1].groupby('Cluster'):
+            expanded_dates = []
+            for _, row in cluster_group.iterrows():
+                # Print the start and end dates for debugging
+                dates = pd.date_range(row["Start"], row["End"])
+                expanded_dates.extend(dates)
+            # print("Total expanded dates:", len(expanded_dates))
+            # Step 2: Count occurrences of each date
+            date_counts = pd.Series(expanded_dates).value_counts().sort_index()
+            # Step 3: Identify cluster start and end (where at least 2 windows overlap)
+            cluster_start = date_counts[date_counts >= 2].index.min()
+            cluster_end = date_counts[date_counts >= 2].index.max()
+            interseced_clusters.append({
+                # 'Seg_ID' : cluster_group['Seg_ID'].iloc[0],
+                # 'Symbol' : cluster_group['Symbol'].iloc[0],
+                'Chart Pattern': pattern,
+                'Cluster': cluster_id,
+                'Start': cluster_start,
+                'End': cluster_end,
+                'Seg_Start': cluster_group['Seg_Start'].iloc[0],
+                'Seg_End': cluster_group['Seg_End'].iloc[0],
+                'Avg_Probability': cluster_group['Probability'].mean(),
+            })
+    if len(cluster_labled_windows) == 0 or len(interseced_clusters) == 0:
+        return None,None
+    # # Combine all merged clusters into a final DataFrame
+    cluster_labled_windows_df = pd.concat(cluster_labled_windows)
+    interseced_clusters_df = pd.DataFrame(interseced_clusters)
+    # sort by the index
+    cluster_labled_windows_df = cluster_labled_windows_df.sort_index()
+    # print(cluster_labled_windows_df)
+    # Display the result
+    # print(merged_df)
+    return cluster_labled_windows_df,interseced_clusters_df
+# =========================Advance Locator ==========================
+pattern_encoding_reversed = get_reverse_pattern_encoding()
+# load the joblib model at Models\Width Aug OHLC_mini_rocket_xgb.joblib to use
+model =  joblib.load('Models/Width Aug OHLC_mini_rocket_xgb.joblib')
+plot_count = 0
+win_size_proportions = np.round(np.logspace(0, np.log10(20), num=10), 2).tolist()
+padding_proportion = 0.6
+stride = 1
+probab_threshold_list = 0.5
+prob_threshold_of_no_pattern_to_mark_as_no_pattern = 0.5
+target_len = 30
+eps=0.04 # in the dbscan clustering
+min_samples=3 # in the dbscan clustering
+win_width_proportion=10 # in the dbscan clustering from what amount to divide the width related feature
+def locate_patterns(ohlc_data, patterns_to_return= None,model = model , pattern_encoding_reversed= pattern_encoding_reversed,plot_count = 10):
+    ohlc_data_segment = ohlc_data.copy()
+    # convert date to datetime
+    ohlc_data_segment['Date'] = pd.to_datetime(ohlc_data_segment['Date'])
+    seg_len = len(ohlc_data_segment)
+    if ohlc_data_segment is None or len(ohlc_data_segment) == 0:
+        print("OHLC Data segment is empty")
+        raise Exception("OHLC Data segment is empty")
+    win_results_for_each_size = []
+    located_patterns_and_other_info_for_each_size = []
+    cluster_labled_windows_list = []
+    used_win_sizes = []
+    win_iteration = 0
+    for win_size_proportion in win_size_proportions:
+        window_size = seg_len // win_size_proportion
+        # print(f"Win size : {window_size}")
+        if window_size < 10:
+            window_size = 10
+        # elif window_size > 30:
+        #     window_size = 30
+        # convert to int
+        window_size = int(window_size)
+        if window_size in used_win_sizes:
+            continue
+        used_win_sizes.append(window_size)
+        # win_results_df = parallel_process_sliding_window(ohlc_data_segment, model, probability_threshold,stride, pattern_encoding_reversed,group,test_seg_id,window_size, padding_proportion, len_norm, target_len)
+        win_results_df = parallel_process_sliding_window(ohlc_data_segment, model, probab_threshold_list,stride, pattern_encoding_reversed,window_size, padding_proportion,prob_threshold_of_no_pattern_to_mark_as_no_pattern,parallel=True)
+        if win_results_df is None or len(win_results_df) == 0:
+            print("Window results dataframe is empty")
+            continue
+        win_results_df['Window_Size'] = window_size
+        win_results_for_each_size.append(win_results_df)
+        # plot_sliding_steps(win_results_df ,ohlc_data_segment,probability_threshold ,test_seg_id)
+        predicted_patterns = prepare_dataset_for_cluster(ohlc_data_segment, win_results_df)
+        if predicted_patterns is None or len(predicted_patterns) == 0:
+            print("Predicted patterns dataframe is empty")
+        # print("Predicted Patterns :",predicted_patterns)
+        # cluster_labled_windows_df , interseced_clusters_df = cluster_windows(predicted_patterns, probability_threshold, window_size)
+        cluster_labled_windows_df , interseced_clusters_df = cluster_windows(predicted_patterns, probab_threshold_list, window_size)
+        if cluster_labled_windows_df is None or interseced_clusters_df is None or len(cluster_labled_windows_df) == 0 or len(interseced_clusters_df) == 0:
+            print("Clustered windows dataframe is empty")
+            continue
+        mask = cluster_labled_windows_df['Cluster'] != -1
+        cluster_labled_windows_df.loc[mask, 'Cluster'] = cluster_labled_windows_df.loc[mask, 'Cluster'].astype(int) + win_iteration
+        # mask2 = interseced_clusters_df['Cluster'] != -1
+        interseced_clusters_df['Cluster'] = interseced_clusters_df['Cluster'].astype(int) + win_iteration
+        num_of_unique_clusters = interseced_clusters_df[interseced_clusters_df['Cluster']!=-1]['Cluster'].nunique()
+        win_iteration += num_of_unique_clusters
+        cluster_labled_windows_list.append(cluster_labled_windows_df)
+        # located_patterns_and_other_info = functional_pattern_filter_and_point_recognition(interseced_clusters_df)
+        interseced_clusters_df['Calc_Start'] = interseced_clusters_df['Start']
+        interseced_clusters_df['Calc_End'] = interseced_clusters_df['End']
+        located_patterns_and_other_info = interseced_clusters_df.copy()
+        if located_patterns_and_other_info is None or len(located_patterns_and_other_info) == 0:
+            print("]Located patterns and other info dataframe is empty")
+            continue
+        # Remove plotting call
+        # plot_pattern_groups_and_finalized_sections(located_patterns_and_other_info, cluster_labled_windows_df, test_seg_id)
+        located_patterns_and_other_info['Window_Size'] = window_size
+        located_patterns_and_other_info_for_each_size.append(located_patterns_and_other_info)
+    if located_patterns_and_other_info_for_each_size is None or len(located_patterns_and_other_info_for_each_size) == 0 or win_results_for_each_size is None or len(win_results_for_each_size) == 0:
+        print("Located patterns and other info for each size is empty")
+        return None
+    located_patterns_and_other_info_for_each_size_df = pd.concat(located_patterns_and_other_info_for_each_size)
+    win_results_for_each_size_df = pd.concat(win_results_for_each_size, ignore_index=True)
+    # window_results_list.append(win_results_for_each_size_df)
+    # get the set of unique window sizes from located_patterns_and_other_info_for_each_size_df
+    unique_window_sizes = located_patterns_and_other_info_for_each_size_df['Window_Size'].unique()
+    unique_patterns = located_patterns_and_other_info_for_each_size_df['Chart Pattern'].unique()
+    # sort the unique_window_sizes descending order
+    unique_window_sizes = np.sort(unique_window_sizes)[::-1]
+    filtered_loc_pat_and_info_rows_list = []
+    for chart_pattern in unique_patterns:
+        located_patterns_and_other_info_for_each_size_df_chart_pattern = located_patterns_and_other_info_for_each_size_df[located_patterns_and_other_info_for_each_size_df['Chart Pattern'] == chart_pattern]
+        for win_size in unique_window_sizes:
+            located_patterns_and_other_info_for_each_size_df_win_size_chart_pattern = located_patterns_and_other_info_for_each_size_df_chart_pattern[located_patterns_and_other_info_for_each_size_df_chart_pattern['Window_Size'] == win_size]
+            for idx , row in located_patterns_and_other_info_for_each_size_df_win_size_chart_pattern.iterrows():
+                start_date = row['Calc_Start']
+                end_date = row['Calc_End']
+                is_already_included = False
+                # check if there are any other rows that intersect with the start and end dates with the same chart pattern
+                intersecting_rows = located_patterns_and_other_info_for_each_size_df_chart_pattern[
+                                                    (located_patterns_and_other_info_for_each_size_df_chart_pattern['Calc_Start'] <= end_date) &
+                                                    (located_patterns_and_other_info_for_each_size_df_chart_pattern['Calc_End'] >= start_date)
+                                                ]
+                is_already_included = False
+                for idx2, row2 in intersecting_rows.iterrows():
+                    iou = intersection_over_union(start_date, end_date, row2['Calc_Start'], row2['Calc_End'])
+                    if iou > 0.6:
+                        # Case 1: Larger window already exists
+                        if row2['Window_Size'] > row['Window_Size']:
+                            # Case 1A: But smaller one has significantly higher probability, keep it instead
+                            if (row['Avg_Probability'] - row2['Avg_Probability']) > 0.1:
+                                is_already_included = False
+                            else:
+                                is_already_included = True
+                                break  # Keep large, skip current(small)
+                        # Case 2: Equal or smaller window exists, possibly overlapping
+                        elif row['Window_Size'] >= row2['Window_Size']:
+                            # If current row has significantly better probability, replace existing
+                            if (row2['Avg_Probability'] - row['Avg_Probability']) > 0.1:
+                                is_already_included = True
+                                break  # remove current (large) , keep small
+                            else:
+                                is_already_included = False
+                                # break
+                if not is_already_included:
+                    filtered_loc_pat_and_info_rows_list.append(row)
+    # convert the filtered_loc_pat_and_info_rows_list to a dataframe
+    filtered_loc_pat_and_info_df = pd.DataFrame(filtered_loc_pat_and_info_rows_list)
+    # located_patterns_and_other_info_list.append(filtered_loc_pat_and_info_df)
+    if cluster_labled_windows_list is None or len(cluster_labled_windows_list) == 0:
+        print("Clustered windows list is empty")
+    cluster_labled_windows_df_conc = pd.concat(cluster_labled_windows_list)
+    # Remove plotting code
+    """
+    if plot_count > 0:
+        plot_pattern_groups_and_finalized_sections(filtered_loc_pat_and_info_df, cluster_labled_windows_df_conc,ohcl_data_given=ohlc_data_segment)
+    plot_count -= 1
+    """
+    if patterns_to_return is None or len(patterns_to_return) == 0:
+        return filtered_loc_pat_and_info_df
+    else:
+        # filter the filtered_loc_pat_and_info_df based on the patterns_to_return
+        filtered_loc_pat_and_info_df = filtered_loc_pat_and_info_df[filtered_loc_pat_and_info_df['Chart Pattern'].isin(patterns_to_return)]
+        return filtered_loc_pat_and_info_df

utils/patternLocatingGemni.py ADDED Viewed

	@@ -0,0 +1,452 @@

+import joblib
+from utils.eval import intersection_over_union
+from utils.formatAndPreprocessNewPatterns import get_patetrn_name_by_encoding, get_pattern_encoding_by_name, get_reverse_pattern_encoding
+import pandas as pd
+import numpy as np
+import math
+from sklearn.cluster import DBSCAN
+from joblib import Parallel, delayed
+# Remove matplotlib imports and plotting function import
+# import matplotlib.pyplot as plt
+# from utils.functionalPatternLocateAndPlot import plot_pattern_groups_and_finalized_sections
+# --- Global Configuration & Model Loading ---
+# Load the pre-trained model and pattern encodings
+# It's assumed 'Models/Width Aug OHLC_mini_rocket_xgb.joblib' is in the correct path
+MODEL_PATH = 'Models/Width Aug OHLC_mini_rocket_xgb.joblib'
+try:
+    rocket_model_global = joblib.load(MODEL_PATH)
+except FileNotFoundError:
+    print(f"Error: Model file not found at {MODEL_PATH}. Please ensure the path is correct.")
+    # You might want to exit or raise an exception here if the model is critical
+    rocket_model_global = None
+pattern_encoding_reversed_global = get_reverse_pattern_encoding()
+# Default parameters for the pattern location logic
+WIN_SIZE_PROPORTIONS = np.round(np.logspace(0, np.log10(20), num=10), 2).tolist()
+PADDING_PROPORTION = 0.6
+STRIDE = 1
+# Default probability thresholds for pattern identification.
+PROBABILITY_THRESHOLD_LIST = [0.8884, 0.8676, 0.5620, 0.5596, 0.5132, 0.8367, 0.7635]
+PROB_THRESHOLD_NO_PATTERN = 0.5 # Threshold to mark as 'No Pattern'
+# DBSCAN Clustering parameters
+DBSCAN_EPS = 0.04
+DBSCAN_MIN_SAMPLES = 3
+# --- Private Helper Functions ---
+def _process_window(i, ohlc_data_segment, rocket_model, probability_threshold, pattern_encoding_reversed, seg_start, seg_end, window_size, padding_proportion, prob_threshold_of_no_pattern_to_mark_as_no_pattern=1):
+    """Processes a single window of OHLC data to predict patterns."""
+    start_index = i - math.ceil(window_size * padding_proportion)
+    end_index = start_index + window_size
+    start_index = max(start_index, 0)
+    end_index = min(end_index, len(ohlc_data_segment))
+    ohlc_segment = ohlc_data_segment[start_index:end_index]
+    if len(ohlc_segment) == 0:
+        return None
+    win_start_date = ohlc_segment['Date'].iloc[0]
+    win_end_date = ohlc_segment['Date'].iloc[-1]
+    # Prepare data for Rocket model (reshape and transpose)
+    ohlc_array_for_rocket = ohlc_segment[['Open', 'High', 'Low', 'Close', 'Volume']].to_numpy().reshape(1, len(ohlc_segment), 5)
+    ohlc_array_for_rocket = np.transpose(ohlc_array_for_rocket, (0, 2, 1))
+    try:
+        pattern_probabilities = rocket_model.predict_proba(ohlc_array_for_rocket)
+    except Exception as e:
+        # print(f"Error in prediction for window {i}: {e}") # Optional: for debugging
+        return None
+    max_probability = np.max(pattern_probabilities)
+    # Assuming get_pattern_encoding_by_name returns a valid index or handles errors
+    no_pattern_encoding = get_pattern_encoding_by_name('No Pattern')
+    if no_pattern_encoding is None: # Handle case where 'No Pattern' is not in encoding
+        # print("Warning: 'No Pattern' encoding not found.") # Optional warning
+        no_pattern_proba = 0
+    else:
+        no_pattern_proba = pattern_probabilities[0][no_pattern_encoding]
+    pattern_index = np.argmax(pattern_probabilities)
+    pred_proba = max_probability
+    pred_pattern = get_patetrn_name_by_encoding(pattern_index)
+    if no_pattern_proba >= prob_threshold_of_no_pattern_to_mark_as_no_pattern: # Use >= for consistency
+        pred_proba = no_pattern_proba
+        pred_pattern = 'No Pattern'
+    return {
+        'Start': win_start_date, 'End': win_end_date, 'Chart Pattern': pred_pattern,
+        'Seg_Start': seg_start, 'Seg_End': seg_end, 'Probability': pred_proba
+    }
+def _parallel_process_sliding_window(ohlc_data_segment, rocket_model, probability_threshold, stride, pattern_encoding_reversed, window_size, padding_proportion, prob_threshold_of_no_pattern_to_mark_as_no_pattern=1, parallel=True, num_cores=16, verbose_level=1):
+    """Applies sliding window pattern detection in parallel or sequentially."""
+    seg_start = ohlc_data_segment['Date'].iloc[0]
+    seg_end = ohlc_data_segment['Date'].iloc[-1]
+    common_args = {
+        'ohlc_data_segment': ohlc_data_segment,
+        'rocket_model': rocket_model,
+        'probability_threshold': probability_threshold,
+        'pattern_encoding_reversed': pattern_encoding_reversed,
+        'window_size': window_size,
+        'seg_start': seg_start,
+        'seg_end': seg_end,
+        'padding_proportion': padding_proportion,
+        'prob_threshold_of_no_pattern_to_mark_as_no_pattern': prob_threshold_of_no_pattern_to_mark_as_no_pattern
+    }
+    if parallel:
+        with Parallel(n_jobs=num_cores, verbose=verbose_level) as parallel_executor: # User requested verbose
+            results = parallel_executor(
+                delayed(_process_window)(i=i, **common_args)
+                for i in range(0, len(ohlc_data_segment), stride)
+            )
+    else:
+        results = []
+        total_iterations = len(range(0, len(ohlc_data_segment), stride)) # Optional: for progress
+        for i_idx, i in enumerate(range(0, len(ohlc_data_segment), stride)):
+            res = _process_window(i=i, **common_args)
+            if res is not None:
+                results.append(res)
+            if verbose_level > 0: # Basic progress for sequential
+                 print(f"Processing window {i_idx + 1} of {total_iterations}...")
+    return pd.DataFrame([res for res in results if res is not None])
+def _prepare_dataset_for_cluster(ohlc_data_segment, win_results_df):
+    """Adds position-based features to window results for clustering."""
+    predicted_patterns = win_results_df.copy()
+    for index, row in predicted_patterns.iterrows():
+        pattern_start_date = row['Start']
+        pattern_end_date = row['End']
+        start_point_index = len(ohlc_data_segment[ohlc_data_segment['Date'] < pattern_start_date])
+        pattern_len = len(ohlc_data_segment[(ohlc_data_segment['Date'] >= pattern_start_date) & (ohlc_data_segment['Date'] <= pattern_end_date)])
+        pattern_mid_index = start_point_index + (pattern_len / 2.0) # Use float division
+        predicted_patterns.at[index, 'Center'] = pattern_mid_index
+        predicted_patterns.at[index, 'Pattern_Start_pos'] = start_point_index
+        predicted_patterns.at[index, 'Pattern_End_pos'] = start_point_index + pattern_len
+    return predicted_patterns
+def _cluster_windows(predicted_patterns, probability_threshold, eps=0.05, min_samples_dbscan=2):
+    """Clusters detected pattern windows using DBSCAN.
+       min_samples_dbscan is the min_samples for DBSCAN algorithm itself.
+       The overlap check for intersected_clusters will also use this value.
+    """
+    df = predicted_patterns.copy()
+    if isinstance(probability_threshold, list):
+        temp_dfs = []
+        # Ensure probability_threshold list length matches number of encodable patterns if used directly with get_patetrn_name_by_encoding(i)
+        # Or, better, iterate through unique patterns present in df if threshold list is a dict or structured differently.
+        # Assuming probability_threshold list is indexed corresponding to pattern encodings from 0 to N-1
+        for i, p_thresh in enumerate(probability_threshold):
+            pattern_name = get_patetrn_name_by_encoding(i)
+            if pattern_name:
+                 temp_dfs.append(df[(df['Chart Pattern'] == pattern_name) & (df['Probability'] >= p_thresh)])
+        if temp_dfs:
+            df = pd.concat(temp_dfs) if temp_dfs else pd.DataFrame(columns=df.columns)
+        else:
+            df = pd.DataFrame(columns=df.columns)
+    else: # single float threshold
+        df = df[df['Probability'] >= probability_threshold] # Changed > to >=
+    if df.empty:
+        return pd.DataFrame(), pd.DataFrame()
+    cluster_labled_windows_list = []
+    interseced_clusters_list = []
+    # Normalize 'Center' for DBSCAN if there's variance
+    min_center_val = df['Center'].min()
+    max_center_val = df['Center'].max()
+    for pattern, group in df.groupby('Chart Pattern'):
+        if group.empty:
+            continue
+        centers = group['Center'].values.reshape(-1, 1)
+        if min_center_val < max_center_val: # Avoid division by zero if all centers are same
+            norm_centers = (centers - min_center_val) / (max_center_val - min_center_val)
+        elif len(centers) > 0 : # All centers are the same, no real distance variance
+            norm_centers = np.zeros_like(centers) # Treat as single point for clustering
+        else: # Empty group after filtering, should not happen if group.empty() check passed
+            norm_centers = np.array([])
+        if len(norm_centers) == 0:
+            group['Cluster'] = -1
+            cluster_labled_windows_list.append(group)
+            continue
+        current_min_samples_for_dbscan = min(min_samples_dbscan, len(norm_centers))
+        if current_min_samples_for_dbscan < 1 and len(norm_centers) > 0 :
+             current_min_samples_for_dbscan = 1
+        elif len(norm_centers) == 0:
+            group['Cluster'] = -1
+            cluster_labled_windows_list.append(group)
+            continue
+        db = DBSCAN(eps=eps, min_samples=current_min_samples_for_dbscan).fit(norm_centers)
+        group['Cluster'] = db.labels_
+        cluster_labled_windows_list.append(group)
+        for cluster_id, cluster_group in group[group['Cluster'] != -1].groupby('Cluster'):
+            expanded_dates = []
+            for _, row_cg in cluster_group.iterrows(): # Renamed 'row' to 'row_cg' to avoid conflict
+                # Ensure Start and End are valid datetime objects
+                try:
+                    dates = pd.date_range(start=pd.to_datetime(row_cg["Start"]), end=pd.to_datetime(row_cg["End"]))
+                    expanded_dates.extend(dates)
+                except Exception as e:
+                    # print(f"Warning: Could not create date range for row: {row_cg}. Error: {e}") # Optional
+                    continue
+            if not expanded_dates:
+                continue
+            date_counts = pd.Series(expanded_dates).value_counts().sort_index()
+            # Use min_samples_dbscan for defining a significant overlap
+            overlapping_dates = date_counts[date_counts >= min_samples_dbscan]
+            if overlapping_dates.empty:
+                continue
+            cluster_start = overlapping_dates.index.min()
+            cluster_end = overlapping_dates.index.max()
+            interseced_clusters_list.append({
+                'Chart Pattern': pattern,
+                'Cluster': cluster_id, # This ID is local to the (pattern, window_size) batch
+                'Start': cluster_start,
+                'End': cluster_end,
+                'Seg_Start': cluster_group['Seg_Start'].iloc[0],
+                'Seg_End': cluster_group['Seg_End'].iloc[0],
+                'Avg_Probability': cluster_group['Probability'].mean(),
+            })
+    final_cluster_labled_df = pd.concat(cluster_labled_windows_list) if cluster_labled_windows_list else pd.DataFrame(columns=df.columns if not df.empty else [])
+    if 'Cluster' not in final_cluster_labled_df.columns and not final_cluster_labled_df.empty:
+        final_cluster_labled_df['Cluster'] = -1 # Default if no clusters formed but df had data
+    final_interseced_df = pd.DataFrame(interseced_clusters_list)
+    return final_cluster_labled_df, final_interseced_df
+# --- Public API Function ---
+def locate_patterns(ohlc_data: pd.DataFrame,
+                    patterns_to_return: list = None,
+                    model=None,
+                    pattern_encoding_reversed=None,
+                    win_size_proportions: list = None,
+                    padding_proportion: float = PADDING_PROPORTION,
+                    stride: int = STRIDE,
+                    probability_threshold = None,
+                    prob_threshold_of_no_pattern_to_mark_as_no_pattern: float = PROB_THRESHOLD_NO_PATTERN,
+                    dbscan_eps: float = DBSCAN_EPS,
+                    dbscan_min_samples: int = DBSCAN_MIN_SAMPLES,
+                    enable_plotting: bool = False,  # Keep parameter but ignore it
+                    parallel_processing: bool = True,
+                    num_cores_parallel: int = 16,
+                    parallel_verbose_level: int = 1
+                    ):
+    """
+    Locates financial chart patterns in OHLC data using a sliding window approach and clustering.
+    """
+    active_model = model if model is not None else rocket_model_global
+    active_pattern_encoding_rev = pattern_encoding_reversed if pattern_encoding_reversed is not None else pattern_encoding_reversed_global
+    active_win_size_proportions = win_size_proportions if win_size_proportions is not None else WIN_SIZE_PROPORTIONS
+    active_probability_threshold = probability_threshold if probability_threshold is not None else PROBABILITY_THRESHOLD_LIST
+    if active_model is None:
+        print("Error: Pattern detection model is not loaded. Cannot proceed.")
+        return pd.DataFrame()
+    ohlc_data_segment = ohlc_data.copy()
+    ohlc_data_segment['Date'] = pd.to_datetime(ohlc_data_segment['Date'])
+    seg_len = len(ohlc_data_segment)
+    if ohlc_data_segment.empty:
+        return pd.DataFrame()
+    win_results_for_each_size = []
+    located_patterns_and_other_info_for_each_size = []
+    cluster_labled_windows_list = [] # Stores all clustered windows from all iterations
+    used_win_sizes = []
+    global_cluster_id_offset = 0 # To ensure cluster IDs are unique across all window sizes and patterns
+    for win_prop in active_win_size_proportions:
+        window_size = seg_len // win_prop if win_prop > 0 else seg_len # Avoid division by zero
+        window_size = int(max(10, window_size))
+        if window_size in used_win_sizes:
+            continue
+        used_win_sizes.append(window_size)
+        win_results_df = _parallel_process_sliding_window(
+            ohlc_data_segment, active_model, active_probability_threshold, stride,
+            active_pattern_encoding_rev, window_size, padding_proportion,
+            prob_threshold_of_no_pattern_to_mark_as_no_pattern,
+            parallel=parallel_processing, num_cores=num_cores_parallel,
+            verbose_level=parallel_verbose_level # Pass verbosity
+        )
+        if win_results_df.empty:
+            continue
+        win_results_df['Window_Size'] = window_size
+        # win_results_for_each_size.append(win_results_df) # Not directly used later, can be omitted if not needed for debugging
+        predicted_patterns_for_cluster = _prepare_dataset_for_cluster(ohlc_data_segment, win_results_df)
+        if predicted_patterns_for_cluster.empty:
+            continue
+        # Pass dbscan_min_samples to _cluster_windows
+        temp_cluster_labled_windows_df, temp_interseced_clusters_df = _cluster_windows(
+            predicted_patterns_for_cluster, active_probability_threshold,
+            eps=dbscan_eps, min_samples_dbscan=dbscan_min_samples # Pass the parameter
+        )
+        if temp_cluster_labled_windows_df.empty or temp_interseced_clusters_df.empty:
+            continue
+        # Adjust cluster IDs to be globally unique before appending
+        # For temp_cluster_labled_windows_df
+        non_noise_clusters_mask_labeled = temp_cluster_labled_windows_df['Cluster'] != -1
+        if non_noise_clusters_mask_labeled.any():
+            temp_cluster_labled_windows_df.loc[non_noise_clusters_mask_labeled, 'Cluster'] = \
+                temp_cluster_labled_windows_df.loc[non_noise_clusters_mask_labeled, 'Cluster'].astype(int) + global_cluster_id_offset
+        # For temp_interseced_clusters_df
+        # Note: 'Cluster' in temp_interseced_clusters_df is already filtered for non-noise by its creation logic
+        if not temp_interseced_clusters_df.empty:
+             temp_interseced_clusters_df['Cluster'] = temp_interseced_clusters_df['Cluster'].astype(int) + global_cluster_id_offset
+        current_max_cluster_id_in_batch = -1
+        if not temp_interseced_clusters_df.empty and 'Cluster' in temp_interseced_clusters_df.columns:
+            valid_clusters = temp_interseced_clusters_df[temp_interseced_clusters_df['Cluster'] != -1]['Cluster']
+            if not valid_clusters.empty:
+                 current_max_cluster_id_in_batch = valid_clusters.max()
+        cluster_labled_windows_list.append(temp_cluster_labled_windows_df)
+        temp_interseced_clusters_df['Calc_Start'] = temp_interseced_clusters_df['Start']
+        temp_interseced_clusters_df['Calc_End'] = temp_interseced_clusters_df['End']
+        located_patterns_info = temp_interseced_clusters_df.copy()
+        located_patterns_info['Window_Size'] = window_size
+        located_patterns_and_other_info_for_each_size.append(located_patterns_info)
+        if current_max_cluster_id_in_batch > -1 :
+            global_cluster_id_offset = current_max_cluster_id_in_batch + 1
+        elif non_noise_clusters_mask_labeled.any(): # If intersected was empty but labeled had clusters
+            max_labeled_cluster = temp_cluster_labled_windows_df.loc[non_noise_clusters_mask_labeled, 'Cluster'].max()
+            global_cluster_id_offset = max_labeled_cluster + 1
+    if not located_patterns_and_other_info_for_each_size:
+        return pd.DataFrame()
+    all_located_patterns_df = pd.concat(located_patterns_and_other_info_for_each_size, ignore_index=True)
+    if all_located_patterns_df.empty:
+        return pd.DataFrame()
+    # Filter overlapping patterns (logic remains similar to previous version)
+    unique_chart_patterns = all_located_patterns_df['Chart Pattern'].unique()
+    # Sort window sizes descending to prioritize larger windows
+    sorted_unique_window_sizes = np.sort(all_located_patterns_df['Window_Size'].unique())[::-1]
+    final_filtered_patterns_list = []
+    # Use a copy and mark 'taken' to handle overlaps systematically
+    candidate_patterns_df = all_located_patterns_df.copy()
+    # Ensure 'taken' column exists, default to False
+    if 'taken' not in candidate_patterns_df.columns:
+        candidate_patterns_df['taken'] = False
+    else: # if it somehow exists from a previous run (unlikely with .copy()), reset it
+        candidate_patterns_df['taken'] = False
+    for cp_val in unique_chart_patterns:
+        for ws_val in sorted_unique_window_sizes:
+            # Select current batch of patterns to consider
+            current_batch_indices = candidate_patterns_df[
+                (candidate_patterns_df['Chart Pattern'] == cp_val) &
+                (candidate_patterns_df['Window_Size'] == ws_val) &
+                (~candidate_patterns_df['taken'])
+            ].index
+            for current_idx in current_batch_indices:
+                if candidate_patterns_df.loc[current_idx, 'taken']: # Already claimed by a higher priority pattern
+                    continue
+                current_row_data = candidate_patterns_df.loc[current_idx]
+                final_filtered_patterns_list.append(current_row_data.drop('taken')) # Add to final list
+                candidate_patterns_df.loc[current_idx, 'taken'] = True # Mark as taken
+                # Now, check for overlaps with other non-taken patterns and invalidate lower-priority ones
+                # Lower priority: smaller window, or same window but processed later (which this loop structure handles),
+                # or significantly lower probability.
+                overlapping_candidates_indices = candidate_patterns_df[
+                    (candidate_patterns_df.index != current_idx) & # Don't compare with itself
+                    (candidate_patterns_df['Chart Pattern'] == cp_val) &
+                    (~candidate_patterns_df['taken']) &
+                    (candidate_patterns_df['Calc_Start'] <= current_row_data['Calc_End']) &
+                    (candidate_patterns_df['Calc_End'] >= current_row_data['Calc_Start'])
+                ].index
+                for ov_idx in overlapping_candidates_indices:
+                    ov_row_data = candidate_patterns_df.loc[ov_idx]
+                    iou = intersection_over_union(current_row_data['Calc_Start'], current_row_data['Calc_End'],
+                                                  ov_row_data['Calc_Start'], ov_row_data['Calc_End'])
+                    if iou > 0.6: # Significant overlap
+                        # current_row_data (from larger/earlier window) is preferred by default.
+                        # ov_row_data (overlapping candidate) is discarded UNLESS:
+                        # it's from a smaller window AND has significantly higher probability.
+                        is_ov_preferred = (ov_row_data['Window_Size'] < current_row_data['Window_Size']) and \
+                                          ((ov_row_data['Avg_Probability'] - current_row_data['Avg_Probability']) > 0.1)
+                        if not is_ov_preferred:
+                            candidate_patterns_df.loc[ov_idx, 'taken'] = True
+                        # If ov_preferred, current_row_data was already added. The ov_row will be considered
+                        # when its (smaller) window size turn comes, if not already taken.
+                        # This implies a potential issue: if current_row is added, and a smaller, much better ov_row exists,
+                        # current_row should ideally be removed. The current logic adds current_row first.
+                        # For a more robust selection, decisions might need to be deferred or involve pairwise ranking.
+                        # However, given the descending window size iteration, this greedy choice is often sufficient.
+                        # Re-evaluating this complex interaction:
+                        # If current_row (larger window) is chosen, and an ov_row (smaller window, much higher prob) exists,
+                        # the current logic keeps current_row and marks ov_row as NOT taken, so ov_row can be picked later.
+                        # This might lead to both being in the list if their IoU with *other* patterns doesn't disqualify them.
+                        # The final drop_duplicates will handle exact overlaps.
+    filtered_loc_pat_and_info_df = pd.DataFrame(final_filtered_patterns_list)
+    if not filtered_loc_pat_and_info_df.empty:
+         # Drop duplicates based on the defining characteristics of a pattern instance
+         filtered_loc_pat_and_info_df = filtered_loc_pat_and_info_df.sort_values(
+             by=['Chart Pattern', 'Calc_Start', 'Window_Size', 'Avg_Probability'],
+             ascending=[True, True, False, False] # Prioritize larger window, then higher prob for duplicates
+         ).drop_duplicates(
+             subset=['Chart Pattern', 'Calc_Start', 'Calc_End'],
+             keep='first' # Keep the one that came first after sorting (best according to sort)
+         ).sort_values(by='Calc_Start').reset_index(drop=True)
+    if enable_plotting and not filtered_loc_pat_and_info_df.empty and cluster_labled_windows_list:
+        # Remove plotting code
+        pass
+    if patterns_to_return and not filtered_loc_pat_and_info_df.empty:
+        return filtered_loc_pat_and_info_df[filtered_loc_pat_and_info_df['Chart Pattern'].isin(patterns_to_return)]
+    return filtered_loc_pat_and_info_df