add files

Browse files

Files changed (5) hide show

handler.py +393 -0
requirements.txt +0 -0
utils/__init__.py +0 -0
utils/eval.py +474 -0
utils/formatAndPreprocessNewPatterns.py +477 -0

handler.py ADDED Viewed

	@@ -0,0 +1,393 @@

+# handler.py
+import joblib
+import pandas as pd
+import numpy as np
+import math
+from joblib import Parallel, delayed
+from sklearn.cluster import DBSCAN
+import os # For accessing model path
+# Import your utility functions
+# Make sure your utils directory is alongside handler.py
+# and contains __init__.py, eval.py, formatAndPreprocessNewPatterns.py
+from utils.eval import intersection_over_union
+from utils.formatAndPreprocessNewPatterns import get_patetrn_name_by_encoding, get_pattern_encoding_by_name, get_reverse_pattern_encoding
+# --- Global Model Loading (Crucial for performance) ---
+# This model will be loaded ONLY ONCE when the server starts.
+# Ensure the path is correct relative to where handler.py runs in the container.
+# The `MODEL_DIR` env var is automatically set by Inference Endpoints.
+# If you place 'Models/' directly in your repo root, it will be at /repository/Models/
+# If you place it outside (not recommended), you'd need to adjust paths.
+# For simplicity, assume `Models/` is in the root of your HF repo.
+MODEL_PATH = os.path.join(os.environ.get("MODEL_DIR", "."), "Models", "Width Aug OHLC_mini_rocket_xgb.joblib")
+# Load the model globally
+try:
+    print(f"Loading model from: {MODEL_PATH}")
+    rocket_model = joblib.load(MODEL_PATH)
+    print("Model loaded successfully!")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    # In a real scenario, you might want to raise an exception to prevent the server from starting
+    rocket_model = None
+# --- Helper functions (from your provided code) ---
+# Paste your `process_window`, `parallel_process_sliding_window`,
+# `prepare_dataset_for_cluster`, `cluster_windows` here.
+# Make sure they are defined before `locate_patterns`
+# because locate_patterns depends on them.
+# Make sure these globals are outside functions if they are truly global constants
+pattern_encoding_reversed = get_reverse_pattern_encoding()
+# model is now `rocket_model` loaded globally
+# plot_count is handled by the API input now
+win_size_proportions = np.round(np.logspace(0, np.log10(20), num=10), 2).tolist()
+padding_proportion = 0.6
+stride = 1
+probab_threshold_list = 0.5
+prob_threshold_of_no_pattern_to_mark_as_no_pattern = 0.5
+target_len = 30 # Not used in your current code
+eps=0.04
+min_samples=3
+win_width_proportion=10 # Not used in your current code
+def process_window(i, ohlc_data_segment, rocket_model, probability_threshold, pattern_encoding_reversed,seg_start, seg_end, window_size, padding_proportion,prob_threshold_of_no_pattern_to_mark_as_no_pattern=1):
+    start_index = i - math.ceil(window_size * padding_proportion)
+    end_index = start_index + window_size
+    start_index = max(start_index, 0)
+    end_index = min(end_index, len(ohlc_data_segment))
+    ohlc_segment = ohlc_data_segment[start_index:end_index]
+    if len(ohlc_segment) == 0:
+        return None  # Skip empty segments
+    win_start_date = ohlc_segment['Date'].iloc[0]
+    win_end_date = ohlc_segment['Date'].iloc[-1]
+    ohlc_array_for_rocket = ohlc_segment[['Open', 'High', 'Low', 'Close','Volume']].to_numpy().reshape(1, len(ohlc_segment), 5)
+    ohlc_array_for_rocket = np.transpose(ohlc_array_for_rocket, (0, 2, 1))
+    try:
+        pattern_probabilities = rocket_model.predict_proba(ohlc_array_for_rocket)
+    except Exception as e:
+        print(f"Error in prediction: {e}")
+        return None
+    max_probability = np.max(pattern_probabilities)
+    no_pattern_proba = pattern_probabilities[0][get_pattern_encoding_by_name ('No Pattern')]
+    pattern_index = np.argmax(pattern_probabilities)
+    pred_proba = max_probability
+    pred_pattern = get_patetrn_name_by_encoding(pattern_index)
+    if no_pattern_proba > prob_threshold_of_no_pattern_to_mark_as_no_pattern:
+        pred_proba = no_pattern_proba
+        pred_pattern = 'No Pattern'
+    new_row = {
+        'Start': win_start_date, 'End': win_end_date,  'Chart Pattern': pred_pattern,  'Seg_Start': seg_start, 'Seg_End': seg_end ,
+        'Probability': pred_proba
+    }
+    return new_row
+def parallel_process_sliding_window(ohlc_data_segment, rocket_model, probability_threshold, stride, pattern_encoding_reversed, window_size, padding_proportion,prob_threshold_of_no_pattern_to_mark_as_no_pattern=1,parallel=True,num_cores=-1):
+    seg_start = ohlc_data_segment['Date'].iloc[0]
+    seg_end = ohlc_data_segment['Date'].iloc[-1]
+    # Render.com's worker environment for the HF endpoint will have limited cores for single instances.
+    # Parallel processing (`joblib.Parallel`) within the *single* HF endpoint worker
+    # might not yield significant benefits or might even cause issues if not configured carefully.
+    # It's generally better to rely on HF's scaling for multiple requests.
+    # Consider setting `parallel=False` or `num_cores=1` for initial deployment if you hit issues.
+    # For now, let's keep it as is, but be mindful of resource constraints.
+    if parallel:
+        with Parallel(n_jobs=num_cores, verbose=0) as parallel: # verbose=0 to reduce log spam
+            results = parallel(
+                delayed(process_window)(
+                    i=i,
+                    ohlc_data_segment=ohlc_data_segment,
+                    rocket_model=rocket_model,
+                    probability_threshold=probability_threshold,
+                    pattern_encoding_reversed=pattern_encoding_reversed,
+                    window_size=window_size,
+                    seg_start=seg_start,
+                    seg_end=seg_end,
+                    padding_proportion=padding_proportion,
+                    prob_threshold_of_no_pattern_to_mark_as_no_pattern=prob_threshold_of_no_pattern_to_mark_as_no_pattern
+                )
+                for i in range(0, len(ohlc_data_segment), stride)
+            )
+        return pd.DataFrame([res for res in results if res is not None])
+    else:
+        results = []
+        for i_idx, i in enumerate(range(0, len(ohlc_data_segment), stride)):
+            res = process_window(i, ohlc_data_segment, rocket_model, probability_threshold, pattern_encoding_reversed, seg_start, seg_end, window_size, padding_proportion)
+            if res is not None:
+                results.append(res)
+        return pd.DataFrame(results)
+def prepare_dataset_for_cluster(ohlc_data_segment, win_results_df):
+    predicted_patterns = win_results_df.copy()
+    # origin_date = ohlc_data_segment['Date'].min() # Not used
+    for index, row in predicted_patterns.iterrows():
+        pattern_start = row['Start']
+        pattern_end = row['End']
+        start_point_index = len(ohlc_data_segment[ohlc_data_segment['Date'] < pattern_start])
+        pattern_len = len(ohlc_data_segment[(ohlc_data_segment['Date'] >= pattern_start) & (ohlc_data_segment['Date'] <= pattern_end)])
+        pattern_mid_index = start_point_index + (pattern_len / 2)
+        predicted_patterns.at[index, 'Center'] = pattern_mid_index
+        predicted_patterns.at[index, 'Pattern_Start_pos'] = start_point_index
+        predicted_patterns.at[index, 'Pattern_End_pos'] = start_point_index + pattern_len
+    return predicted_patterns
+def cluster_windows(predicted_patterns , probability_threshold, window_size,eps = 0.05 , min_samples = 2):
+    df = predicted_patterns.copy()
+    if isinstance(probability_threshold, list):
+        for i in range(len(probability_threshold)):
+            pattern_name = get_patetrn_name_by_encoding(i)
+            df.drop(df[(df['Chart Pattern'] == pattern_name) & (df['Probability'] < probability_threshold[i])].index, inplace=True)
+    else:
+        df = df[df['Probability'] > probability_threshold]
+    cluster_labled_windows = []
+    interseced_clusters = []
+    if df.empty: # Handle case where df might be empty after filtering
+        return None, None
+    min_center = df['Center'].min()
+    max_center = df['Center'].max()
+    for pattern, group in df.groupby('Chart Pattern'):
+        centers = group['Center'].values.reshape(-1, 1)
+        if min_center < max_center:
+            norm_centers = (centers - min_center) / (max_center - min_center)
+        else:
+            norm_centers = np.ones_like(centers)
+        db = DBSCAN(eps=eps, min_samples=min_samples).fit(norm_centers)
+        group['Cluster'] = db.labels_
+        cluster_labled_windows.append(group)
+        for cluster_id, cluster_group in group[group['Cluster'] != -1].groupby('Cluster'):
+            expanded_dates = []
+            for _, row in cluster_group.iterrows():
+                dates = pd.date_range(row["Start"], row["End"])
+                expanded_dates.extend(dates)
+            date_counts = pd.Series(expanded_dates).value_counts().sort_index()
+            cluster_start = date_counts[date_counts >= 2].index.min()
+            cluster_end = date_counts[date_counts >= 2].index.max()
+            interseced_clusters.append({
+                'Chart Pattern': pattern,
+                'Cluster': cluster_id,
+                'Start': cluster_start,
+                'End': cluster_end,
+                'Seg_Start': cluster_group['Seg_Start'].iloc[0],
+                'Seg_End': cluster_group['Seg_End'].iloc[0],
+                'Avg_Probability': cluster_group['Probability'].mean(),
+            })
+    if len(cluster_labled_windows) == 0 or len(interseced_clusters) == 0:
+        return None, None
+    cluster_labled_windows_df = pd.concat(cluster_labled_windows)
+    interseced_clusters_df = pd.DataFrame(interseced_clusters)
+    cluster_labled_windows_df = cluster_labled_windows_df.sort_index()
+    return cluster_labled_windows_df, interseced_clusters_df
+# ========================= locate_patterns function ==========================
+# This will be your primary inference function called by the HF endpoint.
+class InferenceHandler:
+    def __init__(self):
+        # Model is loaded globally, so it's accessible here
+        self.model = rocket_model
+        if self.model is None:
+            raise ValueError("ML model failed to load during initialization.")
+        # Initialize other global parameters here as well
+        self.pattern_encoding_reversed = pattern_encoding_reversed
+        self.win_size_proportions = win_size_proportions
+        self.padding_proportion = padding_proportion
+        self.stride = stride
+        self.probab_threshold_list = probab_threshold_list
+        self.prob_threshold_of_no_pattern_to_mark_as_no_pattern = prob_threshold_of_no_pattern_to_mark_as_no_pattern
+        self.eps = eps
+        self.min_samples = min_samples
+    def __call__(self, inputs):
+        """
+        Main inference method for the Hugging Face Inference Endpoint.
+        Args:
+            inputs: A dictionary or list of dictionaries representing the input data.
+                    For your case, this will be the OHLC data sent from Django.
+                    Expected format: [{"Date": "YYYY-MM-DD", "Open": ..., "High": ..., ...}, ...]
+        Returns:
+            A list of dictionaries representing the detected patterns.
+        """
+        if not self.model:
+            raise ValueError("ML model is not loaded. Cannot perform inference.")
+        # Ensure inputs is a list of dictionaries if not already
+        if isinstance(inputs, dict):
+            inputs = [inputs] # Handle single input dict if needed
+        # Convert input (list of dicts) to pandas DataFrame
+        try:
+            ohlc_data = pd.DataFrame(inputs)
+            # Ensure 'Date' is datetime, it might come as string from JSON
+            ohlc_data['Date'] = pd.to_datetime(ohlc_data['Date'])
+            # Ensure proper columns exist
+            required_cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
+            if not all(col in ohlc_data.columns for col in required_cols):
+                raise ValueError(f"Missing required columns in input data. Expected: {required_cols}, Got: {ohlc_data.columns.tolist()}")
+        except Exception as e:
+            print(f"Error processing input data: {e}")
+            raise ValueError(f"Invalid input data format: {e}")
+        ohlc_data_segment = ohlc_data.copy()
+        seg_len = len(ohlc_data_segment)
+        if ohlc_data_segment.empty:
+            raise ValueError("OHLC Data segment is empty or invalid after processing.")
+        win_results_for_each_size = []
+        located_patterns_and_other_info_for_each_size = []
+        cluster_labled_windows_list = []
+        used_win_sizes = []
+        win_iteration = 0
+        for win_size_proportion in self.win_size_proportions:
+            window_size = seg_len // win_size_proportion
+            if window_size < 10:
+                window_size = 10
+            window_size = int(window_size)
+            if window_size in used_win_sizes:
+                continue
+            used_win_sizes.append(window_size)
+            # Pass the globally loaded model `self.model`
+            win_results_df = parallel_process_sliding_window(
+                ohlc_data_segment,
+                self.model,
+                self.probab_threshold_list,
+                self.stride,
+                self.pattern_encoding_reversed,
+                window_size,
+                self.padding_proportion,
+                self.prob_threshold_of_no_pattern_to_mark_as_no_pattern,
+                parallel=True, # You might want to test with False/num_cores=1 on HF to avoid internal parallelism issues
+                num_cores=-1 # -1 means all available cores; on HF, this will be limited by the instance type
+            )
+            if win_results_df is None or win_results_df.empty:
+                print(f"Window results dataframe is empty for window size {window_size}")
+                continue
+            win_results_df['Window_Size'] = window_size
+            win_results_for_each_size.append(win_results_df)
+            predicted_patterns = prepare_dataset_for_cluster(ohlc_data_segment, win_results_df)
+            if predicted_patterns is None or predicted_patterns.empty:
+                print("Predicted patterns dataframe is empty")
+                continue
+            # Pass eps and min_samples from handler's state
+            cluster_labled_windows_df , interseced_clusters_df = cluster_windows(
+                predicted_patterns,
+                self.probab_threshold_list,
+                window_size,
+                eps=self.eps,
+                min_samples=self.min_samples
+            )
+            if cluster_labled_windows_df is None or interseced_clusters_df is None or cluster_labled_windows_df.empty or interseced_clusters_df.empty:
+                print("Clustered windows dataframe is empty")
+                continue
+            mask = cluster_labled_windows_df['Cluster'] != -1
+            cluster_labled_windows_df.loc[mask, 'Cluster'] = cluster_labled_windows_df.loc[mask, 'Cluster'].astype(int) + win_iteration
+            interseced_clusters_df['Cluster'] = interseced_clusters_df['Cluster'].astype(int) + win_iteration
+            num_of_unique_clusters = interseced_clusters_df[interseced_clusters_df['Cluster']!=-1]['Cluster'].nunique()
+            win_iteration += num_of_unique_clusters
+            cluster_labled_windows_list.append(cluster_labled_windows_df)
+            interseced_clusters_df['Calc_Start'] = interseced_clusters_df['Start']
+            interseced_clusters_df['Calc_End'] = interseced_clusters_df['End']
+            located_patterns_and_other_info = interseced_clusters_df.copy()
+            if located_patterns_and_other_info is None or located_patterns_and_other_info.empty:
+                print("Located patterns and other info dataframe is empty")
+                continue
+            located_patterns_and_other_info['Window_Size'] = window_size
+            located_patterns_and_other_info_for_each_size.append(located_patterns_and_other_info)
+        if located_patterns_and_other_info_for_each_size is None or not located_patterns_and_other_info_for_each_size:
+            print("Located patterns and other info for each size is empty")
+            return [] # Return empty list if no patterns found
+        located_patterns_and_other_info_for_each_size_df = pd.concat(located_patterns_and_other_info_for_each_size)
+        unique_window_sizes = located_patterns_and_other_info_for_each_size_df['Window_Size'].unique()
+        unique_patterns = located_patterns_and_other_info_for_each_size_df['Chart Pattern'].unique()
+        unique_window_sizes = np.sort(unique_window_sizes)[::-1]
+        filtered_loc_pat_and_info_rows_list = []
+        for chart_pattern in unique_patterns:
+            located_patterns_and_other_info_for_each_size_df_chart_pattern = located_patterns_and_other_info_for_each_size_df[located_patterns_and_other_info_for_each_size_df['Chart Pattern'] == chart_pattern]
+            for win_size in unique_window_sizes:
+                located_patterns_and_other_info_for_each_size_df_win_size_chart_pattern = located_patterns_and_other_info_for_each_size_df_chart_pattern[located_patterns_and_other_info_for_each_size_df_chart_pattern['Window_Size'] == win_size]
+                for idx , row in located_patterns_and_other_info_for_each_size_df_win_size_chart_pattern.iterrows():
+                    start_date = row['Calc_Start']
+                    end_date = row['Calc_End']
+                    is_already_included = False
+                    intersecting_rows = located_patterns_and_other_info_for_each_size_df_chart_pattern[
+                                                        (located_patterns_and_other_info_for_each_size_df_chart_pattern['Calc_Start'] <= end_date) &
+                                                        (located_patterns_and_other_info_for_each_size_df_chart_pattern['Calc_End'] >= start_date)
+                                                    ]
+                    is_already_included = False
+                    for idx2, row2 in intersecting_rows.iterrows():
+                        iou = intersection_over_union(start_date, end_date, row2['Calc_Start'], row2['Calc_End'])
+                        if iou > 0.6:
+                            if row2['Window_Size'] > row['Window_Size']:
+                                if (row['Avg_Probability'] - row2['Avg_Probability']) > 0.1:
+                                    is_already_included = False
+                                else:
+                                    is_already_included = True
+                                    break
+                            elif row['Window_Size'] >= row2['Window_Size']:
+                                if (row2['Avg_Probability'] - row['Avg_Probability']) > 0.1:
+                                    is_already_included = True
+                                    break
+                                else:
+                                    is_already_included = False
+                    if not is_already_included:
+                        filtered_loc_pat_and_info_rows_list.append(row)
+        filtered_loc_pat_and_info_df = pd.DataFrame(filtered_loc_pat_and_info_rows_list)
+        # Convert datetime columns to string format for serialization before returning
+        datetime_columns = ['Start', 'End', 'Seg_Start', 'Seg_End', 'Calc_Start', 'Calc_End']
+        for col in datetime_columns:
+            if col in filtered_loc_pat_and_info_df.columns:
+                if pd.api.types.is_datetime64_any_dtype(filtered_loc_pat_and_info_df[col]):
+                    filtered_loc_pat_and_info_df[col] = pd.to_datetime(filtered_loc_pat_and_info_df[col]).dt.strftime('%Y-%m-%d')
+                elif not filtered_loc_pat_and_info_df[col].empty and isinstance(filtered_loc_pat_and_info_df[col].iloc[0], str):
+                    pass
+                else:
+                    filtered_loc_pat_and_info_df[col] = filtered_loc_pat_and_info_df[col].astype(str)
+        # Return as a list of dictionaries (JSON serializable)
+        return filtered_loc_pat_and_info_df.to_dict('records')

requirements.txt ADDED Viewed

Binary file (2.71 kB). View file

utils/__init__.py ADDED Viewed

File without changes

utils/eval.py ADDED Viewed

	@@ -0,0 +1,474 @@

+import sys
+# from matplotlib import pyplot as plt
+# from matplotlib.gridspec import GridSpec
+import numpy as np
+import pandas as pd
+def intersection_over_union(start1, end1, start2, end2):
+    """
+    Compute Intersection over Union (IoU) between two date ranges.
+    """
+    latest_start = max(start1, start2)
+    earliest_end = min(end1, end2)
+    overlap = max(0, (earliest_end - latest_start).days + 1)
+    union = (end1 - start1).days + (end2 - start2).days + 2 - overlap
+    return overlap / union if union > 0 else 0  # Avoid division by zero
+def mean_abselute_error(start1, end1, start2, end2):
+    """
+    Compute Mean Absolute Error (MAE) between two date ranges.
+    """
+    # check if start or end are NAT
+    if start1 is pd.NaT or end1 is pd.NaT or start2 is pd.NaT or end2 is pd.NaT:
+        print("One of the dates is NaT")
+        print(f"start1: {start1}, end1: {end1}, start2: {start2}, end2: {end2}")
+        return None
+    return (abs(start1 - start2).days + abs(end1 - end2).days) / 2
+def get_model_eval_res(located_patterns_and_other_info_updated_dict,window_results_dict,selected_models,selected_test_patterns_without_no_pattern):
+    model_eval_results_dict = {}
+    for model_name in selected_models:
+        print(f"\n Selected model: {model_name}")
+        located_patterns_and_other_info_updated_df = located_patterns_and_other_info_updated_dict[model_name]
+        window_results_df = window_results_dict[model_name]
+        # dictionary to store the count of properly located patterns , iou and mae for each properly detected pattern for each model
+        # Dictionary to store the count of properly located patterns
+        number_of_properly_located_patterns = {}
+        iou_for_each_properly_detected_pattern = {}
+        mae_for_each_properly_detected_pattern = {}
+        # Convert date columns to datetime (once, outside the loop for efficiency)
+        located_patterns_and_other_info_updated_df['Calc_Start'] = pd.to_datetime(located_patterns_and_other_info_updated_df['Calc_Start'])
+        located_patterns_and_other_info_updated_df['Calc_End'] = pd.to_datetime(located_patterns_and_other_info_updated_df['Calc_End'])
+        # Iterate over test patterns with progress bar
+        for index, row in selected_test_patterns_without_no_pattern.iterrows():
+            sys.stdout.write(f"\rProcessing row {index + 1}/{len(selected_test_patterns_without_no_pattern)}")
+            sys.stdout.flush()
+            symbol = row['Symbol']
+            chart_pattern = row['Chart Pattern']
+            start_date = pd.to_datetime(row['Start']).tz_localize(None)
+            end_date = pd.to_datetime(row['End']).tz_localize(None)
+            # Filter for matching symbol and chart pattern
+            located_patterns_for_this = located_patterns_and_other_info_updated_df[
+                (located_patterns_and_other_info_updated_df['Symbol'] == symbol) &
+                (located_patterns_and_other_info_updated_df['Chart Pattern'] == chart_pattern)
+            ].copy()  # Use `.copy()` to avoid SettingWithCopyWarning
+            if located_patterns_for_this.empty:
+                continue  # Skip if no matching rows
+            # Compute IoU for each row using .loc to avoid warnings
+            located_patterns_for_this.loc[:, 'IoU'] = located_patterns_for_this.apply(
+                lambda x: intersection_over_union(start_date, end_date, x['Calc_Start'], x['Calc_End']),
+                axis=1
+            )
+            # Compute MAE for each row using .loc to avoid warnings
+            located_patterns_for_this.loc[:, 'MAE'] = located_patterns_for_this.apply(
+                lambda x: mean_abselute_error(start_date, end_date, x['Calc_Start'], x['Calc_End']),
+                axis=1
+            )
+            # Filter based on IoU threshold (≥ 0.8)
+            located_patterns_for_this_proper = located_patterns_for_this[located_patterns_for_this['IoU'] >= 0.25]
+            if not located_patterns_for_this_proper.empty:
+                number_of_properly_located_patterns[chart_pattern] = number_of_properly_located_patterns.get(chart_pattern, 0) + 1
+                iou_for_each_properly_detected_pattern[chart_pattern] = iou_for_each_properly_detected_pattern.get(chart_pattern, 0) + max(located_patterns_for_this_proper['IoU'])
+                mae_for_each_properly_detected_pattern[chart_pattern] = mae_for_each_properly_detected_pattern.get(chart_pattern, 0) + min(located_patterns_for_this_proper['MAE'])
+        number_of_properly_located_patterns
+        model_eval_results_dict[model_name] = {
+            'number_of_properly_located_patterns': number_of_properly_located_patterns,
+            'iou_for_each_properly_detected_pattern': iou_for_each_properly_detected_pattern,
+            'mae_for_each_properly_detected_pattern': mae_for_each_properly_detected_pattern
+        }
+    return model_eval_results_dict
+############################################################################################
+# Evaluate multiple models and plot
+############################################################################################
+# Commenting out plotting functions
+"""
+def create_comprehensive_model_comparison(all_models_metrics):
+    Create a comprehensive visualization comparing all models across all metrics,
+    using nested concentric pie charts for Precision and Recall.
+    Parameters:
+    -----------
+    all_models_metrics : dict
+        Dictionary containing metrics for each model
+    models = list(all_models_metrics.keys())
+    n_models = len(models)
+    # Define the metrics to include
+    key_metrics = {
+        'total_recall': 'Recall',
+        'total_precision': 'Precision',
+        'overall_f1': 'F1 Score',
+        'overall_iou': 'IoU',
+        'overall_mae': 'MAE'
+    }
+    # Create figure with GridSpec for flexible layout
+    fig = plt.figure(figsize=(20, 14))
+    # Add main title with enough space for legend below it
+    plt.suptitle('Comprehensive Model Evaluation', fontsize=16, y=0.98)
+    # Define a color palette for models
+    colors = plt.cm.tab10(np.linspace(0, 1, n_models))
+    # Create a master legend below the title
+    legend_handles = [plt.Line2D([0], [0], color=colors[i], lw=4, label=model) for i, model in enumerate(models)]
+    fig.legend(
+        handles=legend_handles,
+        labels=models,
+        loc='upper center',
+        bbox_to_anchor=(0.5, 0.93),  # Moved down from 0.98 to 0.93
+        ncol=n_models,
+        fontsize=12
+    )
+    # Adjust GridSpec to account for the title and legend
+    gs = GridSpec(3, 3, figure=fig, height_ratios=[1.2, 1.2, 1], top=0.88)  # Reduced top from 0.95 to 0.88
+    # 1. Precision Nested Pie Chart - top left
+    ax1 = fig.add_subplot(gs[0, 0])
+    # Create a multi-layer nested pie chart for precision
+    # Each ring represents a different model
+    precision_values = [metrics['total_precision'] for metrics in all_models_metrics.values()]
+    # Calculate radii for each ring (outermost ring is largest)
+    radii = np.linspace(0.5, 1.0, n_models+1)[1:]  # start from second element to skip 0.5
+    # Plot each model as a ring, outermost = first model
+    for i, model in enumerate(models):
+        # Create data for this model's ring [precision, 1-precision]
+        data = [precision_values[i], 1-precision_values[i]]
+        colors_ring = [colors[i], 'lightgray']
+        # Create pie chart for this ring
+        wedges, texts = ax1.pie(
+            data,
+            radius=radii[i],
+            colors=colors_ring,
+            startangle=90,
+            counterclock=False,
+            wedgeprops=dict(width=0.15, edgecolor='w')
+        )
+        # Add only the value (no model name) to the pie chart wedge
+        angle = (wedges[0].theta1 + wedges[0].theta2) / 2
+        x = (radii[i] - 0.075) * np.cos(np.radians(angle))
+        y = (radii[i] - 0.075) * np.sin(np.radians(angle))
+        ax1.text(x, y, f"{precision_values[i]:.3f}",
+                ha='center', va='center', fontsize=10, fontweight='bold')
+    # Create center circle for donut effect
+    centre_circle = plt.Circle((0, 0), 0.25, fc='white')
+    ax1.add_patch(centre_circle)
+    ax1.set_title('Precision Comparison (Higher is Better)')
+    ax1.set_aspect('equal')
+    # 2. Recall Nested Pie Chart - top middle
+    ax2 = fig.add_subplot(gs[0, 1])
+    # Create a multi-layer nested pie chart for recall
+    recall_values = [metrics['total_recall'] for metrics in all_models_metrics.values()]
+    # Plot each model as a ring, outermost = first model
+    for i, model in enumerate(models):
+        # Create data for this model's ring [recall, 1-recall]
+        data = [recall_values[i], 1-recall_values[i]]
+        colors_ring = [colors[i], 'lightgray']
+        # Create pie chart for this ring
+        wedges, texts = ax2.pie(
+            data,
+            radius=radii[i],
+            colors=colors_ring,
+            startangle=90,
+            counterclock=False,
+            wedgeprops=dict(width=0.15, edgecolor='w')
+        )
+        # Add only the value (no model name) to the pie chart wedge
+        angle = (wedges[0].theta1 + wedges[0].theta2) / 2
+        x = (radii[i] - 0.075) * np.cos(np.radians(angle))
+        y = (radii[i] - 0.075) * np.sin(np.radians(angle))
+        ax2.text(x, y, f"{recall_values[i]:.3f}",
+                ha='center', va='center', fontsize=10, fontweight='bold')
+    # Create center circle for donut effect
+    centre_circle = plt.Circle((0, 0), 0.25, fc='white')
+    ax2.add_patch(centre_circle)
+    ax2.set_title('Recall Comparison (Higher is Better)')
+    ax2.set_aspect('equal')
+    # 3. F1 Score and IoU - top right
+    ax3 = fig.add_subplot(gs[0, 2])
+    # Prepare data for grouped bar chart
+    metrics_to_plot = ['overall_f1', 'overall_iou']
+    x = np.arange(len(metrics_to_plot))
+    width = 0.8 / n_models
+    # Plot grouped bars for each model
+    for i, (model_name, metrics) in enumerate(all_models_metrics.items()):
+        values = [metrics[key] for key in metrics_to_plot]
+        bars = ax3.bar(x + i*width - width*(n_models-1)/2, values, width, color=colors[i])
+        # Add value labels above each bar
+        for bar, value in zip(bars, values):
+            height = bar.get_height()
+            ax3.text(bar.get_x() + bar.get_width()/2., height + 0.01,
+                    f'{value:.3f}', ha='center', va='bottom', fontsize=9, rotation=0)
+    # Customize the plot
+    ax3.set_xticks(x)
+    ax3.set_xticklabels([key_metrics[key] for key in metrics_to_plot])
+    ax3.set_ylabel('Score')
+    ax3.set_title('F1 Score & IoU Comparison (Higher is Better)')
+    ax3.set_ylim(0, 1.0)
+    ax3.grid(axis='y', linestyle='--', alpha=0.7)
+    # 4. MAE comparison (separate bar chart) - middle left
+    ax4 = fig.add_subplot(gs[1, 0])
+    mae_values = [metrics['overall_mae'] for metrics in all_models_metrics.values()]
+    bars = ax4.bar(models, mae_values, color=colors)
+    # Add value labels above MAE bars
+    for bar, value in zip(bars, mae_values):
+        height = bar.get_height()
+        ax4.text(bar.get_x() + bar.get_width()/2., height + 0.01,
+                f'{value:.3f}', ha='center', va='bottom', fontsize=9)
+    ax4.set_ylabel('Error')
+    ax4.set_title('Mean Absolute Error (Lower is Better)')
+    ax4.grid(axis='y', linestyle='--', alpha=0.7)
+    # 5. Model metrics radar chart - middle center
+    ax5 = fig.add_subplot(gs[1, 1], polar=True)
+    # Setup for radar chart
+    metrics_for_radar = ['total_recall', 'total_precision', 'overall_f1', 'overall_iou']
+    num_vars = len(metrics_for_radar)
+    angles = np.linspace(0, 2*np.pi, num_vars, endpoint=False).tolist()
+    angles += angles[:1]  # Close the loop
+    # Plot each model on the radar chart
+    for i, (model_name, metrics) in enumerate(all_models_metrics.items()):
+        values = [metrics[metric] for metric in metrics_for_radar]
+        values += values[:1]  # Close the loop
+        ax5.plot(angles, values, linewidth=2, linestyle='solid', color=colors[i])
+        ax5.fill(angles, values, alpha=0.1, color=colors[i])
+    # Set radar chart labels
+    ax5.set_xticks(angles[:-1])
+    ax5.set_xticklabels([key_metrics[metric] for metric in metrics_for_radar])
+    ax5.set_ylim(0, 1)
+    ax5.set_title('Model Performance Radar Chart')
+    # 6. Model comparison bar - middle right
+    ax6 = fig.add_subplot(gs[1, 2])
+    # Calculate the average of the four main metrics for an overall score
+    # (excluding MAE which is inverse, lower is better)
+    overall_scores = []
+    for model_name, metrics in all_models_metrics.items():
+        score = (metrics['total_recall'] + metrics['total_precision'] +
+                metrics['overall_f1'] + metrics['overall_iou']) / 4
+        overall_scores.append(score)
+    # Create horizontal bar chart
+    y_pos = np.arange(len(models))
+    ax6.barh(y_pos, overall_scores, color=colors)
+    ax6.set_yticks(y_pos)
+    ax6.set_yticklabels(models)
+    ax6.invert_yaxis()  # labels read top-to-bottom
+    ax6.set_xlabel('Overall Performance Score')
+    ax6.set_title('Overall Model Comparison (Higher is Better)')
+    # Add value labels
+    for i, v in enumerate(overall_scores):
+        ax6.text(v + 0.01, i, f'{v:.3f}', va='center')
+    # 7. Detailed per-model metrics table - bottom span all columns
+    ax7 = fig.add_subplot(gs[2, :])
+    ax7.axis('tight')
+    ax7.axis('off')
+    # Prepare table data
+    table_data = []
+    for model_name, metrics in all_models_metrics.items():
+        row = [model_name]
+        for key in key_metrics:
+            row.append(f"{metrics[key]:.4f}")
+        table_data.append(row)
+    # Create table
+    column_labels = ['Model'] + list(key_metrics.values())
+    table = ax7.table(
+        cellText=table_data,
+        colLabels=column_labels,
+        loc='center',
+        cellLoc='center'
+    )
+    table.auto_set_font_size(False)
+    table.set_fontsize(10)
+    table.scale(1, 1.5)
+    ax7.set_title('Model Metrics Summary Table')
+    plt.tight_layout(rect=[0, 0.03, 1, 0.88])  # Adjusted rect to account for title and legend
+    plt.show()
+    return fig
+# The evaluate_model and evaluate_all_models functions remain unchanged
+# The evaluate_model and evaluate_all_models functions remain unchanged
+# The evaluate_model function remains unchanged from your second code snippet
+def evaluate_model(model_name, model_eval_results_dict, pattern_row_count, test_patterns, located_patterns_and_other_info_updated_dict):
+   Evaluate a model and calculate metrics without redundant plots
+    print(f"\n{'='*20} Model: {model_name} {'='*20}")
+    # Extract model results
+    number_of_properly_located_patterns = model_eval_results_dict[model_name]['number_of_properly_located_patterns']
+    located_patterns_df = located_patterns_and_other_info_updated_dict[model_name]
+    mae_for_each_properly_detected_pattern = model_eval_results_dict[model_name]['mae_for_each_properly_detected_pattern']
+    iou_for_each_properly_detected_pattern = model_eval_results_dict[model_name]['iou_for_each_properly_detected_pattern']
+    # Calculate metrics without plotting
+    # Recall
+    total_number_of_all_patterns = sum(pattern_row_count.values())
+    total_number_of_properly_located_patterns = sum(number_of_properly_located_patterns.values())
+    total_recall = total_number_of_properly_located_patterns / total_number_of_all_patterns if total_number_of_all_patterns > 0 else 0
+    per_pattern_recall = {}
+    for pattern, count in number_of_properly_located_patterns.items():
+        pattern_count = test_patterns[test_patterns['Chart Pattern'] == pattern].shape[0]
+        if pattern_count > 0:
+            per_pattern_recall[pattern] = count / pattern_count
+        else:
+            per_pattern_recall[pattern] = 0
+    # Precision
+    total_number_of_all_located_patterns = len(located_patterns_df)
+    total_precision = total_number_of_properly_located_patterns / total_number_of_all_located_patterns if total_number_of_all_located_patterns > 0 else 0
+    per_pattern_precision = {}
+    for pattern, count in number_of_properly_located_patterns.items():
+        pattern_predictions = located_patterns_df[located_patterns_df['Chart Pattern'] == pattern].shape[0]
+        if pattern_predictions > 0:
+            per_pattern_precision[pattern] = count / pattern_predictions
+        else:
+            per_pattern_precision[pattern] = 0
+    # F1 Score
+    per_pattern_f1 = {}
+    for pattern in per_pattern_recall.keys():
+        precision = per_pattern_precision.get(pattern, 0)
+        recall = per_pattern_recall.get(pattern, 0)
+        if precision + recall > 0:
+            per_pattern_f1[pattern] = 2 * (precision * recall) / (precision + recall)
+        else:
+            per_pattern_f1[pattern] = 0
+    all_precisions = list(per_pattern_precision.values())
+    all_recalls = list(per_pattern_recall.values())
+    avg_precision = sum(all_precisions) / len(all_precisions) if all_precisions else 0
+    avg_recall = sum(all_recalls) / len(all_recalls) if all_recalls else 0
+    if avg_precision + avg_recall == 0:
+        overall_f1 = 0
+    else:
+        overall_f1 = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
+    # MAE
+    per_pattern_mae = {}
+    for pattern, count in number_of_properly_located_patterns.items():
+        if count > 0:
+            per_pattern_mae[pattern] = mae_for_each_properly_detected_pattern.get(pattern, 0) / count
+        else:
+            per_pattern_mae[pattern] = 0
+    total_mae_sum = sum(mae_for_each_properly_detected_pattern.values())
+    total_proper_patterns = sum(number_of_properly_located_patterns.values())
+    overall_mae = total_mae_sum / total_proper_patterns if total_proper_patterns > 0 else 0
+    # IoU
+    per_pattern_iou = {}
+    for pattern, count in number_of_properly_located_patterns.items():
+        if count > 0:
+            per_pattern_iou[pattern] = iou_for_each_properly_detected_pattern.get(pattern, 0) / count
+        else:
+            per_pattern_iou[pattern] = 0
+    total_iou_sum = sum(iou_for_each_properly_detected_pattern.values())
+    overall_iou = total_iou_sum / total_proper_patterns if total_proper_patterns > 0 else 0
+    # Print summary of metrics
+    print(f"Overall Recall: {total_recall:.4f}")
+    print(f"Overall Precision: {total_precision:.4f}")
+    print(f"Overall F1 Score: {overall_f1:.4f}")
+    print(f"Overall Mean Absolute Error: {overall_mae:.4f}")
+    print(f"Overall Mean Intersection over Union: {overall_iou:.4f}")
+    # Store all metrics in one place for easy access
+    metrics_summary = {
+        'total_recall': total_recall,
+        'per_pattern_recall': per_pattern_recall,
+        'total_precision': total_precision,
+        'per_pattern_precision': per_pattern_precision,
+        'overall_f1': overall_f1,
+        'per_pattern_f1': per_pattern_f1,
+        'overall_mae': overall_mae,
+        'per_pattern_mae': per_pattern_mae,
+        'overall_iou': overall_iou,
+        'per_pattern_iou': per_pattern_iou
+    }
+    return metrics_summary
+# Updated evaluate_all_models function that only creates the comprehensive plot
+def evaluate_all_models(model_eval_results_dict, pattern_row_count, test_patterns, located_patterns_and_other_info_updated_dict):
+   Evaluate all models and return metrics summary with comprehensive plot only
+    all_models_metrics = {}
+    for model_name in model_eval_results_dict.keys():
+        all_models_metrics[model_name] = evaluate_model(
+            model_name,
+            model_eval_results_dict,
+            pattern_row_count,
+            test_patterns,
+            located_patterns_and_other_info_updated_dict
+        )
+    # Only create the comprehensive visualization
+    if len(model_eval_results_dict) > 0:
+        print("\n--- Comprehensive Model Comparison ---")
+        # figure = create_comprehensive_model_comparison(all_models_metrics)
+    return all_models_metrics, None  # Return None instead of figure
+"""
+###########################################################################################################

utils/formatAndPreprocessNewPatterns.py ADDED Viewed

	@@ -0,0 +1,477 @@

+# import the necessary libraries
+from multiprocessing import Manager, Value
+import os
+import numpy as np
+import pandas as pd
+from joblib import Parallel, delayed
+import math
+from scipy import interpolate
+from tqdm import tqdm
+from utils.drawPlots import plot_ohlc_segment
+original_pattern_name_list = [
+    'Double Top, Adam and Adam',
+    'Double Top, Adam and Eve',
+    'Double Top, Eve and Eve',
+    'Double Top, Eve and Adam',
+    'Double Bottom, Adam and Adam',
+    'Double Bottom, Eve and Adam',
+    'Double Bottom, Eve and Eve',
+    'Double Bottom, Adam and Eve',
+    'Triangle, symmetrical',
+    'Head-and-shoulders top',
+    'Head-and-shoulders bottom',
+    'Flag, high and tight'
+]
+# Updated pattern encoding
+pattern_encoding = {
+    'Double Top': 0,
+    'Double Bottom': 1,
+    'Triangle, symmetrical': 2,
+    'Head-and-shoulders top': 3,
+    'Head-and-shoulders bottom': 4,
+    'Flag, high and tight': 5,
+    'No Pattern': 6
+}
+def get_pattern_encoding():
+    return pattern_encoding
+def get_reverse_pattern_encoding():
+    return {v: k for k, v in pattern_encoding.items()}
+def get_patetrn_name_by_encoding(encoding):
+    """
+    Get the pattern name by encoding.
+    # Input:
+    - encoding (int): The encoding of the pattern.
+    # Returns:
+    - str: The name of the pattern.
+    """
+    return get_reverse_pattern_encoding().get(encoding, 'Unknown Pattern')
+def get_pattern_encoding_by_name(name):
+    """
+    Get the pattern encoding by name.
+    # Input:
+    - name (str): The name of the pattern.
+    # Returns:
+    - int: The encoding of the pattern.
+    """
+    return get_pattern_encoding().get(name, -1)
+def get_pattern_list():
+    return list(pattern_encoding.keys())
+def filter_to_get_selected_patterns(df):
+    # Filter dataframe to only include selected patterns
+    df = df[df['Chart Pattern'].isin(original_pattern_name_list)].copy()  # Explicit copy to avoid warning
+    # Replace all variations of Double Top and Double Bottom with simplified names
+    double_top_variations = {
+        'Double Top, Adam and Adam': 'Double Top',
+        'Double Top, Adam and Eve': 'Double Top',
+        'Double Top, Eve and Eve': 'Double Top',
+        'Double Top, Eve and Adam': 'Double Top'
+    }
+    double_bottom_variations = {
+        'Double Bottom, Adam and Adam': 'Double Bottom',
+        'Double Bottom, Eve and Adam': 'Double Bottom',
+        'Double Bottom, Eve and Eve': 'Double Bottom',
+        'Double Bottom, Adam and Eve': 'Double Bottom'
+    }
+    # Combine all variations into a single mapping
+    pattern_mapping = {**double_top_variations, **double_bottom_variations}
+    # Use .loc[] to modify the dataframe safely
+    df.loc[:, 'Chart Pattern'] = df['Chart Pattern'].replace(pattern_mapping)
+    return df
+def normalize_dataset(dataset):
+    # calculate the min values from Low column and max values from High column for each instance
+    min_low = dataset.groupby(level='Instance')['Low'].transform('min')
+    max_high = dataset.groupby(level='Instance')['High'].transform('max')
+    # OHLC columns to normalize
+    ohlc_columns = ['Open', 'High', 'Low', 'Close']
+    dataset_normalized = dataset.copy()
+    # Apply the normalization formula to all columns in one go
+    dataset_normalized[ohlc_columns] = (dataset_normalized[ohlc_columns] - min_low.values[:, None]) / (max_high.values[:, None] - min_low.values[:, None])
+    # if there is a Volume column normalize it
+    if 'Volume' in dataset.columns:
+        # calculate the min values from Volume column and max values from Volume column for each instance
+        min_volume = dataset.groupby(level='Instance')['Volume'].transform('min')
+        max_volume = dataset.groupby(level='Instance')['Volume'].transform('max')
+        # Normalize the Volume column
+        dataset_normalized['Volume'] = (dataset_normalized['Volume'] - min_volume.values) / (max_volume.values - min_volume)
+    return dataset_normalized
+def normalize_ohlc_segment(dataset):
+    # calculate the min values from Low column and max values from High column for each instance
+    min_low = dataset['Low'].min()
+    max_high = dataset['High'].max()
+    # OHLC columns to normalize
+    ohlc_columns = ['Open', 'High', 'Low', 'Close']
+    dataset_normalized = dataset.copy()
+    if (max_high - min_low) != 0:
+        # Apply the normalization formula to all columns in one go
+        dataset_normalized[ohlc_columns] = (dataset_normalized[ohlc_columns] - min_low) / (max_high - min_low)
+    else :
+        print("Error: Max high and min low are equal")
+    # if there is a Volume column normalize it
+    if 'Volume' in dataset.columns:
+        # calculate the min values from Volume column and max values from Volume column for each instance
+        min_volume = dataset['Volume'].min()
+        max_volume = dataset['Volume'].max()
+        if (max_volume - min_volume) != 0:
+            # Normalize the Volume column
+            dataset_normalized['Volume'] = (dataset_normalized['Volume'] - min_volume) / (max_volume - min_volume)
+        else:
+            print("Error: Max volume and min volume are equal")
+    return dataset_normalized
+def process_row_improved(idx, row, ohlc_df, instance_counter, lock, successful_instances, instance_index_mapping):
+    try:
+        # Extract info and filter data
+        start_date = pd.to_datetime(row['Start'])
+        end_date = pd.to_datetime(row['End'])
+        symbol_df_filtered = ohlc_df[(ohlc_df['Date'] >= start_date) &
+                                    (ohlc_df['Date'] <= end_date)]
+        if symbol_df_filtered.empty:
+            print(f"Empty result for {row['Symbol']} from {start_date} to {end_date}")
+            return None
+        # Get unique instance ID
+        with lock:
+            unique_instance = instance_counter.value
+            instance_counter.value += 1
+            # Explicitly add to instance_index_mapping using string key conversion
+            instance_index_mapping[unique_instance] = idx
+            # Track successful instances
+            successful_instances.append(unique_instance)
+        # Setup MultiIndex
+        symbol_df_filtered = symbol_df_filtered.reset_index(drop=True)
+        multi_index = pd.MultiIndex.from_arrays(
+            [[unique_instance] * len(symbol_df_filtered), range(len(symbol_df_filtered))],
+            names=["Instance", "Time"]
+        )
+        symbol_df_filtered.index = multi_index
+        # Set index levels to proper types
+        symbol_df_filtered.index = symbol_df_filtered.index.set_levels(
+            symbol_df_filtered.index.levels[0].astype('int'), level=0
+        )
+        symbol_df_filtered.index = symbol_df_filtered.index.set_levels(
+            symbol_df_filtered.index.levels[1].astype('int64'), level=1
+        )
+        # Add pattern and clean up
+        symbol_df_filtered['Pattern'] = pattern_encoding[row['Chart Pattern']]
+        symbol_df_filtered.drop('Date', axis=1, inplace=True)
+        if 'Adj Close' in symbol_df_filtered.columns:
+            symbol_df_filtered.drop('Adj Close', axis=1, inplace=True)
+        # Normalize
+        symbol_df_filtered = normalize_ohlc_segment(symbol_df_filtered)
+        return symbol_df_filtered
+    except Exception as e:
+        print(f"Error processing {row['Symbol']}: {str(e)}")
+        return None
+def dataset_format(filteredPatternDf, give_instance_index_mapping=False):
+    """
+    Formats and preprocesses the dataset with better tracking of successful instances.
+    """
+    # Get symbol list from files
+    folder_path = 'Datasets/OHLC data/'
+    file_list = os.listdir(folder_path)
+    symbol_list = [file[:-4] for file in file_list if file.endswith('.csv')]
+    # Check for missing symbols
+    symbols_in_df = filteredPatternDf['Symbol'].unique()
+    missing_symbols = set(symbols_in_df) - set(symbol_list)
+    if missing_symbols:
+        print("Missing symbols: ", missing_symbols)
+    # Create a list of tasks (symbol, row pairs)
+    tasks = []
+    for symbol in symbols_in_df:
+        if symbol in symbol_list:  # Skip missing symbols
+            filteredPatternDf_for_symbol = filteredPatternDf[filteredPatternDf['Symbol'] == symbol]
+            file_path = os.path.join(folder_path, f"{symbol}.csv")
+            # Pre-load symbol data
+            try:
+                symbol_df = pd.read_csv(file_path)
+                symbol_df['Date'] = pd.to_datetime(symbol_df['Date'])
+                symbol_df['Date'] = symbol_df['Date'].dt.tz_localize(None)
+                for idx, row in filteredPatternDf_for_symbol.iterrows():
+                    tasks.append((idx, row, symbol_df))
+            except Exception as e:
+                print(f"Error loading {symbol}: {str(e)}")
+    print(f"Processing {len(tasks)} tasks in parallel...")
+    # Process all tasks with instance tracking
+    with Manager() as manager:
+        instance_counter = manager.Value('i', 0)
+        lock = manager.Lock()
+        successful_instances = manager.list()  # Track which instances succeed
+        instance_index_mapping = manager.dict()  # Mapping from instance ID to index
+        results = Parallel(n_jobs=-1, verbose=1)(
+            delayed(process_row_improved)(task_idx, row, df, instance_counter, lock, successful_instances, instance_index_mapping)
+            for task_idx, row, df in tasks
+        )
+        # Filter out None results
+        results = [result for result in results if result is not None]
+        print(f"Total tasks: {len(tasks)}, Successful: {len(results)}")
+        print(f"Instance counter final value: {instance_counter.value}")
+        print(f"Number of successful instances: {len(successful_instances)}")
+        # # Debug print for mapping
+        # print("Debug - Instance Index Mapping:")
+        # for k, v in instance_index_mapping.items():
+        #     print(f"Key: {k}, Value: {v}")
+        if len(successful_instances) < instance_counter.value:
+            print("Warning: Some instances were assigned but their tasks failed")
+        # Concatenate results and renumber instances if needed
+        if results:
+            dataset = pd.concat(results)
+            dataset = dataset.sort_index(level=0)
+            # Replace inf/nan values
+            dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
+            dataset.fillna(method='ffill', inplace=True)
+            if give_instance_index_mapping:
+                # Convert manager.dict to a regular dictionary
+                instance_index_mapping_dict = dict(instance_index_mapping)
+                print("Converted Mapping:", instance_index_mapping_dict)
+                return dataset, instance_index_mapping_dict
+            else:
+                return dataset
+        else:
+            return pd.DataFrame()
+def width_augmentation (filteredPatternDf, min_aug_len , aug_len_fraction, make_duplicates = False , keep_original = False):
+    """
+    Perform width augmentation on the filtered pattern DataFrame.
+    # Input:
+    - filteredPatternDf (pd.DataFrame): The filtered pattern DataFrame.
+    - min_aug_len (int): The minimum length of the augmented data.
+    - aug_len_fraction (float): The fraction of the original data size to determine the maximum length of the augmented data.
+    - make_duplicates (bool): Flag to indicate whether to make duplicates of patterns to reduce dataset imbalance.(make this false on test data)
+    - keep_original (bool): Flag to indicate whether to keep the original patterns in the augmented DataFrame.
+    # Returns:
+    - filteredPattern_width_aug_df (pd.DataFrame): The DataFrame with width-augmented patterns.
+    """
+    filteredPattern_width_aug_df = pd.DataFrame(columns=filteredPatternDf.columns)
+    print('Performing width augmentation...')
+    # print('Number of patterns:', len(filteredPatternDf))
+    # loop through the rows of filteredPatternDf
+    for index, row in tqdm(filteredPatternDf.iterrows(), total=len(filteredPatternDf), desc="Processing"):
+        symbol = row['Symbol']
+        start_date = row['Start']
+        end_date = row['End']
+        pattern = row['Chart Pattern']
+        ohlc_df = pd.read_csv(f'Datasets/OHLC data/{symbol}.csv')
+        # Ensure all datetime objects are timezone-naive
+        ohlc_df['Date'] = pd.to_datetime(ohlc_df['Date']).dt.tz_localize(None)
+        # Convert start_date and end_date to timezone-naive if they have a timezone
+        start_date = pd.to_datetime(start_date).tz_localize(None)
+        end_date = pd.to_datetime(end_date).tz_localize(None)
+        ohlc_of_interest = ohlc_df[(ohlc_df['Date'] >= start_date) & (ohlc_df['Date'] <= end_date)]
+        data_size = len(ohlc_of_interest)
+        if data_size <= 0:
+            print (f'No data for {symbol} between {start_date} and {end_date}')
+            continue
+        # index of ohlc data on the start date and end date
+        start_index = ohlc_of_interest.index[0]
+        end_index = ohlc_of_interest.index[-1]
+        min_possible_index = 0
+        max_possible_index = len(ohlc_df) - 1
+        number_of_rows_for_pattern= filteredPatternDf['Chart Pattern'].value_counts()[pattern]
+        max_num_of_rows_for_pattern = filteredPatternDf['Chart Pattern'].value_counts().max()
+        # to make the number of rows for each pattern equal to reduce the imbalance in the dataset
+        if make_duplicates:
+            num_row_diff = (max_num_of_rows_for_pattern - number_of_rows_for_pattern)*2
+            multiplier = math.ceil(num_row_diff / number_of_rows_for_pattern) +2
+            # print ('Pattern :', pattern , 'Multiplier :' , multiplier , 'Number of rows for pattern :', number_of_rows_for_pattern)
+            # get a random mvalue between 1 to multiplier
+            m = np.random.randint(1, multiplier)
+        else:
+            m = 1
+        for i in range(m):
+            max_aug_len = math.ceil(data_size * aug_len_fraction)
+            if max_aug_len < min_aug_len:
+                max_aug_len = min_aug_len
+            aug_len_l = np.random.randint(1, max_aug_len)
+            aug_len_r = np.random.randint(1, max_aug_len)
+            # get the start and end index of the augmented data
+            start_index_aug = start_index - aug_len_l
+            end_index_aug = end_index + aug_len_r
+            if start_index_aug < min_possible_index:
+                start_index_aug = min_possible_index
+            if end_index_aug > max_possible_index:
+                end_index_aug = max_possible_index
+            # get the date of the start and end index of the augmented data
+            start_date_aug = ohlc_df.iloc[start_index_aug]['Date']
+            end_date_aug = ohlc_df.iloc[end_index_aug]['Date']
+            # create a new row for the augmented data
+            new_row = row.copy()
+            new_row['Start'] = start_date_aug
+            new_row['End'] = end_date_aug
+            filteredPattern_width_aug_df = pd.concat([filteredPattern_width_aug_df, pd.DataFrame([new_row])], ignore_index=True)
+        if keep_original:
+            # concat the original row too
+            filteredPattern_width_aug_df = pd.concat([filteredPattern_width_aug_df, pd.DataFrame([row])], ignore_index=True)
+    return filteredPattern_width_aug_df
+def normalize_ohlc_len(df, target_len=30 , plot_count= 0):
+    instances_list = df.index.get_level_values(0).unique()
+    normalized_df_list = []
+    # pick 10 random instances from the list of instances to plot
+    random_indices = np.random.choice(len(instances_list), plot_count, replace=False)
+    for instance in instances_list:
+        sample = df.loc[instance]
+        pattern_df = sample.copy()
+        new_data = {}
+        orig_indices = pattern_df.index.values  # Changed this line
+        new_indices = np.linspace(0, len(orig_indices) - 1, target_len)
+        # First interpolate all numerical columns
+        for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
+            # Determine the best interpolation method based on data length
+            if len(orig_indices) >= 4:  # Enough points for cubic
+                kind = 'cubic'
+            elif len(orig_indices) >= 3:  # Can use quadratic
+                kind = 'quadratic'
+            elif len(orig_indices) >= 2:  # Can use linear
+                kind = 'linear'
+            else:  # Not enough points, use nearest
+                kind = 'nearest'
+            f = interpolate.interp1d(np.arange(len(orig_indices)), pattern_df[col].values,
+                                  kind=kind, bounds_error=False, fill_value='extrapolate')
+            # Apply interpolation function to get new values
+            new_data[col] = f(new_indices)
+        # Ensure all OHLC values are positive
+        for col in ['Open', 'High', 'Low', 'Close']:
+            new_data[col] = np.maximum(new_data[col], 0.001)  # Small positive value instead of zero
+        # Fix OHLC relationships
+        for i in range(len(new_indices)):
+            # Ensure High is the maximum
+            new_data['High'][i] = max(new_data['High'][i], new_data['Open'][i], new_data['Close'][i])
+            # Ensure Low is the minimum
+            new_data['Low'][i] = min(new_data['Low'][i], new_data['Open'][i], new_data['Close'][i])
+        # Handle categorical data separately
+        if 'Pattern' in pattern_df.columns:
+            f = interpolate.interp1d(np.arange(len(orig_indices)), pattern_df['Pattern'].values,
+                                   kind='nearest', bounds_error=False, fill_value=pattern_df['Pattern'].iloc[0])
+            new_data['Pattern'] = f(new_indices)
+        result_df = pd.DataFrame(new_data)
+        result_df.index = pd.MultiIndex.from_product([[instance], result_df.index])
+        normalized_df_list.append(result_df)
+        if instance in instances_list[random_indices]:  # Fixed this line
+            # plot results
+            plot_ohlc_segment(pattern_df)
+            plot_ohlc_segment(result_df)
+    combined_result_df = pd.concat(normalized_df_list, axis=0)  # Fixed this line
+    return combined_result_df
+# Define features, target, and desired series length
+features = ['Open', 'High', 'Low', 'Close', 'Volume']
+target   = 'Pattern'
+series_length = 100
+# This function pads or truncates every instance to length=100,
+# then stacks into an array of shape (n_instances, n_features, series_length)
+def prepare_rocket_data(dataset, features = features, target = target, series_length = series_length):
+    def adjust_series_length(group):
+        arr = group[features].values
+        if len(arr) > series_length:
+            return arr[:series_length]
+        padding = np.zeros((series_length - len(arr), arr.shape[1]))
+        return np.vstack([arr, padding])
+    # Apply per-instance adjustment
+    adjusted = dataset.groupby(level=0).apply(adjust_series_length)
+    X = np.stack(adjusted.values)              # (n_instances, series_length, n_features)
+    X = np.transpose(X, (0, 2, 1))              # → (n_instances, n_features, series_length)
+    y = dataset.groupby(level=0)[target].first().values
+    return X, y