Updated create_dataset.py in d_18_02_2026_14_43_50_historic_netball/ and created the latter dataset

Browse files

Files changed (8) hide show

datasets/d_18_02_2026_14_43_50_historic_netball/create_dataset.py +521 -0
datasets/d_18_02_2026_14_43_50_historic_netball/d_desc.md +12 -0
datasets/d_18_02_2026_14_43_50_historic_netball/ftd_20_02_2026_00_58_13/makespans.bin +3 -0
datasets/d_18_02_2026_14_43_50_historic_netball/ftd_20_02_2026_00_58_13/metadata.json +11 -0
datasets/d_18_02_2026_14_43_50_historic_netball/ftd_20_02_2026_00_58_13/pfsp_instance.npy +3 -0
datasets/d_18_02_2026_14_43_50_historic_netball/ftd_20_02_2026_00_58_13/schedules.bin +3 -0
datasets/d_18_02_2026_14_43_50_historic_netball/script.bash +8 -0
utils/create_dataset.py +471 -0

datasets/d_18_02_2026_14_43_50_historic_netball/create_dataset.py ADDED Viewed

	@@ -0,0 +1,521 @@

+import numpy as np
+import os
+import json
+import itertools
+import math
+import tqdm
+def generate_random_pfsp_instance(nb_jobs, nb_machines, time_min, time_max, seed=97):
+    #TODO: add the possibility to simply load an instance from a file
+    """
+    Generates a random instance of the Permutation Flow Shop Problem (PFSP).
+    Parameters:
+    - nb_jobs: Number of jobs (n).
+    - nb_machines: Number of machines (m).
+    - time_min: Minimum processing time for any job on any machine.
+    - time_max: Maximum processing time for any job on any machine.
+    Returns:
+    - A 2D list (matrix) of size (nb_jobs x nb_machines) where each entry is a random processing time between time_min and time_max.
+    """
+    if seed is not None: np.random.seed(seed)
+    return np.random.randint(time_min, time_max + 1, size=(nb_jobs, nb_machines))
+def fit_palmer(pfsp_instance: np.ndarray):
+    """
+    Implements Palmer's heuristic for the flowshop scheduling problem. Returns a schedule and its corresponding makespan.
+    For now I am using an old code that performs palmer by interfacing with it, but it should be refactored to be cleaner and more efficient.
+    Parameters:
+    - pfsp_instance: A 2D numpy array where pfsp_instance[i][j] is the processing time of job i on machine j.
+    Returns:
+    - A tuple (schedule, makespan) where:
+        - schedule: A list of job indices representing the order of jobs (e.g., [0, 2, 1]).
+        - makespan: The total completion time for the given schedule.
+    """
+    # =====================================================================================
+    class Palmer:
+        def __init__(self, jobs_list: list):
+            self.jobs_list = jobs_list
+            self.nb_jobs = len(jobs_list)
+            self.nb_machines = len(jobs_list[0])
+            self.seq_star = None
+            self.make_span_star = None
+        # utility function that returns the gantt cumule based on a job execution times and a previous gantt cumule
+        def cumulate(self, job: list, previous_cumul=None):
+            res = [0] * len(job)
+            if previous_cumul == None:
+                res[0] = job[0]
+                for i in range(1, len(job)):
+                    res[i] = res[i - 1] + job[i]
+            else:
+                res[0] = previous_cumul[0] + job[0]
+                for i in range(1, len(job)):
+                    res[i] = max(res[i - 1], previous_cumul[i]) + job[i]
+            return res
+        # utility function that computes the gantt cumule given only a job sequence (not used in the algorithm due to inneficiency
+        # dynamic programming with cumulate is used instead ...)
+        def cumulate_seq(self, seq: list):
+            cumulated = None
+            for i in seq:
+                cumulated = self.cumulate(self.jobs_list[i], cumulated)
+            return cumulated
+        # launching the optimization
+        def optim(self, debug=False):
+            jobs_weights = []
+            for i, job in zip(range(self.nb_jobs), self.jobs_list):
+                weight = 0
+                for j in range(self.nb_machines):
+                    if debug == True:
+                        print(
+                            f">job {i} mach {j} first term: {(2*(j+1) - 1) - self.nb_machines}"
+                        )
+                        print(f">job {i} mach {j} second term: {job[j]}")
+                        print(
+                            "------------------------------------------------------------------"
+                        )
+                    weight += ((2 * (j + 1) - 1) - self.nb_machines) * job[j]
+                if debug == True:
+                    print(f"===>> job {i} weight: {weight}")
+                jobs_weights.append((weight, i))
+            self.seq_star = [tu[1] for tu in sorted(jobs_weights, reverse=True)]
+            self.make_span_star = self.cumulate_seq(self.seq_star)[-1]
+            return (self.seq_star, self.make_span_star)
+    # =====================================================================================
+    # Interfacing with the underlying old palmer code
+    jobs_list = pfsp_instance.tolist()
+    palmer_schedule, palmer_makespan = Palmer(jobs_list).optim()
+    # Returning the schedule and makespan as numpy arrays of type int32
+    return np.array(palmer_schedule, dtype=np.int32), np.int32(palmer_makespan)
+def fit_cds(pfsp_instance: np.ndarray):
+    """
+    Implements CDS heuristic for the flowshop scheduling problem. Returns a schedule and its corresponding makespan.
+    For now I am using an old code that performs cds by interfacing with it, but it should be refactored to be cleaner and more efficient.
+    Parameters:
+    - pfsp_instance: A 2D numpy array where pfsp_instance[i][j] is the processing time of job i on machine j.
+    Returns:
+    - A tuple (schedule, makespan) where:
+        - schedule: A list of job indices representing the order of jobs (e.g., [0, 2, 1]).
+        - makespan: The total completion time for the given schedule.
+    """
+    # =====================================================================================
+    # Function to cumulate job processing times
+    def cumulate(job, previous_cumul=None):
+        res = [0] * len(job)
+        if previous_cumul is None:
+            res[0] = job[0]
+            for i in range(1, len(job)):
+                res[i] = res[i - 1] + job[i]
+        else:
+            res[0] = previous_cumul[0] + job[0]
+            for i in range(1, len(job)):
+                res[i] = max(res[i - 1], previous_cumul[i]) + job[i]
+        return res
+    # Function to cumulate processing times for a given sequence of jobs
+    def cumulate_seq(seq, jobs_list):
+        cumulated = None
+        for i in seq:
+            cumulated = cumulate(jobs_list[i], cumulated)
+        return cumulated
+    # Function to compute the makespan given a sequence of jobs and the job list
+    def makespan(sequence, job_list):
+        return cumulate_seq(sequence, job_list)[-1]
+    # Function to perform the Johnson's algorithm for the flow shop problem
+    def johnson_algorithm(matrix):
+        n = matrix.shape[0]
+        sequence = []
+        machines = [[], []]
+        # Preprocessing to determine the order of jobs
+        for i in range(n):
+            if matrix[i][0] < matrix[i][1]:  # if time(m1) < time(m2)
+                machines[0].append((matrix[i][0], i))
+            else:
+                machines[1].append((matrix[i][1], i))
+        # Sorting jobs for each machine
+        machines[0] = sorted(
+            machines[0], key=lambda x: x[0]
+        )  # ascending sort for the first machine
+        machines[1] = sorted(
+            machines[1], key=lambda x: x[0], reverse=True
+        )  # descending sort for the second machine
+        # Merging the two sorted lists
+        merged = machines[0] + machines[1]
+        # Constructing the optimal sequence
+        sequence = [index for _, index in merged]
+        return sequence
+    # Function that applies Johnson's algorithm and computes the makespan
+    def johnson(job_matrix, data_matrix):
+        sequence = johnson_algorithm(job_matrix)
+        return sequence, makespan(sequence, data_matrix)
+    # CDS heuristic
+    def cds_heuristic(matrix):
+        n = matrix.shape[0]
+        m = matrix.shape[1]
+        best_makespan = float("inf")
+        best_sequences = []
+        # Step 1: Generate matrices of all possible job lists
+        for i in range(1, m):
+            machine_subset_1 = matrix[:, :i].sum(axis=1)
+            machine_subset_2 = matrix[:, -i:].sum(axis=1)
+            job_matrix = np.column_stack((machine_subset_1, machine_subset_2))
+            # Step 2: Apply Johnson's algorithm to the job matrix abd calculate the makespan
+            sequence, makespan_value = johnson(job_matrix, matrix)
+            # Step 3: Update the best makespan and corresponding sequences
+            if makespan_value < best_makespan:
+                best_makespan = makespan_value
+                best_sequences = [sequence]
+            elif makespan_value == best_makespan:
+                best_sequences.append(sequence)
+        return best_sequences[0], best_makespan
+    # =====================================================================================
+    # Interfacing with the underlying old cds code
+    cds_schedule, cds_makespan = cds_heuristic(pfsp_instance)
+    # Returning the schedule and makespan as numpy arrays of type int32
+    return np.array(cds_schedule, dtype=np.int32), np.int32(cds_makespan)
+def fit_neh(pfsp_instance: np.ndarray):
+    """
+    Implements NEH heuristic for the flowshop scheduling problem. Returns a schedule and its corresponding makespan.
+    For now I am using an old code that performs neh by interfacing with it, but it should be refactored to be cleaner and more efficient.
+    Parameters:
+    - pfsp_instance: A 2D numpy array where pfsp_instance[i][j] is the processing time of job i on machine j.
+    Returns:
+    - A tuple (schedule, makespan) where:
+        - schedule: A list of job indices representing the order of jobs (e.g., [0, 2, 1]).
+        - makespan: The total completion time for the given schedule.
+    """
+    # =====================================================================================
+    class Inst:
+        def __init__(
+            self,
+            jobs: int,
+            machines: int,
+            seed: int,
+            ub: int,
+            lb: int,
+            matrix: list[list[int]],
+        ):
+            self.jobs = jobs
+            self.machines = machines
+            self.seed = seed
+            self.ub = ub
+            self.lb = lb
+            self.matrix = matrix
+        def __repr__(self) -> str:
+            return f"Inst(jobs={self.jobs}, machines={self.machines}, seed={self.seed}, ub={self.ub}, lb={self.lb}, matrix={self.matrix})"
+    class NEH:
+        def __init__(self, instance: Inst, debug: bool = False):
+            self.instance = instance
+            self.debug = debug
+        def calculate_sj(self, job: int) -> int:
+            sj = 0
+            for machine in range(self.instance.machines):
+                sj += self.instance.matrix[machine][job]
+            return sj
+        def sort_jobs(self, reverse: bool = False) -> list[int]:
+            return sorted(
+                range(self.instance.jobs),
+                key=lambda job: self.calculate_sj(job),
+                reverse=reverse,
+            )
+        def emulate(self, jobs: list[int]) -> list[int]:
+            machines_exec = [0] * self.instance.machines
+            for job in jobs:
+                for current_machine in range(self.instance.machines):
+                    # Add jobs execution time to current machine
+                    machines_exec[current_machine] += self.instance.matrix[
+                        current_machine
+                    ][job]
+                    # Sync other machines if they are behind current time
+                    for machine in range(current_machine + 1, self.instance.machines):
+                        machines_exec[machine] = max(
+                            machines_exec[current_machine], machines_exec[machine]
+                        )
+            return machines_exec
+        def calculate_cmax(self, jobs: list[int]) -> int:
+            return self.emulate(jobs)[-1]
+        def get_best_order(self, orders: list[list[int]]) -> tuple[int, list[int]]:
+            min_cmax = float("inf")
+            min_order = None
+            for order in orders:
+                cmax = self.calculate_cmax(order)
+                if cmax < min_cmax:
+                    min_cmax = cmax
+                    min_order = order
+            return min_cmax, min_order
+        def get_best_position(
+            self, order: list[int], job: int
+        ) -> tuple[int, list[int]]:
+            possible_orders: list[list[int]] = []
+            for pos in range(len(order) + 1):
+                possible_orders.append(order[:pos] + [job] + order[pos:])
+            return self.get_best_order(possible_orders)
+        def __call__(self) -> tuple[int, list[int]]:
+            if self.instance.jobs < 2:
+                raise ValueError("Number of jobs must be greater than 2")
+            sorted_jobs = self.sort_jobs()
+            current_cmax, current_order = self.get_best_order(
+                [sorted_jobs[:2], sorted_jobs[:2][::-1]]
+            )
+            if self.debug:
+                print(current_cmax, current_order)
+            if self.instance.jobs == 2:
+                return current_cmax, current_order
+            for job in sorted_jobs[2:]:
+                current_cmax, current_order = self.get_best_position(current_order, job)
+                if self.debug:
+                    print(current_cmax, current_order)
+            return current_cmax, current_order
+    # =====================================================================================
+    # Interfacing with the underlying old neh code
+    neh_instance_jobs = pfsp_instance.shape[0]
+    neh_instance_machines = pfsp_instance.shape[1]
+    neh_instance_matrix = pfsp_instance.T.tolist()
+    neh_instance = Inst(
+        neh_instance_jobs,
+        neh_instance_machines,
+        seed=0,
+        ub=0,
+        lb=0,
+        matrix=neh_instance_matrix,
+    )
+    neh_makespan, neh_schedule = NEH(neh_instance)()
+    # Returning the schedule and makespan as numpy arrays of type int32
+    return np.array(neh_schedule, dtype=np.int32), np.int32(neh_makespan)
+def evaluate_makespan(pfsp_instance, schedule):
+    """
+    Evaluates the makespan (completion time) of a given schedule for a given pfsp_instance.
+    Parameters:
+    - pfsp_instance: A list of lists, where pfsp_instance[i][j] is the processing time of job i on machine j.
+    - schedule: A list/tuple indicating the order of jobs (e.g., [0, 2, 1]).
+    Returns:
+    - The makespan (total completion time) for the given schedule.
+    """
+    def cumulate(job: list, previous_cumul=None):
+        # Calculate the cumulative completion times for a job
+        res = [0] * len(job)
+        if previous_cumul == None:
+            res[0] = job[0]
+            for i in range(1, len(job)):
+                res[i] = res[i - 1] + job[i]
+        else:
+            res[0] = previous_cumul[0] + job[0]
+            for i in range(1, len(job)):
+                res[i] = max(res[i - 1], previous_cumul[i]) + job[i]
+        return res
+    def cumulate_seq(pfsp_instance: list, schedule: list):
+        # Calculates the cumulative time for a sequence of jobs on machines.
+        cumulated = None
+        for i in schedule:
+            cumulated = cumulate(pfsp_instance[i], cumulated)
+        return cumulated
+    cumulative = cumulate_seq(pfsp_instance, schedule)
+    return cumulative[-1]
+def create_dataset(
+    pfsp_instance,
+    nb_base_samples,
+    duplication_factor=0.0,
+    init_type="random",
+    data_folder_location="./",
+    data_folder_name=None,
+    seed=97
+):
+    if init_type == "exhaustive":
+        nb_base_samples = math.factorial(pfsp_instance.shape[0])
+    nb_samples = nb_base_samples + int(nb_base_samples * duplication_factor)
+    if seed is not None: np.random.seed(seed)
+    def perturb_schedule(schedule):
+        perturbed_schedule = schedule[:]
+        i, j = np.random.choice(perturbed_schedule.shape[0], size=2, replace=False)
+        perturbed_schedule[[i,j]] = perturbed_schedule[[j,i]]
+        return perturbed_schedule, evaluate_makespan(pfsp_instance, perturbed_schedule)
+    # Create the folder if it doesn't exist
+    if data_folder_name is None: data_folder_name = f"ftd_{time.strftime('%d_%m_%Y_%H_%M_%S')}"
+    else: data_folder_name = f"ftd_{data_folder_name}"
+    data_path = os.path.join(data_folder_location, data_folder_name)
+    os.makedirs(data_path, exist_ok=True)
+    # Create the np memmap files for schedules and makespans
+    nb_jobs = pfsp_instance.shape[0]
+    schedules = np.memmap(os.path.join(data_path,"schedules.bin"), dtype=np.int32, mode='w+', shape=(nb_samples, nb_jobs))
+    makespans = np.memmap(os.path.join(data_path,"makespans.bin"), dtype=np.int32, mode='w+', shape=(nb_samples,))
+    # Save the pfsp instance as a numpy file
+    np.save(os.path.join(data_path,"pfsp_instance.npy"), pfsp_instance)
+    # Create a metadata dictionary and save it as a json file
+    metadata_dict = {
+        "nb_base_samples": nb_base_samples,
+        "duplication_factor": duplication_factor,
+        "nb_samples": nb_samples,
+        "nb_jobs": nb_jobs,
+        "nb_machines": pfsp_instance.shape[1],
+        "init_type": init_type,
+        "data_path": data_path,
+        "seed": seed,
+        "date_time": time.strftime('%d_%m_%Y_%H_%M_%S')
+    }
+    with open(os.path.join(data_path,"metadata.json"), "w") as f:
+        json.dump(metadata_dict, f, indent=4)
+    if init_type == "exhaustive":
+        for i, schedule in tqdm.tqdm(enumerate(itertools.permutations(range(nb_jobs))), total=math.factorial(nb_jobs)):
+            schedules[i] = schedule
+            makespans[i] = evaluate_makespan(pfsp_instance, schedule)
+    elif init_type == "cds":
+        cds_schedule, cds_makespan = fit_cds(pfsp_instance)
+        schedules[0] = cds_schedule
+        makespans[0] = cds_makespan
+        for i in tqdm.tqdm(range(1, nb_base_samples), desc="Generating CDS samples"):
+            schedules[i], makespans[i] = perturb_schedule(cds_schedule)
+    elif init_type == "palmer":
+        palmer_schedule, palmer_makespan = fit_palmer(pfsp_instance)
+        schedules[0] = palmer_schedule
+        makespans[0] = palmer_makespan
+        for i in tqdm.tqdm(range(1, nb_base_samples), desc="Generating Palmer samples"):
+            schedules[i], makespans[i] = perturb_schedule(palmer_schedule)
+    elif init_type == "neh":
+        neh_schedule, neh_makespan = fit_neh(pfsp_instance)
+        schedules[0] = neh_schedule
+        makespans[0] = neh_makespan
+        for i in tqdm.tqdm(range(1, nb_base_samples), desc="Generating NEH samples"):
+            schedules[i], makespans[i] = perturb_schedule(neh_schedule)
+    elif init_type == "heuristics":
+        cds_schedule, cds_makespan = fit_cds(pfsp_instance)
+        schedules[0], makespans[0] = cds_schedule, cds_makespan
+        cds_size = nb_base_samples // 3
+        for i in tqdm.tqdm(range(1, cds_size), desc="Generating CDS heuristic samples"):
+            schedules[i], makespans[i] = perturb_schedule(cds_schedule)
+        i+=1
+        palmer_schedule, palmer_makespan = fit_palmer(pfsp_instance)
+        schedules[i], makespans[i] = palmer_schedule, palmer_makespan
+        palmer_size = nb_base_samples // 3
+        for i in tqdm.tqdm(range(i+1, i+palmer_size), desc="Generating Palmer heuristic samples"):
+            schedules[i], makespans[i] = perturb_schedule(palmer_schedule)
+        i+=1
+        neh_schedule, neh_makespan = fit_neh(pfsp_instance)
+        schedules[i], makespans[i] = neh_schedule, neh_makespan
+        neh_size = nb_base_samples - cds_size - palmer_size
+        for i in tqdm.tqdm(range(i+1, i+neh_size), desc="Generating NEH heuristic samples"):
+            schedules[i], makespans[i] = perturb_schedule(neh_schedule)
+    elif init_type == "random":
+        for i in tqdm.tqdm(range(nb_base_samples), desc="Generating Random samples"):
+            schedule = np.random.permutation(pfsp_instance.shape[0])
+            makespan = evaluate_makespan(pfsp_instance, schedule)
+            schedules[i] = schedule
+            makespans[i] = makespan
+    else:
+        raise ValueError("Invalid initialization type")
+    # Add the duplicated samples. I sample with repetition from the base samples and then add them to the dataset
+    duplicated_schedules_idx = np.random.choice(nb_base_samples, size=nb_samples - nb_base_samples, replace=True)
+    schedules[nb_base_samples:nb_samples] = schedules[duplicated_schedules_idx]
+    makespans[nb_base_samples:nb_samples] = makespans[duplicated_schedules_idx]
+    # Flush and return
+    schedules.flush()
+    makespans.flush()
+    return schedules, makespans
+if __name__ == "__main__":
+    # Parse arguments and call create_dataset with the appropriate parameters
+    import argparse
+    import time
+    parser = argparse.ArgumentParser(description="Create a dataset for the flowshop scheduling problem")
+    parser.add_argument("--nb_jobs", type=int, default=4, help="Number of jobs")
+    parser.add_argument("--nb_machines", type=int, default=2, help="Number of machines")
+    parser.add_argument("--time_min", type=int, default=1, help="Minimum processing time")
+    parser.add_argument("--time_max", type=int, default=100, help="Maximum processing time")
+    parser.add_argument("--nb_base_samples", type=int, default=1000, help="Number of base samples to generate before duplication")
+    parser.add_argument("--duplication_factor", type=float, default=0.1, help="Factor by which to duplicate the base samples (e.g., 0.5 means 50% more samples will be created by duplicating the base samples)")
+    parser.add_argument("--init_type", type=str, default="exhaustive", choices=["exhaustive", "cds", "palmer", "neh", "heuristics", "random"], help="Initialization type for the base samples")
+    parser.add_argument("--data_folder_location", type=str, default="./", help="Location where the dataset folder will be created")
+    parser.add_argument("--data_folder_name", type=str, default=None, help="Name of the dataset folder (if not provided, a name with the current date and time will be generated)")
+    parser.add_argument("--seed", type=int, default=97, help="Random seed for reproducibility (set to None for no seeding)")
+    args = parser.parse_args()
+    pfsp_instance = generate_random_pfsp_instance(args.nb_jobs, args.nb_machines, args.time_min, args.time_max, seed=args.seed)
+    schedules, makespans = create_dataset(
+        pfsp_instance=pfsp_instance,
+        nb_base_samples=args.nb_base_samples,
+        duplication_factor=args.duplication_factor,
+        init_type=args.init_type,
+        data_folder_location=args.data_folder_location,
+        data_folder_name=args.data_folder_name,
+        seed=args.seed
+    )

datasets/d_18_02_2026_14_43_50_historic_netball/d_desc.md ADDED Viewed

	@@ -0,0 +1,12 @@

+# Dataset ID
+d_18_02_2026_14_43_50_historic_netball
+# Dataset Description
+This dataset focuses on ...
+# Dataset Tags
+- d:sports
+- ...

datasets/d_18_02_2026_14_43_50_historic_netball/ftd_20_02_2026_00_58_13/makespans.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbb6c637a3c570e3bf9a72d5e1b3b53ed61c156f23a3d66fe62e67c52fd44ea9
+size 22176

datasets/d_18_02_2026_14_43_50_historic_netball/ftd_20_02_2026_00_58_13/metadata.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "nb_base_samples": 5040,
+    "duplication_factor": 0.1,
+    "nb_samples": 5544,
+    "nb_jobs": 7,
+    "nb_machines": 2,
+    "init_type": "exhaustive",
+    "data_path": "./ftd_20_02_2026_00_58_13",
+    "seed": 97,
+    "date_time": "20_02_2026_00_58_13"
+}

datasets/d_18_02_2026_14_43_50_historic_netball/ftd_20_02_2026_00_58_13/pfsp_instance.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aec110fb8e0df28666bd9adbde900bd67199d34c33b2f060a44a2d5746b4b82b
+size 240

datasets/d_18_02_2026_14_43_50_historic_netball/ftd_20_02_2026_00_58_13/schedules.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08a5221ad691e9eb37fdd730e3c66845c23fed5e15008f5ed09b8d62e7edd66a
+size 155232

datasets/d_18_02_2026_14_43_50_historic_netball/script.bash ADDED Viewed

	@@ -0,0 +1,8 @@

+python create_dataset.py --nb_jobs 7\
+    --nb_machines 2\
+    --time_min 1\
+    --time_max 10\
+    --nb_base_samples 1000\
+    --duplication_factor 0.1\
+    --init_type exhaustive\
+    --data_folder_location ./\

utils/create_dataset.py ADDED Viewed

	@@ -0,0 +1,471 @@

+import numpy as np
+import os
+import json
+def generate_random_pfsp_instance(nb_jobs, nb_machines, time_min, time_max):
+    """
+    Generates a random instance of the Permutation Flow Shop Problem (PFSP).
+    Parameters:
+    - nb_jobs: Number of jobs (n).
+    - nb_machines: Number of machines (m).
+    - time_min: Minimum processing time for any job on any machine.
+    - time_max: Maximum processing time for any job on any machine.
+    Returns:
+    - A 2D list (matrix) of size (nb_jobs x nb_machines) where each entry is a random processing time between time_min and time_max.
+    """
+    return np.random.randint(time_min, time_max + 1, size=(nb_jobs, nb_machines))
+def fit_palmer(pfsp_instance: np.ndarray):
+    """
+    Implements Palmer's heuristic for the flowshop scheduling problem. Returns a schedule and its corresponding makespan.
+    For now I am using an old code that performs palmer by interfacing with it, but it should be refactored to be cleaner and more efficient.
+    Parameters:
+    - pfsp_instance: A 2D numpy array where pfsp_instance[i][j] is the processing time of job i on machine j.
+    Returns:
+    - A tuple (schedule, makespan) where:
+        - schedule: A list of job indices representing the order of jobs (e.g., [0, 2, 1]).
+        - makespan: The total completion time for the given schedule.
+    """
+    # =====================================================================================
+    class Palmer:
+        def __init__(self, jobs_list: list):
+            self.jobs_list = jobs_list
+            self.nb_jobs = len(jobs_list)
+            self.nb_machines = len(jobs_list[0])
+            self.seq_star = None
+            self.make_span_star = None
+        # utility function that returns the gantt cumule based on a job execution times and a previous gantt cumule
+        def cumulate(self, job: list, previous_cumul=None):
+            res = [0] * len(job)
+            if previous_cumul == None:
+                res[0] = job[0]
+                for i in range(1, len(job)):
+                    res[i] = res[i - 1] + job[i]
+            else:
+                res[0] = previous_cumul[0] + job[0]
+                for i in range(1, len(job)):
+                    res[i] = max(res[i - 1], previous_cumul[i]) + job[i]
+            return res
+        # utility function that computes the gantt cumule given only a job sequence (not used in the algorithm due to inneficiency
+        # dynamic programming with cumulate is used instead ...)
+        def cumulate_seq(self, seq: list):
+            cumulated = None
+            for i in seq:
+                cumulated = self.cumulate(self.jobs_list[i], cumulated)
+            return cumulated
+        # launching the optimization
+        def optim(self, debug=False):
+            jobs_weights = []
+            for i, job in zip(range(self.nb_jobs), self.jobs_list):
+                weight = 0
+                for j in range(self.nb_machines):
+                    if debug == True:
+                        print(
+                            f">job {i} mach {j} first term: {(2*(j+1) - 1) - self.nb_machines}"
+                        )
+                        print(f">job {i} mach {j} second term: {job[j]}")
+                        print(
+                            "------------------------------------------------------------------"
+                        )
+                    weight += ((2 * (j + 1) - 1) - self.nb_machines) * job[j]
+                if debug == True:
+                    print(f"===>> job {i} weight: {weight}")
+                jobs_weights.append((weight, i))
+            self.seq_star = [tu[1] for tu in sorted(jobs_weights, reverse=True)]
+            self.make_span_star = self.cumulate_seq(self.seq_star)[-1]
+            return (self.seq_star, self.make_span_star)
+    # =====================================================================================
+    # Interfacing with the underlying old palmer code
+    jobs_list = pfsp_instance.tolist()
+    palmer_schedule, palmer_makespan = Palmer(jobs_list).optim()
+    # Returning the schedule and makespan as numpy arrays of type int32
+    return np.array(palmer_schedule, dtype=np.int32), np.int32(palmer_makespan)
+def fit_cds(pfsp_instance: np.ndarray):
+    """
+    Implements CDS heuristic for the flowshop scheduling problem. Returns a schedule and its corresponding makespan.
+    For now I am using an old code that performs cds by interfacing with it, but it should be refactored to be cleaner and more efficient.
+    Parameters:
+    - pfsp_instance: A 2D numpy array where pfsp_instance[i][j] is the processing time of job i on machine j.
+    Returns:
+    - A tuple (schedule, makespan) where:
+        - schedule: A list of job indices representing the order of jobs (e.g., [0, 2, 1]).
+        - makespan: The total completion time for the given schedule.
+    """
+    # =====================================================================================
+    # Function to cumulate job processing times
+    def cumulate(job, previous_cumul=None):
+        res = [0] * len(job)
+        if previous_cumul is None:
+            res[0] = job[0]
+            for i in range(1, len(job)):
+                res[i] = res[i - 1] + job[i]
+        else:
+            res[0] = previous_cumul[0] + job[0]
+            for i in range(1, len(job)):
+                res[i] = max(res[i - 1], previous_cumul[i]) + job[i]
+        return res
+    # Function to cumulate processing times for a given sequence of jobs
+    def cumulate_seq(seq, jobs_list):
+        cumulated = None
+        for i in seq:
+            cumulated = cumulate(jobs_list[i], cumulated)
+        return cumulated
+    # Function to compute the makespan given a sequence of jobs and the job list
+    def makespan(sequence, job_list):
+        return cumulate_seq(sequence, job_list)[-1]
+    # Function to perform the Johnson's algorithm for the flow shop problem
+    def johnson_algorithm(matrix):
+        n = matrix.shape[0]
+        sequence = []
+        machines = [[], []]
+        # Preprocessing to determine the order of jobs
+        for i in range(n):
+            if matrix[i][0] < matrix[i][1]:  # if time(m1) < time(m2)
+                machines[0].append((matrix[i][0], i))
+            else:
+                machines[1].append((matrix[i][1], i))
+        # Sorting jobs for each machine
+        machines[0] = sorted(
+            machines[0], key=lambda x: x[0]
+        )  # ascending sort for the first machine
+        machines[1] = sorted(
+            machines[1], key=lambda x: x[0], reverse=True
+        )  # descending sort for the second machine
+        # Merging the two sorted lists
+        merged = machines[0] + machines[1]
+        # Constructing the optimal sequence
+        sequence = [index for _, index in merged]
+        return sequence
+    # Function that applies Johnson's algorithm and computes the makespan
+    def johnson(job_matrix, data_matrix):
+        sequence = johnson_algorithm(job_matrix)
+        return sequence, makespan(sequence, data_matrix)
+    # CDS heuristic
+    def cds_heuristic(matrix):
+        n = matrix.shape[0]
+        m = matrix.shape[1]
+        best_makespan = float("inf")
+        best_sequences = []
+        # Step 1: Generate matrices of all possible job lists
+        for i in range(1, m):
+            machine_subset_1 = matrix[:, :i].sum(axis=1)
+            machine_subset_2 = matrix[:, -i:].sum(axis=1)
+            job_matrix = np.column_stack((machine_subset_1, machine_subset_2))
+            # Step 2: Apply Johnson's algorithm to the job matrix abd calculate the makespan
+            sequence, makespan_value = johnson(job_matrix, matrix)
+            # Step 3: Update the best makespan and corresponding sequences
+            if makespan_value < best_makespan:
+                best_makespan = makespan_value
+                best_sequences = [sequence]
+            elif makespan_value == best_makespan:
+                best_sequences.append(sequence)
+        return best_sequences[0], best_makespan
+    # =====================================================================================
+    # Interfacing with the underlying old cds code
+    cds_schedule, cds_makespan = cds_heuristic(pfsp_instance)
+    # Returning the schedule and makespan as numpy arrays of type int32
+    return np.array(cds_schedule, dtype=np.int32), np.int32(cds_makespan)
+def fit_neh(pfsp_instance: np.ndarray):
+    """
+    Implements NEH heuristic for the flowshop scheduling problem. Returns a schedule and its corresponding makespan.
+    For now I am using an old code that performs neh by interfacing with it, but it should be refactored to be cleaner and more efficient.
+    Parameters:
+    - pfsp_instance: A 2D numpy array where pfsp_instance[i][j] is the processing time of job i on machine j.
+    Returns:
+    - A tuple (schedule, makespan) where:
+        - schedule: A list of job indices representing the order of jobs (e.g., [0, 2, 1]).
+        - makespan: The total completion time for the given schedule.
+    """
+    # =====================================================================================
+    class Inst:
+        def __init__(
+            self,
+            jobs: int,
+            machines: int,
+            seed: int,
+            ub: int,
+            lb: int,
+            matrix: list[list[int]],
+        ):
+            self.jobs = jobs
+            self.machines = machines
+            self.seed = seed
+            self.ub = ub
+            self.lb = lb
+            self.matrix = matrix
+        def __repr__(self) -> str:
+            return f"Inst(jobs={self.jobs}, machines={self.machines}, seed={self.seed}, ub={self.ub}, lb={self.lb}, matrix={self.matrix})"
+    class NEH:
+        def __init__(self, instance: Inst, debug: bool = False):
+            self.instance = instance
+            self.debug = debug
+        def calculate_sj(self, job: int) -> int:
+            sj = 0
+            for machine in range(self.instance.machines):
+                sj += self.instance.matrix[machine][job]
+            return sj
+        def sort_jobs(self, reverse: bool = False) -> list[int]:
+            return sorted(
+                range(self.instance.jobs),
+                key=lambda job: self.calculate_sj(job),
+                reverse=reverse,
+            )
+        def emulate(self, jobs: list[int]) -> list[int]:
+            machines_exec = [0] * self.instance.machines
+            for job in jobs:
+                for current_machine in range(self.instance.machines):
+                    # Add jobs execution time to current machine
+                    machines_exec[current_machine] += self.instance.matrix[
+                        current_machine
+                    ][job]
+                    # Sync other machines if they are behind current time
+                    for machine in range(current_machine + 1, self.instance.machines):
+                        machines_exec[machine] = max(
+                            machines_exec[current_machine], machines_exec[machine]
+                        )
+            return machines_exec
+        def calculate_cmax(self, jobs: list[int]) -> int:
+            return self.emulate(jobs)[-1]
+        def get_best_order(self, orders: list[list[int]]) -> tuple[int, list[int]]:
+            min_cmax = float("inf")
+            min_order = None
+            for order in orders:
+                cmax = self.calculate_cmax(order)
+                if cmax < min_cmax:
+                    min_cmax = cmax
+                    min_order = order
+            return min_cmax, min_order
+        def get_best_position(
+            self, order: list[int], job: int
+        ) -> tuple[int, list[int]]:
+            possible_orders: list[list[int]] = []
+            for pos in range(len(order) + 1):
+                possible_orders.append(order[:pos] + [job] + order[pos:])
+            return self.get_best_order(possible_orders)
+        def __call__(self) -> tuple[int, list[int]]:
+            if self.instance.jobs < 2:
+                raise ValueError("Number of jobs must be greater than 2")
+            sorted_jobs = self.sort_jobs()
+            current_cmax, current_order = self.get_best_order(
+                [sorted_jobs[:2], sorted_jobs[:2][::-1]]
+            )
+            if self.debug:
+                print(current_cmax, current_order)
+            if self.instance.jobs == 2:
+                return current_cmax, current_order
+            for job in sorted_jobs[2:]:
+                current_cmax, current_order = self.get_best_position(current_order, job)
+                if self.debug:
+                    print(current_cmax, current_order)
+            return current_cmax, current_order
+    # =====================================================================================
+    # Interfacing with the underlying old neh code
+    neh_instance_jobs = pfsp_instance.shape[0]
+    neh_instance_machines = pfsp_instance.shape[1]
+    neh_instance_matrix = pfsp_instance.T.tolist()
+    neh_instance = Inst(
+        neh_instance_jobs,
+        neh_instance_machines,
+        seed=0,
+        ub=0,
+        lb=0,
+        matrix=neh_instance_matrix,
+    )
+    neh_makespan, neh_schedule = NEH(neh_instance)()
+    # Returning the schedule and makespan as numpy arrays of type int32
+    return np.array(neh_schedule, dtype=np.int32), np.int32(neh_makespan)
+def evaluate_makespan(pfsp_instance, schedule):
+    """
+    Evaluates the makespan (completion time) of a given schedule for a given pfsp_instance.
+    Parameters:
+    - pfsp_instance: A list of lists, where pfsp_instance[i][j] is the processing time of job i on machine j.
+    - schedule: A list/tuple indicating the order of jobs (e.g., [0, 2, 1]).
+    Returns:
+    - The makespan (total completion time) for the given schedule.
+    """
+    def cumulate(job: list, previous_cumul=None):
+        # Calculate the cumulative completion times for a job
+        res = [0] * len(job)
+        if previous_cumul == None:
+            res[0] = job[0]
+            for i in range(1, len(job)):
+                res[i] = res[i - 1] + job[i]
+        else:
+            res[0] = previous_cumul[0] + job[0]
+            for i in range(1, len(job)):
+                res[i] = max(res[i - 1], previous_cumul[i]) + job[i]
+        return res
+    def cumulate_seq(pfsp_instance: list, schedule: list):
+        # Calculates the cumulative time for a sequence of jobs on machines.
+        cumulated = None
+        for i in schedule:
+            cumulated = cumulate(pfsp_instance[i], cumulated)
+        return cumulated
+    cumulative = cumulate_seq(pfsp_instance, schedule)
+    return cumulative[-1]
+def create_dataset(
+    pfsp_instance,
+    nb_samples,
+    init_type,
+    data_folder_location,
+    data_folder_name=None,
+    seed=97
+):
+    np.random.seed(seed)
+    def perturb_schedule(schedule):
+        perturbed_schedule = schedule[:]
+        i, j = np.random.choice(perturbed_schedule.shape[0], size=2, replace=False)
+        perturbed_schedule[[i,j]] = perturbed_schedule[[j,i]]
+        return perturbed_schedule, evaluate_makespan(pfsp_instance, perturbed_schedule)
+    # Create the folder if it doesn't exist
+    if data_folder_name is None: data_folder_name = f"ftdataset_{str(np.datetime64('now'))}"
+    data_path = os.path.join(data_folder_location, data_folder_name)
+    os.makedirs(data_path, exist_ok=True)
+    # Create the np memmap files for schedules and makespans
+    nb_jobs = pfsp_instance.shape[0]
+    schedules = np.memmap(os.path.join(data_path,"schedules.bin"), dtype=np.int32, mode='w+', shape=(nb_samples, nb_jobs))
+    makespans = np.memmap(os.path.join(data_path,"makespans.bin"), dtype=np.int32, mode='w+', shape=(nb_samples,))
+    # Save the pfsp instance as a numpy file
+    np.save(os.path.join(data_path,"pfsp_instance.npy"), pfsp_instance)
+    # Create a metadata dictionary and save it as a json file
+    metadata_dict = {
+        "nb_samples": nb_samples,
+        "nb_jobs": nb_jobs,
+        "nb_machines": pfsp_instance.shape[1],
+        "init_type": init_type,
+        "data_path": data_path,
+        "seed": seed,
+        "date_time": str(np.datetime64('now'))
+    }
+    with open(os.path.join(data_path,"metadata.json"), "w") as f:
+        json.dump(metadata_dict, f, indent=4)
+    if init_type == "cds":
+        cds_schedule, cds_makespan = fit_cds(pfsp_instance)
+        schedules[0] = cds_schedule
+        makespans[0] = cds_makespan
+        for i in range(1, nb_samples):
+            schedules[i], makespans[i] = perturb_schedule(cds_schedule)
+    elif init_type == "palmer":
+        palmer_schedule, palmer_makespan = fit_palmer(pfsp_instance)
+        schedules[0] = palmer_schedule
+        makespans[0] = palmer_makespan
+        for i in range(1, nb_samples):
+            schedules[i], makespans[i] = perturb_schedule(palmer_schedule)
+    elif init_type == "neh":
+        neh_schedule, neh_makespan = fit_neh(pfsp_instance)
+        schedules[0] = neh_schedule
+        makespans[0] = neh_makespan
+        for i in range(1, nb_samples):
+            schedules[i], makespans[i] = perturb_schedule(neh_schedule)
+    elif init_type == "heuristics":
+        cds_schedule, cds_makespan = fit_cds(pfsp_instance)
+        schedules[0], makespans[0] = cds_schedule, cds_makespan
+        cds_size = nb_samples // 3
+        for i in range(1, cds_size):
+            print("cds", i)
+            schedules[i], makespans[i] = perturb_schedule(cds_schedule)
+        i+=1
+        palmer_schedule, palmer_makespan = fit_palmer(pfsp_instance)
+        schedules[i], makespans[i] = palmer_schedule, palmer_makespan
+        palmer_size = nb_samples // 3
+        for i in range(i+1, i+palmer_size):
+            print("palmer", i)
+            schedules[i], makespans[i] = perturb_schedule(palmer_schedule)
+        i+=1
+        neh_schedule, neh_makespan = fit_neh(pfsp_instance)
+        schedules[i], makespans[i] = neh_schedule, neh_makespan
+        neh_size = nb_samples - cds_size - palmer_size
+        for i in range(i+1, i+neh_size):
+            print("neh", i)
+            schedules[i], makespans[i] = perturb_schedule(neh_schedule)
+    elif init_type == "random":
+        for i in range(nb_samples):
+            schedule = np.random.permutation(pfsp_instance.shape[0])
+            makespan = evaluate_makespan(pfsp_instance, schedule)
+            schedules[i] = schedule
+            makespans[i] = makespan
+    else:
+        raise ValueError("Invalid initialization type")
+    schedules.flush()
+    makespans.flush()
+    return schedules, makespans