Spaces:

NMR-CeNT-UW
/

ShimNet-development

Sleeping

ShimNet-development / shimnet /generators.py

Marek Bukowicki

remove torchdata from dependencies

5fd2416 2 months ago

49.2 kB

	from enum import Enum
	from copy import deepcopy
	from typing import Optional
	import numpy as np
	import pandas as pd
	import torch
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from abc import ABC, abstractmethod

	def random_uniform(min_value, max_value, generator=None):
	return (min_value + torch.rand(1, generator=generator) * (max_value - min_value)).item()
	random_value = random_uniform

	def random_loguniform(min_value, max_value, generator=None):
	return (min_value * torch.exp(torch.rand(1, generator=generator) * (torch.log(torch.tensor(max_value)) - torch.log(torch.tensor(min_value))))).item()

	def random_uniform_vector(min_value, max_value, size, generator=None):
	return min_value + torch.rand(size, generator=generator) * (max_value - min_value)

	def random_loguniform_vector(min_value, max_value, size, generator=None):
	return min_value * torch.exp(torch.rand(size, generator=generator) * (torch.log(torch.tensor(max_value)) - torch.log(torch.tensor(min_value))))

	def spectrum_from_peaks_data(peaks_parameters: dict \| list, frq_frq:torch.Tensor, relative_frequency=False):

	if isinstance(peaks_parameters, dict):
	peaks_parameters = [peaks_parameters]

	spectrum = torch.zeros((1, frq_frq.shape[0]))
	for peak_params in peaks_parameters:
	# extract parameters
	if relative_frequency:
	tff_lin = frq_frq[0] + peak_params["tff_relative"]*(frq_frq[1]-frq_frq[0])
	else:
	tff_lin = peak_params["tff_lin"]
	twf_lin = peak_params["twf_lin"]
	thf_lin = peak_params["thf_lin"]
	trf_lin = peak_params["trf_lin"]

	lwf_lin = twf_lin
	lhf_lin = thf_lin * (1. - trf_lin)
	gwf_lin = twf_lin
	gdf_lin = gwf_lin / torch.tensor(2.).log().mul(2.).sqrt()
	ghf_lin = thf_lin * trf_lin
	# calculate Lorenz peaks contriubutions
	lsf_linfrq = lwf_lin[:, None] 2 / (lwf_lin[:, None] 2 + (frq_frq - tff_lin[:, None]) ** 2) * lhf_lin[:, None]
	# calculate Gaussian peaks contriubutions
	gsf_linfrq = torch.exp(-(frq_frq - tff_lin[:, None]) 2 / gdf_lin[:, None] 2 / 2.) * ghf_lin[:, None]
	tsf_linfrq = lsf_linfrq + gsf_linfrq
	# sum peaks contriubutions
	spectrum += tsf_linfrq.sum(0, keepdim = True)
	return spectrum

	calculate_theoretical_spectrum = spectrum_from_peaks_data # Alias for backward compatibility

	pascal_triangle = [(1,), (1,1), (1,2,1), (1,3,3,1), (1,4,6,4,1), (1,5,10,10,5,1), (1,6,15,20,15,6,1), (1,7, 21,35,35,21,7,1)]
	normalized_pascal_triangle = [torch.tensor(x)/sum(x) for x in pascal_triangle]

	def pascal_multiplicity(multiplicity):
	intensities = normalized_pascal_triangle[multiplicity-1]
	n_peaks = len(intensities)
	shifts = torch.arange(n_peaks)-((n_peaks-1)/2)
	return shifts, intensities

	def double_multiplicity(multiplicity1, multiplicity2, j1=1, j2=1):
	shifts1, intensities1 = pascal_multiplicity(multiplicity1)
	shifts2, intensities2 = pascal_multiplicity(multiplicity2)

	shifts = (j1shifts1.reshape(-1,1) + j2shifts2.reshape(1,-1)).flatten()
	intensities = (intensities1.reshape(-1,1) * intensities2.reshape(1,-1)).flatten()
	return shifts, intensities

	def generate_multiplet_parameters(multiplicity, tff_lin, thf_lin, twf_lin, trf_lin, j1, j2):
	shifts, intensities = double_multiplicity(multiplicity[0], multiplicity[1], j1, j2)
	n_peaks = len(shifts)

	return {
	"tff_lin": shifts + tff_lin,
	"thf_lin": intensities * thf_lin,
	"twf_lin": torch.full((n_peaks,), twf_lin),
	"trf_lin": torch.full((n_peaks,), trf_lin),
	}

	def value_to_index(values, table):
	span = table[-1] - table[0]
	indices = ((values - table[0])/span * (len(table)-1)) #.round().type(torch.int64)
	return indices

	def generate_theoretical_spectrum(
	number_of_signals_min, number_of_signals_max,
	spectrum_width_min, spectrum_width_max,
	relative_width_min, relative_width_max,
	tff_min, tff_max,
	thf_min, thf_max,
	trf_min, trf_max,
	relative_height_min, relative_height_max,
	multiplicity_j1_min, multiplicity_j1_max,
	multiplicity_j2_min, multiplicity_j2_max,
	atom_groups_data,
	frq_frq,
	generator=None
	):
	number_of_signals = torch.randint(number_of_signals_min, number_of_signals_max+1, [], generator=generator)
	atom_group_indices = torch.randint(0, len(atom_groups_data), [number_of_signals], generator=generator)
	width_spectrum = random_loguniform(spectrum_width_min, spectrum_width_max, generator=generator)
	height_spectrum = random_loguniform(thf_min, thf_max, generator=generator)

	peak_parameters_data = []
	theoretical_spectrum = None
	for atom_group_index in atom_group_indices:
	relative_intensity, multiplicity1, multiplicity2 = atom_groups_data[atom_group_index]
	position = random_value(tff_min, tff_max, generator=generator)
	j1 = random_value(multiplicity_j1_min, multiplicity_j1_max, generator=generator)
	j2 = random_value(multiplicity_j2_min, multiplicity_j2_max, generator=generator)
	width = width_spectrum*random_loguniform(relative_width_min, relative_width_max, generator=generator)
	height = height_spectrumrelative_intensityrandom_loguniform(relative_height_min, relative_height_max, generator=generator)
	gaussian_contribution = random_value(trf_min, trf_max, generator=generator)

	peaks_parameters = generate_multiplet_parameters(multiplicity=(multiplicity1, multiplicity2), tff_lin=position, thf_lin=height, twf_lin= width, trf_lin= gaussian_contribution, j1=j1, j2=j2)
	peaks_parameters["tff_relative"] = value_to_index(peaks_parameters["tff_lin"], frq_frq)
	peak_parameters_data.append(peaks_parameters)
	spectrum_contribution = calculate_theoretical_spectrum(peaks_parameters, frq_frq)
	if theoretical_spectrum is None:
	theoretical_spectrum = spectrum_contribution
	else:
	theoretical_spectrum += spectrum_contribution
	return theoretical_spectrum, peak_parameters_data


	def theoretical_generator(
	atom_groups_data,
	pixels=2048, frq_step=11160.7142857 / 32768,
	number_of_signals_min=1, number_of_signals_max=8,
	spectrum_width_min=0.2, spectrum_width_max=1,
	relative_width_min=1, relative_width_max=2,
	relative_height_min=1, relative_height_max=1,
	relative_frequency_min=-0.4, relative_frequency_max=0.4,
	thf_min=1/16, thf_max=16,
	trf_min=0, trf_max=1,
	multiplicity_j1_min=0, multiplicity_j1_max=15,
	multiplicity_j2_min=0, multiplicity_j2_max=15,
	):
	tff_min = relative_frequency_min * pixels * frq_step
	tff_max = relative_frequency_max * pixels * frq_step
	frq_frq = torch.arange(-pixels // 2, pixels // 2) * frq_step

	while True:
	yield generate_theoretical_spectrum(
	number_of_signals_min=number_of_signals_min,
	number_of_signals_max=number_of_signals_max,
	spectrum_width_min=spectrum_width_min,
	spectrum_width_max=spectrum_width_max,
	relative_width_min=relative_width_min,
	relative_width_max=relative_width_max,
	relative_height_min=relative_height_min,
	relative_height_max=relative_height_max,
	tff_min=tff_min, tff_max=tff_max,
	thf_min=thf_min, thf_max=thf_max,
	trf_min=trf_min, trf_max=trf_max,
	multiplicity_j1_min=multiplicity_j1_min,
	multiplicity_j1_max=multiplicity_j1_max,
	multiplicity_j2_min=multiplicity_j2_min,
	multiplicity_j2_max=multiplicity_j2_max,
	atom_groups_data=atom_groups_data,
	frq_frq=frq_frq
	)

	class ResponseLibrary:
	def __init__(self, response_files, normalize=True):
	self.data = [torch.load(f, map_location='cpu', weights_only=True).flatten(0,-4) for f in response_files]
	if normalize:
	self.data = [data/torch.sum(data, dim=(-1,), keepdim=True) for data in self.data]
	lengths = [len(data) for data in self.data]
	self.start_indices = torch.cumsum(torch.tensor([0] + lengths[:-1]), 0)
	self.total_length = sum(lengths)

	def __getitem__(self, idx):
	if idx >= self.total_length:
	raise ValueError(f'index {idx} out of range')
	tensor_index = torch.searchsorted(self.start_indices, idx, right=True) - 1
	return self.data[tensor_index][idx - self.start_indices[tensor_index]]

	def __len__(self):
	return self.total_length

	@property
	def max_response_length(self):
	return max([data.shape[-1] for data in self.data])

	def generator(
	theoretical_generator_params,
	response_function_library,
	response_function_stretch_min=0.5,
	response_function_stretch_max=2.0,
	response_function_noise=0.,
	spectrum_noise_min=0.,
	spectrum_noise_max=1/64,
	include_spectrum_data=False,
	include_peak_mask=False,
	include_response_function=False,
	flip_response_function=False

	):
	for theoretical_spectrum, theoretical_spectrum_data in theoretical_generator(**theoretical_generator_params):
	# get response function
	response_function = response_function_library[torch.randint(0, len(response_function_library), [1])][0]
	# stretch response function
	padding_size = (response_function.shape[-1] - 1)//2
	padding_size = round(random_loguniform(response_function_stretch_min, response_function_stretch_max)padding_size) #torch.randint(round(padding_sizeresponse_function_stretch_min), round(paddingSize*response_function_stretch_max), [1]).item()
	response_function = torch.nn.functional.interpolate(response_function, size=2*padding_size+1, mode='linear')
	response_function /= response_function.sum() # normalize sum of response function to 1
	# add noise to response function
	response_function += torch.randn(response_function.shape) * response_function_noise
	response_function /= response_function.sum() # normalize sum of response function to 1
	if flip_response_function and (torch.rand(1).item() < 0.5):
	response_function = response_function.flip(-1)
	# disturbed spectrum
	disturbed_spectrum = torch.nn.functional.conv1d(theoretical_spectrum, response_function, padding=padding_size)
	# add noise
	noised_spectrum = disturbed_spectrum + torch.randn(disturbed_spectrum.shape) * random_value(spectrum_noise_min, spectrum_noise_max)

	out = {
	# 'response_function': response_function,
	'theoretical_spectrum': theoretical_spectrum,
	'disturbed_spectrum': disturbed_spectrum,
	'noised_spectrum': noised_spectrum,
	}
	if include_response_function:
	out['response_function'] = response_function
	if include_spectrum_data:
	out["theoretical_spectrum_data"] = theoretical_spectrum_data
	if include_peak_mask:
	all_peaks_rel = torch.cat([peak_data["tff_relative"] for peak_data in theoretical_spectrum_data])
	peaks_indices = all_peaks_rel.round().type(torch.int64)
	out["peaks_mask"] = torch.scatter(torch.zeros(out["theoretical_spectrum"].shape[1]), 0, peaks_indices, 1.).unsqueeze(0)

	yield out


	def collate_with_spectrum_data(batch):
	tensor_keys = set(batch[0].keys())
	tensor_keys.remove('theoretical_spectrum_data')
	out = {k: torch.stack([item[k] for item in batch]) for k in tensor_keys}
	out["theoretical_spectrum_data"] = [item["theoretical_spectrum_data"] for item in batch]
	return out

	class RngGetter:
	def __init__(self, seed=42):
	self.rng = torch.Generator()
	if seed is not None:
	self.rng.manual_seed(seed)
	else:
	self.rng.seed()

	def get_rng(self, seed=None):
	# Use provided seed or fall back to instance RNG
	if seed is not None:
	rng = torch.Generator()
	rng.manual_seed(seed)
	else:
	rng = self.rng
	return rng


	class PeaksParameterDataGenerator:
	"""
	Generates peak parameter data for NMR multiplets.

	This class is responsible for generating the parameters that describe individual peaks
	in an NMR spectrum (frequencies, heights, widths, Gaussian/Lorentzian ratio).
	"""
	def __init__(self,
	tff_min=None, #may be assigned after initialization
	tff_max=None, #may be assigned after initialization
	atom_groups_data_file=None,
	number_of_signals_min=1,
	number_of_signals_max=8,
	relative_frequency_min=-0.4,
	relative_frequency_max=0.4,
	spectrum_width_min=0.2,
	spectrum_width_max=1,
	relative_width_min=1,
	relative_width_max=2,
	relative_height_min=1,
	relative_height_max=1,
	thf_min=1/16,
	thf_max=16,
	trf_min=0,
	trf_max=1,
	multiplicity_j1_min=0,
	multiplicity_j1_max=15,
	multiplicity_j2_min=0,
	multiplicity_j2_max=15,
	seed=42
	):
	# Read atom_groups_data from file
	if atom_groups_data_file is None:
	self.atom_groups_data = np.ones((1,3), dtype=int)
	else:
	self.atom_groups_data = np.atleast_2d(np.loadtxt(atom_groups_data_file, usecols=(1,2,3), dtype=int))

	self.tff_min = tff_min
	self.tff_max = tff_max
	self.number_of_signals_min = number_of_signals_min
	self.number_of_signals_max = number_of_signals_max
	self.relative_frequency_min = relative_frequency_min
	self.relative_frequency_max = relative_frequency_max

	self.spectrum_width_min = spectrum_width_min
	self.spectrum_width_max = spectrum_width_max
	self.relative_width_min = relative_width_min
	self.relative_width_max = relative_width_max
	self.relative_height_min = relative_height_min
	self.relative_height_max = relative_height_max
	self.thf_min = thf_min
	self.thf_max = thf_max
	self.trf_min = trf_min
	self.trf_max = trf_max
	self.multiplicity_j1_min = multiplicity_j1_min
	self.multiplicity_j1_max = multiplicity_j1_max
	self.multiplicity_j2_min = multiplicity_j2_min
	self.multiplicity_j2_max = multiplicity_j2_max

	self.rng_getter = RngGetter(seed=seed)

	def set_frq_range(self, frq_min, frq_max):
	frq_amplitude = frq_max - frq_min
	frq_center = (frq_max + frq_min) / 2
	self.tff_min = frq_center + frq_amplitude * self.relative_frequency_min
	self.tff_max = frq_center + frq_amplitude * self.relative_frequency_max

	def __call__(self, seed=None):
	"""
	Generate peak parameters data.

	Args:
	seed: Optional seed for reproducibility

	Returns:
	List of dicts containing peak parameters (without tff_relative)
	"""
	if self.tff_min is None or self.tff_max is None:
	raise ValueError("tff_min and tff_max must be set before calling the generator.")

	rng = self.rng_getter.get_rng(seed=seed)

	number_of_signals = torch.randint(
	self.number_of_signals_min,
	self.number_of_signals_max + 1,
	[],
	generator=rng
	)
	atom_group_indices = torch.randint(
	0,
	len(self.atom_groups_data),
	[number_of_signals],
	generator=rng
	)
	width_spectrum = random_loguniform(
	self.spectrum_width_min,
	self.spectrum_width_max,
	generator=rng
	)
	height_spectrum = random_loguniform(
	self.thf_min,
	self.thf_max,
	generator=rng
	)

	peaks_parameters_data = []
	for atom_group_index in atom_group_indices:
	relative_intensity, multiplicity1, multiplicity2 = self.atom_groups_data[atom_group_index]
	position = random_value(self.tff_min, self.tff_max, generator=rng)
	j1 = random_value(self.multiplicity_j1_min, self.multiplicity_j1_max, generator=rng)
	j2 = random_value(self.multiplicity_j2_min, self.multiplicity_j2_max, generator=rng)
	width = width_spectrum * random_loguniform(
	self.relative_width_min,
	self.relative_width_max,
	generator=rng
	)
	height = height_spectrum * relative_intensity * random_loguniform(
	self.relative_height_min,
	self.relative_height_max,
	generator=rng
	)
	gaussian_contribution = random_value(self.trf_min, self.trf_max, generator=rng)

	peak_parameters = generate_multiplet_parameters(
	multiplicity=(multiplicity1, multiplicity2),
	tff_lin=position,
	thf_lin=height,
	twf_lin=width,
	trf_lin=gaussian_contribution,
	j1=j1,
	j2=j2
	)
	peaks_parameters_data.append(peak_parameters)

	return peaks_parameters_data

	class TheoreticalMultipletSpectraGenerator:
	"""
	Generates theoretical NMR multiplet spectra.

	This class combines peak parameter generation with spectrum calculation.
	It can accept either a PeaksParameterDataGenerator instance or parameters to create one.
	"""
	def __init__(self,
	peaks_parameter_generator,
	pixels=2048,
	frq_step=11160.7142857 / 32768,
	relative_frequency_min=-0.4,
	relative_frequency_max=0.4,
	frequency_min=None, #if None, the 0 will be in the center of spectrum
	frequency_max=None,
	include_tff_relative=False,
	seed=42
	):

	# Spectrum-level parameters
	self.pixels = pixels
	self.frq_step = frq_step
	self.relative_frequency_min = relative_frequency_min
	self.relative_frequency_max = relative_frequency_max
	self.include_tff_relative = include_tff_relative
	# Frequency axis
	self.frq_frq, frq_min, frq_max = self._frequency_axis_from_parameters(frq_step, pixels, frequency_min, frequency_max)

	self.peaks_parameter_generator = peaks_parameter_generator
	self.peaks_parameter_generator.set_frq_range(frq_min, frq_max)

	# self.rng_getter = RngGetter(seed=seed) # self.rng_getter.get_rng(seed=seed) to get random generator

	def _frequency_axis_from_parameters(self, frq_step, pixels, frequency_min, frequency_max):
	"""frq_step is never None, pixels, frequency_min or frequency_max can be None
	"""
	# Option 1: from pixels and frq_step
	if pixels is not None:
	assert (frequency_min is None) or (frequency_max is None)
	if (frequency_min is None) and (frequency_max is None): # if both are None, center at 0
	frequency_min = -(pixels // 2) * frq_step
	elif frequency_min is None: # frequency_max is not None, use it to calculate frequency_min
	frequency_min = frequency_max - pixels * frq_step
	frq_frq = torch.arange(0, pixels) * frq_step + frequency_min
	# Option 2: from frequency_min and frequency_max
	elif (frequency_min is not None) and (frequency_max is not None):
	pixels = round((frequency_max - frequency_min) / frq_step)
	frq_frq = torch.arange(0, pixels) * frq_step + frequency_min
	else:
	raise ValueError("Insufficient parameters to determine frequency axis.")
	return frq_frq, frq_frq[0], frq_frq[-1]


	def __call__(self, seed=None):
	"""
	Generate a theoretical spectrum.

	Args:
	seed: Optional seed for reproducibility

	Returns:
	Tuple of (spectrum, dict with spectrum_data and frq_frq)
	"""
	# Generate peak parameters (peaks_parameter_generator has its own RngGetter)
	peaks_parameters_data = self.peaks_parameter_generator(seed=seed)

	# Add tff_relative if requested
	if self.include_tff_relative:
	for peak_params in peaks_parameters_data:
	peak_params["tff_relative"] = value_to_index(peak_params["tff_lin"], self.frq_frq)

	# Create spectrum from peaks
	spectrum = spectrum_from_peaks_data(peaks_parameters_data, self.frq_frq)

	return spectrum, {"spectrum_data": peaks_parameters_data, "frq_frq": self.frq_frq}


	class PeaksParametersNames(Enum):
	"""Enum for standardized peak parameter names."""
	position_hz ="tff_lin"
	height = "thf_lin"
	halfwidth_hz = "twf_lin"
	gaussian_fraction = "trf_lin"

	@classmethod
	def keys(cls):
	return [member.value for member in cls]

	@classmethod
	def values(cls):
	return [member.name for member in cls]

	class PeaksParametersParser:
	"""class to convert peaks parameters from `{"width_hz": [...], "height": ..., ...}` format to `{"twf_lin": torch.tensor([...]), "thf_lin": ..., ...}` format."""
	def __init__(self,
	alias_position_hz = None,
	alias_height = None,
	alias_width_hz = None,
	alias_gaussian_fraction = None,
	default_position_hz = None,
	default_height = None,
	default_width_hz = None,
	default_gaussian_fraction = 0.,
	convert_width_to_halfwidth = True
	):
	self.alias_position_hz = alias_position_hz if alias_position_hz is not None else "position_hz"
	self.alias_height = alias_height if alias_height is not None else "height"
	self.alias_width_hz = alias_width_hz if alias_width_hz is not None else "width_hz"
	self.alias_gaussian_fraction = alias_gaussian_fraction if alias_gaussian_fraction is not None else "gaussian_fraction"
	self.default_position_hz = default_position_hz
	self.default_height = default_height
	self.default_width_hz = default_width_hz
	self.default_gaussian_fraction = default_gaussian_fraction
	self.convert_width_to_halfwidth = convert_width_to_halfwidth

	def transform_single_peak(self, peak: dict) -> dict:
	parsed_peak = {
	PeaksParametersNames.position_hz.value: peak.get(self.alias_position_hz, self.default_position_hz),
	PeaksParametersNames.height.value: peak.get(self.alias_height, self.default_height),
	PeaksParametersNames.halfwidth_hz.value: (0.5 if self.convert_width_to_halfwidth else 1.) * peak.get(self.alias_width_hz, self.default_width_hz),
	PeaksParametersNames.gaussian_fraction.value: peak.get(self.alias_gaussian_fraction, self.default_gaussian_fraction),
	}
	# Validate and convert other peak parameters
	for k, v in parsed_peak.items():
	if v is None:
	raise ValueError(f"Peak parameter '{k}' is None.")
	parsed_peak[k] = torch.atleast_1d(v.float() if isinstance(v, torch.Tensor) else torch.tensor(v, dtype=torch.float32))
	return parsed_peak

	def transform(self, spectrum_peaks: list[dict]) -> list[dict]:
	parsed_peaks = []
	for peak in spectrum_peaks:
	parsed_peaks.append(self.transform_single_peak(peak))
	return parsed_peaks

	def csv_file_to_multiplets_dict(file_path: str) -> list[dict]:
	peaks_data = pd.read_csv(file_path)
	multiplets = {k: v.drop(columns="multiplet_name").to_dict(orient='list') for k, v in peaks_data.groupby("multiplet_name")}
	return multiplets

	def combine_multiplets(multiplets_list: list[dict]) -> dict:
	composed_multiplets = {}
	for multiplets in multiplets_list:
	for k, v in multiplets.items():
	if not k in composed_multiplets:
	composed_multiplets[k] = v
	else:
	composed_multiplets[k].extend(v)
	return composed_multiplets

	class MultipletsLibrary:
	def __init__(self, csv_files_paths: list[str], peak_data_parser: PeaksParametersParser = None, return_name=False):
	self.csv_files_paths = csv_files_paths
	self.multiplets_data = {}
	self.peak_data_parser = peak_data_parser
	for file_path in csv_files_paths:
	self.multiplets_data.update(self._get_multiplet_data_from_file(file_path))

	self.names = sorted(self.multiplets_data.keys())
	self.return_name = return_name

	def _get_multiplet_data_from_file(self, file_path: str) -> dict:
	multiplets = csv_file_to_multiplets_dict(file_path) # dict[dict]
	multiplets_out = {}
	for k, v in multiplets.items():
	multiplets_out[f"{file_path}/{k}"] = self.peak_data_parser.transform([v])[0] if self.peak_data_parser else v
	return multiplets_out

	def get_by_name(self, name: str) -> dict:
	return self.multiplets_data.get(name, None)

	def __getitem__(self, idx: int) -> dict:
	name = self.names[idx]
	multiplet_data = deepcopy(self.multiplets_data[name])
	if self.return_name:
	return name, multiplet_data
	return multiplet_data

	def __len__(self):
	return len(self.multiplets_data)

	class SectraLibrary(MultipletsLibrary):
	def _get_multiplet_data_from_file(self, file_path: str) -> dict:
	multiplets = csv_file_to_multiplets_dict(file_path) # dict[dict]
	combined_multiplet = combine_multiplets(multiplets.values()) # dict
	return {f"{file_path}": self.peak_data_parser.transform([combined_multiplet])[0]}

	class MultipletDataFromMultipletsLibrary:
	def __init__(self,
	multiplets_library,
	tff_min=None, #may be assigned after initialization if the original peak positions are not used
	tff_max=None, #may be assigned after initialization if the original peak positions are not used
	use_original_peak_position=True,
	number_of_signals_min=None,
	number_of_signals_max=None,
	relative_frequency_min=None,
	relative_frequency_max=None,
	spectrum_width_factor_min=1,
	spectrum_width_factor_max=1,
	multiplet_width_factor_min=1,
	multiplet_width_factor_max=1,
	multiplet_width_additive_min=0,
	multiplet_width_additive_max=0,
	spectrum_height_factor_min=1,
	spectrum_height_factor_max=1,
	multiplet_height_factor_min=1,
	multiplet_height_factor_max=1,
	multiplet_height_additive_min=0,
	multiplet_height_additive_max=0,
	position_shift_min=0,
	position_shift_max=0,
	gaussian_fraction_change_min=None,
	gaussian_fraction_change_max=None,
	gaussian_fraction_change_additive_min=0.,
	gaussian_fraction_change_additive_max=0.,
	seed=42
	):

	if (number_of_signals_min is None) != (number_of_signals_max is None):
	raise ValueError("Both number_of_signals_min and number_of_signals_max should be provided or both should be None.")

	self.multiplets_library = multiplets_library
	self.rng_getter = RngGetter(seed=seed)
	self.tff_min = tff_min
	self.tff_max = tff_max
	self.relative_frequency_min = relative_frequency_min
	self.relative_frequency_max = relative_frequency_max
	self.use_original_peak_position = use_original_peak_position
	self.number_of_signals_min = number_of_signals_min
	self.number_of_signals_max = number_of_signals_max
	self.spectrum_width_factor_min = spectrum_width_factor_min
	self.spectrum_width_factor_max = spectrum_width_factor_max
	self.multiplet_width_factor_min = multiplet_width_factor_min
	self.multiplet_width_factor_max = multiplet_width_factor_max
	self.multiplet_width_additive_min = multiplet_width_additive_min
	self.multiplet_width_additive_max = multiplet_width_additive_max
	self.spectrum_height_factor_min = spectrum_height_factor_min
	self.spectrum_height_factor_max = spectrum_height_factor_max
	self.multiplet_height_factor_min = multiplet_height_factor_min
	self.multiplet_height_factor_max = multiplet_height_factor_max
	self.multiplet_height_additive_min = multiplet_height_additive_min
	self.multiplet_height_additive_max = multiplet_height_additive_max
	self.position_shift_min = position_shift_min
	self.position_shift_max = position_shift_max
	self.gaussian_fraction_change_min = gaussian_fraction_change_min
	self.gaussian_fraction_change_max = gaussian_fraction_change_max
	self.gaussian_fraction_change_additive_min = gaussian_fraction_change_additive_min
	self.gaussian_fraction_change_additive_max = gaussian_fraction_change_additive_max

	def set_frq_range(self, frq_min, frq_max):
	frq_amplitude = frq_max - frq_min
	frq_center = (frq_max + frq_min) / 2
	self.tff_min = frq_center + frq_amplitude * self.relative_frequency_min
	self.tff_max = frq_center + frq_amplitude * self.relative_frequency_max


	def __call__(self, seed=None):
	if (not self.use_original_peak_position) and (self.tff_min is None or self.tff_max is None):
	raise ValueError("for use_original_peak_position=False, tff_min and tff_max must be set before calling the generator.")

	rng = self.rng_getter.get_rng(seed=seed)

	# select number of signals and their indices
	if self.number_of_signals_min is None:
	number_of_signals = len(self.multiplets_library)
	multiplets_indices = list(range(len(self.multiplets_library)))
	else:
	number_of_signals = torch.randint(
	self.number_of_signals_min,
	self.number_of_signals_max + 1,
	[],
	generator=rng
	)

	multiplets_indices = torch.randint(
	0,
	len(self.multiplets_library),
	[number_of_signals],
	generator=rng
	)

	# spectrum width and height factors
	spectrum_width_factor = random_loguniform(
	self.spectrum_width_factor_min,
	self.spectrum_width_factor_max,
	generator=rng
	)

	spectrum_height_factor = random_loguniform(
	self.spectrum_height_factor_min,
	self.spectrum_height_factor_max,
	generator=rng
	)

	# get and modify peaks parameters data
	peaks_parameters_data = [self.multiplets_library[idx] for idx in multiplets_indices]
	for peak_parameters in peaks_parameters_data:

	# position
	if not self.use_original_peak_position:
	new_position_center = random_value(self.tff_min, self.tff_max, generator=rng)
	peak_parameters["tff_lin"] += new_position_center - torch.mean(peak_parameters["tff_lin"])
	else:
	position_shift = random_value(self.position_shift_min, self.position_shift_max, generator=rng)
	peak_parameters["tff_lin"] += position_shift

	# width
	multiplet_width_factor = random_loguniform(
	self.multiplet_width_factor_min,
	self.multiplet_width_factor_max,
	generator=rng
	)
	multiplet_width_additive = random_uniform(
	self.multiplet_width_additive_min,
	self.multiplet_width_additive_max,
	generator=rng
	)
	peak_parameters["twf_lin"] = peak_parameters["twf_lin"] * spectrum_width_factor * multiplet_width_factor + multiplet_width_additive

	# height
	multiplet_height_factor = random_loguniform(
	self.multiplet_height_factor_min,
	self.multiplet_height_factor_max,
	generator=rng
	)
	multiplet_height_additive = random_uniform(
	self.multiplet_height_additive_min,
	self.multiplet_height_additive_max,
	generator=rng
	)
	peak_parameters["thf_lin"] = peak_parameters["thf_lin"] * spectrum_height_factor * multiplet_height_factor + multiplet_height_additive

	# gaussian contribution
	if self.gaussian_fraction_change_min is not None:
	gaussian_contribution_shift = random_value(self.gaussian_fraction_change_min, self.gaussian_fraction_change_max, generator=rng)
	gaussian_contribution_additive = random_value(self.gaussian_fraction_change_additive_min, self.gaussian_fraction_change_additive_max, generator=rng)
	gaussian_contribution_shift += gaussian_contribution_additive
	peak_parameters["trf_lin"] = torch.clip(peak_parameters["trf_lin"] + gaussian_contribution_shift, 0., 1.)

	return peaks_parameters_data


	class ResponseGenerator:
	def __init__(self, response_function_library, response_function_stretch_min=1., response_function_stretch_max=1., pad_to=None,
	response_function_noise=0.0, flip_response_function=False, seed=42):
	self.response_function_library = response_function_library
	self.response_function_stretch_min = response_function_stretch_min
	self.response_function_stretch_max = response_function_stretch_max
	self.pad_to = pad_to
	self.response_function_noise = response_function_noise
	self.flip_response_function = flip_response_function
	self.rng_getter = RngGetter(seed=seed) # self.rng_getter.get_rng(seed=seed) to get random generator

	def __call__(self, seed=None):
	rng = self.rng_getter.get_rng(seed=seed)

	response_function = self.response_function_library[torch.randint(0, len(self.response_function_library), [1], generator=rng)][0]
	padding_size = (response_function.shape[-1] - 1)//2
	padding_size = round(random_loguniform(self.response_function_stretch_min, self.response_function_stretch_max, generator=rng)*padding_size)
	response_function = torch.nn.functional.interpolate(response_function, size=2*padding_size+1, mode='linear')
	response_function /= response_function.sum()
	response_function += torch.randn(response_function.shape, generator=rng) * self.response_function_noise
	response_function /= response_function.sum()
	if self.flip_response_function and (torch.rand(1, generator=rng).item() < 0.5):
	response_function = response_function.flip(-1)
	if self.pad_to is not None:
	pad_size_left = (self.pad_to - response_function.shape[-1]) // 2
	pad_size_right = self.pad_to - response_function.shape[-1] - pad_size_left
	response_function = torch.nn.functional.pad(response_function, (pad_size_left, pad_size_right))
	return response_function

	class NoiseGenerator:
	def __init__(self, spectrum_noise_min=0., spectrum_noise_max=1/64, seed=42):
	self.spectrum_noise_min = spectrum_noise_min
	self.spectrum_noise_max = spectrum_noise_max
	self.rng_getter = RngGetter(seed=seed) # self.rng_getter.get_rng(seed=seed) to get random generator

	def __call__(self, disturbed_spectrum, seed=None):
	rng = self.rng_getter.get_rng(seed=seed)
	return disturbed_spectrum + torch.randn(disturbed_spectrum.shape, generator=rng) * random_value(self.spectrum_noise_min, self.spectrum_noise_max, generator=rng)

	class BaseGenerator(ABC):
	"""
	Single-threaded base generator.

	For this workload, single-threaded execution is typically faster because:
	- Thread creation/synchronization overhead > computation time
	- Python GIL contention during object creation
	- Memory allocator contention when multiple threads allocate tensors
	- CPU cache thrashing across cores
	- Small per-thread workload doesn't amortize thread overhead
	"""
	def __init__(self, batch_size=64, seed=None):
	self.batch_size = batch_size
	self.seed = seed

	def set_seed(self, seed):
	self.seed = seed

	@abstractmethod
	def _generate_element(self, seed):
	pass

	def __iter__(self):
	rng = torch.Generator()
	if self.seed is not None:
	rng.manual_seed(self.seed)
	else:
	rng.seed()

	while True:
	batch = []
	# Generate unique seeds for each element in the batch
	if self.seed is not None:
	element_seeds = [torch.randint(0, 2**31, (1,), generator=rng).item() for _ in range(self.batch_size)]
	else:
	element_seeds = [None] * self.batch_size

	# Single-threaded sequential generation
	for i in range(self.batch_size):
	batch.append(self._generate_element(element_seeds[i]))

	yield self.collate_fn(batch)

	@abstractmethod
	def collate_fn(self, batch):
	pass


	class BaseGeneratorMultithread(ABC):
	"""
	Multithreaded base generator (backup option).

	Use only if profiling shows benefit for your specific use case
	(e.g., very large/slow generation functions, I/O-bound operations).
	"""
	def __init__(self, batch_size=64, num_workers=4, seed=None, ordered_batch=False):
	self.batch_size = batch_size
	self.num_workers = num_workers
	self.seed = seed
	self.ordered_batch = ordered_batch

	def set_seed(self, seed):
	self.seed = seed

	def set_ordered_batch(self, ordered_batch):
	self.ordered_batch = ordered_batch

	@abstractmethod
	def _generate_element(self, seed):
	pass

	def __iter__(self):
	rng = torch.Generator()
	if self.seed is not None:
	rng.manual_seed(self.seed)
	else:
	rng.seed()

	while True:
	batch = []
	# Generate unique seeds for each element in the batch
	if self.seed is not None:
	element_seeds = [torch.randint(0, 2**31, (1,), generator=rng).item() for _ in range(self.batch_size)]
	else:
	element_seeds = [None] * self.batch_size

	with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
	futures = [executor.submit(self._generate_element, element_seeds[i]) for i in range(self.batch_size)]

	if self.ordered_batch:
	# Maintain order: iterate futures in submission order
	for f in futures:
	batch.append(f.result())
	else:
	# Faster: process as completed (order may vary)
	for f in as_completed(futures):
	batch.append(f.result())

	yield self.collate_fn(batch)

	@abstractmethod
	def collate_fn(self, batch):
	pass

	class Generator(BaseGenerator):
	def __init__(self, clean_spectra_generator, response_generator, noise_generator, batch_size=64,
	include_spectrum_data=False, include_peak_mask=False, include_response_function=False, input_normalization_height=None, seed=None):
	super().__init__(batch_size=batch_size, seed=seed)
	self.clean_spectra_generator = clean_spectra_generator
	self.response_generator = response_generator
	self.noise_generator = noise_generator
	self.include_spectrum_data = include_spectrum_data
	self.include_peak_mask = include_peak_mask
	self.include_response_function = include_response_function
	self.input_normalization_height = input_normalization_height

	def _generate_element(self, seed):
	# Generate different seeds for each generator from the provided seed
	if seed is not None:
	rng = torch.Generator()
	rng.manual_seed(seed)
	clean_seed = torch.randint(0, 2**31, (1,), generator=rng).item()
	response_seed = torch.randint(0, 2**31, (1,), generator=rng).item()
	noise_seed = torch.randint(0, 2**31, (1,), generator=rng).item()
	else:
	clean_seed = None
	response_seed = None
	noise_seed = None

	clean_spectrum, extra_clean_data = self.clean_spectra_generator(seed=clean_seed)
	response_function = self.response_generator(seed=response_seed)
	padding_size = (response_function.shape[-1] - 1)//2
	disturbed_spectrum = torch.nn.functional.conv1d(clean_spectrum, response_function, padding=padding_size)

	if self.input_normalization_height is not None:
	max_val = torch.max(disturbed_spectrum)
	clean_spectrum = clean_spectrum / max_val * self.input_normalization_height
	disturbed_spectrum = disturbed_spectrum / max_val * self.input_normalization_height

	# noise after normalization to better control noise level
	noised_spectrum = self.noise_generator(disturbed_spectrum, seed=noise_seed)

	out = {
	'theoretical_spectrum': clean_spectrum,
	'disturbed_spectrum': disturbed_spectrum,
	'noised_spectrum': noised_spectrum,
	}
	if self.include_spectrum_data:
	out['theoretical_spectrum_data'] = extra_clean_data['spectrum_data']
	out['frq_frq'] = extra_clean_data['frq_frq']
	if self.include_peak_mask and extra_clean_data is not None:
	all_peaks_rel = torch.cat([peak_data["tff_relative"] for peak_data in extra_clean_data['spectrum_data']])
	peaks_indices = all_peaks_rel.round().type(torch.int64)
	out["peaks_mask"] = torch.scatter(torch.zeros(out["theoretical_spectrum"].shape[1]), 0, peaks_indices, 1.).unsqueeze(0)
	if self.include_response_function:
	out['response_function'] = response_function
	return out

	def collate_fn(self, batch):
	tensor_keys = set(batch[0].keys())
	for k in ['theoretical_spectrum_data', 'frq_frq']:
	tensor_keys.discard(k)
	out = {k: torch.stack([item[k] for item in batch]) for k in tensor_keys}
	if 'theoretical_spectrum_data' in batch[0]:
	out['theoretical_spectrum_data'] = [item['theoretical_spectrum_data'] for item in batch]
	if 'frq_frq' in batch[0]:
	out['frq_frq'] = [item['frq_frq'] for item in batch]
	return out

	class PeaksParametersFromSinglets:
	def __init__(self,
	singlets_files: list[pd.DataFrame],
	number_of_signals_min: int = 5,
	number_of_signals_max: int = 20,
	use_original_position: bool = True,
	position_hz_min: Optional[float] = None,
	position_hz_max: Optional[float] = None,
	position_hz_change_min: float = 0.0,
	position_hz_change_max: float = 0.0,
	relative_frequency_min: float = -0.4, # used only if position_hz_min/max are None
	relative_frequency_max: float = 0.4,
	use_original_width: bool = True,
	width_hz_min: float = 0.2,
	width_hz_max: float = 2.0,
	width_factor_min: float = 1.0,
	width_factor_max: float = 1.0,
	width_hz_change_min: float = 0.0,
	width_hz_change_max: float = 0.0,
	convert_width_to_halfwidth: bool = True,
	use_original_height: bool = True,
	height_min: float = 0.1,
	height_max: float = 10.0,
	height_factor_min: float = 1.0,
	height_factor_max: float = 1.0,
	height_change_min: float = 0.0,
	height_change_max: float = 0.0,
	use_original_gaussian_fraction: bool = True,
	gaussian_fraction_min: float = 0.0,
	gaussian_fraction_max: float = 1.0,
	gaussian_fraction_change_min: float = 0.0,
	gaussian_fraction_change_max: float = 0.0,
	seed=42
	):
	self.peaks_rows = pd.concat([pd.read_csv(f) for f in singlets_files], ignore_index=True)

	# number of signals
	self.number_of_signals_min = number_of_signals_min
	self.number_of_signals_max = number_of_signals_max
	# position
	self.use_original_position = use_original_position
	self.position_hz_min = position_hz_min
	self.position_hz_max = position_hz_max
	self.position_hz_change_min = position_hz_change_min
	self.position_hz_change_max = position_hz_change_max
	self.relative_frequency_min = relative_frequency_min
	self.relative_frequency_max = relative_frequency_max
	# width
	self.use_original_width = use_original_width
	self.width_hz_min = width_hz_min
	self.width_hz_max = width_hz_max
	self.width_factor_min = width_factor_min
	self.width_factor_max = width_factor_max
	self.width_hz_change_min = width_hz_change_min
	self.width_hz_change_max = width_hz_change_max
	self.convert_width_to_halfwidth = convert_width_to_halfwidth # if True, the original widths will be divided by 2
	# height
	self.use_original_height = use_original_height
	self.height_min = height_min
	self.height_max = height_max
	self.height_factor_min = height_factor_min
	self.height_factor_max = height_factor_max
	self.height_change_min = height_change_min
	self.height_change_max = height_change_max
	# gaussian fraction
	self.use_original_gaussian_fraction = use_original_gaussian_fraction
	self.gaussian_fraction_min = gaussian_fraction_min
	self.gaussian_fraction_max = gaussian_fraction_max
	self.gaussian_fraction_change_min = gaussian_fraction_change_min
	self.gaussian_fraction_change_max = gaussian_fraction_change_max

	self.rng_getter = RngGetter(seed=seed)

	def set_frq_range(self, frq_min, frq_max):
	frq_amplitude = frq_max - frq_min
	frq_center = (frq_max + frq_min) / 2
	self.position_hz_min = frq_center + frq_amplitude * self.relative_frequency_min
	self.position_hz_max = frq_center + frq_amplitude * self.relative_frequency_max

	def __call__(self, seed=None) -> list[dict]:
	rng = self.rng_getter.get_rng(seed=seed)

	number_of_signals = torch.randint(
	low=self.number_of_signals_min,
	high=min(self.number_of_signals_max, len(self.peaks_rows) + 1),
	size=[],
	generator=rng
	)
	selected_peaks = self.peaks_rows.sample(n=number_of_signals.item(), random_state=seed)

	multiplet_data = {}
	# position
	if self.use_original_position:
	multiplet_data[PeaksParametersNames.position_hz.value] = torch.tensor(selected_peaks["position_hz"].values, dtype=torch.float32) + random_uniform_vector(self.position_hz_change_min, self.position_hz_change_max, size=len(selected_peaks))
	else:
	multiplet_data[PeaksParametersNames.position_hz.value] = random_uniform_vector(self.position_hz_min, self.position_hz_max, size=len(selected_peaks))
	# width
	if self.use_original_width:
	multiplet_data[PeaksParametersNames.halfwidth_hz.value] = (0.5 if self.convert_width_to_halfwidth else 1.)torch.tensor(selected_peaks["width_hz"].values, dtype=torch.float32) random_uniform_vector(self.width_factor_min, self.width_factor_max, size=len(selected_peaks)) + random_uniform_vector(self.width_hz_change_min, self.width_hz_change_max, size=len(selected_peaks))
	else:
	multiplet_data[PeaksParametersNames.halfwidth_hz.value] = random_loguniform_vector(self.width_hz_min, self.width_hz_max, size=len(selected_peaks))
	# height
	if self.use_original_height:
	multiplet_data[PeaksParametersNames.height.value] = torch.tensor(selected_peaks["height"].values, dtype=torch.float32) * random_uniform_vector(self.height_factor_min, self.height_factor_max, size=len(selected_peaks)) + random_uniform_vector(self.height_change_min, self.height_change_max, size=len(selected_peaks))
	else:
	multiplet_data[PeaksParametersNames.height.value] = random_loguniform_vector(self.height_min, self.height_max, size=len(selected_peaks))
	# gaussian fraction
	if self.use_original_gaussian_fraction:
	multiplet_data[PeaksParametersNames.gaussian_fraction.value] = torch.clamp(torch.tensor(selected_peaks["gaussian_fraction"].values, dtype=torch.float32) + random_uniform_vector(self.gaussian_fraction_change_min, self.gaussian_fraction_change_max, size=len(selected_peaks)), 0.0, 1.0)
	else:
	multiplet_data[PeaksParametersNames.gaussian_fraction.value] = random_uniform_vector(self.gaussian_fraction_min, self.gaussian_fraction_max, size=len(selected_peaks))

	return [multiplet_data]