gpt2_base_prefix_682k / scripts /data /parallel_utils.py
augustocsc's picture
GPT-2 Base trained on prefix dataset (682K)
5faf2eb verified
# parallel_utils.py
from joblib import Parallel, delayed
import pandas as pd
from .data_augmentation import generate_expression_instructions, generate_expression_instruction
def augment_dataframe_parallel(df, expression_col="expression", n_jobs=-1):
"""
Parallelized augmentation of a DataFrame with math expressions.
Args:
df (pd.DataFrame): DataFrame with a column of expressions.
expression_col (str): Name of the column with expressions.
n_jobs (int): Number of parallel workers (-1 = all cores).
Returns:
pd.DataFrame: Original DataFrame with new instruction columns.
"""
expressions = df[expression_col].tolist()
augmented_data = Parallel(n_jobs=n_jobs)(
delayed(generate_expression_instruction)(expr) for expr in expressions
)
df_aug = df.copy()
df_aug["instruction"] = [item["instriction"] for item in augmented_data]
#df_aug["simple"] = [item["Simple_Instruct"] for item in augmented_data]
#df_aug["key_value"] = [item["Key_Value"] for item in augmented_data]
#df_aug["delimiter"] = [item["Delimiter_Based"] for item in augmented_data]
#df_aug["minimalist"] = [item["Minimalist"] for item in augmented_data]
return df_aug