|
|
import pandas as pd |
|
|
|
|
|
def cut_data(): |
|
|
file_path = "datasets/train_first_half.parquet" |
|
|
|
|
|
data = pd.read_parquet(file_path) |
|
|
|
|
|
print(data['problem'][0]) |
|
|
|
|
|
half_size = len(data) // 2 |
|
|
data_first_half = data.iloc[:half_size] |
|
|
data_second_half = data.iloc[half_size:] |
|
|
|
|
|
print(f"First half length: {len(data_first_half)}") |
|
|
print(f"Second half length: {len(data_second_half)}") |
|
|
|
|
|
data_first_half.to_parquet("datasets/train_1_in_4.parquet", index=False) |
|
|
data_second_half.to_parquet("datasets/train_2_in_4.parquet", index=False) |
|
|
|
|
|
|
|
|
def formatted_data(): |
|
|
file_path = "datasets/train_first_half.parquet" |
|
|
|
|
|
data = pd.read_parquet(file_path) |
|
|
|
|
|
data['problem'] = data['problem'].apply(lambda x: "Return your final response within \\boxed{}. " + x) |
|
|
|
|
|
print(data['problem'][0]) |
|
|
|
|
|
target_path = file_path.replace(".parquet", "_formatted.parquet") |
|
|
data.to_parquet(target_path, index=False) |
|
|
|
|
|
|
|
|
def visualize_data(): |
|
|
|
|
|
file_path = "datasets/train-00000-of-00001_formatted.parquet" |
|
|
|
|
|
|
|
|
data = pd.read_parquet(file_path) |
|
|
|
|
|
print(data.head()) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
formatted_data() |
|
|
visualize_data() |
|
|
cut_data() |