File size: 4,680 Bytes
002bd9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# %%
import pandas as pd
import os.path as osp
import os
import sys
import numpy as np
# Define the Excel file path
file_path = "exp-grit-all.csv.xlsx"
if len(sys.argv) > 1:
file_path = sys.argv[1]
print(f"file_path: {file_path}")
# List the sheet names you want to merge
sheets_to_merge = [
"scores-ciderd.csv.xlsx",
"scores-meteor.csv.xlsx",
"scores-spice.csv.xlsx",
"scores-bleu.csv.xlsx",
"scores-rouge.csv.xlsx",
"content.csv.xlsx",
"clip.csv.xlsx",
]
# Read the first sheet and store it in a DataFrame
merged_data = pd.read_excel(file_path, sheet_name=sheets_to_merge[0])
# Remove empty columns
merged_data = merged_data.dropna(axis=1, how="all")
merged_data.insert(loc=0, column="BaseLogPath", value=merged_data["LogPath"].apply(lambda x: osp.dirname(x)))
# %%
# Iterate through the rest of the sheets and append them to the merged_data DataFrame
def merge_and_check_df(merged_data, sheet, data, check="both"):
# NOTE: check the docs
merged_data = merged_data.merge(
data,
left_on=["BaseLogPath"],
right_on=["BaseLogPath"],
how="left",
validate="1:1",
indicator=True,
suffixes=("", f"_{sheet}"),
)
# Check indicator column is all "both", and remove the column
if isinstance(check, str):
check = [check]
if not isinstance(check, list):
raise ValueError(f"check should be str or list, but got {type(check)}")
merge_check = merged_data["_merge"] == check[0]
if len(check) > 1:
for c in check[1:]:
merge_check = merge_check | (merged_data["_merge"] == c)
assert all(merge_check)
print(f"merge check: {check} has passed.")
merged_data = merged_data.drop(columns=["_merge"])
return merged_data
for sheet in sheets_to_merge[1:]:
try:
data = pd.read_excel(file_path, sheet_name=sheet)
# Remove empty columns
data = data.dropna(axis=1, how="all")
data.insert(loc=0, column="BaseLogPath", value=data["LogPath"].apply(lambda x: osp.dirname(x)))
except Exception as e:
print(f"Error: {e}")
continue
print(f"sheet: {sheet}, len: {len(data)}")
if sheet == "clip.csv.xlsx":
assert len(data) % 2 == 0
data_1 = data.iloc[: len(data) // 2]
data_2 = data.iloc[len(data) // 2 :]
merged_data = merge_and_check_df(merged_data, f"{sheet}_pred", data_1, check=["both", "left_only"])
merged_data = merge_and_check_df(merged_data, f"{sheet}_gt", data_2, check=["both", "left_only"])
else:
merged_data = merge_and_check_df(merged_data, sheet, data, check="both")
# Preview the merged DataFrame
print(merged_data.head())
# %%
# Save the merged DataFrame to a new Excel file
# merged_data.to_excel('merged_data.xlsx', index=False)
# Save the merged DataFrame to a new Excel file with the header (column names)
# with pd.ExcelWriter("merged_data.xlsx", engine="openpyxl", mode="w") as writer:
# merged_data.to_excel(writer, index=False)
# %%
# Remove empty columns
merged_data = merged_data.dropna(axis=1, how="all")
# Remove columns whose names start with "Max" or "Min"
merged_data = merged_data.loc[:, ~merged_data.columns.str.startswith(("Max", "Min"))]
# Remove all the columns starts with "LogPath" or "Dataset"
logpath_dataset_mask = merged_data.columns.str.startswith(("LogPath", "Dataset"))
dataset_mask = merged_data.columns.str.startswith(("Dataset"))
first_datset_index = np.where(dataset_mask == True)[0][0]
logpath_dataset_mask[first_datset_index] = False
merged_data = merged_data.loc[:, ~logpath_dataset_mask]
# Remove the suffix
def remove_suffix(value):
if isinstance(value, str):
if "+/-" in value:
return value.split("+/-")[0].strip()
elif "±" in value:
return value.split("±")[0].strip()
return value
try:
# NOTE: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
merged_data = merged_data.map(remove_suffix)
except Exception as e:
merged_data = merged_data.applymap(remove_suffix)
for column_name in merged_data.columns:
try:
# if the first obj can be convert to number, we convert all of them
merged_data[column_name] = pd.to_numeric(merged_data[column_name])
except ValueError as e:
continue
# Save the merged DataFrame to a new Excel file with the header (column names)
file_path_wo_ext, ext = osp.splitext(file_path)
save_file_path = f"{file_path_wo_ext}.merged{ext}"
with pd.ExcelWriter(save_file_path, engine="openpyxl", mode="w") as writer:
merged_data.to_excel(writer, index=False)
print(f"save_file_path: {save_file_path}")
# %%
|