|
|
|
|
|
import pandas as pd |
|
|
import os.path as osp |
|
|
import os |
|
|
import sys |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
file_path = "exp-grit-all.csv.xlsx" |
|
|
if len(sys.argv) > 1: |
|
|
file_path = sys.argv[1] |
|
|
print(f"file_path: {file_path}") |
|
|
|
|
|
|
|
|
sheets_to_merge = [ |
|
|
"scores-ciderd.csv.xlsx", |
|
|
"scores-meteor.csv.xlsx", |
|
|
"scores-spice.csv.xlsx", |
|
|
"scores-bleu.csv.xlsx", |
|
|
"scores-rouge.csv.xlsx", |
|
|
"content.csv.xlsx", |
|
|
"clip.csv.xlsx", |
|
|
] |
|
|
|
|
|
|
|
|
merged_data = pd.read_excel(file_path, sheet_name=sheets_to_merge[0]) |
|
|
|
|
|
merged_data = merged_data.dropna(axis=1, how="all") |
|
|
merged_data.insert(loc=0, column="BaseLogPath", value=merged_data["LogPath"].apply(lambda x: osp.dirname(x))) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_and_check_df(merged_data, sheet, data, check="both"): |
|
|
|
|
|
merged_data = merged_data.merge( |
|
|
data, |
|
|
left_on=["BaseLogPath"], |
|
|
right_on=["BaseLogPath"], |
|
|
how="left", |
|
|
validate="1:1", |
|
|
indicator=True, |
|
|
suffixes=("", f"_{sheet}"), |
|
|
) |
|
|
|
|
|
if isinstance(check, str): |
|
|
check = [check] |
|
|
if not isinstance(check, list): |
|
|
raise ValueError(f"check should be str or list, but got {type(check)}") |
|
|
merge_check = merged_data["_merge"] == check[0] |
|
|
if len(check) > 1: |
|
|
for c in check[1:]: |
|
|
merge_check = merge_check | (merged_data["_merge"] == c) |
|
|
assert all(merge_check) |
|
|
print(f"merge check: {check} has passed.") |
|
|
merged_data = merged_data.drop(columns=["_merge"]) |
|
|
return merged_data |
|
|
|
|
|
|
|
|
for sheet in sheets_to_merge[1:]: |
|
|
try: |
|
|
data = pd.read_excel(file_path, sheet_name=sheet) |
|
|
|
|
|
data = data.dropna(axis=1, how="all") |
|
|
data.insert(loc=0, column="BaseLogPath", value=data["LogPath"].apply(lambda x: osp.dirname(x))) |
|
|
except Exception as e: |
|
|
print(f"Error: {e}") |
|
|
continue |
|
|
print(f"sheet: {sheet}, len: {len(data)}") |
|
|
|
|
|
if sheet == "clip.csv.xlsx": |
|
|
assert len(data) % 2 == 0 |
|
|
data_1 = data.iloc[: len(data) // 2] |
|
|
data_2 = data.iloc[len(data) // 2 :] |
|
|
merged_data = merge_and_check_df(merged_data, f"{sheet}_pred", data_1, check=["both", "left_only"]) |
|
|
merged_data = merge_and_check_df(merged_data, f"{sheet}_gt", data_2, check=["both", "left_only"]) |
|
|
else: |
|
|
merged_data = merge_and_check_df(merged_data, sheet, data, check="both") |
|
|
|
|
|
|
|
|
print(merged_data.head()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
merged_data = merged_data.dropna(axis=1, how="all") |
|
|
|
|
|
|
|
|
merged_data = merged_data.loc[:, ~merged_data.columns.str.startswith(("Max", "Min"))] |
|
|
|
|
|
|
|
|
logpath_dataset_mask = merged_data.columns.str.startswith(("LogPath", "Dataset")) |
|
|
dataset_mask = merged_data.columns.str.startswith(("Dataset")) |
|
|
first_datset_index = np.where(dataset_mask == True)[0][0] |
|
|
logpath_dataset_mask[first_datset_index] = False |
|
|
merged_data = merged_data.loc[:, ~logpath_dataset_mask] |
|
|
|
|
|
|
|
|
|
|
|
def remove_suffix(value): |
|
|
if isinstance(value, str): |
|
|
if "+/-" in value: |
|
|
return value.split("+/-")[0].strip() |
|
|
elif "±" in value: |
|
|
return value.split("±")[0].strip() |
|
|
return value |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
merged_data = merged_data.map(remove_suffix) |
|
|
except Exception as e: |
|
|
merged_data = merged_data.applymap(remove_suffix) |
|
|
|
|
|
for column_name in merged_data.columns: |
|
|
try: |
|
|
|
|
|
merged_data[column_name] = pd.to_numeric(merged_data[column_name]) |
|
|
except ValueError as e: |
|
|
continue |
|
|
|
|
|
|
|
|
file_path_wo_ext, ext = osp.splitext(file_path) |
|
|
save_file_path = f"{file_path_wo_ext}.merged{ext}" |
|
|
with pd.ExcelWriter(save_file_path, engine="openpyxl", mode="w") as writer: |
|
|
merged_data.to_excel(writer, index=False) |
|
|
|
|
|
print(f"save_file_path: {save_file_path}") |
|
|
|
|
|
|
|
|
|