|
|
import os |
|
|
import re |
|
|
from transformers import AutoTokenizer |
|
|
import json |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
|
|
|
parent_folder = "/mnt/lyc/wuxinrui/Qwen2.5-Math/evaluation" |
|
|
pattern = re.compile(r"MODEL-.*-TIP-.*-STAGE-add-DATA-.*") |
|
|
|
|
|
entry_list = [] |
|
|
setting_names = [] |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("/mnt/lyc/wuxinrui/LLaMA-Factory/TCMv4_8ratio/1_5B_TCMv4_8ratio_models/models", trust_remote_code=True) |
|
|
|
|
|
def calculate_token_length(text): |
|
|
"""计算文本的token长度""" |
|
|
tokens = tokenizer(text)['input_ids'] |
|
|
return len(tokens) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
for folder in os.listdir(parent_folder): |
|
|
|
|
|
if folder == "MODEL-DS_QW_1_5B-TIP-prompt_v2-STAGE-2-DATA-math500": |
|
|
entry_list.append(os.path.join(parent_folder, folder)) |
|
|
setting_names.append(folder) |
|
|
|
|
|
for entry, setting_name in zip(entry_list, setting_names): |
|
|
token_length_metrics = [] |
|
|
final_answer_metrics = [] |
|
|
|
|
|
for sub_entry in os.listdir(entry): |
|
|
if not os.path.isdir(os.path.join(entry, sub_entry)): |
|
|
continue |
|
|
|
|
|
for root, dirs, files in os.walk(os.path.join(entry, sub_entry)): |
|
|
for file in files: |
|
|
if "metrics" in file: |
|
|
continue |
|
|
|
|
|
cot_answer_path = os.path.join(root, file) |
|
|
|
|
|
with open(cot_answer_path, "r") as f: |
|
|
token_length_data = {} |
|
|
final_answer_data = {} |
|
|
budget_length = int(sub_entry) |
|
|
token_length_data['budget_length'] = budget_length |
|
|
final_answer_data['budget_length'] = budget_length |
|
|
total_tokens = 0 |
|
|
answer_count = 0 |
|
|
single_final_answer_count = 0 |
|
|
multiple_final_answer_count = 0 |
|
|
total_delta_length = 0 |
|
|
valid_delta_count = 0 |
|
|
|
|
|
for line in f: |
|
|
data = json.loads(line) |
|
|
answer_text = data['code'][0] |
|
|
token_length = calculate_token_length(answer_text) |
|
|
total_tokens += token_length |
|
|
answer_count += 1 |
|
|
|
|
|
|
|
|
if token_length < budget_length: |
|
|
delta = budget_length - token_length |
|
|
total_delta_length += delta |
|
|
valid_delta_count += 1 |
|
|
|
|
|
|
|
|
first_match = answer_text.find("</think>") |
|
|
if first_match != -1: |
|
|
modified_text = answer_text[:first_match] + answer_text[first_match + len("</think>"):] |
|
|
second_match = modified_text.find("</think>") |
|
|
|
|
|
if second_match == -1: |
|
|
single_final_answer_count += 1 |
|
|
else: |
|
|
multiple_final_answer_count += 1 |
|
|
|
|
|
avg_token_length = total_tokens / answer_count if answer_count > 0 else 0 |
|
|
token_length_data['avg_token_length'] = avg_token_length |
|
|
token_length_data['total_tokens'] = total_tokens |
|
|
|
|
|
|
|
|
avg_delta_length = total_delta_length / valid_delta_count if valid_delta_count > 0 else 0 |
|
|
token_length_data['avg_delta_length'] = avg_delta_length |
|
|
token_length_data['avg_delta_length/BUDGET'] = avg_delta_length / token_length_data['budget_length'] |
|
|
token_length_data['valid_delta_count'] = valid_delta_count |
|
|
|
|
|
token_length_metrics.append(token_length_data) |
|
|
|
|
|
final_answer_data['single_final_answer_count'] = single_final_answer_count |
|
|
final_answer_data['multiple_final_answer_count'] = multiple_final_answer_count |
|
|
final_answer_data['total_answer_count'] = answer_count |
|
|
final_answer_data['multiple_final_answer_ratio'] = multiple_final_answer_count / answer_count if answer_count > 0 else 0 |
|
|
final_answer_metrics.append(final_answer_data) |
|
|
|
|
|
|
|
|
token_length_metrics.sort(key=lambda x: x['budget_length']) |
|
|
final_answer_metrics.sort(key=lambda x: x['budget_length']) |
|
|
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
budget_lengths = [data['budget_length'] for data in token_length_metrics] |
|
|
avg_lengths = [data['avg_token_length'] for data in token_length_metrics] |
|
|
|
|
|
length_csv = os.path.join(entry, "token_length_metrics.csv") |
|
|
with open(length_csv, "w") as f: |
|
|
f.write("budget_length,avg_token_length,total_tokens,avg_delta_length,valid_delta_count, avg_delta_length/BUDGET]\n") |
|
|
for data in token_length_metrics: |
|
|
f.write(f"{data['budget_length']},{data['avg_token_length']},{data['total_tokens']},{data['avg_delta_length']},{data['valid_delta_count']}, {data['avg_delta_length/BUDGET']}\n") |
|
|
|
|
|
ax.plot(budget_lengths, avg_lengths, marker='o', color='blue', linewidth=2) |
|
|
ax.set_title(f"Average Token Length by Budget Length - {setting_name}") |
|
|
ax.set_xlabel('Budget Length') |
|
|
ax.set_ylabel('Average Token Length') |
|
|
ax.grid(True) |
|
|
|
|
|
plt.tight_layout() |
|
|
pic_name = os.path.join(entry, "token_length_metrics.png") |
|
|
plt.savefig(pic_name, dpi=300) |
|
|
print(f"Saved figure as {pic_name}") |
|
|
plt.close() |
|
|
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
avg_deltas = [data['avg_delta_length'] for data in token_length_metrics] |
|
|
|
|
|
ax.plot(budget_lengths, avg_deltas, marker='o', color='green', linewidth=2) |
|
|
ax.set_title(f"Average Delta Length by Budget Length - {setting_name}") |
|
|
ax.set_xlabel('Budget Length') |
|
|
ax.set_ylabel('Average Delta Length (budget - actual)') |
|
|
ax.grid(True) |
|
|
|
|
|
plt.tight_layout() |
|
|
delta_pic_name = os.path.join(entry, "delta_length_metrics.png") |
|
|
plt.savefig(delta_pic_name, dpi=300) |
|
|
print(f"Saved delta figure as {delta_pic_name}") |
|
|
plt.close() |
|
|
|
|
|
|
|
|
final_answer_csv = os.path.join(entry, "final_answer_metrics.csv") |
|
|
with open(final_answer_csv, "w") as f: |
|
|
f.write("budget_length,single_final_answer_count,multiple_final_answer_count,total_answer_count,multiple_final_answer_ratio\n") |
|
|
for data in final_answer_metrics: |
|
|
f.write(f"{data['budget_length']},{data['single_final_answer_count']},{data['multiple_final_answer_count']},{data['total_answer_count']},{data['multiple_final_answer_ratio']}\n") |