File size: 1,983 Bytes
002bd9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import re
import json
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt


def extract_dicts(line):
    return re.findall(r"{[^}]*}", line)


def main(log_file_path, output_dir=None):
    if output_dir is None:
        output_dir = os.path.dirname(log_file_path)

    # Read log file and extract lines with "eval_loss"
    with open(log_file_path, "r") as log_file:
        log_lines = log_file.readlines()

    eval_loss_pattern = r"{'eval_loss'"
    extracted_dicts = []

    last_epoch = 0
    epoch_offset = 0

    for line in log_lines:
        dicts = extract_dicts(line)
        for d in dicts:
            if re.search(eval_loss_pattern, d):
                # Replace single quotes with double quotes
                d = d.replace("'", '"')
                try:
                    parsed_dict = json.loads(d)
                    current_epoch = parsed_dict["epoch"]

                    # Check if the epoch number has reset
                    if current_epoch < last_epoch:
                        epoch_offset += last_epoch

                    parsed_dict["epoch"] += epoch_offset
                    last_epoch = current_epoch

                    extracted_dicts.append(parsed_dict)
                except json.JSONDecodeError:
                    print(f"Could not parse line: {d}")

    # Save extracted data to a CSV file
    df = pd.DataFrame(extracted_dicts)
    csv_file_path = os.path.join(output_dir, "output.csv")
    df.to_csv(csv_file_path, index=False)

    # Plot "eval_loss" values
    plt.plot(df["epoch"], df["eval_loss"])
    plt.xlabel("Epoch")
    plt.ylabel("Eval Loss")
    plt.title("Eval Loss vs Epoch")
    plt.grid()

    plot_file_path = os.path.join(output_dir, "plot.png")
    plt.savefig(plot_file_path)
    plt.show()


if __name__ == "__main__":
    log_file_path = sys.argv[1]
    if len(sys.argv) > 2:
        output_dir = sys.argv[2]
        main(log_file_path, output_dir)
    else:
        main(log_file_path)