File size: 3,258 Bytes
a555ead
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import re
import sys

def parse_log(log_path):
    with open(log_path, 'r') as f:
        content = f.read()

    # Task order as defined in script
    tasks = ["yelp", "amazon", "mnli", "cb", "copa", "qqp", "rte", "imdb", "sst2", "dbpedia", "agnews", "yahoo", "multirc", "boolq", "wic"]
    
    # Split content into segments, each ending with a "predict metrics" block
    # We look for "predict_exact_match_for_CL" as an anchor for each step evaluation
    segments = re.split(r'predict_exact_match_for_CL\s+=\s+\d+\.\d+', content)
    
    # The last segment might be empty if there's nothing after the final metrics
    if not segments[-1].strip():
        segments = segments[:-1]
    
    # We expect 15 evaluations
    print(f"Found {len(segments)} evaluation segments in {log_path}")
    
    matrix = []
    for seg in segments:
        scores = []
        for task in tasks:
            match = re.search(fr'predict_exact_match_for_{task}\s+=\s+(\d+\.\d+|\d+)', seg)
            if match:
                scores.append(float(match.group(1)))
            else:
                scores.append(0.0)
        if any(s > 0 for s in scores): # Only add if we found at least one score
            matrix.append(scores)

    # If it's the final evaluation only (like in some logs), we might have only 1 segment
    return matrix, tasks

def calculate_metrics(matrix):
    if not matrix:
        return None
    
    task_num = len(matrix[0])
    # final_scores is the last row provided (if the run ended at 15, it's matrix[14])
    final_scores = matrix[-1]
    AP = sum(final_scores) / task_num
    
    # Forgetting: max(history) - final
    fgt_list = []
    for t_idx in range(task_num - 1):
        history = [row[t_idx] for row in matrix]
        best = max(history)
        final = final_scores[t_idx]
        fgt_list.append(best - final)
    
    Fgt = sum(fgt_list) / len(fgt_list) if fgt_list else 0.0
    
    # User's definition of forgetting in markdown: Final - Initial?
    # Let's calculate that too just in case
    fgt_user_list = []
    for t_idx in range(task_num - 1):
        initial = matrix[t_idx][t_idx] if t_idx < len(matrix) else 0.0
        final = final_scores[t_idx]
        fgt_user_list.append(final - initial)
    Fgt_user = sum(fgt_user_list) / len(fgt_user_list) if fgt_user_list else 0.0

    return {
        "AP": AP,
        "Fgt (Best-Final)": Fgt,
        "Fgt_user (Final-Initial)": Fgt_user,
        "Final Scores": final_scores
    }

log_v10 = "/Users/nnminh322/Desktop/personal/Continual/improve_gainlora/logs/t5_small_improve/improve_gainlora_v10.log"
matrix, tasks = parse_log(log_v10)
metrics = calculate_metrics(matrix)
print("--- V10 Metrics ---")
if metrics:
    print(f"AP (EM): {metrics['AP']:.4f}")
    print(f"Fgt (Best-Final): {metrics['Fgt (Best-Final)']:.4f}")
    print(f"Fgt (Final-Initial): {metrics['Fgt_user (Final-Initial)']:.4f}")
    print("Final Scores:", metrics['Final Scores'])
else:
    print("Failed to parse matrix for V10")

# Also do V5 for comparison
log_v5_dir = "/Users/nnminh322/Desktop/personal/Continual/improve_gainlora/logs/t5_small_improve/gen_script_long_order3_t5_small_specroute_v5/"
# We need to find the log file inside v5 dir.
# It's likely in outputs/ or similar.